import re import os from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, IntegerType, ArrayType from pyspark.sql.functions import count, lit, desc from pyspark.sql import SparkSession os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" # source /opt/conda-analytics/bin/activate # Unescaping and array-splitting UDFs def unescape(str): if (str is None): return None else: return str.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t") # The comma splitter applies a negative lookahead for \ to prevent splitting escaped commas def toArray(str): if (str is None): return [] else: return [s.strip().replace("\\,", ",") for s in re.split("(?