import re import os from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, IntegerType, ArrayType #from pyspark.sql.functions import count, lit, desc, col, array import pyspark.sql.functions as F from pyspark.sql import SparkSession os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" # Unescaping and array-splitting UDFs def unescape(str): if (str is None): return None else: return str.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t") # The comma splitter applies a negative lookahead for \ to prevent splitting escaped commas def toArray(str): if (str is None): return [] else: return [s.strip().replace("\\,", ",") for s in re.split("(?