diff --git a/spec.txt b/spec.txt index 14c4d42..7d65af3 100644 --- a/spec.txt +++ b/spec.txt @@ -14,5 +14,5 @@ otherwise, need to -[ ] collect 10 years of bot activity on mwf -[ ] organize that +[ x ] collect 10 years of bot activity on mwf +[ x ] organize that diff --git a/src/helper_scripts/decompression_script.py b/src/helper_scripts/decompression_script.py index c66230e..8b0c49b 100644 --- a/src/helper_scripts/decompression_script.py +++ b/src/helper_scripts/decompression_script.py @@ -4,7 +4,7 @@ import shutil import os import sys -#FILE_LOC_PREFIX = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/" +#FILE_LOC_PREFIX = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/" def decompress(filepath): decompressed_filepath = filepath[:-4] diff --git a/src/helper_scripts/existing_file_checking.py b/src/helper_scripts/existing_file_checking.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lib/spark-warehouse/activity_isolation.py b/src/lib/spark-warehouse/activity_isolation.py new file mode 100644 index 0000000..312c503 --- /dev/null +++ b/src/lib/spark-warehouse/activity_isolation.py @@ -0,0 +1,203 @@ +import re +import os +from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, IntegerType, ArrayType +#from pyspark.sql.functions import count, lit, desc, col, array +import pyspark.sql.functions as F +from pyspark.sql import SparkSession + +os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" +os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" + +# Unescaping and array-splitting UDFs +def unescape(str): + if (str is None): + return None + else: + return str.replace("\\n", "\n").replace("\\r", "\r").replace("\\t", "\t") +# The comma splitter applies a negative lookahead for \ to prevent splitting escaped commas +def toArray(str): + if (str is None): + return [] + else: + return [s.strip().replace("\\,", ",") for s in re.split("(? (447 + 56) / 8283]\r" + "[Stage 0:==> (397 + 56) / 8283]\r" ] } ], @@ -438,7 +436,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, diff --git a/src/lib/spark-warehouse/bot_isolation.py b/src/lib/spark-warehouse/bot_isolation.py index de73e18..b840475 100644 --- a/src/lib/spark-warehouse/bot_isolation.py +++ b/src/lib/spark-warehouse/bot_isolation.py @@ -7,6 +7,8 @@ from pyspark.sql import SparkSession os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" +# source /opt/conda-analytics/bin/activate + # Unescaping and array-splitting UDFs def unescape(str): if (str is None): diff --git a/src/lib/wiki_get.py b/src/lib/wiki_get.py index e6ddd9e..d9080ca 100644 --- a/src/lib/wiki_get.py +++ b/src/lib/wiki_get.py @@ -115,7 +115,8 @@ def parse_tech_news(wikitext): if __name__ == "__main__": - dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard" + dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs" + #ve-rfcs files = os.listdir(dir_path) print(files) #file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt') @@ -124,7 +125,7 @@ if __name__ == "__main__": print(file) file_wikitext = read_file(dir_path + "/" + file) json_discussion = parse_talkpage2(file_wikitext) - json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json") + json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json") ''' file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt") json_discussion = parse_tech_news(file_wikitext)