import bz2 import os import re import json import pandas as pd import sys import findspark #findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2") os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2" os.environ['PYSPARK_PYTHON'] = sys.executable os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64" os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre" from pyspark.sql import SparkSession from pyspark.sql import functions as F, types as T, Window #source /opt/conda-analytics/bin/activate import wmfdata.spark as wmfspark from pyspark.sql import SparkSession from pyspark.sql import functions as F, types as T, Window #source /opt/conda-analytics/bin/activate import wmfdata.spark as wmfspark ## defining the spark session spark_config = {} spark = wmfspark.create_session( type="local" ) print(spark) df_projects = ( spark.read.table("wmf_raw.mediawiki_project_namespace_map") .where(F.col("snapshot")==snapshot) .where(F.col("hostname").contains("wikipedia")) .select(F.col("dbname").alias("wiki_db")) .distinct() .orderBy("wiki_db") ) df_projects.show() # TODO Get a list of bots in the project # TODO get all mws wikis # TODO get the # page creation and page edit # events of those bots for each of the wikis