50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
import bz2
|
|
import os
|
|
import re
|
|
import json
|
|
import pandas as pd
|
|
import sys
|
|
|
|
import findspark
|
|
#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
|
|
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
|
|
os.environ['PYSPARK_PYTHON'] = sys.executable
|
|
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
|
|
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64"
|
|
os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre"
|
|
from pyspark.sql import SparkSession
|
|
from pyspark.sql import functions as F, types as T, Window
|
|
|
|
#source /opt/conda-analytics/bin/activate
|
|
import wmfdata.spark as wmfspark
|
|
from pyspark.sql import SparkSession
|
|
from pyspark.sql import functions as F, types as T, Window
|
|
|
|
#source /opt/conda-analytics/bin/activate
|
|
import wmfdata.spark as wmfspark
|
|
|
|
## defining the spark session
|
|
|
|
spark_config = {}
|
|
spark = wmfspark.create_session(
|
|
type="local"
|
|
)
|
|
print(spark)
|
|
df_projects = (
|
|
spark.read.table("wmf_raw.mediawiki_project_namespace_map")
|
|
.where(F.col("snapshot")==snapshot)
|
|
.where(F.col("hostname").contains("wikipedia"))
|
|
.select(F.col("dbname").alias("wiki_db"))
|
|
.distinct()
|
|
.orderBy("wiki_db")
|
|
)
|
|
df_projects.show()
|
|
|
|
# TODO Get a list of bots in the project
|
|
|
|
# TODO get all mws wikis
|
|
|
|
# TODO get the
|
|
# page creation and page edit
|
|
# events of those bots for each of the wikis
|