1
0
mw-convo-collections/src/lib/old_wiki_approach/get_wiki_activity.py

50 lines
1.4 KiB
Python

import bz2
import os
import re
import json
import pandas as pd
import sys
import findspark
#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre"
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark
## defining the spark session
spark_config = {}
spark = wmfspark.create_session(
type="local"
)
print(spark)
df_projects = (
spark.read.table("wmf_raw.mediawiki_project_namespace_map")
.where(F.col("snapshot")==snapshot)
.where(F.col("hostname").contains("wikipedia"))
.select(F.col("dbname").alias("wiki_db"))
.distinct()
.orderBy("wiki_db")
)
df_projects.show()
# TODO Get a list of bots in the project
# TODO get all mws wikis
# TODO get the
# page creation and page edit
# events of those bots for each of the wikis