1
0

cleaning and reorganizing along with adding data dump of current data

This commit is contained in:
Matthew Gaughan 2025-05-27 16:05:59 -05:00
parent 993bbe658b
commit 228a6b07ae
35 changed files with 0 additions and 2702543 deletions

Binary file not shown.

View File

@ -1,18 +0,0 @@
this tool needs to
[ ] grab conversations from phabricator
[ ] what was said
[ ] who said it
[ ] in response to who
[ ] grab text data from wiki pages
[ ] page edit histories
[ ] who said it
[ ~ ] talk pages?
[ x ] who said what and when
[ x ] in response to who
[ ~ ] structure both of those in json files
[ ] construct folders of the tagged json files
otherwise, need to
[ x ] collect 10 years of bot activity on mwf
[ x ] organize that

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,92 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"24/12/12 11:55:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
]
}
],
"source": [
"## defining the spark session\n",
"spark_config = {}\n",
"spark = wmfspark.create_session(\n",
" app_name='Pyspark notebook', \n",
" type='regular'\n",
"# extra_settings=spark_config\n",
")\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"success\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,49 +0,0 @@
import bz2
import os
import re
import json
import pandas as pd
import sys
import findspark
#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre"
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F, types as T, Window
#source /opt/conda-analytics/bin/activate
import wmfdata.spark as wmfspark
## defining the spark session
spark_config = {}
spark = wmfspark.create_session(
type="local"
)
print(spark)
df_projects = (
spark.read.table("wmf_raw.mediawiki_project_namespace_map")
.where(F.col("snapshot")==snapshot)
.where(F.col("hostname").contains("wikipedia"))
.select(F.col("dbname").alias("wiki_db"))
.distinct()
.orderBy("wiki_db")
)
df_projects.show()
# TODO Get a list of bots in the project
# TODO get all mws wikis
# TODO get the
# page creation and page edit
# events of those bots for each of the wikis

View File

@ -1,90 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import bz2\n",
"import os\n",
"import re\n",
"import json\n",
"import pandas as pd\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import findspark\n",
"#findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
"os.environ['PYSPARK_PYTHON'] = sys.executable\n",
"os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n",
"os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
"os.environ['JRE_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64/jre\"\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"\n",
"#source /opt/conda-analytics/bin/activate\n",
"import wmfdata.spark as wmfspark"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'spark' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_projects \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m----> 2\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwmf_raw.mediawiki_project_namespace_map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msnapshot\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m==\u001b[39msnapshot)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhostname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwikipedia\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;241m.\u001b[39mselect(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdbname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;241m.\u001b[39mdistinct()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;241m.\u001b[39morderBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m df_projects\u001b[38;5;241m.\u001b[39mshow()\n",
"\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined"
]
}
],
"source": [
"df_projects = (\n",
" spark.read.table(\"wmf_raw.mediawiki_project_namespace_map\")\n",
" .where(F.col(\"snapshot\")==snapshot)\n",
" .where(F.col(\"hostname\").contains(\"wikipedia\"))\n",
" .select(F.col(\"dbname\").alias(\"wiki_db\"))\n",
" .distinct()\n",
" .orderBy(\"wiki_db\")\n",
")\n",
"df_projects.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,136 +0,0 @@
import os
import re
import datetime
import uuid
import json
import mwparserfromhell
import mwchatter as mwc
def read_file(filename):
with open(filename, 'r') as file:
file_content = file.read()
return file_content
def parse_talkpage(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
comment = ""
current_title = ""
current_header = ""
subheader = ""
comment_dict = {}
thread_array = []
discussion_array = []
for cell in arraytext:
if re.search(r"^==.*?==$", cell):
current_title = cell
continue
if re.search(r"^===.*?===$", cell):
current_header = cell
continue
if re.search(r"^====.*?====$", cell):
subheader = cell
continue
comment += cell
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
if match:
comment_id = uuid.uuid4()
user = match.group()
split_comment = comment.split(user)
comment_dict['id'] = str(comment_id)
comment_dict['text'] = split_comment[0]
comment_dict['title'] = current_title
comment_dict['header'] = current_header
comment_dict['subheader'] = subheader
comment_dict['author'] = user.split("|")[0][12:]
# doing stuff to figure out replies
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
comment_dict['thread'] = []
thread_array = [comment_dict['id']]
else:
comment_ = comment_dict['text']
level = 0
while comment_.startswith('*') or comment_.startswith(':'):
level += 1
comment_ = comment_[1:]
thread_array = thread_array[:level]
comment_dict['thread'] = thread_array
thread_array.append(comment_dict['id'])
# doing stuff to get the timestamp
string_time = split_comment[-1].split(" ")[-5:]
if string_time[-1] == "":
string_time = split_comment[-1].split(" ")[-6:]
string_time[0] = string_time[0][-6:]
if string_time[-1] == "(UTC)":
comment_dict['time'] = " ".join(string_time)
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
#print(comment_dict)
discussion_array.append(comment_dict)
comment = ""
comment_dict = {}
return discussion_array
def parse_talkpage2(wikitext):
parsed_text = mwc.parse(wikitext)
return parsed_text
def json_it(array_of_dicts, filename):
json_ = json.dumps(array_of_dicts)
with open(filename, 'w') as json_file:
json_file.write(json_)
def parse_tech_news(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
message_array = []
comment_dict = {}
text_dict = {}
raw_message = ""
current_section = "header"
text_dict[current_section] = []
for cell in arraytext:
raw_message += cell
if re.search(r"^==.*?==$", cell):
#issue = cell.split("Tech News: ")[1]
comment_dict['issue'] = cell
if re.search(r"^'''.*?'''$", cell):
current_section = cell[2:-3]
text_dict[current_section] = []
continue
text_dict[current_section].append(cell)
if "<!--" in cell and "-->" in cell:
comment_dict['raw'] = raw_message
comment_dict['structured text'] = text_dict
message_array.append(comment_dict)
raw_message = ""
comment_dict = {}
text_dict = {}
current_section = "header"
text_dict[current_section] = []
return message_array
if __name__ == "__main__":
#dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs"
#ve-rfcs
#files = os.listdir(dir_path)
#print(files)
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
#json_discussion = parse_talkpage2(file_wikitext)
'''
for file in files:
print(file)
file_wikitext = read_file(dir_path + "/" + file)
json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json")
'''
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt")
json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "test.json")
#json_discussion = parse_talkpage(file_wikitext)
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")