cleaning and reorganizing along with adding data dump of current data
This commit is contained in:
parent
993bbe658b
commit
228a6b07ae
BIN
data-snapshots/052725_mw_lifecycles_data.tar.gz
Normal file
BIN
data-snapshots/052725_mw_lifecycles_data.tar.gz
Normal file
Binary file not shown.
18
spec.txt
18
spec.txt
@ -1,18 +0,0 @@
|
||||
this tool needs to
|
||||
[ ] grab conversations from phabricator
|
||||
[ ] what was said
|
||||
[ ] who said it
|
||||
[ ] in response to who
|
||||
[ ] grab text data from wiki pages
|
||||
[ ] page edit histories
|
||||
[ ] who said it
|
||||
[ ~ ] talk pages?
|
||||
[ x ] who said what and when
|
||||
[ x ] in response to who
|
||||
[ ~ ] structure both of those in json files
|
||||
[ ] construct folders of the tagged json files
|
||||
|
||||
|
||||
otherwise, need to
|
||||
[ x ] collect 10 years of bot activity on mwf
|
||||
[ x ] organize that
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,92 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import bz2\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import json\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import findspark\n",
|
||||
"findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
|
||||
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"from pyspark.sql import functions as F, types as T, Window\n",
|
||||
"\n",
|
||||
"#source /opt/conda-analytics/bin/activate\n",
|
||||
"import wmfdata.spark as wmfspark"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
|
||||
"Setting default log level to \"WARN\".\n",
|
||||
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
|
||||
"24/12/12 11:55:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## defining the spark session\n",
|
||||
"spark_config = {}\n",
|
||||
"spark = wmfspark.create_session(\n",
|
||||
" app_name='Pyspark notebook', \n",
|
||||
" type='regular'\n",
|
||||
"# extra_settings=spark_config\n",
|
||||
")\n",
|
||||
"spark"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"success\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
import bz2
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import pandas as pd
|
||||
import sys
|
||||
|
||||
import findspark
|
||||
#findspark.init("/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2")
|
||||
os.environ['HADOOP_CONF_DIR'] = "/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2"
|
||||
os.environ['PYSPARK_PYTHON'] = sys.executable
|
||||
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
|
||||
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64"
|
||||
os.environ['JRE_HOME'] = "/usr/lib/jvm/java-11-openjdk-amd64/jre"
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import functions as F, types as T, Window
|
||||
|
||||
#source /opt/conda-analytics/bin/activate
|
||||
import wmfdata.spark as wmfspark
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import functions as F, types as T, Window
|
||||
|
||||
#source /opt/conda-analytics/bin/activate
|
||||
import wmfdata.spark as wmfspark
|
||||
|
||||
## defining the spark session
|
||||
|
||||
spark_config = {}
|
||||
spark = wmfspark.create_session(
|
||||
type="local"
|
||||
)
|
||||
print(spark)
|
||||
df_projects = (
|
||||
spark.read.table("wmf_raw.mediawiki_project_namespace_map")
|
||||
.where(F.col("snapshot")==snapshot)
|
||||
.where(F.col("hostname").contains("wikipedia"))
|
||||
.select(F.col("dbname").alias("wiki_db"))
|
||||
.distinct()
|
||||
.orderBy("wiki_db")
|
||||
)
|
||||
df_projects.show()
|
||||
|
||||
# TODO Get a list of bots in the project
|
||||
|
||||
# TODO get all mws wikis
|
||||
|
||||
# TODO get the
|
||||
# page creation and page edit
|
||||
# events of those bots for each of the wikis
|
@ -1,90 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import bz2\n",
|
||||
"import os\n",
|
||||
"import re\n",
|
||||
"import json\n",
|
||||
"import pandas as pd\n",
|
||||
"import sys"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import findspark\n",
|
||||
"#findspark.init(\"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\")\n",
|
||||
"os.environ['HADOOP_CONF_DIR'] = \"/home/SOC.NORTHWESTERN.EDU/nws8519/spark-3.2.0-bin-hadoop3.2\"\n",
|
||||
"os.environ['PYSPARK_PYTHON'] = sys.executable\n",
|
||||
"os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable\n",
|
||||
"os.environ['JAVA_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n",
|
||||
"os.environ['JRE_HOME'] = \"/usr/lib/jvm/java-11-openjdk-amd64/jre\"\n",
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"from pyspark.sql import functions as F, types as T, Window\n",
|
||||
"\n",
|
||||
"#source /opt/conda-analytics/bin/activate\n",
|
||||
"import wmfdata.spark as wmfspark"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'spark' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m df_projects \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m----> 2\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241m.\u001b[39mread\u001b[38;5;241m.\u001b[39mtable(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwmf_raw.mediawiki_project_namespace_map\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msnapshot\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m==\u001b[39msnapshot)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;241m.\u001b[39mwhere(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhostname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwikipedia\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 5\u001b[0m \u001b[38;5;241m.\u001b[39mselect(F\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdbname\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39malias(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 6\u001b[0m \u001b[38;5;241m.\u001b[39mdistinct()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;241m.\u001b[39morderBy(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwiki_db\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m )\n\u001b[1;32m 9\u001b[0m df_projects\u001b[38;5;241m.\u001b[39mshow()\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'spark' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_projects = (\n",
|
||||
" spark.read.table(\"wmf_raw.mediawiki_project_namespace_map\")\n",
|
||||
" .where(F.col(\"snapshot\")==snapshot)\n",
|
||||
" .where(F.col(\"hostname\").contains(\"wikipedia\"))\n",
|
||||
" .select(F.col(\"dbname\").alias(\"wiki_db\"))\n",
|
||||
" .distinct()\n",
|
||||
" .orderBy(\"wiki_db\")\n",
|
||||
")\n",
|
||||
"df_projects.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,136 +0,0 @@
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
import uuid
|
||||
import json
|
||||
|
||||
import mwparserfromhell
|
||||
|
||||
import mwchatter as mwc
|
||||
|
||||
def read_file(filename):
|
||||
with open(filename, 'r') as file:
|
||||
file_content = file.read()
|
||||
return file_content
|
||||
|
||||
def parse_talkpage(wikitext):
|
||||
wikicode = mwparserfromhell.parse(wikitext)
|
||||
arraytext = wikicode.split('\n')
|
||||
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
|
||||
comment = ""
|
||||
current_title = ""
|
||||
current_header = ""
|
||||
subheader = ""
|
||||
comment_dict = {}
|
||||
thread_array = []
|
||||
discussion_array = []
|
||||
for cell in arraytext:
|
||||
if re.search(r"^==.*?==$", cell):
|
||||
current_title = cell
|
||||
continue
|
||||
if re.search(r"^===.*?===$", cell):
|
||||
current_header = cell
|
||||
continue
|
||||
if re.search(r"^====.*?====$", cell):
|
||||
subheader = cell
|
||||
continue
|
||||
comment += cell
|
||||
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
|
||||
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
|
||||
if match:
|
||||
comment_id = uuid.uuid4()
|
||||
user = match.group()
|
||||
split_comment = comment.split(user)
|
||||
comment_dict['id'] = str(comment_id)
|
||||
comment_dict['text'] = split_comment[0]
|
||||
comment_dict['title'] = current_title
|
||||
comment_dict['header'] = current_header
|
||||
comment_dict['subheader'] = subheader
|
||||
comment_dict['author'] = user.split("|")[0][12:]
|
||||
# doing stuff to figure out replies
|
||||
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
|
||||
comment_dict['thread'] = []
|
||||
thread_array = [comment_dict['id']]
|
||||
else:
|
||||
comment_ = comment_dict['text']
|
||||
level = 0
|
||||
while comment_.startswith('*') or comment_.startswith(':'):
|
||||
level += 1
|
||||
comment_ = comment_[1:]
|
||||
thread_array = thread_array[:level]
|
||||
comment_dict['thread'] = thread_array
|
||||
thread_array.append(comment_dict['id'])
|
||||
# doing stuff to get the timestamp
|
||||
string_time = split_comment[-1].split(" ")[-5:]
|
||||
if string_time[-1] == "":
|
||||
string_time = split_comment[-1].split(" ")[-6:]
|
||||
string_time[0] = string_time[0][-6:]
|
||||
if string_time[-1] == "(UTC)":
|
||||
comment_dict['time'] = " ".join(string_time)
|
||||
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
||||
#print(comment_dict)
|
||||
discussion_array.append(comment_dict)
|
||||
comment = ""
|
||||
comment_dict = {}
|
||||
return discussion_array
|
||||
|
||||
def parse_talkpage2(wikitext):
|
||||
parsed_text = mwc.parse(wikitext)
|
||||
return parsed_text
|
||||
|
||||
def json_it(array_of_dicts, filename):
|
||||
json_ = json.dumps(array_of_dicts)
|
||||
with open(filename, 'w') as json_file:
|
||||
json_file.write(json_)
|
||||
|
||||
def parse_tech_news(wikitext):
|
||||
wikicode = mwparserfromhell.parse(wikitext)
|
||||
arraytext = wikicode.split('\n')
|
||||
message_array = []
|
||||
comment_dict = {}
|
||||
text_dict = {}
|
||||
raw_message = ""
|
||||
current_section = "header"
|
||||
text_dict[current_section] = []
|
||||
for cell in arraytext:
|
||||
raw_message += cell
|
||||
if re.search(r"^==.*?==$", cell):
|
||||
#issue = cell.split("Tech News: ")[1]
|
||||
comment_dict['issue'] = cell
|
||||
if re.search(r"^'''.*?'''$", cell):
|
||||
current_section = cell[2:-3]
|
||||
text_dict[current_section] = []
|
||||
continue
|
||||
text_dict[current_section].append(cell)
|
||||
if "<!--" in cell and "-->" in cell:
|
||||
comment_dict['raw'] = raw_message
|
||||
comment_dict['structured text'] = text_dict
|
||||
message_array.append(comment_dict)
|
||||
raw_message = ""
|
||||
comment_dict = {}
|
||||
text_dict = {}
|
||||
current_section = "header"
|
||||
text_dict[current_section] = []
|
||||
return message_array
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
#dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs"
|
||||
#ve-rfcs
|
||||
#files = os.listdir(dir_path)
|
||||
#print(files)
|
||||
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
|
||||
#json_discussion = parse_talkpage2(file_wikitext)
|
||||
'''
|
||||
for file in files:
|
||||
print(file)
|
||||
file_wikitext = read_file(dir_path + "/" + file)
|
||||
json_discussion = parse_talkpage2(file_wikitext)
|
||||
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json")
|
||||
'''
|
||||
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt")
|
||||
json_discussion = parse_talkpage2(file_wikitext)
|
||||
json_it(json_discussion, "test.json")
|
||||
#json_discussion = parse_talkpage(file_wikitext)
|
||||
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")
|
||||
|
Loading…
Reference in New Issue
Block a user