first draft of conversation parsing for wiki talk pages

2024-12-09 14:56:31 -06:00 · 2024-12-09 14:56:31 -06:00 · b57e4c15a3
commit b57e4c15a3
parent aa30d02c48
2 changed files with 40 additions and 7 deletions
--- a/spec.txt
+++ b/spec.txt
@ -7,7 +7,7 @@
 	[ ] page edit histories 
 	[ ] who said it
 	[ ~ ] talk pages? 
-		[ ~ ] who said what and when 
+		[ x ] who said what and when 
-		[ ] in response to who
+		[ x ] in response to who
-[ ] structure both of those in json files 
+[ ~ ] structure both of those in json files 
 [ ] construct folders of the tagged json files
--- a/src/lib/wiki_get.py
+++ b/src/lib/wiki_get.py
@ -1,6 +1,8 @@
 import os 
 import re
 import datetime 
 import uuid
 import json
 import mwparserfromhell
@ -18,6 +20,8 @@ def parse_talkpage(wikitext):
    current_header = ""
    subheader = ""
    comment_dict = {}
    thread_array = []
    discussion_array = []
    for cell in arraytext:
        if re.search(r"^==.*?==$", cell):
            current_title = cell
@ -29,26 +33,55 @@ def parse_talkpage(wikitext):
            subheader = cell
            continue
        comment += cell
-        match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
+        match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
        #match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
        if match:
            comment_id = uuid.uuid4()
            user = match.group()
            split_comment = comment.split(user)
            comment_dict['id'] = str(comment_id)
            comment_dict['text'] = split_comment[0]
            comment_dict['title'] = current_title
            comment_dict['header'] = current_header
            comment_dict['subheader'] = subheader
-            comment_dict['author_talk'] = user
+            comment_dict['author'] = user.split("|")[0][12:]
            # doing stuff to figure out replies
            if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
                comment_dict['thread'] = []
                thread_array = [comment_dict['id']]
            else:
                comment_ = comment_dict['text']
                level = 0 
                while comment_.startswith('*') or comment_.startswith(':'):
                    level += 1
                    comment_ = comment_[1:]
                thread_array = thread_array[:level]
                comment_dict['thread'] = thread_array
                thread_array.append(comment_dict['id'])
            # doing stuff to get the timestamp 
            string_time = split_comment[-1].split(" ")[-5:]
            if string_time[-1] == "":
                string_time = split_comment[-1].split(" ")[-6:]
            string_time[0] = string_time[0][-6:]
            if string_time[-1] == "(UTC)":
-                comment_dict['draft_time'] = string_time
+                comment_dict['time'] = " ".join(string_time)
                #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
            #print(comment_dict)
            discussion_array.append(comment_dict)
            comment = ""
            comment_dict = {}
    return discussion_array
 def json_it(array_of_dicts, filename):
    json_ = json.dumps(array_of_dicts)
    with open(filename, 'w') as json_file:
        json_file.write(json_)
 if __name__ == "__main__":
    file_directory = os.path.dirname(os.path.abspath(__file__))
    os.chdir(file_directory)
    wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
-    parse_talkpage(wikitext)
+    json_discussion = parse_talkpage(wikitext)
    json_it(json_discussion, "test.json")