first draft of conversation parsing for wiki talk pages

2024-12-09 14:56:31 -06:00 · 2024-12-09 14:56:31 -06:00 · b57e4c15a3
commit b57e4c15a3
parent aa30d02c48
2 changed files with 40 additions and 7 deletions
--- a/spec.txt
+++ b/spec.txt
@ -7,7 +7,7 @@
 	[ ] page edit histories 
 	[ ] who said it
 	[ ~ ] talk pages? 
-		[ ~ ] who said what and when 
-		[ ] in response to who
-[ ] structure both of those in json files 
+		[ x ] who said what and when 
+		[ x ] in response to who
+[ ~ ] structure both of those in json files 
 [ ] construct folders of the tagged json files
--- a/src/lib/wiki_get.py
+++ b/src/lib/wiki_get.py
@ -1,6 +1,8 @@
 import os 
 import re
 import datetime 
+import uuid
+import json

 import mwparserfromhell

@ -18,6 +20,8 @@ def parse_talkpage(wikitext):
    current_header = ""
    subheader = ""
    comment_dict = {}
+    thread_array = []
+    discussion_array = []
    for cell in arraytext:
        if re.search(r"^==.*?==$", cell):
            current_title = cell
@ -29,26 +33,55 @@ def parse_talkpage(wikitext):
            subheader = cell
            continue
        comment += cell
-        match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
+        match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
+        #match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
        if match:
+            comment_id = uuid.uuid4()
            user = match.group()
            split_comment = comment.split(user)
+            comment_dict['id'] = str(comment_id)
            comment_dict['text'] = split_comment[0]
            comment_dict['title'] = current_title
            comment_dict['header'] = current_header
            comment_dict['subheader'] = subheader
-            comment_dict['author_talk'] = user
+            comment_dict['author'] = user.split("|")[0][12:]
+            # doing stuff to figure out replies
+            if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
+                comment_dict['thread'] = []
+                thread_array = [comment_dict['id']]
+            else:
+                comment_ = comment_dict['text']
+                level = 0 
+                while comment_.startswith('*') or comment_.startswith(':'):
+                    level += 1
+                    comment_ = comment_[1:]
+                thread_array = thread_array[:level]
+                comment_dict['thread'] = thread_array
+                thread_array.append(comment_dict['id'])
+            # doing stuff to get the timestamp 
            string_time = split_comment[-1].split(" ")[-5:]
            if string_time[-1] == "":
                string_time = split_comment[-1].split(" ")[-6:]
+            string_time[0] = string_time[0][-6:]
            if string_time[-1] == "(UTC)":
-                comment_dict['draft_time'] = string_time
+                comment_dict['time'] = " ".join(string_time)
                #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
            #print(comment_dict)
+            discussion_array.append(comment_dict)
            comment = ""
+            comment_dict = {}
+    return discussion_array
+
+def json_it(array_of_dicts, filename):
+    json_ = json.dumps(array_of_dicts)
+    with open(filename, 'w') as json_file:
+        json_file.write(json_)
+
+        

 if __name__ == "__main__":
    file_directory = os.path.dirname(os.path.abspath(__file__))
    os.chdir(file_directory)
    wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
-    parse_talkpage(wikitext)
+    json_discussion = parse_talkpage(wikitext)
+    json_it(json_discussion, "test.json")