13
0

improve tokenizer.

This commit is contained in:
Nate E TeBlunthuis 2020-08-03 22:55:10 -07:00
parent ddf2adb8a6
commit b3ffaaba1d

View File

@ -7,6 +7,7 @@ from collections import Counter
import pandas as pd import pandas as pd
import os import os
import datetime import datetime
from nltk import wordpunct_tokenize, MWETokenizer
# compute term frequencies for comments in each subreddit by week # compute term frequencies for comments in each subreddit by week
def weekly_tf(partition): def weekly_tf(partition):
@ -36,13 +37,15 @@ def weekly_tf(partition):
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week)) subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
tokenizer = MWETokenizer()
def tf_comments(subreddit_weeks): def tf_comments(subreddit_weeks):
for key, posts in subreddit_weeks: for key, posts in subreddit_weeks:
subreddit, week = key subreddit, week = key
tfs = Counter([]) tfs = Counter([])
for post in posts: for post in posts:
tfs.update(post.body.split()) tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower())))
for term, tf in tfs.items(): for term, tf in tfs.items():
yield [subreddit, term, week, tf] yield [subreddit, term, week, tf]
@ -55,6 +58,7 @@ def weekly_tf(partition):
while True: while True:
chunk = islice(outrows,outchunksize) chunk = islice(outrows,outchunksize)
pddf = pd.DataFrame(chunk, columns=schema.names) pddf = pd.DataFrame(chunk, columns=schema.names)
print(pddf)
table = pa.Table.from_pandas(pddf,schema=schema) table = pa.Table.from_pandas(pddf,schema=schema)
if table.shape[0] == 0: if table.shape[0] == 0:
break break