improve tokenizer.
This commit is contained in:
parent
ddf2adb8a6
commit
b3ffaaba1d
@ -7,6 +7,7 @@ from collections import Counter
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
|
from nltk import wordpunct_tokenize, MWETokenizer
|
||||||
|
|
||||||
# compute term frequencies for comments in each subreddit by week
|
# compute term frequencies for comments in each subreddit by week
|
||||||
def weekly_tf(partition):
|
def weekly_tf(partition):
|
||||||
@ -36,13 +37,15 @@ def weekly_tf(partition):
|
|||||||
|
|
||||||
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
|
subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
|
||||||
|
|
||||||
|
tokenizer = MWETokenizer()
|
||||||
|
|
||||||
def tf_comments(subreddit_weeks):
|
def tf_comments(subreddit_weeks):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
tfs = Counter([])
|
tfs = Counter([])
|
||||||
|
|
||||||
for post in posts:
|
for post in posts:
|
||||||
tfs.update(post.body.split())
|
tfs.update(tokenizer.tokenize(wordpunct_tokenize(post.body.lower())))
|
||||||
|
|
||||||
for term, tf in tfs.items():
|
for term, tf in tfs.items():
|
||||||
yield [subreddit, term, week, tf]
|
yield [subreddit, term, week, tf]
|
||||||
@ -55,6 +58,7 @@ def weekly_tf(partition):
|
|||||||
while True:
|
while True:
|
||||||
chunk = islice(outrows,outchunksize)
|
chunk = islice(outrows,outchunksize)
|
||||||
pddf = pd.DataFrame(chunk, columns=schema.names)
|
pddf = pd.DataFrame(chunk, columns=schema.names)
|
||||||
|
print(pddf)
|
||||||
table = pa.Table.from_pandas(pddf,schema=schema)
|
table = pa.Table.from_pandas(pddf,schema=schema)
|
||||||
if table.shape[0] == 0:
|
if table.shape[0] == 0:
|
||||||
break
|
break
|
||||||
|
Loading…
Reference in New Issue
Block a user