1
0
This commit is contained in:
Nathan TeBlunthuis 2024-12-03 19:15:07 -08:00
parent ec5859c311
commit 4be8bb6bf5

View File

@ -62,7 +62,8 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
for sentence in sentences:
if random() <= 0.1:
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
Path(ngram_output).mkdir(parents=True, exist_ok=True)
with open(ngram_output','a') as gram_file:
for ng in grams:
gram_file.write(' '.join(ng) + '\n')
for token in sentence:
@ -193,7 +194,7 @@ def weekly_tf(partition,
# we follow the approach described in datta, phelan, adar 2017
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output)
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, Path(output_10p_sample_path) / ngram_output)
outchunksize = 100000