bugfix
This commit is contained in:
parent
ec5859c311
commit
4be8bb6bf5
@ -62,7 +62,8 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output):
|
||||
for sentence in sentences:
|
||||
if random() <= 0.1:
|
||||
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
||||
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
||||
Path(ngram_output).mkdir(parents=True, exist_ok=True)
|
||||
with open(ngram_output','a') as gram_file:
|
||||
for ng in grams:
|
||||
gram_file.write(' '.join(ng) + '\n')
|
||||
for token in sentence:
|
||||
@ -193,7 +194,7 @@ def weekly_tf(partition,
|
||||
|
||||
# we follow the approach described in datta, phelan, adar 2017
|
||||
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output)
|
||||
outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, Path(output_10p_sample_path) / ngram_output)
|
||||
|
||||
outchunksize = 100000
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user