diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 38de903..f941d1a 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -62,7 +62,8 @@ def my_tokenizer(text, mwe_pass, mwe_tokenize, stopWords, ngram_output): for sentence in sentences: if random() <= 0.1: grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) - with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file: + Path(ngram_output).mkdir(parents=True, exist_ok=True) + with open(ngram_output','a') as gram_file: for ng in grams: gram_file.write(' '.join(ng) + '\n') for token in sentence: @@ -193,7 +194,7 @@ def weekly_tf(partition, # we follow the approach described in datta, phelan, adar 2017 - outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, ngram_output) + outrows = tf_func(subreddit_weeks, mwe_pass, mwe_tokenize, stopWords, Path(output_10p_sample_path) / ngram_output) outchunksize = 100000