From 7da046735b9bf1cc6dd18ace77d626cb478d536c Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 27 Nov 2024 19:10:34 -0800 Subject: [PATCH] move function to outer scope. --- ngrams/term_frequencies.py | 89 +++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index b490e42..c30131c 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -19,6 +19,51 @@ from pathlib import Path # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)") +def my_tokenizer(text): + # remove stopwords, punctuation, urls, lower case + # lowercase + text = text.lower() + + # remove urls + text = urlregex.sub("", text) + + # sentence tokenize + sentences = sent_tokenize(text) + + # wordpunct_tokenize + sentences = map(wordpunct_tokenize, sentences) + + # remove punctuation + + sentences = map(remove_punct, sentences) + + # remove sentences with less than 2 words + sentences = filter(lambda sentence: len(sentence) > 2, sentences) + + # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase. + # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms + # here we take a 10 percent sample of sentences + if mwe_pass == 'first': + sentences = list(sentences) + for sentence in sentences: + if random() <= 0.1: + grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) + with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file: + for ng in grams: + gram_file.write(' '.join(ng) + '\n') + for token in sentence: + if token not in stopWords: + yield token + + else: + # remove stopWords + sentences = map(mwe_tokenize, sentences) + sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences) + for sentence in sentences: + for token in sentence: + yield token + + def tf_comments(subreddit_weeks): for key, posts in subreddit_weeks: subreddit, week = key @@ -138,50 +183,6 @@ def weekly_tf(partition, stopWords = set(stopwords.words('english')) # we follow the approach described in datta, phelan, adar 2017 - def my_tokenizer(text): - # remove stopwords, punctuation, urls, lower case - # lowercase - text = text.lower() - - # remove urls - text = urlregex.sub("", text) - - # sentence tokenize - sentences = sent_tokenize(text) - - # wordpunct_tokenize - sentences = map(wordpunct_tokenize, sentences) - - # remove punctuation - - sentences = map(remove_punct, sentences) - - # remove sentences with less than 2 words - sentences = filter(lambda sentence: len(sentence) > 2, sentences) - - # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase. - # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms - # here we take a 10 percent sample of sentences - if mwe_pass == 'first': - sentences = list(sentences) - for sentence in sentences: - if random() <= 0.1: - grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) - with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file: - for ng in grams: - gram_file.write(' '.join(ng) + '\n') - for token in sentence: - if token not in stopWords: - yield token - - else: - # remove stopWords - sentences = map(mwe_tokenize, sentences) - sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences) - for sentence in sentences: - for token in sentence: - yield token - outrows = tf_func(subreddit_weeks)