move function to outer scope.

2024-11-27 19:10:34 -08:00 · 2024-11-27 19:10:34 -08:00 · 7da046735b
commit 7da046735b
parent 0631256956
1 changed files with 45 additions and 44 deletions
--- a/ngrams/term_frequencies.py
+++ b/ngrams/term_frequencies.py
@ -19,6 +19,51 @@ from pathlib import Path
 # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
 urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")

+def my_tokenizer(text):
+    # remove stopwords, punctuation, urls, lower case
+    # lowercase        
+    text = text.lower()
+
+    # remove urls
+    text = urlregex.sub("", text)
+
+    # sentence tokenize
+    sentences = sent_tokenize(text)
+
+    # wordpunct_tokenize
+    sentences = map(wordpunct_tokenize, sentences)
+
+    # remove punctuation
+
+    sentences = map(remove_punct, sentences)
+
+    # remove sentences with less than 2 words
+    sentences = filter(lambda sentence: len(sentence) > 2, sentences)
+
+    # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
+    # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
+    # here we take a 10 percent sample of sentences 
+    if mwe_pass == 'first':
+        sentences = list(sentences)
+        for sentence in sentences:
+            if random() <= 0.1:
+                grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
+                with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
+                    for ng in grams:
+                        gram_file.write(' '.join(ng) + '\n')
+            for token in sentence:
+                if token not in stopWords:
+                    yield token
+
+    else:
+        # remove stopWords
+        sentences = map(mwe_tokenize, sentences)
+        sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
+        for sentence in sentences:
+            for token in sentence:
+                yield token
+
+
 def tf_comments(subreddit_weeks):
    for key, posts in subreddit_weeks:
        subreddit, week = key
@ -138,50 +183,6 @@ def weekly_tf(partition,
    stopWords = set(stopwords.words('english'))

    # we follow the approach described in datta, phelan, adar 2017
-    def my_tokenizer(text):
-        # remove stopwords, punctuation, urls, lower case
-        # lowercase        
-        text = text.lower()
-
-        # remove urls
-        text = urlregex.sub("", text)
-
-        # sentence tokenize
-        sentences = sent_tokenize(text)
-
-        # wordpunct_tokenize
-        sentences = map(wordpunct_tokenize, sentences)
-
-        # remove punctuation
-                        
-        sentences = map(remove_punct, sentences)
-
-        # remove sentences with less than 2 words
-        sentences = filter(lambda sentence: len(sentence) > 2, sentences)
-
-        # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
-        # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
-        # here we take a 10 percent sample of sentences 
-        if mwe_pass == 'first':
-            sentences = list(sentences)
-            for sentence in sentences:
-                if random() <= 0.1:
-                    grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
-                    with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
-                        for ng in grams:
-                            gram_file.write(' '.join(ng) + '\n')
-                for token in sentence:
-                    if token not in stopWords:
-                        yield token
-
-        else:
-            # remove stopWords
-            sentences = map(mwe_tokenize, sentences)
-            sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
-            for sentence in sentences:
-                for token in sentence:
-                    yield token
-

    outrows = tf_func(subreddit_weeks)