From 7da046735b9bf1cc6dd18ace77d626cb478d536c Mon Sep 17 00:00:00 2001
From: Nathan TeBlunthuis <nathanteblunthuis@gmail.com>
Date: Wed, 27 Nov 2024 19:10:34 -0800
Subject: [PATCH] move function to outer scope.

---
 ngrams/term_frequencies.py | 89 +++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py
index b490e42..c30131c 100755
--- a/ngrams/term_frequencies.py
+++ b/ngrams/term_frequencies.py
@@ -19,6 +19,51 @@ from pathlib import Path
 # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
 urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
 
+def my_tokenizer(text):
+    # remove stopwords, punctuation, urls, lower case
+    # lowercase        
+    text = text.lower()
+
+    # remove urls
+    text = urlregex.sub("", text)
+
+    # sentence tokenize
+    sentences = sent_tokenize(text)
+
+    # wordpunct_tokenize
+    sentences = map(wordpunct_tokenize, sentences)
+
+    # remove punctuation
+
+    sentences = map(remove_punct, sentences)
+
+    # remove sentences with less than 2 words
+    sentences = filter(lambda sentence: len(sentence) > 2, sentences)
+
+    # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
+    # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
+    # here we take a 10 percent sample of sentences 
+    if mwe_pass == 'first':
+        sentences = list(sentences)
+        for sentence in sentences:
+            if random() <= 0.1:
+                grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
+                with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
+                    for ng in grams:
+                        gram_file.write(' '.join(ng) + '\n')
+            for token in sentence:
+                if token not in stopWords:
+                    yield token
+
+    else:
+        # remove stopWords
+        sentences = map(mwe_tokenize, sentences)
+        sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
+        for sentence in sentences:
+            for token in sentence:
+                yield token
+
+
 def tf_comments(subreddit_weeks):
     for key, posts in subreddit_weeks:
         subreddit, week = key
@@ -138,50 +183,6 @@ def weekly_tf(partition,
     stopWords = set(stopwords.words('english'))
 
     # we follow the approach described in datta, phelan, adar 2017
-    def my_tokenizer(text):
-        # remove stopwords, punctuation, urls, lower case
-        # lowercase        
-        text = text.lower()
-
-        # remove urls
-        text = urlregex.sub("", text)
-
-        # sentence tokenize
-        sentences = sent_tokenize(text)
-
-        # wordpunct_tokenize
-        sentences = map(wordpunct_tokenize, sentences)
-
-        # remove punctuation
-                        
-        sentences = map(remove_punct, sentences)
-
-        # remove sentences with less than 2 words
-        sentences = filter(lambda sentence: len(sentence) > 2, sentences)
-
-        # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
-        # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
-        # here we take a 10 percent sample of sentences 
-        if mwe_pass == 'first':
-            sentences = list(sentences)
-            for sentence in sentences:
-                if random() <= 0.1:
-                    grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
-                    with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
-                        for ng in grams:
-                            gram_file.write(' '.join(ng) + '\n')
-                for token in sentence:
-                    if token not in stopWords:
-                        yield token
-
-        else:
-            # remove stopWords
-            sentences = map(mwe_tokenize, sentences)
-            sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
-            for sentence in sentences:
-                for token in sentence:
-                    yield token
-
 
     outrows = tf_func(subreddit_weeks)