move function to outer scope.

2024-11-27 19:10:34 -08:00 · 2024-11-27 19:10:34 -08:00 · 7da046735b
commit 7da046735b
parent 0631256956
1 changed files with 45 additions and 44 deletions
--- a/ngrams/term_frequencies.py
+++ b/ngrams/term_frequencies.py
@ -19,6 +19,51 @@ from pathlib import Path
 # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
 urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
 def my_tokenizer(text):
    # remove stopwords, punctuation, urls, lower case
    # lowercase        
    text = text.lower()
    # remove urls
    text = urlregex.sub("", text)
    # sentence tokenize
    sentences = sent_tokenize(text)
    # wordpunct_tokenize
    sentences = map(wordpunct_tokenize, sentences)
    # remove punctuation
    sentences = map(remove_punct, sentences)
    # remove sentences with less than 2 words
    sentences = filter(lambda sentence: len(sentence) > 2, sentences)
    # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
    # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
    # here we take a 10 percent sample of sentences 
    if mwe_pass == 'first':
        sentences = list(sentences)
        for sentence in sentences:
            if random() <= 0.1:
                grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
                with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
                    for ng in grams:
                        gram_file.write(' '.join(ng) + '\n')
            for token in sentence:
                if token not in stopWords:
                    yield token
    else:
        # remove stopWords
        sentences = map(mwe_tokenize, sentences)
        sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
        for sentence in sentences:
            for token in sentence:
                yield token
 def tf_comments(subreddit_weeks):
    for key, posts in subreddit_weeks:
        subreddit, week = key
@ -138,50 +183,6 @@ def weekly_tf(partition,
    stopWords = set(stopwords.words('english'))
    # we follow the approach described in datta, phelan, adar 2017
    def my_tokenizer(text):
        # remove stopwords, punctuation, urls, lower case
        # lowercase        
        text = text.lower()
        # remove urls
        text = urlregex.sub("", text)
        # sentence tokenize
        sentences = sent_tokenize(text)
        # wordpunct_tokenize
        sentences = map(wordpunct_tokenize, sentences)
        # remove punctuation
        sentences = map(remove_punct, sentences)
        # remove sentences with less than 2 words
        sentences = filter(lambda sentence: len(sentence) > 2, sentences)
        # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
        # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
        # here we take a 10 percent sample of sentences 
        if mwe_pass == 'first':
            sentences = list(sentences)
            for sentence in sentences:
                if random() <= 0.1:
                    grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
                    with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
                        for ng in grams:
                            gram_file.write(' '.join(ng) + '\n')
                for token in sentence:
                    if token not in stopWords:
                        yield token
        else:
            # remove stopWords
            sentences = map(mwe_tokenize, sentences)
            sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
            for sentence in sentences:
                for token in sentence:
                    yield token
    outrows = tf_func(subreddit_weeks)