move function to outer scope.
This commit is contained in:
parent
0631256956
commit
7da046735b
@ -19,6 +19,51 @@ from pathlib import Path
|
|||||||
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
|
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
|
||||||
urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
|
urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
|
||||||
|
|
||||||
|
def my_tokenizer(text):
|
||||||
|
# remove stopwords, punctuation, urls, lower case
|
||||||
|
# lowercase
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# remove urls
|
||||||
|
text = urlregex.sub("", text)
|
||||||
|
|
||||||
|
# sentence tokenize
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
|
||||||
|
# wordpunct_tokenize
|
||||||
|
sentences = map(wordpunct_tokenize, sentences)
|
||||||
|
|
||||||
|
# remove punctuation
|
||||||
|
|
||||||
|
sentences = map(remove_punct, sentences)
|
||||||
|
|
||||||
|
# remove sentences with less than 2 words
|
||||||
|
sentences = filter(lambda sentence: len(sentence) > 2, sentences)
|
||||||
|
|
||||||
|
# datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
|
||||||
|
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
|
||||||
|
# here we take a 10 percent sample of sentences
|
||||||
|
if mwe_pass == 'first':
|
||||||
|
sentences = list(sentences)
|
||||||
|
for sentence in sentences:
|
||||||
|
if random() <= 0.1:
|
||||||
|
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
||||||
|
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
||||||
|
for ng in grams:
|
||||||
|
gram_file.write(' '.join(ng) + '\n')
|
||||||
|
for token in sentence:
|
||||||
|
if token not in stopWords:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
else:
|
||||||
|
# remove stopWords
|
||||||
|
sentences = map(mwe_tokenize, sentences)
|
||||||
|
sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
|
||||||
|
for sentence in sentences:
|
||||||
|
for token in sentence:
|
||||||
|
yield token
|
||||||
|
|
||||||
|
|
||||||
def tf_comments(subreddit_weeks):
|
def tf_comments(subreddit_weeks):
|
||||||
for key, posts in subreddit_weeks:
|
for key, posts in subreddit_weeks:
|
||||||
subreddit, week = key
|
subreddit, week = key
|
||||||
@ -138,50 +183,6 @@ def weekly_tf(partition,
|
|||||||
stopWords = set(stopwords.words('english'))
|
stopWords = set(stopwords.words('english'))
|
||||||
|
|
||||||
# we follow the approach described in datta, phelan, adar 2017
|
# we follow the approach described in datta, phelan, adar 2017
|
||||||
def my_tokenizer(text):
|
|
||||||
# remove stopwords, punctuation, urls, lower case
|
|
||||||
# lowercase
|
|
||||||
text = text.lower()
|
|
||||||
|
|
||||||
# remove urls
|
|
||||||
text = urlregex.sub("", text)
|
|
||||||
|
|
||||||
# sentence tokenize
|
|
||||||
sentences = sent_tokenize(text)
|
|
||||||
|
|
||||||
# wordpunct_tokenize
|
|
||||||
sentences = map(wordpunct_tokenize, sentences)
|
|
||||||
|
|
||||||
# remove punctuation
|
|
||||||
|
|
||||||
sentences = map(remove_punct, sentences)
|
|
||||||
|
|
||||||
# remove sentences with less than 2 words
|
|
||||||
sentences = filter(lambda sentence: len(sentence) > 2, sentences)
|
|
||||||
|
|
||||||
# datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
|
|
||||||
# they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
|
|
||||||
# here we take a 10 percent sample of sentences
|
|
||||||
if mwe_pass == 'first':
|
|
||||||
sentences = list(sentences)
|
|
||||||
for sentence in sentences:
|
|
||||||
if random() <= 0.1:
|
|
||||||
grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
|
|
||||||
with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
|
|
||||||
for ng in grams:
|
|
||||||
gram_file.write(' '.join(ng) + '\n')
|
|
||||||
for token in sentence:
|
|
||||||
if token not in stopWords:
|
|
||||||
yield token
|
|
||||||
|
|
||||||
else:
|
|
||||||
# remove stopWords
|
|
||||||
sentences = map(mwe_tokenize, sentences)
|
|
||||||
sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
|
|
||||||
for sentence in sentences:
|
|
||||||
for token in sentence:
|
|
||||||
yield token
|
|
||||||
|
|
||||||
|
|
||||||
outrows = tf_func(subreddit_weeks)
|
outrows = tf_func(subreddit_weeks)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user