initial import of material for public archive into git
We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
This commit is contained in:
89
code/prediction/00_ngram_extraction.py
Normal file
89
code/prediction/00_ngram_extraction.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from time import time
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
import csv
|
||||
import argparse
|
||||
|
||||
n_features = 100000 # Gets the top n_features terms
|
||||
n_samples = None # Enter an integer here for testing, so it doesn't take so long
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
|
||||
parser.add_argument('-i', help='Location of the abstracts file',
|
||||
default='processed_data/abstracts.tsv')
|
||||
parser.add_argument('-o', help='Location of the output file',
|
||||
default='processed_data/ngram_table.csv')
|
||||
parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
|
||||
default=3)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading dataset...")
|
||||
t0 = time()
|
||||
doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
# Write the header
|
||||
write_header(args.o)
|
||||
|
||||
bags_o_words = get_counts(data_samples, n_features, args.n)
|
||||
write_output(doc_ids, bags_o_words, args.o)
|
||||
|
||||
def get_counts(abstracts, n_features, ngram_max):
|
||||
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
|
||||
max_features=n_features,
|
||||
stop_words='english',
|
||||
ngram_range = (1,ngram_max))
|
||||
t0 = time()
|
||||
tf = tf_vectorizer.fit_transform(abstracts)
|
||||
print("done in %0.3fs." % (time() - t0))
|
||||
|
||||
terms = tf_vectorizer.get_feature_names()
|
||||
freqs = tf.toarray()
|
||||
bags_o_words = to_bags_o_words(terms, freqs)
|
||||
return bags_o_words
|
||||
|
||||
|
||||
def write_header(out_file):
|
||||
with open(out_file, 'w') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
out.writerow(['document_id','term','frequency'])
|
||||
|
||||
def to_bags_o_words(terms, freqs):
|
||||
'''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
|
||||
The format of the dictionaries is term:count within that document.
|
||||
'''
|
||||
result = []
|
||||
for d in freqs:
|
||||
curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
|
||||
result.append(curr_result)
|
||||
return result
|
||||
|
||||
def write_output(ids, bags_o_words, out_file):
|
||||
with open(out_file, 'a') as o_f:
|
||||
out = csv.writer(o_f)
|
||||
for i, doc in enumerate(bags_o_words):
|
||||
for k,v in doc.items():
|
||||
# For each term and count, output a row, together with the document id
|
||||
out.writerow([ids[i],k,v])
|
||||
|
||||
def get_ids_and_abstracts(fn, length_limit):
|
||||
with open(fn, 'r') as f:
|
||||
in_csv = csv.DictReader(f, delimiter='\t')
|
||||
abstracts = []
|
||||
ids = []
|
||||
i = 1
|
||||
for r in in_csv:
|
||||
try:
|
||||
abstracts.append(r['abstract'])
|
||||
ids.append(r['eid'])
|
||||
except KeyError:
|
||||
print(r)
|
||||
if length_limit and i > length_limit:
|
||||
break
|
||||
i += 1
|
||||
return ids, abstracts
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user