We're creating a fresh archive because the history for our old chapter includes API keys, data files, and other material we can't share.
90 lines
2.9 KiB
Python
90 lines
2.9 KiB
Python
from time import time
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
import csv
|
|
import argparse
|
|
|
|
n_features = 100000 # Gets the top n_features terms
|
|
n_samples = None # Enter an integer here for testing, so it doesn't take so long
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
|
|
parser.add_argument('-i', help='Location of the abstracts file',
|
|
default='processed_data/abstracts.tsv')
|
|
parser.add_argument('-o', help='Location of the output file',
|
|
default='processed_data/ngram_table.csv')
|
|
parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
|
|
default=3)
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("Loading dataset...")
|
|
t0 = time()
|
|
doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
|
|
print("done in %0.3fs." % (time() - t0))
|
|
|
|
# Write the header
|
|
write_header(args.o)
|
|
|
|
bags_o_words = get_counts(data_samples, n_features, args.n)
|
|
write_output(doc_ids, bags_o_words, args.o)
|
|
|
|
def get_counts(abstracts, n_features, ngram_max):
|
|
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
|
|
max_features=n_features,
|
|
stop_words='english',
|
|
ngram_range = (1,ngram_max))
|
|
t0 = time()
|
|
tf = tf_vectorizer.fit_transform(abstracts)
|
|
print("done in %0.3fs." % (time() - t0))
|
|
|
|
terms = tf_vectorizer.get_feature_names()
|
|
freqs = tf.toarray()
|
|
bags_o_words = to_bags_o_words(terms, freqs)
|
|
return bags_o_words
|
|
|
|
|
|
def write_header(out_file):
|
|
with open(out_file, 'w') as o_f:
|
|
out = csv.writer(o_f)
|
|
out.writerow(['document_id','term','frequency'])
|
|
|
|
def to_bags_o_words(terms, freqs):
|
|
'''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
|
|
The format of the dictionaries is term:count within that document.
|
|
'''
|
|
result = []
|
|
for d in freqs:
|
|
curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
|
|
result.append(curr_result)
|
|
return result
|
|
|
|
def write_output(ids, bags_o_words, out_file):
|
|
with open(out_file, 'a') as o_f:
|
|
out = csv.writer(o_f)
|
|
for i, doc in enumerate(bags_o_words):
|
|
for k,v in doc.items():
|
|
# For each term and count, output a row, together with the document id
|
|
out.writerow([ids[i],k,v])
|
|
|
|
def get_ids_and_abstracts(fn, length_limit):
|
|
with open(fn, 'r') as f:
|
|
in_csv = csv.DictReader(f, delimiter='\t')
|
|
abstracts = []
|
|
ids = []
|
|
i = 1
|
|
for r in in_csv:
|
|
try:
|
|
abstracts.append(r['abstract'])
|
|
ids.append(r['eid'])
|
|
except KeyError:
|
|
print(r)
|
|
if length_limit and i > length_limit:
|
|
break
|
|
i += 1
|
|
return ids, abstracts
|
|
|
|
if __name__ == '__main__':
|
|
main()
|