social-media-chapter/code/prediction/00_ngram_extraction.py

from time import time

from sklearn.feature_extraction.text import CountVectorizer
import csv
import argparse

n_features = 100000 # Gets the top n_features terms
n_samples = None # Enter an integer here for testing, so it doesn't take so long

def main():

    parser = argparse.ArgumentParser(description='Take in abstracts, output CSV of n-gram counts')
    parser.add_argument('-i', help='Location of the abstracts file',
            default='processed_data/abstracts.tsv')
    parser.add_argument('-o', help='Location of the output file',
            default='processed_data/ngram_table.csv')
    parser.add_argument('-n', type=int, help='Gets from 1 to n ngrams',
        default=3)

    args = parser.parse_args()

    print("Loading dataset...")
    t0 = time()
    doc_ids, data_samples = get_ids_and_abstracts(args.i, n_samples)
    print("done in %0.3fs." % (time() - t0))

    # Write the header
    write_header(args.o)

    bags_o_words = get_counts(data_samples, n_features, args.n)
    write_output(doc_ids, bags_o_words, args.o)

def get_counts(abstracts, n_features, ngram_max):
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                    max_features=n_features,
                                    stop_words='english',
                                    ngram_range = (1,ngram_max))
    t0 = time()
    tf = tf_vectorizer.fit_transform(abstracts)
    print("done in %0.3fs." % (time() - t0))

    terms = tf_vectorizer.get_feature_names()
    freqs = tf.toarray()
    bags_o_words = to_bags_o_words(terms, freqs)
    return bags_o_words


def write_header(out_file):
    with open(out_file, 'w') as o_f:
        out = csv.writer(o_f)
        out.writerow(['document_id','term','frequency'])

def to_bags_o_words(terms, freqs):
    '''Takes in the vectorizer stuff, and returns a list of dictionaries, one for each document.
    The format of the dictionaries is term:count within that document.
    '''
    result = []
    for d in freqs:
        curr_result = {terms[i]:val for i,val in enumerate(d) if val > 0 }
        result.append(curr_result)
    return result

def write_output(ids, bags_o_words, out_file):
    with open(out_file, 'a') as o_f:
        out = csv.writer(o_f)
        for i, doc in enumerate(bags_o_words):
            for k,v in doc.items():
                # For each term and count, output a row, together with the document id
                out.writerow([ids[i],k,v])

def get_ids_and_abstracts(fn, length_limit):
    with open(fn, 'r') as f:
        in_csv = csv.DictReader(f, delimiter='\t')
        abstracts = []
        ids = []
        i = 1
        for r in in_csv:
            try:
                abstracts.append(r['abstract'])
                ids.append(r['eid'])
            except KeyError:
                print(r)
            if length_limit and  i > length_limit:
                break
            i += 1
    return ids, abstracts

if __name__ == '__main__':
    main()