98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
import mwapi
|
|
from revscoring import Model
|
|
import articlequality
|
|
import pyarrow
|
|
import pandas as pd
|
|
import scoring_utils
|
|
from itertools import chain, zip_longest
|
|
from multiprocessing import Pool
|
|
from functools import partial
|
|
from pyRemembeR import Remember
|
|
import fire
|
|
from pathlib import Path
|
|
import tqdm
|
|
remember = Remember("score_sample_articles.RDS")
|
|
|
|
def get_revision_text(revid_batch, api):
|
|
revid_batch = filter(lambda rid: rid is not None, revid_batch)
|
|
doc = api.get(action='query',
|
|
prop='revisions',
|
|
revids=revid_batch,
|
|
rvprop=['ids','content'],
|
|
rvslots=['main'])
|
|
pages = doc.get('query',{}).get('pages',{})
|
|
for pageid, doc in pages.items():
|
|
revisions = doc.get('revisions',[])
|
|
for revision in revisions:
|
|
text = revision.get('slots',{}).get('main',{}).get('*',{})
|
|
yield {'revid':revision.get('revid',{}), 'text':text}
|
|
|
|
def grouper(n, iterable, fillvalue=None):
|
|
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
|
|
args = [iter(iterable)] * n
|
|
return zip_longest(fillvalue=fillvalue, *args)
|
|
|
|
def pull_revision_texts(revids, api, api_batch_size):
|
|
batches = grouper(api_batch_size,revids)
|
|
get_revision_text_2 = partial(get_revision_text,api=api)
|
|
revs = chain(* map(get_revision_text_2, batches))
|
|
yield from revs
|
|
|
|
def score_revisions(revids, api, api_batch_size=50, parallel=True):
|
|
|
|
revs = pull_revision_texts(revids, api, api_batch_size)
|
|
|
|
ncores = 28
|
|
pool = Pool(ncores)
|
|
scorer_model = Model.load(open('articlequality/models/enwiki.nettrom_wp10.gradient_boosting.model', 'rb'))
|
|
add_score = partial(scoring_utils.add_score, scorer_model=scorer_model)
|
|
|
|
if parallel:
|
|
ncores = 48
|
|
pool = Pool(ncores)
|
|
|
|
revs = pool.imap_unordered(add_score, revs, chunksize = api_batch_size*4)
|
|
else:
|
|
revs = map(add_score,revs)
|
|
|
|
to_pddict = partial(scoring_utils.to_pddict,kept_keys=['revid'])
|
|
revs = map(to_pddict, revs)
|
|
yield from revs
|
|
|
|
#sample_file_parquet = "data/article_sample_set.parquet"; output_feather="data/scored_article_sample.feather";
|
|
|
|
sample_file="/data/nti9383home/production_functions/data/20200301_article_labelings_sample.feather";output="/data/nti9383home/production_functions/data/scored_article_sample.feather"
|
|
|
|
def score_sample(sample_file = "data/article_sample_set.feather", output="data/scored_article_sample.feather"):
|
|
|
|
sample = pd.read_feather(sample_file)
|
|
|
|
revids = set(sample.revid)
|
|
user_agent = "Nate TeBlunthuis <nathante@uw.edu>. What's the relationship between contributors and article quality?"
|
|
api = mwapi.Session("https://en.wikipedia.org",user_agent=user_agent)
|
|
|
|
scores = tqdm.tqdm(score_revisions(revids, api, 50, True),total=len(revids),miniters=100,smoothing=0.2)
|
|
|
|
p = Path(output)
|
|
output_csv = Path(str(p).replace("".join(p.suffixes), ".csv"))
|
|
output_json = Path(str(p).replace("".join(p.suffixes), ".json"))
|
|
output_feather = Path(str(p).replace("".join(p.suffixes), ".feather"))
|
|
|
|
saved_scores = list()
|
|
with open(output_json,'w') as of:
|
|
for score in scores:
|
|
of.write(str(score) + '\n')
|
|
saved_scores.append(score)
|
|
|
|
|
|
scored_revids = pd.DataFrame(saved_scores)
|
|
sample_1 = sample.merge(scored_revids,left_on="revid",right_on="revid")
|
|
remember(sample_1.shape[0],"sample_size_unscored")
|
|
|
|
remember(sample_1.shape[0],"sample_size_scored")
|
|
sample_1.to_feather(output_feather)
|
|
sample_1.to_csv(output_csv)
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(score_sample)
|