1
0

Initial commit

p#	new file:   runwikiq.sh
This commit is contained in:
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions

View File

@@ -0,0 +1,37 @@
"""
Prints the rev_id, characters and hash of all revisions to Willy_on_Wheels.
"""
import getpass
import hashlib
import os
import sys
try:
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import api
except: raise
api_session = api.Session("https://en.wikipedia.org/w/api.php")
print("(EN) Wikipedia credentials...")
username = input("Username: ")
password = getpass.getpass("Password: ")
api_session.login(username, password)
revisions = api_session.deleted_revisions.query(
properties={'ids', 'content'},
titles={'Willy on Wheels'},
direction="newer"
)
for rev in revisions:
print(
"{0} ({1} chars): {2}".format(
rev['revid'],
len(rev.get('*', "")),
hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
)
)

View File

@@ -0,0 +1,19 @@
"""
Prints the rev_id of all revisions to User:EpochFail.
"""
import sys
import os
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import api
api_session = api.Session("https://en.wikipedia.org/w/api.php")
revisions = api_session.revisions.query(
properties={'ids'},
titles={'User:TestAccountForMWUtils'}
)
for rev in revisions:
print(rev['revid'])

View File

@@ -0,0 +1,30 @@
"""
Prints the rev_id and hash of the 10 oldest edits in recent_changes.
"""
import os
import sys
try:
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import api
except:
raise
api_session = api.Session("https://en.wikipedia.org/w/api.php")
changes = api_session.recent_changes.query(
type={'edit', 'new'},
properties={'ids', 'sha1', 'timestamp'},
direction="newer",
limit=10
)
for change in changes:
print(
"{0} ({1}) @ {2}: {3}".format(
change['rcid'],
change['type'],
change['timestamp'],
change.get('sha1', "")
)
)

View File

@@ -0,0 +1,28 @@
"""
Prints the rev_id, characters and hash of all revisions to User:EpochFail.
"""
import sys
import os
sys.path.insert(0, os.path.abspath(os.getcwd()))
import hashlib
from mw import api
api_session = api.Session("https://en.wikipedia.org/w/api.php")
revisions = api_session.revisions.query(
properties={'ids', 'content'},
titles={"User:EpochFail"},
direction="newer",
limit=51
)
for rev in revisions:
print(
"{0} ({1} chars): {2}".format(
rev['revid'],
len(rev.get('*', "")),
hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
)
)

View File

@@ -0,0 +1,20 @@
"""
Prints the rev_id, characters and hash of all revisions to User:EpochFail.
"""
import os
import sys
try:
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import api
except:
raise
api_session = api.Session("https://en.wikipedia.org/w/api.php")
user_docs = api_session.users.query(
users=["EpochFail", "Halfak (WMF)"]
)
for user_doc in user_docs:
print(user_doc)

View File

@@ -0,0 +1,31 @@
"""
"""
import os
import sys
try:
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw import database
except:
raise
db = database.DB.from_params(
host="analytics-store.eqiad.wmnet",
read_default_file="~/.my.cnf",
user="research",
db="enwiki"
)
users = db.users.query(
registered_after="20140101000000",
direction="newer",
limit=10
)
for user in users:
print("{user_id}:{user_name} -- {user_editcount} edits".format(**user))

View File

@@ -0,0 +1,59 @@
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
version="0.8" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<base>http://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.22wmf2</generator>
<case>first-letter</case>
<namespaces>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Foo</title>
<ns>0</ns>
<id>1</id>
<revision>
<id>1</id>
<timestamp>2004-08-09T09:04:08Z</timestamp>
<contributor>
<username>Gen0cide</username>
<id>92182</id>
</contributor>
<text xml:space="preserve">Revision 1 text</text>
<sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
<model>wikitext</model>
<format>text/x-wiki</format>
</revision>
<revision>
<id>2</id>
<timestamp>2004-08-10T09:04:08Z</timestamp>
<contributor>
<ip>222.152.210.109</ip>
</contributor>
<text xml:space="preserve">Revision 2 text</text>
<sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
<model>wikitext</model>
<comment>Comment 2</comment>
<format>text/x-wiki</format>
</revision>
</page>
<page>
<title>Bar</title>
<ns>1</ns>
<id>2</id>
<revision>
<id>3</id>
<timestamp>2004-08-11T09:04:08Z</timestamp>
<contributor>
<ip>222.152.210.22</ip>
</contributor>
<text xml:space="preserve">Revision 3 text</text>
<sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
<model>wikitext</model>
<format>text/x-wiki</format>
</revision>
</page>
</mediawiki>

View File

@@ -0,0 +1,31 @@
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
version="0.8" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<base>http://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.22wmf2</generator>
<case>first-letter</case>
<namespaces>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Herp</title>
<ns>1</ns>
<id>2</id>
<revision>
<id>4</id>
<timestamp>2004-08-11T09:04:08Z</timestamp>
<contributor>
<id>10</id>
<name>FOobar!?</name>
</contributor>
<text xml:space="preserve">Revision 4 text</text>
<sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
<model>wikitext</model>
<format>text/x-wiki</format>
</revision>
</page>
</mediawiki>

View File

@@ -0,0 +1,19 @@
import pprint
import re
from mw.api import Session
from mw.lib import persistence
session = Session("https://en.wikipedia.org/w/api.php")
rev, tokens_added, future_revs = persistence.api.score(session, 560561013,
properties={'user'})
words_re = re.compile("\w+", re.UNICODE)
print("Words added")
for token in tokens_added:
if words_re.search(token.text):
print("'{0}' survived:".format(token.text))
for frev in token.revisions:
print("\t{revid} by {user}".format(**frev))

View File

@@ -0,0 +1,18 @@
"""
Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
revisions made by user "PermaNoob".
"""
from mw.api import Session
from mw.lib import reverts
session = Session("https://en.wikipedia.org/w/api.php")
revisions = session.user_contribs.query(user={"PermaNoob"}, direction="newer")
for rev in revisions:
revert = reverts.api.check_rev(session, rev, window=60*60*24*2)
if revert is not None:
print("{0} reverted {1} to {2}".format(
revert.reverting['revid'],
rev['revid'],
revert.reverted_to['revid'])
)

View File

@@ -0,0 +1,23 @@
"""
Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
revisions made by user with ID 9133062.
"""
from mw.database import DB
from mw.lib import reverts
db = DB.from_params(
host="s1-analytics-slave.eqiad.wmnet",
read_default_file="~/.my.cnf",
user="research",
db="enwiki"
)
revisions = db.revisions.query(user_id=9133062)
for rev_row in revisions:
revert = reverts.database.check_row(db, rev_row)
if revert is not None:
print("{0} reverted {1} to {2}".format(
revert.reverting['rev_id'],
rev_row['rev_id'],
revert.reverted_to['rev_id'])
)

View File

@@ -0,0 +1,21 @@
"""
Prints all reverted revisions of User:EpochFail.
"""
from mw.api import Session
from mw.lib import reverts
# Gather a page's revisions from the API
api_session = Session("https://en.wikipedia.org/w/api.php")
revs = api_session.revisions.query(
titles={"User:EpochFail"},
properties={'ids', 'sha1'},
direction="newer"
)
# Creates a revsion event iterator
rev_events = ((rev['sha1'], rev) for rev in revs)
# Detect and print reverts
for revert in reverts.detect(rev_events):
print("{0} reverted back to {1}".format(revert.reverting['revid'],
revert.reverted_to['revid']))

View File

@@ -0,0 +1,17 @@
"""
Prints out session information for user "TextAccountForMWUtils"
"""
from mw.api import Session
from mw.lib import sessions
# Gather a user's revisions from the API
api_session = Session("https://en.wikipedia.org/w/api.php")
revs = api_session.user_contribs.query(
user={"TestAccountForMWUtils"},
direction="newer"
)
rev_events = ((rev['user'], rev['timestamp'], rev) for rev in revs)
# Extract and print sessions
for user, session in sessions.cluster(rev_events):
print("{0}'s session with {1} revisions".format(user, len(session)))

View File

@@ -0,0 +1,26 @@
"""
Demonstrates title normalization and parsing.
"""
import sys
import os
sys.path.insert(0, os.path.abspath(os.getcwd()))
from mw.api import Session
from mw.lib import title
# Normalize titles
title.normalize("foo bar")
# > "Foo_bar"
# Construct a title parser from the API
api_session = Session("https://en.wikipedia.org/w/api.php")
parser = title.Parser.from_api(api_session)
# Handles normalization
parser.parse("user:epochFail")
# > 2, "EpochFail"
# Handles namespace aliases
parser.parse("WT:foobar")
# > 5, "Foobar"

View File

@@ -0,0 +1,27 @@
"""
Demonstrates some simple Timestamp operations
"""
from mw import Timestamp
# Seconds since Unix Epoch
str(Timestamp(1234567890))
# > '20090213233130'
# Database format
int(Timestamp("20090213233130"))
# > 1234567890
# API format
int(Timestamp("2009-02-13T23:31:30Z"))
# > 1234567890
# Difference in seconds
Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
# > 1
# strptime and strftime
Timestamp(1234567890).strftime("%Y foobar")
# > '2009 foobar'
str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
# > '20091001000000'

View File

@@ -0,0 +1,14 @@
"""
Prints out all rev_ids that appear in dump.xml.
"""
from mw.xml_dump import Iterator
# Construct dump file iterator
dump = Iterator.from_file(open("examples/dump.xml"))
# Iterate through pages
for page in dump:
# Iterate through a page's revisions
for revision in page:
print(revision.id)

View File

@@ -0,0 +1,15 @@
"""
Processes two dump files.
"""
from mw import xml_dump
files = ["examples/dump.xml", "examples/dump2.xml"]
def page_info(dump, path):
for page in dump:
yield page.id, page.namespace, page.title
for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
print("\t".join([str(page_id), str(page_namespace), page_title]))