mw-convo-collections/data_collection/phab_get.py

#from phabricator import Phabricator
import os, sys
import json
import numpy as np
import pandas as pd
import requests
import re
import datetime
import time

# the query task taken from MGerlach's notebook
def query_task_tag(
    tag_term,
    limit = 100,
    ts1 = None, ts2 = None,
    api_url_base = 'https://phabricator.wikimedia.org/api/maniphest.search',
    api_token = "api-b7lr4rr2yo5kjyxwmkxqbsbelhyf",
    sleep = 13
):
    '''
    query all tasks tagged with specific tag
    OPTIONAL:
    - limit (int, default=100), number of results per query, cannot be larger than 100
    - ts1, ts2 (int, default = None); timewindow for creation of tasks (timestamp)
    - sleep (int, default = 0), sleep between each query
    - api_url_base (str,) is the url for the api
    - api_token (str, default=martins token),

    RETURNS:
    - list of dictionary.
    '''
    time.sleep(sleep)
    to_query = 1
    after = None

    data = []

    # for bot frameworks
    # listed on the help page as of 2-12-2024
    # utilizing git as their VCS

    while to_query == 1:
        time.sleep(sleep)
        params = {
            'api.token'  : api_token,
            'constraints[query]':[tag_term], ## term that task is searched for with
            # seemed to be artificially limiting the data that was returned, unrealistically low count values
            #'constraints[projects]':[tag_term], ## term that task is tagged with
            'constraints[createdStart]':ts1, ## timestamp task creation (min)
            'constraints[createdEnd]':ts2, ## timestamp task creation (max)
            'limit':limit,
            'after':after,
            "attachments[subscribers]":"true",
        }

        response = requests.get( api_url_base, params=params)
        print(response)
        result = json.loads(response.text)['result']
        print(result)
        ## the data
        if result != None:
            data_tmp = result['data']
            data += data_tmp
            ## check if there are more results to query
            cursor = result['cursor']
            ## if after == None, no more queries
            if cursor['after'] == None:
                to_query = 0
            ## if after != None, query next page by passing after-argument
            else:
                after = cursor['after']
        else:
            to_query = 0
    return data

#also from MGerlach
def query_transactions_phid_task(
    task_phid,
    limit = 100,
    api_url_base = 'https://phabricator.wikimedia.org/api/transaction.search',
    api_token = 'api-b7lr4rr2yo5kjyxwmkxqbsbelhyf',
    sleep = 13,
):
    '''
    query all transactions for a task (task_phid).
    OPTIONAL:
    - limit (int, default=100), number of results per query, cannot be larger than 100
    - sleep (int, default = 0), sleep between each query
    - api_url_base (str,) is the url for the api
    - api_token (str, default=martins token),

    RETURNS:
    - list of dictionary.
    '''
    time.sleep(sleep)
    to_query = 1
    after = None

    data = []

    while to_query == 1:
        time.sleep(sleep)
        params = {
            'api.token'  : api_token,
            'objectIdentifier':task_phid, ## task-phid
            'limit':limit,
            'after':after,
        }
        response = requests.get(api_url_base, params=params)
        try:
            result = json.loads(response.text)['result']
            data_tmp = result['data']
            data += data_tmp
        except json.decoder.JSONDecodeError as e:
            data = {}
            break

        ## the data
        ## check if there are more results to query
        cursor = result['cursor']
        ## if after == None, no more queries
        if cursor['after'] == None:
            to_query = 0
        ## if after != None, query next page by passing after-argument
        else:
            after = cursor['after']
    return data

def query_users(
    api_url_base = 'https://phabricator.wikimedia.org/api/user.search',
    api_token = 'api-b7lr4rr2yo5kjyxwmkxqbsbelhyf',
    sleep = 13,
    limit = 100,
):
    time.sleep(sleep)
    to_query = 1
    after = None

    data = []

    while to_query == 1:
        time.sleep(sleep)
        params = {
            'api.token'  : api_token,
            'constraints[nameLike]':'WMF',
            'limit':limit,
            'after':after,
        }
        response = requests.get( api_url_base, params=params)
        try:
            result = json.loads(response.text)['result']
            data_tmp = result['data']
            data += data_tmp
        except json.decoder.JSONDecodeError as e:
            data = {}
            break
        ## the data
        ## check if there are more results to query
        cursor = result['cursor']
        ## if after == None, no more queries
        if cursor['after'] == None:
            to_query = 0
        ## if after != None, query next page by passing after-argument
        else:
            after = cursor['after']
    return data


if __name__ == "__main__":
    # phab=Phabricator("https://phabricator.wikimedia.org/")
    tags = [
        "http"
    ]
    tag = "http"
    #set phabricator api token
    token = "api-b7lr4rr2yo5kjyxwmkxqbsbelhyf"
    api_base = 'https://phabricator.wikimedia.org/api/'

    #p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2011, 9, 1, 0, 0, 0)))
    p_ts1 = int(datetime.datetime.timestamp(datetime.datetime(2013, 10, 21, 0, 0, 0)))
    p_ts2 = int(datetime.datetime.timestamp(datetime.datetime(2013, 12, 5, 0, 0, 0)))

    p_data = query_task_tag(tag, ts1=p_ts1, ts2=p_ts2)
    for entry in p_data:
        task_id = entry['phid']
        print(task_id)
        transactions = query_transactions_phid_task(task_id)
        comments = {}
        for item in transactions:
            comments[item['id']] = item['comments']
        entry['task_comments'] = comments
    DATA_PREFIX = "/data/users/mgaughan/mw-repo-lifecycles/phab_data/"
    with open(f"{DATA_PREFIX}{tag}_10-21-2013_12-5-2013_phab_data.json", "w") as outfile1:
        json.dump(p_data, outfile1)
    '''
    user = query_users()
    with open(f"022825_wmf_master_phab_roster.json", "w") as outfile1:
        json.dump(user, outfile1)
    '''