2025-01-29 05:04:51 +00:00
import git
from tqdm import tqdm
import csv
import os
import shutil
import time
import pandas as pd
import datetime
import argparse
2025-01-29 16:31:55 +00:00
'''
RUNNING INSTRUCTIONS :
[ 1 ] set up tmux environment
[ 2 ] edit this file where marked " FIX BELOW "
[ 3 ] install pip packages
[ 4 ] in your tmux environment , run the following three commands
- os . environ [ ' GIT_SSH_COMMAND ' ] = ' ssh -o StrictHostKeyChecking=no '
- os . environ [ ' GIT_ASKPASS ' ] = ' false '
- os . environ [ ' GIT_TERMINAL_PROMPT ' ] = ' 0 '
[ 5 ] in tmux , run the script as follows with your START and STOP values
- python3 intermediary_script . py - - start_index START - - stop_index STOP
[ 6 ] the password handling is imperfect , so I would appreciate if you could check on the script every so often in case anything hangs
THANK YOU VERY MUCH - matt
'''
#FIX BELOW: temp_dir is where the repositories will be temporarily cloned to, if you are worried about space, specify here
2025-01-29 05:04:51 +00:00
temp_dir = " /data/users/mgaughan/tmp3/ "
cst = datetime . timezone ( datetime . timedelta ( hours = - 6 ) )
from_date = datetime . datetime ( 1970 , 1 , 1 , 12 , 00 , 00 , tzinfo = cst )
to_date = datetime . datetime ( 2024 , 3 , 16 , 12 , 00 , 00 , tzinfo = cst )
2025-01-29 16:31:55 +00:00
#FIX BELOW: this is where the commit data will be stored, the below parent directory needs to contain the subdirs contributing_commit_data and readme_commit_data within them
2025-01-29 05:04:51 +00:00
COMMIT_SAVE_PREFIX = " /data/users/mgaughan/kkex/012825_cam_revision_main/ "
def temp_clone ( vcs_link , temp_location ) :
"""
ARGS
vcs_link : url link to upstream repo vcs
temp_location : filepath to where the repo should be cloned to
RETURNS
repo : the GitRepository object of the cloned repo
repo_path : the filepath to the cloned repository
"""
#print(temp_location)
vcs_link = vcs_link . strip ( )
os . makedirs ( temp_location )
repo_path = temp_location
repo = git . Repo . clone_from ( vcs_link , repo_path )
print ( f " Successfully Cloned { vcs_link } " )
return repo , repo_path
def delete_clone ( temp_location ) :
"""
ARGS
temp_location : filepath to the cloned repository
RETURNS
whether or not the deletion was a success
"""
if os . path . exists ( temp_location ) :
shutil . rmtree ( temp_location )
print ( f " { temp_location } has been deleted. " )
return 0
else :
print ( " No clone at location " )
return 1
# parses through commits in reverse chronological order, hence the flipping of the terms for the arguments
def commit_analysis ( repo , cutoff_date , start_date ) :
print ( " Analyzing Commits... " )
commits_info = [ ]
for commit in repo . iter_commits ( ) :
# if too far back, break
if commit . committed_datetime > start_date :
continue
if commit . committed_datetime < cutoff_date :
break
commit_info = {
" commit_hash " : commit . hexsha ,
" author_name " : commit . author . name ,
" author_email " : commit . author . email ,
" authored_date " : commit . authored_datetime ,
" committer_name " : commit . committer . name ,
" committer_email " : commit . committer . email ,
" commit_date " : commit . committed_datetime ,
" message " : commit . message ,
" is_merge " : len ( commit . parents ) > 1 ,
}
# author/committer org information
commit_info [ ' author_org ' ] = commit_info [ " author_email " ] . split ( " @ " ) [ - 1 ] . split ( " . " ) [ 0 ]
commit_info [ ' committer_org ' ] = commit_info [ " committer_email " ] . split ( " @ " ) [ - 1 ] . split ( " . " ) [ 0 ]
# some more effort to get this information
commit_info [ " branches " ] = repo . git . branch (
" --contains " , commit_info [ " commit_hash " ]
)
# diff information
diffs = commit . diff (
commit . parents [ 0 ] if commit . parents else git . NULL_TREE , create_patch = True
)
commit_info [ " diff_info " ] = diff_analysis ( diffs )
# print(commit_info)
commits_info . append ( commit_info )
return commits_info
def diff_analysis ( diffs ) :
diff_objects = [ ]
for diff in diffs :
diff_info = {
" lines_added " : sum (
1
2025-01-30 18:21:10 +00:00
for line in diff . diff . decode ( " utf-8 " , errors = " ignore " ) . split ( " \n " )
2025-01-29 05:04:51 +00:00
if line . startswith ( " + " ) and not line . startswith ( " +++ " )
) ,
" lines_deleted " : sum (
1
2025-01-30 18:21:10 +00:00
for line in diff . diff . decode ( " utf-8 " , errors = " ignore " ) . split ( " \n " )
2025-01-29 05:04:51 +00:00
if line . startswith ( " - " ) and not line . startswith ( " --- " )
) ,
" parent_filepath " : diff . a_path ,
" child_filepath " : diff . b_path ,
" change_type " : diff . change_type ,
" new_file " : diff . new_file ,
" deleted_file " : diff . deleted_file ,
" renamed_file " : diff . renamed ,
#'diff': diff.diff.decode('utf-8')
}
diff_objects . append ( diff_info )
return diff_objects
def for_all_files ( start_index , stop_index ) :
cwd = os . getcwd ( )
2025-01-29 21:51:09 +00:00
csv_path = " for_batching/deb_full_data.csv "
2025-01-29 05:04:51 +00:00
index = - 1
saved = [ ]
empty_row = 0
clone_error = [ ]
has_readme = 0
has_contributing = 0
2025-01-29 17:38:05 +00:00
try :
with open ( csv_path , ' r ' ) as file :
csv_reader = csv . DictReader ( file )
lines = [ line for line in file ]
for row in tqdm ( csv . reader ( lines ) , total = len ( lines ) ) :
index + = 1
if index < start_index :
continue
time . sleep ( 4 )
if row [ 0 ] == " " :
empty_row + = 1
continue
#row[5] = upstream vcs
temp_repo_path = " "
und_repo_id = " "
try :
os . environ [ ' GIT_SSH_COMMAND ' ] = ' ssh -o StrictHostKeyChecking=no '
os . environ [ ' GIT_ASKPASS ' ] = ' false '
os . environ [ ' GIT_TERMINAL_PROMPT ' ] = ' 0 '
ssh_url = " "
try :
if " github " in row [ 5 ] :
repo_id = row [ 5 ] [ len ( ' https://github.com/ ' ) : ]
ssh_url = f ' git@github.com: { repo_id } .git '
2025-01-29 05:04:51 +00:00
if ssh_url . endswith ( ' .git.git ' ) :
ssh_url = ssh_url [ : - 4 ]
temp_repo , temp_repo_path = temp_clone ( ssh_url , temp_dir )
2025-01-29 17:38:05 +00:00
else :
parts = row [ 5 ] . split ( ' / ' )
domain = parts [ 2 ]
repo_id = ' / ' . join ( parts [ 3 : ] )
try :
temp_repo , temp_repo_path = temp_clone ( row [ 5 ] , temp_dir )
except Exception as e :
print ( f ' non-Github cloning error, assuming HTTPS issue: { e } ' )
delete_clone ( temp_dir )
ssh_url = f ' git@ { domain } : { repo_id } .git '
if ssh_url . endswith ( ' .git.git ' ) :
ssh_url = ssh_url [ : - 4 ]
temp_repo , temp_repo_path = temp_clone ( ssh_url , temp_dir )
except Exception as e :
print ( f ' cloning error at { row [ 5 ] } ' )
print ( f ' inside cloning error: { e } ' )
raise ValueError ( e )
os . chdir ( temp_repo_path )
os . system ( f " git checkout `git rev-list -n 1 --before= ' 2024-03-16 12:00:00 ' ` " )
os . chdir ( cwd )
has_readme_bool , has_contributing_bool = False , False
for filename in os . listdir ( temp_repo_path ) :
if filename . startswith ( " README " ) :
has_readme_bool = True
if filename . startswith ( " CONTRIBUTING " ) :
has_contributing_bool = True
if has_readme_bool or has_contributing_bool :
commits_array = commit_analysis ( temp_repo , from_date , to_date )
commits_df = pd . DataFrame . from_records ( commits_array )
und_repo_id = ' _ ' . join ( repo_id . split ( " / " ) )
if has_readme_bool :
has_readme + = 1
commits_df . to_csv (
f " { COMMIT_SAVE_PREFIX } readme_commit_data/ { und_repo_id } _commits.csv " ,
index = False ,
)
if has_contributing_bool :
has_contributing + = 1
commits_df . to_csv (
f " { COMMIT_SAVE_PREFIX } contributing_commit_data/ { und_repo_id } _commits.csv " ,
index = False ,
)
2025-01-29 05:04:51 +00:00
except Exception as e :
2025-01-29 21:51:09 +00:00
clone_error . append ( [ row [ 5 ] , str ( e ) ] )
2025-01-29 17:38:05 +00:00
print ( f " outside cloning error: { e } " )
finally :
und_repo_id = " "
delete_clone ( temp_dir )
os . chdir ( cwd )
2025-01-29 05:04:51 +00:00
2025-01-29 17:38:05 +00:00
if index == stop_index :
break
except KeyboardInterrupt :
print ( " KeyBoardInterrrupt " )
finally :
print ( clone_error )
with open ( f " { stop_index } -clone-error-output.txt " , " w " ) as txt_file :
for error in clone_error :
2025-01-29 21:51:09 +00:00
txt_file . write ( ' , ' . join ( error ) + " \n " )
2025-01-29 17:38:05 +00:00
with open ( f " { stop_index } -success-output.txt " , " w " ) as txt_file :
txt_file . write ( f " Number of Empty Rows: { empty_row } \n " )
txt_file . write ( f " Number of Cloning Errors: { len ( clone_error ) } \n " )
txt_file . write ( f " Number that has README: { has_readme } \n " )
txt_file . write ( f " Number that has CONTRIBUTING: { has_contributing } " )
print ( f " Number of Empty Rows: { empty_row } " )
print ( f " Number of Cloning Errors: { len ( clone_error ) } " )
print ( f " Number that has README: { has_readme } " )
print ( f " Number that has CONTRIBUTING: { has_contributing } " )
2025-01-29 05:04:51 +00:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " chase validation " )
parser . add_argument ( " --start_index " , type = int , required = True , help = " The starting index for the search " )
parser . add_argument ( " --stop_index " , type = int , required = True , help = " The stopping index for the search " )
args = parser . parse_args ( )
for_all_files ( args . start_index , args . stop_index )
#temp_repo, temp_repo_path = temp_clone(" https://gitlab.gnome.org/GNOME/almanah", temp_dir)
2025-01-29 16:31:55 +00:00
#delete_clone(temp_dir)
#python3 intermediary_script.py --start_index START --stop_index STOP