2024-10-24 19:44:23 +00:00
import csv
import os
import pandas as pd
def for_readme_files ( ) :
ld_csv_path = " final_data/deb_readme_did.csv "
2024-10-26 21:31:43 +00:00
ta_csv_path = " text_analysis/d_readability_readme.csv "
2024-10-24 19:44:23 +00:00
topic_csv_path = " text_analysis/readme_file_topic_distributions.csv "
# criteria for good readme
2024-10-26 21:31:43 +00:00
# mean of three examples listed on the contributing page
# https://github.com/rails/rails/blob/main/README.md (71s)
# https://github.com/github/docs/blob/main/README.md (48s)
# https://github.com/opengovernment/opengovernment/blob/master/README.md (14s)
# Median 48s
readme_ta_df = pd . read_csv ( ta_csv_path )
2024-10-27 22:10:23 +00:00
time_threshold_df = readme_ta_df [ readme_ta_df [ ' reading_time ' ] > = 48 ]
# R8 > 0.10 (0.8 * 0.125)
2024-10-26 21:31:43 +00:00
readme_topic_df = pd . read_csv ( topic_csv_path )
2024-10-27 22:10:23 +00:00
topic_threshold_df = readme_topic_df [ readme_topic_df [ ' t7 ' ] > = 0.1 ]
2024-10-31 03:42:11 +00:00
# THIS IS JUST LENGTH BASED
readme_exemplar_df = time_threshold_df
# BELOW IS STRICT
#readme_exemplar_df = pd.merge(time_threshold_df, topic_threshold_df, on="filename", how="inner")
2024-10-27 22:10:23 +00:00
exemplary_files = readme_exemplar_df [ ' filename ' ] . str . split ( ' _ ' ) . str [ : - 1 ] . str . join ( " _ " )
#one manual cleaning
exemplary_files = exemplary_files [ exemplary_files != " CheMPS2_README_8md " ]
exemplary_files = pd . concat ( [ exemplary_files , pd . Series ( " CheMPS2 " ) ] )
2024-10-31 03:42:11 +00:00
exemplary_files = exemplary_files [ exemplary_files != " pg_filedump.git_README.pg " ]
exemplary_files = pd . concat ( [ exemplary_files , pd . Series ( " pg_filedump.git " ) ] )
2024-10-27 22:10:23 +00:00
print ( len ( exemplary_files ) )
#connecting with the timeseries data
ts_data_df = pd . read_csv ( ld_csv_path )
#print(ts_data_df['upstream_vcs_link'].str.split("/").str[-1])
test_vec = ts_data_df [ ' upstream_vcs_link ' ] . str . split ( " / " ) . str [ - 1 ]
diff_vec = exemplary_files [ ~ exemplary_files . isin ( test_vec ) ]
2024-10-31 03:42:11 +00:00
print ( diff_vec )
2024-10-27 22:10:23 +00:00
subset_ts_data = ts_data_df [ ts_data_df [ ' upstream_vcs_link ' ] . str . split ( " / " ) . str [ - 1 ] . isin ( exemplary_files ) ]
#if "CheMPS2" in exemplary_files:
# subset_ts_data = pd.concat([subset_ts_data, ts_data_df[ts_data_df['upstream_vcs_link'] == "https://github.com/SebWouters/CheMPS2"]])
print ( subset_ts_data . shape [ 0 ] )
#print("https://github.com/SebWouters/CheMPS2" in subset_ts_data["upstream_vcs_link"])
2024-10-31 03:42:11 +00:00
subset_ts_data . to_csv ( ' 110124_supp_analysis/110124_readme_length_subset.csv ' , index = False )
2024-10-27 22:10:23 +00:00
2024-10-24 19:44:23 +00:00
def for_contributing_files ( ) :
ld_csv_path = " final_data/deb_contrib_did.csv "
2024-10-26 21:31:43 +00:00
ta_csv_path = " text_analysis/d_readability_contributing.csv "
2024-10-24 19:44:23 +00:00
topic_csv_path = " text_analysis/contrib_file_topic_distributions.csv "
# criteria for good contributing
2024-10-26 21:31:43 +00:00
# mean of three examples listed on https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions/setting-guidelines-for-repository-contributors
# https://github.com/rails/rails/blob/main/CONTRIBUTING.md (116s)
# https://github.com/github/docs/blob/main/.github/CONTRIBUTING.md (59s)
# https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md (45s)
# Median 59s
contributing_ta_df = pd . read_csv ( ta_csv_path )
2024-10-27 22:10:23 +00:00
time_threshold_df = contributing_ta_df [ contributing_ta_df [ ' reading_time ' ] > = 59 ]
# and then making sure they're on topic, C4 > 0.2 (0.8 * 0.25)
contributing_topic_df = pd . read_csv ( topic_csv_path )
topic_threshold_df = contributing_topic_df [ contributing_topic_df [ ' t3 ' ] > = 0.2 ]
contributing_exemplar_df = pd . merge ( topic_threshold_df , time_threshold_df , on = ' filename ' , how = ' inner ' )
2024-10-31 03:42:11 +00:00
print ( time_threshold_df . shape [ 0 ] )
# LENGTH VERSION
exemplary_files = time_threshold_df [ ' filename ' ] . str . replace ( " _CONTRIBUTING.md " , " " )
# BELOW IS STRICT VERSION
#exemplary_files = contributing_exemplar_df['filename'].str.replace("_CONTRIBUTING.md", "")
exemplary_files = exemplary_files [ exemplary_files != " imbalanced-learn.git_CONTRIBUTING.rst " ]
exemplary_files = pd . concat ( [ exemplary_files , pd . Series ( " imbalanced-learn.git " ) ] )
exemplary_files = exemplary_files [ exemplary_files != " synapse.git_CONTRIBUTING.rst " ]
exemplary_files = pd . concat ( [ exemplary_files , pd . Series ( " synapse.git " ) ] )
2024-10-27 22:10:23 +00:00
# reading in the data
ts_data_df = pd . read_csv ( ld_csv_path )
2024-10-31 03:42:11 +00:00
test_vec = ts_data_df [ ' upstream_vcs_link ' ] . str . split ( " / " ) . str [ - 1 ]
diff_vec = exemplary_files [ ~ exemplary_files . isin ( test_vec ) ]
print ( diff_vec )
2024-10-27 22:10:23 +00:00
subset_ts_data = ts_data_df [ ts_data_df [ ' upstream_vcs_link ' ] . str . split ( " / " ) . str [ - 1 ] . isin ( exemplary_files ) ]
print ( subset_ts_data . shape [ 0 ] )
2024-10-31 03:42:11 +00:00
subset_ts_data . to_csv ( ' 110124_supp_analysis/110124_contrib_length_subset.csv ' , index = False )
2024-10-24 19:44:23 +00:00
2024-10-26 21:31:43 +00:00
if __name__ == " __main__ " :
for_contributing_files ( )
for_readme_files ( )