diff --git a/final_data/deb_readme_did.csv b/final_data/deb_readme_did.csv index 4604fea..ab3a065 100644 --- a/final_data/deb_readme_did.csv +++ b/final_data/deb_readme_did.csv @@ -525,7 +525,7 @@ https://github.com/spanezz/django-housekeeping.git,2014-05-14 9:53:17,fe18a0159c https://github.com/spaam/svtplay-dl,2011-10-11 18:30:39,4ae1c73079d97b07201d38ea78012e1d877c2eac,"[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 6, 0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1, 10, 3, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 10, 0, 0, 3, 0, 0, 0, 5, 0, 0, 5, 0]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,1,0,0,Tue Mar 1 17:42:12 2011 +0100,2011-03-01 11:42:12,4863.470694,224.2836458 https://github.com/SoundScapeRenderer/ssr,2013-11-27 16:08:24,282398501961d280e7845ca1c2f4841b62d40b65,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[9, 3, 8, 2, 0, 2, 24, 11, 2, 3, 0, 3, 7, 8, 4, 6, 1, 0, 0, 3, 0, 0, 1, 0, 2, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,1,2,0,Wed Nov 27 17:08:24 2013 +0100,2013-11-27 11:08:24,3861.494167,0.2083333333 https://github.com/sorich87/bootstrap-tour,2012-07-03 14:03:22,f8cf8c62981ef84ddc2561f7b2939d1f2a92832e,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[10, 22, 4, 1, 0, 11, 1, 1, 14, 0, 0, 0, 0, 2, 4, 1, 1, 9, 2, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 5, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,2,12,0,Tue Jul 3 15:02:56 2012 +0100,2012-07-03 10:02:56,4373.581296,0.1669675926 -https://github.com/agateau/yokadi/issues/new,2009-02-03 9:41:14,b68abca7bf25451a3ee24d62b3d8b4f7d6e9b81b,"[1, 0, 25, 21, 10, 4, 7, 16, 1, 0, 0, 1, 1, 7, 22, 6, 8, 35, 0, 20, 0, 1, 10, 11, 7, 15, 2]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[28, 7, 15, 0, 0, 5, 14, 4, 8, 2, 0, 1, 0, 1, 13, 8, 2, 0, 7, 0, 1, 16, 31, 2, 6, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",4,0,6,2,Sun Feb 1 15:37:33 2009 -0700,2009-02-01 17:37:33,5621.223924,1.669224537 +https://github.com/agateau/yokadi,2009-02-03 9:41:14,b68abca7bf25451a3ee24d62b3d8b4f7d6e9b81b,"[1, 0, 25, 21, 10, 4, 7, 16, 1, 0, 0, 1, 1, 7, 22, 6, 8, 35, 0, 20, 0, 1, 10, 11, 7, 15, 2]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[28, 7, 15, 0, 0, 5, 14, 4, 8, 2, 0, 1, 0, 1, 13, 8, 2, 0, 7, 0, 1, 16, 31, 2, 6, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",4,0,6,2,Sun Feb 1 15:37:33 2009 -0700,2009-02-01 17:37:33,5621.223924,1.669224537 https://github.com/somiaj/fvwm2-debian,2021-06-02 0:27:05,629cf2e5fbdc1d3ff8b9c28855ae2656a07be11f,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2,0,0,0,Tue Jun 1 18:27:05 2021 -0600,2021-06-01 20:27:05,1118.147859,0.1666666667 https://github.com/solvespace/solvespace/issues/new,2015-07-10 12:59:12,636b20bfa9fae2a40dba6b1b8c1adecfbfc5a9b9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,Tue Mar 25 02:02:13 2008 -0800,2008-03-25 6:02:13,5934.748461,2663.289572 https://github.com/aircrack-ng/mdk4,2018-02-06 8:23:27,218029afb9c0e9ef3a9ddfcf4a91a4c727d32ce1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 0, 1, 7, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",2,1,3,0,Mon Feb 5 01:54:04 2018 +0000,2018-02-04 20:54:04,2331.087454,1.478738426 diff --git a/sample_good_subset.py b/sample_good_subset.py index 27752e2..2858ba2 100644 --- a/sample_good_subset.py +++ b/sample_good_subset.py @@ -14,10 +14,29 @@ def for_readme_files(): # https://github.com/opengovernment/opengovernment/blob/master/README.md (14s) # Median 48s readme_ta_df = pd.read_csv(ta_csv_path) - threshold_count = readme_ta_df[readme_ta_df['reading_time'] >= 48].shape[0] - print(threshold_count) - # R8 > 0.125 + time_threshold_df = readme_ta_df[readme_ta_df['reading_time'] >= 48] + # R8 > 0.10 (0.8 * 0.125) readme_topic_df = pd.read_csv(topic_csv_path) + topic_threshold_df = readme_topic_df[readme_topic_df['t7'] >= 0.1] + readme_exemplar_df = pd.merge(time_threshold_df, topic_threshold_df, on="filename", how="inner") + exemplary_files = readme_exemplar_df['filename'].str.split('_').str[:-1].str.join("_") + #one manual cleaning + exemplary_files = exemplary_files[exemplary_files!= "CheMPS2_README_8md"] + exemplary_files = pd.concat([exemplary_files, pd.Series("CheMPS2")]) + print(len(exemplary_files)) + #connecting with the timeseries data + ts_data_df = pd.read_csv(ld_csv_path) + #print(ts_data_df['upstream_vcs_link'].str.split("/").str[-1]) + test_vec = ts_data_df['upstream_vcs_link'].str.split("/").str[-1] + diff_vec = exemplary_files[~exemplary_files.isin(test_vec)] + subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)] + #if "CheMPS2" in exemplary_files: + # subset_ts_data = pd.concat([subset_ts_data, ts_data_df[ts_data_df['upstream_vcs_link'] == "https://github.com/SebWouters/CheMPS2"]]) + print(subset_ts_data.shape[0]) + #print("https://github.com/SebWouters/CheMPS2" in subset_ts_data["upstream_vcs_link"]) + + #subset_ts_data.to_csv('102724_readme_exemplar_subset.csv', index=False) + def for_contributing_files(): ld_csv_path = "final_data/deb_contrib_did.csv" @@ -30,10 +49,19 @@ def for_contributing_files(): # https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md (45s) # Median 59s contributing_ta_df = pd.read_csv(ta_csv_path) - threshold_count = contributing_ta_df[contributing_ta_df['reading_time'] >= 59].shape[0] - print(threshold_count) - # and then making sure they're on topic, C4 > 0.25 - contributing_ta_df = pd.read_csv(topic_csv_path) + time_threshold_df = contributing_ta_df[contributing_ta_df['reading_time'] >= 59] + # and then making sure they're on topic, C4 > 0.2 (0.8 * 0.25) + contributing_topic_df = pd.read_csv(topic_csv_path) + topic_threshold_df = contributing_topic_df[contributing_topic_df['t3'] >= 0.2] + contributing_exemplar_df = pd.merge(topic_threshold_df, time_threshold_df, on='filename', how='inner') + print(contributing_exemplar_df.shape[0]) + exemplary_files = contributing_exemplar_df['filename'].str.replace("_CONTRIBUTING.md", "") + # reading in the data + ts_data_df = pd.read_csv(ld_csv_path) + subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)] + print(subset_ts_data.shape[0]) + + #subset_ts_data.to_csv('102724_contrib_exemplar_subset.csv', index=False) if __name__ == "__main__": for_contributing_files()