import csv import os import pandas as pd def for_readme_files(): ld_csv_path = "final_data/deb_readme_did.csv" ta_csv_path = "text_analysis/d_readability_readme.csv" topic_csv_path = "text_analysis/readme_file_topic_distributions.csv" # criteria for good readme # mean of three examples listed on the contributing page # https://github.com/rails/rails/blob/main/README.md (71s) # https://github.com/github/docs/blob/main/README.md (48s) # https://github.com/opengovernment/opengovernment/blob/master/README.md (14s) # Median 48s readme_ta_df = pd.read_csv(ta_csv_path) time_threshold_df = readme_ta_df[readme_ta_df['reading_time'] >= 48] # R8 > 0.10 (0.8 * 0.125) readme_topic_df = pd.read_csv(topic_csv_path) topic_threshold_df = readme_topic_df[readme_topic_df['t7'] >= 0.1] readme_exemplar_df = pd.merge(time_threshold_df, topic_threshold_df, on="filename", how="inner") exemplary_files = readme_exemplar_df['filename'].str.split('_').str[:-1].str.join("_") #one manual cleaning exemplary_files = exemplary_files[exemplary_files!= "CheMPS2_README_8md"] exemplary_files = pd.concat([exemplary_files, pd.Series("CheMPS2")]) print(len(exemplary_files)) #connecting with the timeseries data ts_data_df = pd.read_csv(ld_csv_path) #print(ts_data_df['upstream_vcs_link'].str.split("/").str[-1]) test_vec = ts_data_df['upstream_vcs_link'].str.split("/").str[-1] diff_vec = exemplary_files[~exemplary_files.isin(test_vec)] subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)] #if "CheMPS2" in exemplary_files: # subset_ts_data = pd.concat([subset_ts_data, ts_data_df[ts_data_df['upstream_vcs_link'] == "https://github.com/SebWouters/CheMPS2"]]) print(subset_ts_data.shape[0]) #print("https://github.com/SebWouters/CheMPS2" in subset_ts_data["upstream_vcs_link"]) #subset_ts_data.to_csv('102724_readme_exemplar_subset.csv', index=False) def for_contributing_files(): ld_csv_path = "final_data/deb_contrib_did.csv" ta_csv_path = "text_analysis/d_readability_contributing.csv" topic_csv_path = "text_analysis/contrib_file_topic_distributions.csv" # criteria for good contributing # mean of three examples listed on https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions/setting-guidelines-for-repository-contributors # https://github.com/rails/rails/blob/main/CONTRIBUTING.md (116s) # https://github.com/github/docs/blob/main/.github/CONTRIBUTING.md (59s) # https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md (45s) # Median 59s contributing_ta_df = pd.read_csv(ta_csv_path) time_threshold_df = contributing_ta_df[contributing_ta_df['reading_time'] >= 59] # and then making sure they're on topic, C4 > 0.2 (0.8 * 0.25) contributing_topic_df = pd.read_csv(topic_csv_path) topic_threshold_df = contributing_topic_df[contributing_topic_df['t3'] >= 0.2] contributing_exemplar_df = pd.merge(topic_threshold_df, time_threshold_df, on='filename', how='inner') print(contributing_exemplar_df.shape[0]) exemplary_files = contributing_exemplar_df['filename'].str.replace("_CONTRIBUTING.md", "") # reading in the data ts_data_df = pd.read_csv(ld_csv_path) subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)] print(subset_ts_data.shape[0]) #subset_ts_data.to_csv('102724_contrib_exemplar_subset.csv', index=False) if __name__ == "__main__": for_contributing_files() for_readme_files()