84 lines
4.5 KiB
Python
84 lines
4.5 KiB
Python
import csv
|
|
import os
|
|
import pandas as pd
|
|
|
|
|
|
def for_readme_files():
|
|
ld_csv_path = "final_data/deb_readme_did.csv"
|
|
ta_csv_path = "text_analysis/d_readability_readme.csv"
|
|
topic_csv_path = "text_analysis/readme_file_topic_distributions.csv"
|
|
# criteria for good readme
|
|
# mean of three examples listed on the contributing page
|
|
# https://github.com/rails/rails/blob/main/README.md (71s)
|
|
# https://github.com/github/docs/blob/main/README.md (48s)
|
|
# https://github.com/opengovernment/opengovernment/blob/master/README.md (14s)
|
|
# Median 48s
|
|
readme_ta_df = pd.read_csv(ta_csv_path)
|
|
time_threshold_df = readme_ta_df[readme_ta_df['reading_time'] >= 48]
|
|
# R8 > 0.10 (0.8 * 0.125)
|
|
readme_topic_df = pd.read_csv(topic_csv_path)
|
|
topic_threshold_df = readme_topic_df[readme_topic_df['t7'] >= 0.1]
|
|
# THIS IS JUST LENGTH BASED
|
|
readme_exemplar_df = time_threshold_df
|
|
# BELOW IS STRICT
|
|
#readme_exemplar_df = pd.merge(time_threshold_df, topic_threshold_df, on="filename", how="inner")
|
|
exemplary_files = readme_exemplar_df['filename'].str.split('_').str[:-1].str.join("_")
|
|
#one manual cleaning
|
|
exemplary_files = exemplary_files[exemplary_files!= "CheMPS2_README_8md"]
|
|
exemplary_files = pd.concat([exemplary_files, pd.Series("CheMPS2")])
|
|
exemplary_files = exemplary_files[exemplary_files!= "pg_filedump.git_README.pg"]
|
|
exemplary_files = pd.concat([exemplary_files, pd.Series("pg_filedump.git")])
|
|
print(len(exemplary_files))
|
|
#connecting with the timeseries data
|
|
ts_data_df = pd.read_csv(ld_csv_path)
|
|
#print(ts_data_df['upstream_vcs_link'].str.split("/").str[-1])
|
|
test_vec = ts_data_df['upstream_vcs_link'].str.split("/").str[-1]
|
|
diff_vec = exemplary_files[~exemplary_files.isin(test_vec)]
|
|
print(diff_vec)
|
|
subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)]
|
|
#if "CheMPS2" in exemplary_files:
|
|
# subset_ts_data = pd.concat([subset_ts_data, ts_data_df[ts_data_df['upstream_vcs_link'] == "https://github.com/SebWouters/CheMPS2"]])
|
|
print(subset_ts_data.shape[0])
|
|
#print("https://github.com/SebWouters/CheMPS2" in subset_ts_data["upstream_vcs_link"])
|
|
|
|
subset_ts_data.to_csv('110124_supp_analysis/110124_readme_length_subset.csv', index=False)
|
|
|
|
|
|
def for_contributing_files():
|
|
ld_csv_path = "final_data/deb_contrib_did.csv"
|
|
ta_csv_path = "text_analysis/d_readability_contributing.csv"
|
|
topic_csv_path = "text_analysis/contrib_file_topic_distributions.csv"
|
|
# criteria for good contributing
|
|
# mean of three examples listed on https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions/setting-guidelines-for-repository-contributors
|
|
# https://github.com/rails/rails/blob/main/CONTRIBUTING.md (116s)
|
|
# https://github.com/github/docs/blob/main/.github/CONTRIBUTING.md (59s)
|
|
# https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md (45s)
|
|
# Median 59s
|
|
contributing_ta_df = pd.read_csv(ta_csv_path)
|
|
time_threshold_df = contributing_ta_df[contributing_ta_df['reading_time'] >= 59]
|
|
# and then making sure they're on topic, C4 > 0.2 (0.8 * 0.25)
|
|
contributing_topic_df = pd.read_csv(topic_csv_path)
|
|
topic_threshold_df = contributing_topic_df[contributing_topic_df['t3'] >= 0.2]
|
|
contributing_exemplar_df = pd.merge(topic_threshold_df, time_threshold_df, on='filename', how='inner')
|
|
print(time_threshold_df.shape[0])
|
|
# LENGTH VERSION
|
|
exemplary_files = time_threshold_df['filename'].str.replace("_CONTRIBUTING.md", "")
|
|
# BELOW IS STRICT VERSION
|
|
#exemplary_files = contributing_exemplar_df['filename'].str.replace("_CONTRIBUTING.md", "")
|
|
exemplary_files = exemplary_files[exemplary_files!= "imbalanced-learn.git_CONTRIBUTING.rst"]
|
|
exemplary_files = pd.concat([exemplary_files, pd.Series("imbalanced-learn.git")])
|
|
exemplary_files = exemplary_files[exemplary_files!= "synapse.git_CONTRIBUTING.rst"]
|
|
exemplary_files = pd.concat([exemplary_files, pd.Series("synapse.git")])
|
|
# reading in the data
|
|
ts_data_df = pd.read_csv(ld_csv_path)
|
|
test_vec = ts_data_df['upstream_vcs_link'].str.split("/").str[-1]
|
|
diff_vec = exemplary_files[~exemplary_files.isin(test_vec)]
|
|
print(diff_vec)
|
|
subset_ts_data = ts_data_df[ts_data_df['upstream_vcs_link'].str.split("/").str[-1].isin(exemplary_files)]
|
|
print(subset_ts_data.shape[0])
|
|
|
|
subset_ts_data.to_csv('110124_supp_analysis/110124_contrib_length_subset.csv', index=False)
|
|
|
|
if __name__ == "__main__":
|
|
for_contributing_files()
|
|
for_readme_files() |