diff --git a/sample_good_subset.py b/sample_good_subset.py index aeb4c3d..27752e2 100644 --- a/sample_good_subset.py +++ b/sample_good_subset.py @@ -5,17 +5,36 @@ import pandas as pd def for_readme_files(): ld_csv_path = "final_data/deb_readme_did.csv" - ta_csv_path = "d_readability_readme.csv" + ta_csv_path = "text_analysis/d_readability_readme.csv" topic_csv_path = "text_analysis/readme_file_topic_distributions.csv" # criteria for good readme - # longer than half of a pageview - - + # mean of three examples listed on the contributing page + # https://github.com/rails/rails/blob/main/README.md (71s) + # https://github.com/github/docs/blob/main/README.md (48s) + # https://github.com/opengovernment/opengovernment/blob/master/README.md (14s) + # Median 48s + readme_ta_df = pd.read_csv(ta_csv_path) + threshold_count = readme_ta_df[readme_ta_df['reading_time'] >= 48].shape[0] + print(threshold_count) + # R8 > 0.125 + readme_topic_df = pd.read_csv(topic_csv_path) def for_contributing_files(): ld_csv_path = "final_data/deb_contrib_did.csv" - ta_csv_path = "d_readability_contrib.csv" + ta_csv_path = "text_analysis/d_readability_contributing.csv" topic_csv_path = "text_analysis/contrib_file_topic_distributions.csv" # criteria for good contributing - # longer than half of a pageview + # mean of three examples listed on https://docs.github.com/en/communities/setting-up-your-project-for-healthy-contributions/setting-guidelines-for-repository-contributors + # https://github.com/rails/rails/blob/main/CONTRIBUTING.md (116s) + # https://github.com/github/docs/blob/main/.github/CONTRIBUTING.md (59s) + # https://github.com/opengovernment/opengovernment/blob/master/CONTRIBUTING.md (45s) + # Median 59s + contributing_ta_df = pd.read_csv(ta_csv_path) + threshold_count = contributing_ta_df[contributing_ta_df['reading_time'] >= 59].shape[0] + print(threshold_count) + # and then making sure they're on topic, C4 > 0.25 + contributing_ta_df = pd.read_csv(topic_csv_path) +if __name__ == "__main__": + for_contributing_files() + for_readme_files() \ No newline at end of file