From ef25337e554d6a3f12116b9213a6e8113222a80c Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 16 Jun 2024 13:40:05 -0500 Subject: [PATCH] sampling for qual_analysis --- text_analysis/qual_sampling.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 text_analysis/qual_sampling.py diff --git a/text_analysis/qual_sampling.py b/text_analysis/qual_sampling.py new file mode 100644 index 0000000..bb77747 --- /dev/null +++ b/text_analysis/qual_sampling.py @@ -0,0 +1,26 @@ +import csv +import io +import shutil +import os +from random import sample + +readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme" +contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing" + + +def sample_from_doc(sample_k, doc_directory): + subdirs = os.listdir(doc_directory) + for dir in subdirs: + print(dir) + files = os.listdir(doc_directory + "/" + dir) + final_sampled = [] + while len(final_sampled) < sample_k: + trial_sample = sample(files, 1)[0] + with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f: + file_length = len(f.readlines()) + if file_length >= 10: + final_sampled.append([trial_sample, file_length]) + print(final_sampled) + +if __name__ == "__main__": + sample_from_doc(3, readme_wd) \ No newline at end of file