24_deb_pkg_gov/text_analysis/qual_sampling.py

26 lines
891 B
Python
Raw Normal View History

2024-06-16 18:40:05 +00:00
import csv
import io
import shutil
import os
from random import sample
2024-07-15 18:06:24 +00:00
readme_wd = "/data/users/mgaughan/kkex/time_specific_files/dwo_partitioned_readme"
contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/dwo_partitioned_contributing"
2024-06-16 18:40:05 +00:00
def sample_from_doc(sample_k, doc_directory):
subdirs = os.listdir(doc_directory)
for dir in subdirs:
print(dir)
files = os.listdir(doc_directory + "/" + dir)
final_sampled = []
while len(final_sampled) < sample_k:
trial_sample = sample(files, 1)[0]
with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f:
file_length = len(f.readlines())
if file_length >= 10:
final_sampled.append([trial_sample, file_length])
print(final_sampled)
if __name__ == "__main__":
2024-07-15 18:06:24 +00:00
sample_from_doc(3, contributing_wd)