24_deb_pkg_gov/text_analysis/qual_sampling.py

26 lines
877 B
Python

import csv
import io
import shutil
import os
from random import sample
readme_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_readme"
contributing_wd = "/data/users/mgaughan/kkex/time_specific_files/partitioned_contributing"
def sample_from_doc(sample_k, doc_directory):
subdirs = os.listdir(doc_directory)
for dir in subdirs:
print(dir)
files = os.listdir(doc_directory + "/" + dir)
final_sampled = []
while len(final_sampled) < sample_k:
trial_sample = sample(files, 1)[0]
with open(doc_directory + "/" + dir + "/" + trial_sample,"r") as f:
file_length = len(f.readlines())
if file_length >= 10:
final_sampled.append([trial_sample, file_length])
print(final_sampled)
if __name__ == "__main__":
sample_from_doc(3, readme_wd)