24_deb_pkg_gov/instructions_analysis.py

35 lines
1.2 KiB
Python
Raw Normal View History

2024-01-15 17:55:54 +00:00
import csv
import os
import nltk
#from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def main():
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
2024-01-22 17:20:52 +00:00
all_word_counts = []
2024-01-15 17:55:54 +00:00
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
print(filename)
with open(instructions_dir + filename, "r") as file:
word_list = file.read().split()
word_count = len(word_list)
lemmatized_words = []
for word in word_list:
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
if lemma_word not in lemmatized_words:
lemmatized_words.append(lemma_word)
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs?
unique_word_count = len(lemmatized_words)
2024-01-22 17:20:52 +00:00
if "checklist" in lemmatized_words or "process" in lemmatized_words:
print('contains keyword')
2024-01-15 17:55:54 +00:00
print(word_count)
2024-01-22 17:20:52 +00:00
all_word_counts.append(unique_word_count)
print(sum(all_word_counts)/len(all_word_counts))
2024-01-15 17:55:54 +00:00
if __name__ == "__main__":
main()