24_deb_pkg_gov/instructions_analysis.py

30 lines
962 B
Python
Raw Normal View History

2024-01-15 17:55:54 +00:00
import csv
import os
import nltk
#from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def main():
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
for filename in os.listdir(instructions_dir):
instructions_metadata = {}
print(filename)
with open(instructions_dir + filename, "r") as file:
word_list = file.read().split()
word_count = len(word_list)
lemmatized_words = []
for word in word_list:
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
if lemma_word not in lemmatized_words:
lemmatized_words.append(lemma_word)
# pulling whether or not keywords like "Checklist" or "Process" occur?
# pulling whether "HOWTO" occurs?
unique_word_count = len(lemmatized_words)
print(word_count)
if __name__ == "__main__":
main()