import csv import os import nltk #from nltk.stem import WordNetLemmatizer nltk.download('wordnet') def main(): instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/" all_word_counts = [] for filename in os.listdir(instructions_dir): instructions_metadata = {} print(filename) with open(instructions_dir + filename, "r") as file: word_list = file.read().split() word_count = len(word_list) lemmatized_words = [] for word in word_list: lemma_word = nltk.WordNetLemmatizer().lemmatize(word) if lemma_word not in lemmatized_words: lemmatized_words.append(lemma_word) # pulling whether or not keywords like "Checklist" or "Process" occur? # pulling whether "HOWTO" occurs? unique_word_count = len(lemmatized_words) if "checklist" in lemmatized_words or "process" in lemmatized_words: print('contains keyword') print(word_count) all_word_counts.append(unique_word_count) print(sum(all_word_counts)/len(all_word_counts)) if __name__ == "__main__": main()