2024-01-15 17:55:54 +00:00
|
|
|
import csv
|
|
|
|
import os
|
|
|
|
import nltk
|
|
|
|
#from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
nltk.download('wordnet')
|
|
|
|
|
|
|
|
def main():
|
|
|
|
instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
|
2024-01-22 17:20:52 +00:00
|
|
|
all_word_counts = []
|
2024-01-15 17:55:54 +00:00
|
|
|
for filename in os.listdir(instructions_dir):
|
|
|
|
instructions_metadata = {}
|
|
|
|
print(filename)
|
|
|
|
with open(instructions_dir + filename, "r") as file:
|
|
|
|
word_list = file.read().split()
|
|
|
|
word_count = len(word_list)
|
|
|
|
lemmatized_words = []
|
|
|
|
for word in word_list:
|
|
|
|
lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
|
|
|
|
if lemma_word not in lemmatized_words:
|
|
|
|
lemmatized_words.append(lemma_word)
|
|
|
|
# pulling whether or not keywords like "Checklist" or "Process" occur?
|
|
|
|
# pulling whether "HOWTO" occurs?
|
|
|
|
unique_word_count = len(lemmatized_words)
|
2024-01-22 17:20:52 +00:00
|
|
|
if "checklist" in lemmatized_words or "process" in lemmatized_words:
|
|
|
|
print('contains keyword')
|
2024-01-15 17:55:54 +00:00
|
|
|
print(word_count)
|
2024-01-22 17:20:52 +00:00
|
|
|
all_word_counts.append(unique_word_count)
|
|
|
|
print(sum(all_word_counts)/len(all_word_counts))
|
2024-01-15 17:55:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|