24_deb_pkg_gov/instructions_analysis.py

import csv 
import os 
import nltk
#from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def main():
    instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"
    all_word_counts = []
    for filename in os.listdir(instructions_dir):
        instructions_metadata = {}
        print(filename)
        with open(instructions_dir + filename, "r") as file:
            word_list = file.read().split()
            word_count = len(word_list)
            lemmatized_words = []
            for word in word_list:
                lemma_word = nltk.WordNetLemmatizer().lemmatize(word)
                if lemma_word not in lemmatized_words:
                    lemmatized_words.append(lemma_word)
            # pulling whether or not keywords like "Checklist" or "Process" occur?
            # pulling whether "HOWTO" occurs?
            unique_word_count = len(lemmatized_words)
            if "checklist" in lemmatized_words or "process" in lemmatized_words:
                print('contains keyword')
            print(word_count)
            all_word_counts.append(unique_word_count)
    print(sum(all_word_counts)/len(all_word_counts))


if __name__ == "__main__":
    main()
overdue back up 2024-01-15 17:55:54 +00:00			`import csv`
			`import os`
			`import nltk`
			`#from nltk.stem import WordNetLemmatizer`

			`nltk.download('wordnet')`

			`def main():`
			`instructions_dir = "/data/users/mgaughan/kkex_contrib_files_122023/contribute_inst/"`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`all_word_counts = []`
overdue back up 2024-01-15 17:55:54 +00:00			`for filename in os.listdir(instructions_dir):`
			`instructions_metadata = {}`
			`print(filename)`
			`with open(instructions_dir + filename, "r") as file:`
			`word_list = file.read().split()`
			`word_count = len(word_list)`
			`lemmatized_words = []`
			`for word in word_list:`
			`lemma_word = nltk.WordNetLemmatizer().lemmatize(word)`
			`if lemma_word not in lemmatized_words:`
			`lemmatized_words.append(lemma_word)`
			`# pulling whether or not keywords like "Checklist" or "Process" occur?`
			`# pulling whether "HOWTO" occurs?`
			`unique_word_count = len(lemmatized_words)`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`if "checklist" in lemmatized_words or "process" in lemmatized_words:`
			`print('contains keyword')`
overdue back up 2024-01-15 17:55:54 +00:00			`print(word_count)`
backup for expanded contributor data 2024-01-22 17:20:52 +00:00			`all_word_counts.append(unique_word_count)`
			`print(sum(all_word_counts)/len(all_word_counts))`
overdue back up 2024-01-15 17:55:54 +00:00


			`if __name__ == "__main__":`
			`main()`