1
0
govdoc-cr-analysis/text_analysis/020425_corpus_subset.ipynb

133 lines
3.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"import pandas as pd \n",
"import glob\n",
"import re\n",
"import statistics"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"contributing_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing_subset/\"\n",
"readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme_subset/\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def metadata_for_file(file):\n",
" word_list = file.split()\n",
" word_count = len(word_list)\n",
" #print(word_list)\n",
" if word_count == 0:\n",
" avg_word_length = 0\n",
" else: \n",
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
" #return number of paragraphs\n",
" return word_count, avg_word_length"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"def get_data_from_dir(directory):\n",
" files = glob.glob(f\"{directory}/*\")\n",
" data_list = []\n",
" word_counts = []\n",
" avg_word_lengths = []\n",
" file_list = []\n",
" for file in files:\n",
" text = open(file, encoding='utf-8', errors=\"ignore\").read()\n",
" pattern = r'[^a-zA-Z0-9._-]'\n",
" cleaned_text = re.sub(pattern, '', text)\n",
" #here's some of the descriptive text analysis\n",
" word_count, avg_word_length = metadata_for_file(text)\n",
" word_counts.append(word_count)\n",
" avg_word_lengths.append(avg_word_length)\n",
" # README\n",
" # 0.25 19.0\n",
" if word_count < 20:\n",
" os.remove(file)\n",
" #data_list.append(text)\n",
" #adding filename\n",
" file_list.append(file)\n",
" return data_list, word_counts, avg_word_lengths, file_list"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)\n",
"wordcount_series = pd.Series(wordcounts)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.25 19.0\n",
"0.50 98.0\n",
"0.75 266.5\n",
"dtype: float64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#wordcount_series.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\n",
"wordcount_series.quantile([ 0.25, 0.5, 0.75])\n",
"#less_than_20 = wordcount_series[wordcount_series < 200]\n",
"#len(less_than_20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}