133 lines
3.5 KiB
Plaintext
133 lines
3.5 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os \n",
|
|
"import pandas as pd \n",
|
|
"import glob\n",
|
|
"import re\n",
|
|
"import statistics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"contributing_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing_subset/\"\n",
|
|
"readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme_subset/\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def metadata_for_file(file):\n",
|
|
" word_list = file.split()\n",
|
|
" word_count = len(word_list)\n",
|
|
" #print(word_list)\n",
|
|
" if word_count == 0:\n",
|
|
" avg_word_length = 0\n",
|
|
" else: \n",
|
|
" avg_word_length = sum(map(len, word_list)) / len(word_list)\n",
|
|
" #return number of paragraphs\n",
|
|
" return word_count, avg_word_length"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 41,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_data_from_dir(directory):\n",
|
|
" files = glob.glob(f\"{directory}/*\")\n",
|
|
" data_list = []\n",
|
|
" word_counts = []\n",
|
|
" avg_word_lengths = []\n",
|
|
" file_list = []\n",
|
|
" for file in files:\n",
|
|
" text = open(file, encoding='utf-8', errors=\"ignore\").read()\n",
|
|
" pattern = r'[^a-zA-Z0-9._-]'\n",
|
|
" cleaned_text = re.sub(pattern, '', text)\n",
|
|
" #here's some of the descriptive text analysis\n",
|
|
" word_count, avg_word_length = metadata_for_file(text)\n",
|
|
" word_counts.append(word_count)\n",
|
|
" avg_word_lengths.append(avg_word_length)\n",
|
|
" # README\n",
|
|
" # 0.25 19.0\n",
|
|
" if word_count < 20:\n",
|
|
" os.remove(file)\n",
|
|
" #data_list.append(text)\n",
|
|
" #adding filename\n",
|
|
" file_list.append(file)\n",
|
|
" return data_list, word_counts, avg_word_lengths, file_list"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)\n",
|
|
"wordcount_series = pd.Series(wordcounts)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.25 19.0\n",
|
|
"0.50 98.0\n",
|
|
"0.75 266.5\n",
|
|
"dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"#wordcount_series.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\n",
|
|
"wordcount_series.quantile([ 0.25, 0.5, 0.75])\n",
|
|
"#less_than_20 = wordcount_series[wordcount_series < 200]\n",
|
|
"#len(less_than_20)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|