govdoc-cr-analysis/text_analysis/020425_corpus_subset.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "import pandas as pd \n",
    "import glob\n",
    "import re\n",
    "import statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "contributing_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing_subset/\"\n",
    "readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme_subset/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def metadata_for_file(file):\n",
    "    word_list = file.split()\n",
    "    word_count = len(word_list)\n",
    "    #print(word_list)\n",
    "    if word_count == 0:\n",
    "        avg_word_length = 0\n",
    "    else: \n",
    "        avg_word_length = sum(map(len, word_list))  / len(word_list)\n",
    "    #return number of paragraphs\n",
    "    return word_count, avg_word_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_from_dir(directory):\n",
    "    files = glob.glob(f\"{directory}/*\")\n",
    "    data_list = []\n",
    "    word_counts = []\n",
    "    avg_word_lengths = []\n",
    "    file_list = []\n",
    "    for file in files:\n",
    "        text = open(file, encoding='utf-8', errors=\"ignore\").read()\n",
    "        pattern = r'[^a-zA-Z0-9._-]'\n",
    "        cleaned_text = re.sub(pattern, '', text)\n",
    "        #here's some of the descriptive text analysis\n",
    "        word_count, avg_word_length = metadata_for_file(text)\n",
    "        word_counts.append(word_count)\n",
    "        avg_word_lengths.append(avg_word_length)\n",
    "        # README\n",
    "        # 0.25     19.0\n",
    "        if word_count < 20:\n",
    "           os.remove(file)\n",
    "        #data_list.append(text)\n",
    "        #adding filename\n",
    "        file_list.append(file)\n",
    "    return data_list, word_counts, avg_word_lengths, file_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)\n",
    "wordcount_series = pd.Series(wordcounts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.25     19.0\n",
       "0.50     98.0\n",
       "0.75    266.5\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#wordcount_series.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\n",
    "wordcount_series.quantile([ 0.25, 0.5, 0.75])\n",
    "#less_than_20 = wordcount_series[wordcount_series < 200]\n",
    "#len(less_than_20)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}