{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import os \n", "import pandas as pd \n", "import glob\n", "import re\n", "import statistics" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "contributing_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/contributing_subset/\"\n", "readme_directory = \"/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/first_version_documents/readme_subset/\"" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def metadata_for_file(file):\n", " word_list = file.split()\n", " word_count = len(word_list)\n", " #print(word_list)\n", " if word_count == 0:\n", " avg_word_length = 0\n", " else: \n", " avg_word_length = sum(map(len, word_list)) / len(word_list)\n", " #return number of paragraphs\n", " return word_count, avg_word_length" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def get_data_from_dir(directory):\n", " files = glob.glob(f\"{directory}/*\")\n", " data_list = []\n", " word_counts = []\n", " avg_word_lengths = []\n", " file_list = []\n", " for file in files:\n", " text = open(file, encoding='utf-8', errors=\"ignore\").read()\n", " pattern = r'[^a-zA-Z0-9._-]'\n", " cleaned_text = re.sub(pattern, '', text)\n", " #here's some of the descriptive text analysis\n", " word_count, avg_word_length = metadata_for_file(text)\n", " word_counts.append(word_count)\n", " avg_word_lengths.append(avg_word_length)\n", " # README\n", " # 0.25 19.0\n", " if word_count < 20:\n", " os.remove(file)\n", " #data_list.append(text)\n", " #adding filename\n", " file_list.append(file)\n", " return data_list, word_counts, avg_word_lengths, file_list" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "listed_corpus, wordcounts, wordlengths, file_list = get_data_from_dir(contributing_directory)\n", "wordcount_series = pd.Series(wordcounts)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.25 19.0\n", "0.50 98.0\n", "0.75 266.5\n", "dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#wordcount_series.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\n", "wordcount_series.quantile([ 0.25, 0.5, 0.75])\n", "#less_than_20 = wordcount_series[wordcount_series < 200]\n", "#len(less_than_20)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }