1
0
govdoc-cr-analysis/text_analysis/readme_partitioned_analysis.ipynb

547 lines
18 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"merged_manifest = pd.read_csv('0203_readme_merged_manifest.csv')\n",
"topic_distributions = pd.read_csv('020325_README_file_topic_distributions.csv')\n",
"readability_scores = pd.read_csv('020325_README_readability.csv')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4247"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_merge = readability_scores.merge(topic_distributions, on=['filename'],how=\"inner\")\n",
"#primary_merge = first_merge.merge(readability_scores, )\n",
"first_merge['fvf_filepath'] = first_merge['filename']\n",
"#len(first_merge)\n",
"second_merge = first_merge.merge(merged_manifest, on=['fvf_filepath'], how=\"inner\")\n",
"len(second_merge)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filename</th>\n",
" <th>flesch_reading_ease</th>\n",
" <th>flesch_kincaid_grade</th>\n",
" <th>linsear_write_formula</th>\n",
" <th>dale_chall_readability_score</th>\n",
" <th>mcalpine_eflaw</th>\n",
" <th>reading_time</th>\n",
" <th>char_count</th>\n",
" <th>word_count</th>\n",
" <th>t0</th>\n",
" <th>...</th>\n",
" <th>effect</th>\n",
" <th>group</th>\n",
" <th>level</th>\n",
" <th>term</th>\n",
" <th>estimate</th>\n",
" <th>std.error</th>\n",
" <th>conf.low</th>\n",
" <th>conf.high</th>\n",
" <th>ranef_grouping</th>\n",
" <th>rank</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>libevent_libevent_hullabaloo_README</td>\n",
" <td>50.36</td>\n",
" <td>11.4</td>\n",
" <td>14.625000</td>\n",
" <td>12.47</td>\n",
" <td>26.3</td>\n",
" <td>7.37</td>\n",
" <td>502</td>\n",
" <td>83</td>\n",
" <td>0.001976</td>\n",
" <td>...</td>\n",
" <td>ran_vals</td>\n",
" <td>project_id</td>\n",
" <td>libevent_libevent</td>\n",
" <td>before_after:week_index</td>\n",
" <td>0.186522</td>\n",
" <td>0.478091</td>\n",
" <td>-0.750519</td>\n",
" <td>1.123563</td>\n",
" <td>1</td>\n",
" <td>3229.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>zaach_jison_hullabaloo_README.md</td>\n",
" <td>44.78</td>\n",
" <td>13.5</td>\n",
" <td>7.714286</td>\n",
" <td>10.70</td>\n",
" <td>37.2</td>\n",
" <td>33.05</td>\n",
" <td>2250</td>\n",
" <td>395</td>\n",
" <td>0.000489</td>\n",
" <td>...</td>\n",
" <td>ran_vals</td>\n",
" <td>project_id</td>\n",
" <td>zaach_jison</td>\n",
" <td>before_after:week_index</td>\n",
" <td>1.001879</td>\n",
" <td>0.217063</td>\n",
" <td>0.576443</td>\n",
" <td>1.427315</td>\n",
" <td>2</td>\n",
" <td>4068.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>lincolnloop_python-qrcode.git_hullabaloo_READM...</td>\n",
" <td>58.28</td>\n",
" <td>8.4</td>\n",
" <td>7.583333</td>\n",
" <td>10.76</td>\n",
" <td>18.3</td>\n",
" <td>6.82</td>\n",
" <td>464</td>\n",
" <td>78</td>\n",
" <td>0.002597</td>\n",
" <td>...</td>\n",
" <td>ran_vals</td>\n",
" <td>project_id</td>\n",
" <td>lincolnloop_python-qrcode.git</td>\n",
" <td>before_after:week_index</td>\n",
" <td>-0.220654</td>\n",
" <td>0.679910</td>\n",
" <td>-1.553254</td>\n",
" <td>1.111946</td>\n",
" <td>1</td>\n",
" <td>1467.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>mati75_evilwm.git_hullabaloo_README</td>\n",
" <td>66.74</td>\n",
" <td>7.2</td>\n",
" <td>5.888889</td>\n",
" <td>8.77</td>\n",
" <td>18.1</td>\n",
" <td>56.76</td>\n",
" <td>3864</td>\n",
" <td>742</td>\n",
" <td>0.000266</td>\n",
" <td>...</td>\n",
" <td>ran_vals</td>\n",
" <td>project_id</td>\n",
" <td>mati75_evilwm.git</td>\n",
" <td>before_after:week_index</td>\n",
" <td>-0.293846</td>\n",
" <td>0.697316</td>\n",
" <td>-1.660560</td>\n",
" <td>1.072868</td>\n",
" <td>1</td>\n",
" <td>1184.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>markdown-it_markdown-it_hullabaloo_README.md</td>\n",
" <td>45.76</td>\n",
" <td>11.1</td>\n",
" <td>12.000000</td>\n",
" <td>12.84</td>\n",
" <td>24.0</td>\n",
" <td>1.51</td>\n",
" <td>103</td>\n",
" <td>17</td>\n",
" <td>0.006993</td>\n",
" <td>...</td>\n",
" <td>ran_vals</td>\n",
" <td>project_id</td>\n",
" <td>markdown-it_markdown-it</td>\n",
" <td>before_after:week_index</td>\n",
" <td>0.238615</td>\n",
" <td>0.602856</td>\n",
" <td>-0.942961</td>\n",
" <td>1.420190</td>\n",
" <td>1</td>\n",
" <td>3459.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 33 columns</p>\n",
"</div>"
],
"text/plain": [
" filename flesch_reading_ease \\\n",
"0 libevent_libevent_hullabaloo_README 50.36 \n",
"1 zaach_jison_hullabaloo_README.md 44.78 \n",
"2 lincolnloop_python-qrcode.git_hullabaloo_READM... 58.28 \n",
"3 mati75_evilwm.git_hullabaloo_README 66.74 \n",
"4 markdown-it_markdown-it_hullabaloo_README.md 45.76 \n",
"\n",
" flesch_kincaid_grade linsear_write_formula dale_chall_readability_score \\\n",
"0 11.4 14.625000 12.47 \n",
"1 13.5 7.714286 10.70 \n",
"2 8.4 7.583333 10.76 \n",
"3 7.2 5.888889 8.77 \n",
"4 11.1 12.000000 12.84 \n",
"\n",
" mcalpine_eflaw reading_time char_count word_count t0 ... \\\n",
"0 26.3 7.37 502 83 0.001976 ... \n",
"1 37.2 33.05 2250 395 0.000489 ... \n",
"2 18.3 6.82 464 78 0.002597 ... \n",
"3 18.1 56.76 3864 742 0.000266 ... \n",
"4 24.0 1.51 103 17 0.006993 ... \n",
"\n",
" effect group level \\\n",
"0 ran_vals project_id libevent_libevent \n",
"1 ran_vals project_id zaach_jison \n",
"2 ran_vals project_id lincolnloop_python-qrcode.git \n",
"3 ran_vals project_id mati75_evilwm.git \n",
"4 ran_vals project_id markdown-it_markdown-it \n",
"\n",
" term estimate std.error conf.low conf.high \\\n",
"0 before_after:week_index 0.186522 0.478091 -0.750519 1.123563 \n",
"1 before_after:week_index 1.001879 0.217063 0.576443 1.427315 \n",
"2 before_after:week_index -0.220654 0.679910 -1.553254 1.111946 \n",
"3 before_after:week_index -0.293846 0.697316 -1.660560 1.072868 \n",
"4 before_after:week_index 0.238615 0.602856 -0.942961 1.420190 \n",
"\n",
" ranef_grouping rank \n",
"0 1 3229.0 \n",
"1 2 4068.0 \n",
"2 1 1467.0 \n",
"3 1 1184.0 \n",
"4 1 3459.0 \n",
"\n",
"[5 rows x 33 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"second_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">flesch_reading_ease</th>\n",
" <th colspan=\"2\" halign=\"left\">reading_time</th>\n",
" <th colspan=\"2\" halign=\"left\">word_count</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean</th>\n",
" <th>median</th>\n",
" <th>mean</th>\n",
" <th>median</th>\n",
" <th>mean</th>\n",
" <th>median</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ranef_grouping</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>55.089708</td>\n",
" <td>52.900</td>\n",
" <td>23.486886</td>\n",
" <td>9.28</td>\n",
" <td>256.973406</td>\n",
" <td>90.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>46.425909</td>\n",
" <td>51.365</td>\n",
" <td>22.760642</td>\n",
" <td>9.53</td>\n",
" <td>249.534759</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" flesch_reading_ease reading_time word_count \\\n",
" mean median mean median mean \n",
"ranef_grouping \n",
"1 55.089708 52.900 23.486886 9.28 256.973406 \n",
"2 46.425909 51.365 22.760642 9.53 249.534759 \n",
"\n",
" \n",
" median \n",
"ranef_grouping \n",
"1 90.0 \n",
"2 100.0 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"readability_aggregate = second_merge.groupby('ranef_grouping').agg({\n",
" 'flesch_reading_ease' : ['mean', 'median'],\n",
" 'reading_time' : ['mean', 'median'],\n",
" 'word_count' : ['mean', 'median'],\n",
"})\n",
"\n",
"readability_aggregate"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>t0</th>\n",
" <th>t1</th>\n",
" <th>t2</th>\n",
" <th>t3</th>\n",
" <th>t4</th>\n",
" <th>t5</th>\n",
" <th>t6</th>\n",
" <th>t7</th>\n",
" <th>t8</th>\n",
" <th>t9</th>\n",
" <th>t10</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" <th>mean</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ranef_grouping</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.095142</td>\n",
" <td>0.101366</td>\n",
" <td>0.084609</td>\n",
" <td>0.127050</td>\n",
" <td>0.024873</td>\n",
" <td>0.060155</td>\n",
" <td>0.088660</td>\n",
" <td>0.136191</td>\n",
" <td>0.060451</td>\n",
" <td>0.134408</td>\n",
" <td>0.087095</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.122912</td>\n",
" <td>0.102389</td>\n",
" <td>0.050782</td>\n",
" <td>0.168774</td>\n",
" <td>0.015752</td>\n",
" <td>0.072154</td>\n",
" <td>0.102839</td>\n",
" <td>0.115994</td>\n",
" <td>0.053674</td>\n",
" <td>0.119278</td>\n",
" <td>0.075450</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" t0 t1 t2 t3 t4 t5 \\\n",
" mean mean mean mean mean mean \n",
"ranef_grouping \n",
"1 0.095142 0.101366 0.084609 0.127050 0.024873 0.060155 \n",
"2 0.122912 0.102389 0.050782 0.168774 0.015752 0.072154 \n",
"\n",
" t6 t7 t8 t9 t10 \n",
" mean mean mean mean mean \n",
"ranef_grouping \n",
"1 0.088660 0.136191 0.060451 0.134408 0.087095 \n",
"2 0.102839 0.115994 0.053674 0.119278 0.075450 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"topic_aggregate = second_merge.groupby('ranef_grouping').agg({\n",
" 't0' : ['mean'],\n",
" 't1' : ['mean'],\n",
" 't2' : ['mean'],\n",
" 't3' : ['mean'],\n",
" 't4' : ['mean'],\n",
" 't5' : ['mean'],\n",
" 't6' : ['mean'],\n",
" 't7' : ['mean'],\n",
" 't8' : ['mean'],\n",
" 't9' : ['mean'],\n",
" 't10': ['mean']\n",
"})\n",
"\n",
"topic_aggregate"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}