diff --git a/text_analysis/contributing_partitioned_analysis.ipynb b/text_analysis/contributing_partitioned_analysis.ipynb new file mode 100644 index 0000000..9192569 --- /dev/null +++ b/text_analysis/contributing_partitioned_analysis.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "merged_manifest = pd.read_csv('0203_contributing_merged_manifest.csv')\n", + "topic_distributions = pd.read_csv('020125_CONTRIBUTING_file_topic_distributions.csv')\n", + "readability_scores = pd.read_csv('020125_CONTRIBUTING_readability.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "715" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_merge = readability_scores.merge(topic_distributions, on=['filename'],how=\"inner\")\n", + "#primary_merge = first_merge.merge(readability_scores, )\n", + "first_merge['fvf_filepath'] = first_merge['filename']\n", + "second_merge = first_merge.merge(merged_manifest, on=['fvf_filepath'], how=\"inner\")\n", + "len(second_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
flesch_reading_easereading_timeword_count
meanmedianmeanmedianmeanmedian
ranef_grouping
046.10207149.9923.40928613.720259.635714153.0
144.23727653.4129.06578916.880329.151703198.0
247.43222254.7834.36162723.365394.242063252.0
\n", + "
" + ], + "text/plain": [ + " flesch_reading_ease reading_time word_count \\\n", + " mean median mean median mean \n", + "ranef_grouping \n", + "0 46.102071 49.99 23.409286 13.720 259.635714 \n", + "1 44.237276 53.41 29.065789 16.880 329.151703 \n", + "2 47.432222 54.78 34.361627 23.365 394.242063 \n", + "\n", + " \n", + " median \n", + "ranef_grouping \n", + "0 153.0 \n", + "1 198.0 \n", + "2 252.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readability_aggregate = second_merge.groupby('ranef_grouping').agg({\n", + " 'flesch_reading_ease' : ['mean', 'median'],\n", + " 'reading_time' : ['mean', 'median'],\n", + " 'word_count' : ['mean', 'median'],\n", + "})\n", + "\n", + "readability_aggregate" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t0t1t2t3t4
meanmeanmeanmeanmean
ranef_grouping
00.0932810.3682680.1563040.0987480.283398
10.1575190.2819080.1858170.1121950.262561
20.2025650.2525550.1671630.0906780.287039
\n", + "
" + ], + "text/plain": [ + " t0 t1 t2 t3 t4\n", + " mean mean mean mean mean\n", + "ranef_grouping \n", + "0 0.093281 0.368268 0.156304 0.098748 0.283398\n", + "1 0.157519 0.281908 0.185817 0.112195 0.262561\n", + "2 0.202565 0.252555 0.167163 0.090678 0.287039" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topic_aggregate = second_merge.groupby('ranef_grouping').agg({\n", + " 't0' : ['mean'],\n", + " 't1' : ['mean'],\n", + " 't2' : ['mean'],\n", + " 't3' : ['mean'],\n", + " 't4' : ['mean']\n", + "})\n", + "\n", + "topic_aggregate" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/text_analysis/readme_partitioned_analysis.ipynb b/text_analysis/readme_partitioned_analysis.ipynb index c1b3220..39094c5 100644 --- a/text_analysis/readme_partitioned_analysis.ipynb +++ b/text_analysis/readme_partitioned_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -31,7 +31,7 @@ "4248" ] }, - "execution_count": 10, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -43,6 +43,482 @@ "second_merge = first_merge.merge(merged_manifest, on=['fvf_filepath'], how=\"inner\")\n", "len(second_merge)" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenameflesch_reading_easeflesch_kincaid_gradelinsear_write_formuladale_chall_readability_scoremcalpine_eflawreading_timechar_countword_countt0...effectgroupleveltermestimatestd.errorconf.lowconf.highranef_groupingrank
0libevent_libevent_hullabaloo_README50.3611.414.62500012.4726.37.37502830.001976...ran_valsproject_idlibevent_libeventbefore_after:week_index0.1865220.478091-0.7505191.12356313229.0
1zaach_jison_hullabaloo_README.md44.7813.57.71428610.7037.233.0522503950.000489...ran_valsproject_idzaach_jisonbefore_after:week_index1.0018790.2170630.5764431.42731524068.0
2lincolnloop_python-qrcode.git_hullabaloo_READM...58.288.47.58333310.7618.36.82464780.002597...ran_valsproject_idlincolnloop_python-qrcode.gitbefore_after:week_index-0.2206540.679910-1.5532541.11194611467.0
3mati75_evilwm.git_hullabaloo_README66.747.25.8888898.7718.156.7638647420.000266...ran_valsproject_idmati75_evilwm.gitbefore_after:week_index-0.2938460.697316-1.6605601.07286811184.0
4markdown-it_markdown-it_hullabaloo_README.md45.7611.112.00000012.8424.01.51103170.006993...ran_valsproject_idmarkdown-it_markdown-itbefore_after:week_index0.2386150.602856-0.9429611.42019013459.0
\n", + "

5 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " filename flesch_reading_ease \\\n", + "0 libevent_libevent_hullabaloo_README 50.36 \n", + "1 zaach_jison_hullabaloo_README.md 44.78 \n", + "2 lincolnloop_python-qrcode.git_hullabaloo_READM... 58.28 \n", + "3 mati75_evilwm.git_hullabaloo_README 66.74 \n", + "4 markdown-it_markdown-it_hullabaloo_README.md 45.76 \n", + "\n", + " flesch_kincaid_grade linsear_write_formula dale_chall_readability_score \\\n", + "0 11.4 14.625000 12.47 \n", + "1 13.5 7.714286 10.70 \n", + "2 8.4 7.583333 10.76 \n", + "3 7.2 5.888889 8.77 \n", + "4 11.1 12.000000 12.84 \n", + "\n", + " mcalpine_eflaw reading_time char_count word_count t0 ... \\\n", + "0 26.3 7.37 502 83 0.001976 ... \n", + "1 37.2 33.05 2250 395 0.000489 ... \n", + "2 18.3 6.82 464 78 0.002597 ... \n", + "3 18.1 56.76 3864 742 0.000266 ... \n", + "4 24.0 1.51 103 17 0.006993 ... \n", + "\n", + " effect group level \\\n", + "0 ran_vals project_id libevent_libevent \n", + "1 ran_vals project_id zaach_jison \n", + "2 ran_vals project_id lincolnloop_python-qrcode.git \n", + "3 ran_vals project_id mati75_evilwm.git \n", + "4 ran_vals project_id markdown-it_markdown-it \n", + "\n", + " term estimate std.error conf.low conf.high \\\n", + "0 before_after:week_index 0.186522 0.478091 -0.750519 1.123563 \n", + "1 before_after:week_index 1.001879 0.217063 0.576443 1.427315 \n", + "2 before_after:week_index -0.220654 0.679910 -1.553254 1.111946 \n", + "3 before_after:week_index -0.293846 0.697316 -1.660560 1.072868 \n", + "4 before_after:week_index 0.238615 0.602856 -0.942961 1.420190 \n", + "\n", + " ranef_grouping rank \n", + "0 1 3229.0 \n", + "1 2 4068.0 \n", + "2 1 1467.0 \n", + "3 1 1184.0 \n", + "4 1 3459.0 \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "second_merge.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
flesch_reading_easereading_timeword_count
meanmedianmeanmedianmeanmedian
ranef_grouping
155.13755852.93523.4863999.29256.98890090.0
246.42590951.36522.7606429.53249.534759100.0
\n", + "
" + ], + "text/plain": [ + " flesch_reading_ease reading_time word_count \\\n", + " mean median mean median mean \n", + "ranef_grouping \n", + "1 55.137558 52.935 23.486399 9.29 256.988900 \n", + "2 46.425909 51.365 22.760642 9.53 249.534759 \n", + "\n", + " \n", + " median \n", + "ranef_grouping \n", + "1 90.0 \n", + "2 100.0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readability_aggregate = second_merge.groupby('ranef_grouping').agg({\n", + " 'flesch_reading_ease' : ['mean', 'median'],\n", + " 'reading_time' : ['mean', 'median'],\n", + " 'word_count' : ['mean', 'median'],\n", + "})\n", + "\n", + "readability_aggregate" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t0t1t2t3t4t5t6t7t8t9t10
meanmeanmeanmeanmeanmeanmeanmeanmeanmeanmean
ranef_grouping
10.0359380.0825780.0968170.1565250.1113050.0949730.0687360.114930.0612430.0976050.079349
20.0243290.0539340.0958060.2096610.1014640.0766280.0820480.152260.0642410.0885260.051103
\n", + "
" + ], + "text/plain": [ + " t0 t1 t2 t3 t4 t5 \\\n", + " mean mean mean mean mean mean \n", + "ranef_grouping \n", + "1 0.035938 0.082578 0.096817 0.156525 0.111305 0.094973 \n", + "2 0.024329 0.053934 0.095806 0.209661 0.101464 0.076628 \n", + "\n", + " t6 t7 t8 t9 t10 \n", + " mean mean mean mean mean \n", + "ranef_grouping \n", + "1 0.068736 0.11493 0.061243 0.097605 0.079349 \n", + "2 0.082048 0.15226 0.064241 0.088526 0.051103 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topic_aggregate = second_merge.groupby('ranef_grouping').agg({\n", + " 't0' : ['mean'],\n", + " 't1' : ['mean'],\n", + " 't2' : ['mean'],\n", + " 't3' : ['mean'],\n", + " 't4' : ['mean'],\n", + " 't5' : ['mean'],\n", + " 't6' : ['mean'],\n", + " 't7' : ['mean'],\n", + " 't8' : ['mean'],\n", + " 't9' : ['mean'],\n", + " 't10': ['mean']\n", + "})\n", + "\n", + "topic_aggregate" + ] } ], "metadata": {