{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "merged_manifest = pd.read_csv('0207_readme_merged_manifest.csv')\n", "topic_distributions = pd.read_csv('020725_README_file_topic_distributions.csv')\n", "readability_scores = pd.read_csv('020725_README_readability.csv')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4226" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_merge = readability_scores.merge(topic_distributions, on=['filename'],how=\"inner\")\n", "#primary_merge = first_merge.merge(readability_scores, )\n", "first_merge['new_filepath'] = first_merge['filename']\n", "#len(first_merge)\n", "second_merge = first_merge.merge(merged_manifest, on=['new_filepath'], how=\"inner\")\n", "len(second_merge)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenameflesch_reading_easeflesch_kincaid_gradelinsear_write_formuladale_chall_readability_scoremcalpine_eflawreading_timechar_countword_countt0...effectgroupleveltermestimatestd.errorconf.lowconf.highranef_groupingrank
0libevent_libevent_hullabaloo_README50.3611.414.62500012.4726.37.37502830.001976...ran_valsproject_idlibevent_libeventbefore_after:week_index0.1865220.478091-0.7505191.12356313229.0
1zaach_jison_hullabaloo_README.md44.7813.57.71428610.7037.233.0522503950.000489...ran_valsproject_idzaach_jisonbefore_after:week_index1.0018790.2170630.5764431.42731524068.0
2lincolnloop_python-qrcode.git_hullabaloo_READM...58.288.47.58333310.7618.36.82464780.002597...ran_valsproject_idlincolnloop_python-qrcode.gitbefore_after:week_index-0.2206540.679910-1.5532541.11194611467.0
3mati75_evilwm.git_hullabaloo_README66.747.25.8888898.7718.156.7638647420.000266...ran_valsproject_idmati75_evilwm.gitbefore_after:week_index-0.2938460.697316-1.6605601.07286811184.0
4markdown-it_markdown-it_hullabaloo_README.md45.7611.112.00000012.8424.01.51103170.006993...ran_valsproject_idmarkdown-it_markdown-itbefore_after:week_index0.2386150.602856-0.9429611.42019013459.0
\n", "

5 rows × 33 columns

\n", "
" ], "text/plain": [ " filename flesch_reading_ease \\\n", "0 libevent_libevent_hullabaloo_README 50.36 \n", "1 zaach_jison_hullabaloo_README.md 44.78 \n", "2 lincolnloop_python-qrcode.git_hullabaloo_READM... 58.28 \n", "3 mati75_evilwm.git_hullabaloo_README 66.74 \n", "4 markdown-it_markdown-it_hullabaloo_README.md 45.76 \n", "\n", " flesch_kincaid_grade linsear_write_formula dale_chall_readability_score \\\n", "0 11.4 14.625000 12.47 \n", "1 13.5 7.714286 10.70 \n", "2 8.4 7.583333 10.76 \n", "3 7.2 5.888889 8.77 \n", "4 11.1 12.000000 12.84 \n", "\n", " mcalpine_eflaw reading_time char_count word_count t0 ... \\\n", "0 26.3 7.37 502 83 0.001976 ... \n", "1 37.2 33.05 2250 395 0.000489 ... \n", "2 18.3 6.82 464 78 0.002597 ... \n", "3 18.1 56.76 3864 742 0.000266 ... \n", "4 24.0 1.51 103 17 0.006993 ... \n", "\n", " effect group level \\\n", "0 ran_vals project_id libevent_libevent \n", "1 ran_vals project_id zaach_jison \n", "2 ran_vals project_id lincolnloop_python-qrcode.git \n", "3 ran_vals project_id mati75_evilwm.git \n", "4 ran_vals project_id markdown-it_markdown-it \n", "\n", " term estimate std.error conf.low conf.high \\\n", "0 before_after:week_index 0.186522 0.478091 -0.750519 1.123563 \n", "1 before_after:week_index 1.001879 0.217063 0.576443 1.427315 \n", "2 before_after:week_index -0.220654 0.679910 -1.553254 1.111946 \n", "3 before_after:week_index -0.293846 0.697316 -1.660560 1.072868 \n", "4 before_after:week_index 0.238615 0.602856 -0.942961 1.420190 \n", "\n", " ranef_grouping rank \n", "0 1 3229.0 \n", "1 2 4068.0 \n", "2 1 1467.0 \n", "3 1 1184.0 \n", "4 1 3459.0 \n", "\n", "[5 rows x 33 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "second_merge.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
flesch_reading_easereading_timeword_count
meanmedianmeanmedianmeanmedian
ranef_grouping
036.42560049.92027.60160010.06294.32000099.0
147.58042550.33027.56220415.09294.790940147.0
254.81974852.83531.47875314.04341.339329145.5
\n", "
" ], "text/plain": [ " flesch_reading_ease reading_time word_count \\\n", " mean median mean median mean \n", "ranef_grouping \n", "0 36.425600 49.920 27.601600 10.06 294.320000 \n", "1 47.580425 50.330 27.562204 15.09 294.790940 \n", "2 54.819748 52.835 31.478753 14.04 341.339329 \n", "\n", " \n", " median \n", "ranef_grouping \n", "0 99.0 \n", "1 147.0 \n", "2 145.5 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readability_aggregate = second_merge.groupby('ranef_grouping').agg({\n", " 'flesch_reading_ease' : ['mean', 'median'],\n", " 'reading_time' : ['mean', 'median'],\n", " 'word_count' : ['mean', 'median'],\n", "})\n", "\n", "readability_aggregate" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
t0t1t2t3t4t5t6t7t8
meanmeanmeanmeanmeanmeanmeanmeanmean
ranef_grouping
00.0723620.1173560.0947080.0559650.1478620.0686890.1242380.0567900.262029
10.0730220.0937570.1023630.0836880.1443480.0763050.0930230.0744070.259087
20.0737840.1113560.0763070.0501780.1304410.0591230.0931420.0670750.338593
\n", "
" ], "text/plain": [ " t0 t1 t2 t3 t4 t5 \\\n", " mean mean mean mean mean mean \n", "ranef_grouping \n", "0 0.072362 0.117356 0.094708 0.055965 0.147862 0.068689 \n", "1 0.073022 0.093757 0.102363 0.083688 0.144348 0.076305 \n", "2 0.073784 0.111356 0.076307 0.050178 0.130441 0.059123 \n", "\n", " t6 t7 t8 \n", " mean mean mean \n", "ranef_grouping \n", "0 0.124238 0.056790 0.262029 \n", "1 0.093023 0.074407 0.259087 \n", "2 0.093142 0.067075 0.338593 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_aggregate = second_merge.groupby('ranef_grouping').agg({\n", " 't0' : ['mean'],\n", " 't1' : ['mean'],\n", " 't2' : ['mean'],\n", " 't3' : ['mean'],\n", " 't4' : ['mean'],\n", " 't5' : ['mean'],\n", " 't6' : ['mean'],\n", " 't7' : ['mean'],\n", " 't8' : ['mean']\n", "})\n", "\n", "topic_aggregate" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }