{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "merged_manifest = pd.read_csv('0207_contributing_merged_manifest.csv')\n", "topic_distributions = pd.read_csv('020725_CONTRIBUTING_file_topic_distributions.csv')\n", "readability_scores = pd.read_csv('020725_CONTRIBUTING_readability.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "714" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_merge = readability_scores.merge(topic_distributions, on=['filename'],how=\"inner\")\n", "#primary_merge = first_merge.merge(readability_scores, )\n", "first_merge['new_filepath'] = first_merge['filename']\n", "second_merge = first_merge.merge(merged_manifest, on=['new_filepath'], how=\"inner\")\n", "len(second_merge)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
flesch_reading_easereading_timeword_count
meanmedianmeanmedianmeanmedian
ranef_grouping
042.56692950.0225.15078712.720279.196850127.0
146.03994252.8031.45384418.715356.794798220.5
248.24767655.5436.95029024.900426.473029273.0
\n", "
" ], "text/plain": [ " flesch_reading_ease reading_time word_count \\\n", " mean median mean median mean \n", "ranef_grouping \n", "0 42.566929 50.02 25.150787 12.720 279.196850 \n", "1 46.039942 52.80 31.453844 18.715 356.794798 \n", "2 48.247676 55.54 36.950290 24.900 426.473029 \n", "\n", " \n", " median \n", "ranef_grouping \n", "0 127.0 \n", "1 220.5 \n", "2 273.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readability_aggregate = second_merge.groupby('ranef_grouping').agg({\n", " 'flesch_reading_ease' : ['mean', 'median'],\n", " 'reading_time' : ['mean', 'median'],\n", " 'word_count' : ['mean', 'median'],\n", "})\n", "\n", "readability_aggregate" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
t0t1t2t3t4
meanmeanmeanmeanmean
ranef_grouping
00.0902020.0520870.5005350.1842440.172932
10.1121420.0992780.4515610.1150160.222003
20.0943790.1262440.5352890.0725690.171519
\n", "
" ], "text/plain": [ " t0 t1 t2 t3 t4\n", " mean mean mean mean mean\n", "ranef_grouping \n", "0 0.090202 0.052087 0.500535 0.184244 0.172932\n", "1 0.112142 0.099278 0.451561 0.115016 0.222003\n", "2 0.094379 0.126244 0.535289 0.072569 0.171519" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "topic_aggregate = second_merge.groupby('ranef_grouping').agg({\n", " 't0' : ['mean'],\n", " 't1' : ['mean'],\n", " 't2' : ['mean'],\n", " 't3' : ['mean'],\n", " 't4' : ['mean']\n", "})\n", "\n", "topic_aggregate" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }