{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import shutil" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "contributing_ranef = pd.read_csv(\"../mlm/data/0201_contributing_dweek_ranefs.csv\")\n", "readme_ranef = pd.read_csv(\"../mlm/data/0203_readme_dweek_ranefs.csv\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "contributing_ranef['repo_id'] = contributing_ranef['level']\n", "readme_ranef['repo_id'] = readme_ranef['level']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "contributing_manifest = pd.read_csv(\"../all_fvf_CONTRIBUTING_manifest-link.csv\")\n", "readme_manifest = pd.read_csv(\"../all_fvf_README_manifest-link.csv\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>repo_id</th>\n", " <th>commits_filepath</th>\n", " <th>fvf_filepath</th>\n", " <th>effect</th>\n", " <th>group</th>\n", " <th>level</th>\n", " <th>term</th>\n", " <th>estimate</th>\n", " <th>std.error</th>\n", " <th>conf.low</th>\n", " <th>conf.high</th>\n", " <th>ranef_grouping</th>\n", " <th>rank</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>aio-libs_aiomysql.git</td>\n", " <td>_aio-libs_aiomysql.git_commits.csv</td>\n", " <td>aio-libs_aiomysql.git_hullabaloo_CONTRIBUTING.rst</td>\n", " <td>ran_vals</td>\n", " <td>project_id</td>\n", " <td>aio-libs_aiomysql.git</td>\n", " <td>before_after:week_index</td>\n", " <td>-0.142476</td>\n", " <td>0.116488</td>\n", " <td>-0.370789</td>\n", " <td>0.085838</td>\n", " <td>1</td>\n", " <td>222.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>qTox_qTox</td>\n", " <td>_qTox_qTox_commits.csv</td>\n", " <td>qTox_qTox_hullabaloo_CONTRIBUTING.md</td>\n", " <td>ran_vals</td>\n", " <td>project_id</td>\n", " <td>qTox_qTox</td>\n", " <td>before_after:week_index</td>\n", " <td>0.385689</td>\n", " <td>0.048433</td>\n", " <td>0.290763</td>\n", " <td>0.480615</td>\n", " <td>2</td>\n", " <td>684.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>gohugoio_hugo.git</td>\n", " <td>_gohugoio_hugo.git_commits.csv</td>\n", " <td>gohugoio_hugo.git_hullabaloo_CONTRIBUTING.md</td>\n", " <td>ran_vals</td>\n", " <td>project_id</td>\n", " <td>gohugoio_hugo.git</td>\n", " <td>before_after:week_index</td>\n", " <td>0.334518</td>\n", " <td>0.053172</td>\n", " <td>0.230302</td>\n", " <td>0.438733</td>\n", " <td>2</td>\n", " <td>647.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>ycm-core_ycmd</td>\n", " <td>_ycm-core_ycmd_commits.csv</td>\n", " <td>ycm-core_ycmd_hullabaloo_CONTRIBUTING.md</td>\n", " <td>ran_vals</td>\n", " <td>project_id</td>\n", " <td>ycm-core_ycmd</td>\n", " <td>before_after:week_index</td>\n", " <td>-0.000605</td>\n", " <td>0.094662</td>\n", " <td>-0.186139</td>\n", " <td>0.184928</td>\n", " <td>1</td>\n", " <td>328.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>aio-libs_aiohttp.git</td>\n", " <td>_aio-libs_aiohttp.git_commits.csv</td>\n", " <td>aio-libs_aiohttp.git_hullabaloo_CONTRIBUTING.rst</td>\n", " <td>ran_vals</td>\n", " <td>project_id</td>\n", " <td>aio-libs_aiohttp.git</td>\n", " <td>before_after:week_index</td>\n", " <td>0.348421</td>\n", " <td>0.051826</td>\n", " <td>0.246845</td>\n", " <td>0.449997</td>\n", " <td>2</td>\n", " <td>658.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " repo_id commits_filepath \\\n", "0 aio-libs_aiomysql.git _aio-libs_aiomysql.git_commits.csv \n", "1 qTox_qTox _qTox_qTox_commits.csv \n", "2 gohugoio_hugo.git _gohugoio_hugo.git_commits.csv \n", "3 ycm-core_ycmd _ycm-core_ycmd_commits.csv \n", "4 aio-libs_aiohttp.git _aio-libs_aiohttp.git_commits.csv \n", "\n", " fvf_filepath effect group \\\n", "0 aio-libs_aiomysql.git_hullabaloo_CONTRIBUTING.rst ran_vals project_id \n", "1 qTox_qTox_hullabaloo_CONTRIBUTING.md ran_vals project_id \n", "2 gohugoio_hugo.git_hullabaloo_CONTRIBUTING.md ran_vals project_id \n", "3 ycm-core_ycmd_hullabaloo_CONTRIBUTING.md ran_vals project_id \n", "4 aio-libs_aiohttp.git_hullabaloo_CONTRIBUTING.rst ran_vals project_id \n", "\n", " level term estimate std.error \\\n", "0 aio-libs_aiomysql.git before_after:week_index -0.142476 0.116488 \n", "1 qTox_qTox before_after:week_index 0.385689 0.048433 \n", "2 gohugoio_hugo.git before_after:week_index 0.334518 0.053172 \n", "3 ycm-core_ycmd before_after:week_index -0.000605 0.094662 \n", "4 aio-libs_aiohttp.git before_after:week_index 0.348421 0.051826 \n", "\n", " conf.low conf.high ranef_grouping rank \n", "0 -0.370789 0.085838 1 222.0 \n", "1 0.290763 0.480615 2 684.0 \n", "2 0.230302 0.438733 2 647.0 \n", "3 -0.186139 0.184928 1 328.0 \n", "4 0.246845 0.449997 2 658.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "contributing_merged = pd.merge(contributing_manifest, contributing_ranef, on=['repo_id'], how='inner')\n", "contributing_merged = contributing_merged.drop(columns='Unnamed: 0')\n", "contributing_merged.head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "contributing_merged.to_csv('0203_contributing_merged_manifest.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Empty DataFrame\n", "Columns: [repo_id, commits_filepath, fvf_filepath]\n", "Index: []\n" ] } ], "source": [ "#duplicates = readme_manifest[readme_manifest.duplicated(subset=['repo_id'], keep=False)]\n", "#len(duplicates)\n", "#print(duplicates)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4247" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "readme_merged = pd.merge(readme_manifest, readme_ranef, on=['repo_id'], how='inner')\n", "readme_merged = readme_merged.drop(columns='Unnamed: 0')\n", "len(readme_merged)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "readme_merged.to_csv('0203_readme_merged_manifest.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }