{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "contributing_ranef = pd.read_csv(\"../mlm/data/0201_contributing_dweek_ranefs.csv\")\n",
    "readme_ranef = pd.read_csv(\"../mlm/data/0203_readme_dweek_ranefs.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "contributing_ranef['repo_id'] = contributing_ranef['level']\n",
    "readme_ranef['repo_id'] = readme_ranef['level']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "contributing_manifest = pd.read_csv(\"../all_fvf_CONTRIBUTING_manifest-link.csv\")\n",
    "readme_manifest = pd.read_csv(\"../all_fvf_README_manifest-link.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>repo_id</th>\n",
       "      <th>commits_filepath</th>\n",
       "      <th>fvf_filepath</th>\n",
       "      <th>effect</th>\n",
       "      <th>group</th>\n",
       "      <th>level</th>\n",
       "      <th>term</th>\n",
       "      <th>estimate</th>\n",
       "      <th>std.error</th>\n",
       "      <th>conf.low</th>\n",
       "      <th>conf.high</th>\n",
       "      <th>ranef_grouping</th>\n",
       "      <th>rank</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>aio-libs_aiomysql.git</td>\n",
       "      <td>_aio-libs_aiomysql.git_commits.csv</td>\n",
       "      <td>aio-libs_aiomysql.git_hullabaloo_CONTRIBUTING.rst</td>\n",
       "      <td>ran_vals</td>\n",
       "      <td>project_id</td>\n",
       "      <td>aio-libs_aiomysql.git</td>\n",
       "      <td>before_after:week_index</td>\n",
       "      <td>-0.142476</td>\n",
       "      <td>0.116488</td>\n",
       "      <td>-0.370789</td>\n",
       "      <td>0.085838</td>\n",
       "      <td>1</td>\n",
       "      <td>222.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>qTox_qTox</td>\n",
       "      <td>_qTox_qTox_commits.csv</td>\n",
       "      <td>qTox_qTox_hullabaloo_CONTRIBUTING.md</td>\n",
       "      <td>ran_vals</td>\n",
       "      <td>project_id</td>\n",
       "      <td>qTox_qTox</td>\n",
       "      <td>before_after:week_index</td>\n",
       "      <td>0.385689</td>\n",
       "      <td>0.048433</td>\n",
       "      <td>0.290763</td>\n",
       "      <td>0.480615</td>\n",
       "      <td>2</td>\n",
       "      <td>684.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gohugoio_hugo.git</td>\n",
       "      <td>_gohugoio_hugo.git_commits.csv</td>\n",
       "      <td>gohugoio_hugo.git_hullabaloo_CONTRIBUTING.md</td>\n",
       "      <td>ran_vals</td>\n",
       "      <td>project_id</td>\n",
       "      <td>gohugoio_hugo.git</td>\n",
       "      <td>before_after:week_index</td>\n",
       "      <td>0.334518</td>\n",
       "      <td>0.053172</td>\n",
       "      <td>0.230302</td>\n",
       "      <td>0.438733</td>\n",
       "      <td>2</td>\n",
       "      <td>647.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ycm-core_ycmd</td>\n",
       "      <td>_ycm-core_ycmd_commits.csv</td>\n",
       "      <td>ycm-core_ycmd_hullabaloo_CONTRIBUTING.md</td>\n",
       "      <td>ran_vals</td>\n",
       "      <td>project_id</td>\n",
       "      <td>ycm-core_ycmd</td>\n",
       "      <td>before_after:week_index</td>\n",
       "      <td>-0.000605</td>\n",
       "      <td>0.094662</td>\n",
       "      <td>-0.186139</td>\n",
       "      <td>0.184928</td>\n",
       "      <td>1</td>\n",
       "      <td>328.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>aio-libs_aiohttp.git</td>\n",
       "      <td>_aio-libs_aiohttp.git_commits.csv</td>\n",
       "      <td>aio-libs_aiohttp.git_hullabaloo_CONTRIBUTING.rst</td>\n",
       "      <td>ran_vals</td>\n",
       "      <td>project_id</td>\n",
       "      <td>aio-libs_aiohttp.git</td>\n",
       "      <td>before_after:week_index</td>\n",
       "      <td>0.348421</td>\n",
       "      <td>0.051826</td>\n",
       "      <td>0.246845</td>\n",
       "      <td>0.449997</td>\n",
       "      <td>2</td>\n",
       "      <td>658.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 repo_id                    commits_filepath  \\\n",
       "0  aio-libs_aiomysql.git  _aio-libs_aiomysql.git_commits.csv   \n",
       "1              qTox_qTox              _qTox_qTox_commits.csv   \n",
       "2      gohugoio_hugo.git      _gohugoio_hugo.git_commits.csv   \n",
       "3          ycm-core_ycmd          _ycm-core_ycmd_commits.csv   \n",
       "4   aio-libs_aiohttp.git   _aio-libs_aiohttp.git_commits.csv   \n",
       "\n",
       "                                        fvf_filepath    effect       group  \\\n",
       "0  aio-libs_aiomysql.git_hullabaloo_CONTRIBUTING.rst  ran_vals  project_id   \n",
       "1               qTox_qTox_hullabaloo_CONTRIBUTING.md  ran_vals  project_id   \n",
       "2       gohugoio_hugo.git_hullabaloo_CONTRIBUTING.md  ran_vals  project_id   \n",
       "3           ycm-core_ycmd_hullabaloo_CONTRIBUTING.md  ran_vals  project_id   \n",
       "4   aio-libs_aiohttp.git_hullabaloo_CONTRIBUTING.rst  ran_vals  project_id   \n",
       "\n",
       "                   level                     term  estimate  std.error  \\\n",
       "0  aio-libs_aiomysql.git  before_after:week_index -0.142476   0.116488   \n",
       "1              qTox_qTox  before_after:week_index  0.385689   0.048433   \n",
       "2      gohugoio_hugo.git  before_after:week_index  0.334518   0.053172   \n",
       "3          ycm-core_ycmd  before_after:week_index -0.000605   0.094662   \n",
       "4   aio-libs_aiohttp.git  before_after:week_index  0.348421   0.051826   \n",
       "\n",
       "   conf.low  conf.high  ranef_grouping   rank  \n",
       "0 -0.370789   0.085838               1  222.0  \n",
       "1  0.290763   0.480615               2  684.0  \n",
       "2  0.230302   0.438733               2  647.0  \n",
       "3 -0.186139   0.184928               1  328.0  \n",
       "4  0.246845   0.449997               2  658.0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contributing_merged = pd.merge(contributing_manifest, contributing_ranef, on=['repo_id'], how='inner')\n",
    "contributing_merged = contributing_merged.drop(columns='Unnamed: 0')\n",
    "contributing_merged.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "contributing_merged.to_csv('0203_contributing_merged_manifest.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Empty DataFrame\n",
      "Columns: [repo_id, commits_filepath, fvf_filepath]\n",
      "Index: []\n"
     ]
    }
   ],
   "source": [
    "#duplicates = readme_manifest[readme_manifest.duplicated(subset=['repo_id'], keep=False)]\n",
    "#len(duplicates)\n",
    "#print(duplicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4247"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "readme_merged = pd.merge(readme_manifest, readme_ranef, on=['repo_id'], how='inner')\n",
    "readme_merged = readme_merged.drop(columns='Unnamed: 0')\n",
    "len(readme_merged)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "readme_merged.to_csv('0203_readme_merged_manifest.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}