diff --git a/src/helper_scripts/decompression_script.py b/src/helper_scripts/decompression_script.py index d83c2ef..5c578b9 100644 --- a/src/helper_scripts/decompression_script.py +++ b/src/helper_scripts/decompression_script.py @@ -15,6 +15,7 @@ def decompress(filepath): def decompress_directory(directory_name): # Traverse the directory + OSErrors = 0 for root, dirs, files in os.walk(directory_name): for file in files: if file.endswith('.bz2'): @@ -22,7 +23,13 @@ def decompress_directory(directory_name): filepath = os.path.join(root, file) print(filepath) # Apply the decompress function + #try: decompress(filepath) + #except OSError: + #OSErrors += 1 + #print(f"OSError @ {filepath}") + return OSErrors + def cleanup(directory_name): for root, dirs, files in os.walk(directory_name): @@ -35,5 +42,6 @@ def cleanup(directory_name): if __name__ == "__main__": #batch_parallel_for_single() - decompress_directory(FILE_LOC_PREFIX) + decompression_errors = decompress_directory(FILE_LOC_PREFIX) + print(f"We had {decompression_errors} OSErrors during decompression.") #cleanup(FILE_LOC_PREFIX) \ No newline at end of file diff --git a/src/lib/spark-warehouse/bot_isolation.ipynb b/src/lib/spark-warehouse/bot_isolation.ipynb index 40ac30a..e51f59c 100644 --- a/src/lib/spark-warehouse/bot_isolation.ipynb +++ b/src/lib/spark-warehouse/bot_isolation.ipynb @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "mediawiki_history_path = \"/data/users/mgaughan/mw-repo-lifecycles/wiki_activity_data/single_activity_files\"" + "mediawiki_history_path = \"/data/users/mgaughan/mw-repo-lifecycles/wiki_activity_data/test\"" ] }, { @@ -149,7 +149,7 @@ "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "25/01/08 11:39:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + "25/01/09 14:31:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], @@ -303,21 +303,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "+--------------+-------+--------------+\n", - "|wiki_db |month |revision_count|\n", - "+--------------+-------+--------------+\n", - "|euwiki |2019-12|356355 |\n", - "|cewiki |2019-12|229351 |\n", - "|elwiktionary |2019-12|227666 |\n", - "|cywiki |2019-12|139174 |\n", - "|tgwiki |2019-12|65694 |\n", - "|zh_min_nanwiki|2019-12|59755 |\n", - "|bnwiki |2019-12|55698 |\n", - "|elwiki |2019-12|49604 |\n", - "|dewiktionary |2019-12|47897 |\n", - "|urwiki |2019-12|45793 |\n", - "+--------------+-------+--------------+\n", - "only showing top 10 rows\n", + "+------------+-------+--------------+\n", + "|wiki_db |month |revision_count|\n", + "+------------+-------+--------------+\n", + "|kwwiki |2019-12|1079 |\n", + "|kowikiquote |2019-12|146 |\n", + "|zuwiktionary|2019-12|135 |\n", + "+------------+-------+--------------+\n", "\n" ] } @@ -335,15 +327,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+-----------+--------------+\n", + "|event_entity|event_type |revision_count|\n", + "+------------+-----------+--------------+\n", + "|revision |create |267553 |\n", + "|user |create |26999 |\n", + "|page |create |25214 |\n", + "|page |delete |9842 |\n", + "|page |create-page|9365 |\n", + "|user |rename |1738 |\n", + "|page |move |1657 |\n", + "|user |alterblocks|87 |\n", + "|page |restore |20 |\n", + "|user |altergroups|2 |\n", + "+------------+-----------+--------------+\n", + "\n" + ] + } + ], + "source": [ + "mediawiki_history. \\\n", + " where(\"event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null\"). \\\n", + " groupBy(\"wikidb\", \"event_entity\", \"event_type\"). \\\n", + " agg(count(lit(1)).alias(\"revision_count\")). \\\n", + " sort(desc(\"revision_count\")). \\\n", + " show(20, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+--------------+\n", + "|wiki_db |revision_count|\n", + "+------------+--------------+\n", + "|kwwiki |16625 |\n", + "|kowikiquote |6779 |\n", + "|zuwiktionary|3595 |\n", + "+------------+--------------+\n", + "\n" + ] + } + ], + "source": [ + "mediawiki_history. \\\n", + " where(\"event_entity = 'user' and event_type='create'\"). \\\n", + " selectExpr(\"wiki_db\", \"SUBSTR(event_timestamp, 0, 7) as month\"). \\\n", + " where(\"event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null\"). \\\n", + " groupBy(\"wiki_db\"). \\\n", + " agg(count(lit(1)).alias(\"revision_count\")). \\\n", + " sort(desc(\"revision_count\")). \\\n", + " show(10, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "#df.write.format(\"csv\").save(filepath)\n", + "print(bot_user_creation)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" },