isolated bot activity
This commit is contained in:
parent
64ffe9f80d
commit
e0874fe5c8
@ -361,7 +361,9 @@
|
|||||||
"25/01/16 12:21:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
"25/01/16 12:21:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
||||||
"25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
"25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
||||||
"25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
"25/01/16 12:21:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
||||||
"[Stage 0:==> (397 + 56) / 8283]\r"
|
"25/01/16 12:29:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
||||||
|
"25/01/16 12:29:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.\n",
|
||||||
|
"[Stage 0:==> (447 + 56) / 8283]\r"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -23,7 +23,7 @@ def toArray(str):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/yearly_activity_files/"
|
mediawiki_history_path = "/data_ext/users/nws8519/mw-repo-lifecycles/wiki_activity_data/single_activity_files/"
|
||||||
|
|
||||||
# Note: string unescaping and array conversion is done later
|
# Note: string unescaping and array conversion is done later
|
||||||
mediawiki_history_schema = StructType([
|
mediawiki_history_schema = StructType([
|
||||||
@ -190,6 +190,7 @@ if __name__ == "__main__":
|
|||||||
"to_array(revision_tags_string) AS revision_tags"
|
"to_array(revision_tags_string) AS revision_tags"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
mediawiki_history = mediawiki_history.repartitionByRange(300, "wiki_db")
|
||||||
activity_count_df = mediawiki_history.where("event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null")
|
activity_count_df = mediawiki_history.where("event_user_is_bot_by_historical is not null and event_user_is_bot_by is not null")
|
||||||
activity_count_df = activity_count_df.selectExpr("wiki_db", "SUBSTR(event_timestamp, 0, 10) as day", "event_entity", "event_type")
|
activity_count_df = activity_count_df.selectExpr("wiki_db", "SUBSTR(event_timestamp, 0, 10) as day", "event_entity", "event_type")
|
||||||
activity_count_df = activity_count_df.groupBy("wiki_db", "day", "event_entity", "event_type").agg(count(lit(1)).alias("activity_count"))
|
activity_count_df = activity_count_df.groupBy("wiki_db", "day", "event_entity", "event_type").agg(count(lit(1)).alias("activity_count"))
|
||||||
@ -197,4 +198,4 @@ if __name__ == "__main__":
|
|||||||
sort(desc("day")). \
|
sort(desc("day")). \
|
||||||
show(10, False)
|
show(10, False)
|
||||||
|
|
||||||
activity_count_df.write.format("csv").save("011625_dab_yearly.csv")
|
activity_count_df.write.format("csv").save("011625_dab_single.csv")
|
Loading…
Reference in New Issue
Block a user