1
0

updated work for some m2 writing tomorrow

This commit is contained in:
Matthew Gaughan 2025-05-24 16:59:03 -07:00
parent 3573afbc1a
commit fd1479775d
14 changed files with 1309 additions and 944 deletions

View File

@ -173,3 +173,6 @@ ls ../case3
cd .. cd ..
ls ls
ls case1 ls case1
ls
cd case2
ls

View File

@ -46,35 +46,35 @@ new_unaff_authors <- new_authors_long_df |>
fill=commit_seniority)) + fill=commit_seniority)) +
geom_col(position='dodge') + geom_col(position='dodge') +
labs(x = "Relative Week", y = "Commits", fill="Contributor Tenure (New contributors <= 5 commits before deployment announcement)") + labs(x = "Relative Week", y = "Commits", fill="Contributor Tenure (New contributors <= 5 commits before deployment announcement)") +
geom_vline(data = long_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -29), aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -9), aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -4), aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 1) + linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c2"),
aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 1) +
geom_vline(data = combined_df |> filter(source == "c2"), geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -4), aes(xintercept = -99),
linetype = "3313", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c3"), geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -97), aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c3"), geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -3), aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 1) + linetype = "3313", color = "black", linewidth = 0.5) +
geom_text(data = data.frame(source = "c1", relative_week = -40, lengthened_commit_count = 90), geom_text(data = data.frame(source = "c1", relative_week = -39, lengthened_commit_count = 80),
aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 1) + # Add vertical line at week 0 geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + # Add vertical line at week 0
geom_text(data = data.frame(source = "c1", relative_week = 7, lengthened_commit_count = 90), geom_text(data = data.frame(source = "c2", relative_week = 7, lengthened_commit_count = 80),
aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_text(data = data.frame(source = "c3", relative_week = -15, lengthened_commit_count = 100), geom_text(data = data.frame(source = "c3", relative_week = -15, lengthened_commit_count = 80),
aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment Announcement"), aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment Announcement"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
scale_fill_manual(values = c("returning_unaff_commit_count" = "#FFC107", # Color for "Returning Contributors" scale_fill_manual(values = c("returning_unaff_commit_count" = "#FFC107", # Color for "Returning Contributors"
@ -96,14 +96,14 @@ new_unaff_authors <- new_authors_long_df |>
strip.text = element_text(size = 14)# Increase legend title font size strip.text = element_text(size = 14)# Increase legend title font size
) + ) +
facet_wrap(~source, nrow = 3, labeller = labeller(source = c( facet_wrap(~source, nrow = 3, labeller = labeller(source = c(
"c1" = "VisualEditor", "c1" = "VisualEditor (2013)",
"c2" = "HTTPS-as-default", "c2" = "HTTPS-as-default (2013)",
"c3" = "HTTP-deprecation" "c3" = "HTTP-deprecation (2015)"
))) )))
new_unaff_authors new_unaff_authors
ggsave(filename = "ww-0501-bot-commits-faceted.png", plot = new_unaff_authors, width = 15, height = 9, dpi = 800) ggsave(filename = "d1-m2-bot-commits-faceted.png", plot = new_unaff_authors, width = 15, height = 9, dpi = 800)
unaff_authors <- new_authors_long_df |> unaff_authors <- new_authors_long_df |>
ggplot(aes(x=relative_week, ggplot(aes(x=relative_week,

View File

@ -38,25 +38,37 @@ commit_authors <- long_df |>
fill=factor(commit_type))) + fill=factor(commit_type))) +
geom_col(position='dodge') + geom_col(position='dodge') +
labs(x = "Relative Week", y = "Commits", fill="Commit Author") + labs(x = "Relative Week", y = "Commits", fill="Commit Author") +
geom_vline(data = long_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -29), aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -9), aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c2"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -99), aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = long_df |> filter(source == "c3"), geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -97), aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_text(data = data.frame(source = "c1", relative_week = -40, lengthened_commit_count = 50), geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_text(data = data.frame(source = "c1", relative_week = -39, lengthened_commit_count = 50),
aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 1) + # Add vertical line at week 0 geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + # Add vertical line at week 0
geom_text(data = data.frame(source = "c1", relative_week = 7, lengthened_commit_count = 50), geom_text(data = data.frame(source = "c2", relative_week = 7, lengthened_commit_count = 50),
aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_text(data = data.frame(source = "c3", relative_week = -15, lengthened_commit_count = 50),
aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment Announcement"),
inherit.aes = FALSE, color = "black", size = 4) +
scale_fill_manual(values = affiliationColors, scale_fill_manual(values = affiliationColors,
labels = c("unaff_commit_count" = "Unaffiliated", labels = c("unaff_commit_count" = "Unaffiliated",
"wikimedia_commit_count" = "WMF-affiliated")) + "wikimedia_commit_count" = "WMF-affiliated")) +
@ -74,10 +86,10 @@ commit_authors <- long_df |>
strip.text = element_text(size = 14)# Increase legend title font size strip.text = element_text(size = 14)# Increase legend title font size
) + ) +
facet_wrap(~source, nrow = 3, labeller = labeller(source = c( facet_wrap(~source, nrow = 3, labeller = labeller(source = c(
"c1" = "VisualEditor (commits to extensions/visualeditor)", "c1" = "VisualEditor (2013) [commits to extensions/visualeditor]",
"c2" = "HTTPS-as-default (relevant commits to mediawiki/core)", "c2" = "HTTPS-as-default (2013) [relevant commits to mediawiki/core]",
"c3" = "HTTP-deprecation (relevant commits to mediawiki/core)" "c3" = "HTTP-deprecation (2015) [relevant commits to mediawiki/core]"
))) )))
commit_authors commit_authors
ggsave(filename = "ww-0501-commits-faceted.png", plot = commit_authors, width = 15, height = 9, dpi = 800) ggsave(filename = "d1-m2-commits-faceted.png", plot = commit_authors, width = 15, height = 9, dpi = 800)

Binary file not shown.

After

Width:  |  Height:  |  Size: 781 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 774 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 743 KiB

View File

@ -1,18 +1,17 @@
1. SSH tunnel from your workstation using the following command: 1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:39175 mjilg@klone.hyak.uw.edu ssh -N -L 8787:n3439:38329 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787 and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials: 2. log in to RStudio Server using the following credentials:
user: mjilg user: mjilg
password: twImEJor5ex498HTzJjx password: YXXLCjS/064zAiagiRdx
When done using RStudio Server, terminate the job by: When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node: 2. Issue the following command on the login node:
scancel -f 25681892 scancel -f 26402644
slurmstepd: error: *** JOB 25681892 ON n3439 CANCELLED AT 2025-05-01T23:08:23 DUE TO TIME LIMIT ***

View File

@ -80,13 +80,13 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/tmp/ipykernel_55861/3758790231.py:41: SettingWithCopyWarning: \n", "/tmp/ipykernel_76053/3758790231.py:41: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n", "Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n", "\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n", " mid_comment_phab_df['is_relevant'] = mid_comment_phab_df['conversation_id'].isin(relevant_conversation_ids)\n",
"/tmp/ipykernel_55861/3758790231.py:44: SettingWithCopyWarning: \n", "/tmp/ipykernel_76053/3758790231.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n", "Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n", "\n",
@ -148,7 +148,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 8,
"id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "id": "942344db-c8f5-4ed6-a757-c97f8454f18b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -172,6 +172,29 @@
"print(f\"Unique speakers: {unique_speakers}\")" "print(f\"Unique speakers: {unique_speakers}\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": 9,
"id": "b9229ca3-afb9-4eec-a173-f30be8c4729b",
"metadata": {},
"outputs": [],
"source": [
"given_date = pd.Timestamp(\"2013-08-28\").tz_localize(None)\n",
"task_phab_df['timestamp'] = pd.to_datetime(task_phab_df['timestamp'], unit='s').dt.tz_localize(None)\n",
"task_phab_df['week_bin'] = ((task_phab_df['timestamp'] - given_date).dt.days // 7)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "24205386-d18f-4fb7-b37d-e81c0a5ba532",
"metadata": {},
"outputs": [],
"source": [
"task_phab_df\n",
"task_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/phab_tasks.csv\", index=False)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,
@ -1024,7 +1047,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.12" "version": "3.11.11"
} }
}, },
"nbformat": 4, "nbformat": 4,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -24,7 +24,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85", "id": "f6448c6f-2b5d-45f5-a32e-b3b47c16ef85",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -35,7 +35,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 4,
"id": "e30e81ad", "id": "e30e81ad",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -75,7 +75,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 5,
"id": "f359805f", "id": "f359805f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -149,7 +149,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 6,
"id": "ffd0b263", "id": "ffd0b263",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -175,17 +175,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 9,
"id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5", "id": "f32f6eed-3aeb-4b05-8d40-7ed85e7235c5",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x1495ecba4bb0>" "<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x14cab225fd00>"
] ]
}, },
"execution_count": 6, "execution_count": 9,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -206,7 +206,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 10,
"id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436", "id": "a5b062d8-2d26-4a3e-a84c-ba0eaf6eb436",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -220,26 +220,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 11,
"id": "424d35e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"John is frustrated with the VisualEditor project, he thinks it doesn't work."
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"id": "999e1656-0036-4ba2-bedf-f54493f67790", "id": "999e1656-0036-4ba2-bedf-f54493f67790",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -285,7 +266,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 12,
"id": "be476647-624b-4e95-ab62-9c6b08f85368", "id": "be476647-624b-4e95-ab62-9c6b08f85368",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -298,7 +279,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 13,
"id": "a9628b54-a1df-49cd-a365-9cba59de3421", "id": "a9628b54-a1df-49cd-a365-9cba59de3421",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -308,7 +289,7 @@
"'i hate ve.interface, ve.interface always messes up i browser'" "'i hate ve.interface, ve.interface always messes up i browser'"
] ]
}, },
"execution_count": 10, "execution_count": 13,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -334,54 +315,13 @@
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" \"\"\"Entry point for launching an IPython kernel.\n", " \"\"\"Entry point for launching an IPython kernel.\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n", "Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n",
"Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n", "Token indices sequence length is longer than the specified maximum sequence length for this model (911 > 512). Running this sequence through the model will result in indexing errors\n"
"Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors\n"
] ]
} }
], ],
"source": [ "source": [
"comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n", "comment_phab_df['text'] = comment_phab_df['comment_text'].apply(str)\n",
"comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)" "comment_phab_df['resolved_text'] = comment_phab_df['text'].apply(resolving_comment)\n",
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "2b583feb-1c62-4c96-9ba0-2996d72e70d3",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "46088",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3360\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3361\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.Int64HashTable.get_item\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_61233/1116300830.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcomment_phab_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'resolved_text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m46088\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 941\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mkey_is_scalar\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 943\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_hashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1049\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1050\u001b[0m \u001b[0;31m# Similar to Index.get_value, but we do not fall back to positional\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1051\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1052\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_values_for_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/gscratch/scrubbed/mjilg/envs/coref2-notebook/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3362\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3363\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3364\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3365\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhasnans\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyError\u001b[0m: 46088"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "92bf47ae",
"metadata": {},
"outputs": [],
"source": [
"comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\", index=False)" "comment_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/051825_coref_rel_phab_comments.csv\", index=False)"
] ]
} }
@ -402,7 +342,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.12" "version": "3.11.11"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -148,7 +148,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 8,
"id": "942344db-c8f5-4ed6-a757-c97f8454f18b", "id": "942344db-c8f5-4ed6-a757-c97f8454f18b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -172,6 +172,29 @@
"print(f\"Unique speakers: {unique_speakers}\")" "print(f\"Unique speakers: {unique_speakers}\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": 9,
"id": "0ef35632-ed07-478e-94ab-525169b82783",
"metadata": {},
"outputs": [],
"source": [
"given_date = pd.Timestamp(\"2015-07-02\").tz_localize(None)\n",
"task_phab_df['timestamp'] = pd.to_datetime(task_phab_df['timestamp'], unit='s').dt.tz_localize(None)\n",
"task_phab_df['week_bin'] = ((task_phab_df['timestamp'] - given_date).dt.days // 7)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "1e7bda13-4c2d-413e-b3c6-9c4b38e6cb07",
"metadata": {},
"outputs": [],
"source": [
"task_phab_df\n",
"task_phab_df.to_csv(\"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/phab_tasks.csv\", index=False)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,

View File

@ -1168,7 +1168,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.11" "version": "3.7.12"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -3,6 +3,9 @@ library(tidyverse)
c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/phab_tasks.csv" c1_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/phab_tasks.csv"
c1_input_df <- read.csv(c1_count , header = TRUE) c1_input_df <- read.csv(c1_count , header = TRUE)
c2_count <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case2/phab_tasks.csv"
c2_input_df <- read.csv(c2_count , header = TRUE)
c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/phab_tasks.csv" c3_count <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case3/phab_tasks.csv"
c3_input_df <- read.csv(c3_count , header = TRUE) c3_input_df <- read.csv(c3_count , header = TRUE)
@ -10,17 +13,16 @@ c1_unique_counts <- c1_input_df %>%
group_by(meta.affil, week_bin) %>% group_by(meta.affil, week_bin) %>%
summarise(unique_count = n_distinct(conversation_id), .groups = "drop") summarise(unique_count = n_distinct(conversation_id), .groups = "drop")
c2_unique_counts <- c2_input_df %>%
group_by(meta.affil, week_bin) %>%
summarise(unique_count = n_distinct(conversation_id), .groups = "drop")
c3_unique_counts <- c3_input_df %>% c3_unique_counts <- c3_input_df %>%
group_by(meta.affil, week_bin) %>% group_by(meta.affil, week_bin) %>%
summarise(unique_count = n_distinct(conversation_id), .groups = "drop") summarise(unique_count = n_distinct(conversation_id), .groups = "drop")
c1_unique_counts <- c1_unique_counts%>% mutate(source = "c1") c1_unique_counts <- c1_unique_counts%>% mutate(source = "c1")
c2_unique_counts <- data.frame( c2_unique_counts <- c2_unique_counts %>% mutate(source = "c2")
meta.affil = rep("False", 117),
week_bin = -103:13,
unique_count = rep(0, 117),
source = rep("c2", 117)
)
c3_unique_counts <- c3_unique_counts %>% mutate(source = "c3") c3_unique_counts <- c3_unique_counts %>% mutate(source = "c3")
combined_df <- bind_rows(c1_unique_counts, c2_unique_counts, c3_unique_counts) combined_df <- bind_rows(c1_unique_counts, c2_unique_counts, c3_unique_counts)
@ -37,28 +39,34 @@ commit_authors <- combined_df |>
ggplot(aes(x=week_bin, ggplot(aes(x=week_bin,
y=unique_count, y=unique_count,
fill=factor(meta.affil))) + fill=factor(meta.affil))) +
geom_col(position='dodge') + geom_col(position='dodge2') +
labs(x = "Relative Week", y = "Tasks", fill="Task Author") + labs(x = "Relative Week", y = "New Tasks Created", fill="Task Author") +
geom_vline(data = combined_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -29), aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -9), aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c1"), geom_vline(data = combined_df |> filter(source == "c1"),
aes(xintercept = -4), aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 1) + linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c3"), geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -97), aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 1) + linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = combined_df |> filter(source == "c3"), geom_vline(data = combined_df |> filter(source == "c3"),
aes(xintercept = -3), aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 1) + linetype = "3313", color = "black", linewidth = 0.5) +
geom_text(data = data.frame(source = "c1", relative_week = -40, lengthened_commit_count = 130), geom_text(data = data.frame(source = "c1", relative_week = -39, lengthened_commit_count = 130),
aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Opt-In Testing Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 1) + # Add vertical line at week 0 geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + # Add vertical line at week 0
geom_text(data = data.frame(source = "c1", relative_week = 7, lengthened_commit_count = 130), geom_text(data = data.frame(source = "c2", relative_week = 7, lengthened_commit_count = 130),
aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"), aes(x = relative_week, y = lengthened_commit_count, label = "Wide Deployment"),
inherit.aes = FALSE, color = "black", size = 4) + inherit.aes = FALSE, color = "black", size = 4) +
geom_text(data = data.frame(source = "c3", relative_week = -15, lengthened_commit_count = 130), geom_text(data = data.frame(source = "c3", relative_week = -15, lengthened_commit_count = 130),
@ -81,10 +89,10 @@ commit_authors <- combined_df |>
strip.text = element_text(size = 14)# Increase legend title font size strip.text = element_text(size = 14)# Increase legend title font size
) + ) +
facet_wrap(~source, nrow = 3, labeller = labeller(source = c( facet_wrap(~source, nrow = 3, labeller = labeller(source = c(
"c1" = "VisualEditor", "c1" = "VisualEditor (2013)",
"c2" = "HTTPS-as-default", "c2" = "HTTPS-as-default (2013)",
"c3" = "HTTP-deprecation" "c3" = "HTTP-deprecation (2015)"
))) )))
commit_authors commit_authors
ggsave(filename = "ww-0501-tasks-faceted.png", plot = commit_authors, width = 15, height = 9, dpi = 800) ggsave(filename = "d1-m2-tasks-faceted.png", plot = commit_authors, width = 15, height = 9, dpi = 800)