commit fffb738a4eb124ce46f0c5b66d8368730a21816e Author: Matthew Gaughan Date: Wed Feb 5 20:39:48 2025 -0800 hw1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..198b817 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +# ignore the R studio docker image needed by hyak +rstudio_latest.sif + + +# do not need to include any R items +.Rhistory +.cache/ +.config/ +.local/ diff --git a/hw1/hw1.Rmd b/hw1/hw1.Rmd new file mode 100644 index 0000000..7fa81db --- /dev/null +++ b/hw1/hw1.Rmd @@ -0,0 +1,213 @@ +--- +title: "Homework 1" +output: html_document +--- +Matthew Gaughan, Assignment 1 for Sociology 476: Computational Content Analysis. + + +## Task 0 + +```{r} +library(stringr) +library(dplyr) +string_0 <- "Before you get started, I have to ask you for a quick favor. I lost my +key. Could you please search for it and grab it from this string here +using regex?" + +string_1 <- "Ah, thanks! But, actually, that was the wrong key. Actually, it’s this +key that I need. I don’t need the first key. Can you get this one using +a single regex?" + +string_2 <- "Given the confusion we’ve had with keys lately, I was wondering whether +it would make sense to use colored keys. For instance, we’d have a yellow +key, a purple key, and a red key, etc. And then you could just grab all +those at once with a single regex, but you wouldn’t accidentally get +some other key. Couldn’t you? By the way, I found my NU-ID - this purple +keycard I had been searching for so long." + +#string 0 +str_extract_all(string_0, regex("key")) +#string 1 +str_extract_all(string_1, regex("\\bkey\\b(?!\\.)")) +#string 2 +str_extract_all(string_2, regex("\\bkey\\b(?!\\.)")) + +``` + +## Task 1 + +The corpus consists of comment data on WikiMedia Foundation platforms in and around software feature deployments that likely impacted the operation of user bots and scripts. The user bots and scripts community is an important, peripheral software development community for the WikiMedia Foundation platforms. There are two primary formats for the data, parsed WikiTalk (general forum areas in various parts of the platform) pages and parsed Phabricator discussions (the platform's software engineering management work board.) For all data, we will have access to the comment's text, original author, and time of publication. We will sometimes have information pertaining to the task or subject that the comment is published in reply to. This homework assignment will focus on the data that we have from 3 years (2012-1-1 to 2014-12-31) of Phabricator discussions around the VisualEditor project. + +```{r} +#getting the filepath for the comment data +ve_phab <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/0205_convo_data/phab_data/visualeditor/0205_ve_phab_comments.csv" +ve_phab_df = read.csv(ve_phab, header = TRUE) +comment_level_phab_df = ve_phab_df +head(comment_level_phab_df) +``` +Given that these are comments on a work board, the simplest unit of analysis is each individual comment. I have loaded the corpus in as individual comments (n=32506) above. However, as Phabricator is a work management system, these comments are all discussing specific items of software engineering work. Below, I have re-shaped the data to be organized by parent Task (n=4583). +```{r} +# using R pipe operations to group by the unique TASK PHIDs and then collate the comments, +# separating the different comments with a punctutation term that will be removed at cleaning +task_level_phab_df <- ve_phab_df |> + group_by(TaskPHID) |> + summarise(comment_text = paste(comment_text, collapse = ":-;-:")) +head(task_level_phab_df) +``` +## Task 2 +```{r} +library(quanteda) +library(quanteda.textstats) + +# we need to have unique comment_ids for quanteda +comment_level_phab_df <- comment_level_phab_df |> + mutate(comment_index = row_number()) +#loading into quanteda +corpus <- quanteda::corpus(comment_level_phab_df, text_field = "comment_text", docid_field = "comment_index") +``` +Given the messy nature of these discussions (significant jargon and a high count of URLs) more heavy-handed pre-processing is necessary to turn this into text that we can analyze. +```{r} +# there are a large amount of urls, so removing those will hopefully be useful +# there is also a lot of discussion re: different tasks by their ID, +# I am skeptical that I can currently make meaning of this, so I am removing numbers +tokens <- tokens(corpus, at = "word", + remove_punct = TRUE, + remove_symbols = TRUE, + remove_numbers = TRUE, + remove_url = TRUE, + remove_separators = TRUE, + split_hyphens = FALSE) +# trying to standardize the data across sentences +tokens = tokens_tolower(tokens, keep_acronyms = TRUE) +# the "raw" dfm +comment_dfm_raw = dfm(tokens) +# we then have to remove stop words +tokens_nostop <- tokens_remove(tokens, pattern = stopwords("en")) +``` +Now I will turn the mostly cleaned tokens into a DFM. +```{r} +library(lexicon) +comment_level_dfm <- dfm(tokens_nostop) +# we'll then lemmatize the words +comment_dfm_lemmatized = dfm_replace(comment_level_dfm, + pattern = lexicon::hash_lemmas$token, + replacement = lexicon::hash_lemmas$lemma) +``` +Alternatively, we can just get the 300 most common features. +```{r} +feature_counts = colSums(comment_level_dfm) +comment_300_feat = sort(feature_counts, decreasing = T)[1:300] +``` + +## Task 3 +To work with a simple dictionary for this homework, I must first provide context on this event. VisualEditor was a Wikipedia feature deployment in July 2013 that was met with immediate community backlash. The failed deployment reshaped the relationship between the WikiMedia Foundation and its community members for years to come. I am interested in how the subcommunity of user bots and scripts operators were discussed in this setting; a cursory read-over showed other community members invoking VisualEditor's impact on user bots and scripts as their justification for opposing the feature roll-out. I will populate a dictionary with terms that I associate with discussion of user bots and scripts in this context, to see how many documents contain these terms. +```{r} +breaking_bots_dict = dictionary(list(breaking_bots = c("bots", "scripts", "gadgets", "community", "foundation", "staff", "disregard", "impact", + "break", "operation", "failing", "scrap*", "parse*", "robots", "tools")), + file = NULL, +) +fast_bot_dict_counts <- dfm_lookup(comment_dfm_lemmatized , dictionary = breaking_bots_dict, levels = 1) +print(fast_bot_dict_counts) +``` +This quick and dirty dictionary is not a very good measure, as shown by the 92.8% sparsity in the data set; the words that I personally associate with discussions about visualeditor's impact on user bots and scripts are hardly present in our data. This deductive approach is a limitation of the dictionary method as a tool. While dictionary methods can be useful for initial data exploration, their utility falters as the bounds of the data or the aims of the researcher broaden. + +## Task 4 +Given the computational burden of a corpus this larger, I have to subset my data features for the creation of the co-occurence matrix. Thus I will study a term I know to be popular: VisualEditor. I will get co-occurence values for when people refer to the project that their work orbits around. +```{r} +#because we are supposed to use the raw dfm, we're going to use the co-occurence function from lab +make_term_coocur_matrix <- function(dtm, verbose = FALSE) { + tdm <- t(dtm) + term_coocurrence_matrix <- tdm %*% dtm %>% as.matrix() + shared_terms_matrix <- dtm %*% tdm %>% as.matrix() + #gracefully exit + return(term_coocurrence_matrix) +} +#saving some compute +rcdfm <- dfm_select(comment_dfm_raw, pattern = names(topfeatures(comment_dfm_raw, 200))) +#make the co-occur matrix for the raw dfm +comment_co_occur <- make_term_coocur_matrix(rcdfm) +sort(comment_co_occur[,"visualeditor"], decreasing = T)[1:9] +``` +I am a bit underwhelmed by these results, though they are surely a function of using the raw dfm instead of the cleaned and lemmatized one. + +## Task 5 +We first have to load in the fasttext embeddings and do some pre-processing to make our later operations more efficient. +```{r} +fasttext_embeddings = data.table::fread("/mmfs1/gscratch/comdata/users/mjilg/cc.en.300.vec", skip = 1, quote ="") +# transforming the embeddings into matrices so that we can do matrix algebra on it. +fasttext_mat = as.matrix(fasttext_embeddings[, -1]) +rownames(fasttext_mat) = fasttext_embeddings$V1 +#vector_lengths = sqrt(rowSums(fasttext_mat^2)) +head(fasttext_mat[,1:10]) +``` +Now we can define our class for further analysis. Given that we're looking at comments on a work management platform for software engineering. I want to establish a class of common verbiage around different work tasks and actions of software development. +```{r} +swe_terms = c("merge", "deploy", "permissions", "rights", "code", + "engineer", "review", "churn", "conflict", "develop", + "sprint", "retrospective", "planning", "user", "design") +swe_mat = fasttext_mat[swe_terms, ] +#head(swe_mat) +``` +Now we can use principal component analysis to try to understand the variance between these terms. +```{r} +swe_pca_results = prcomp(swe_mat) +swe_pca_sum = summary(swe_pca_results) + +variance_proportion = swe_pca_sum$sdev^2 / sum(swe_pca_sum$sdev^2) +variance_proportion[1:5] +``` +It looks like our first 5 dimensions are able to capture a significant amount of variance. Let's plot the data. +```{r} +library(ggplot2) +ggplot(as.data.frame(swe_pca_results$x)) + + geom_text(aes(PC4, PC5, label = swe_terms)) + + ylab(paste0("")) + + theme_bw() +``` +I have spent some time clicking through the different PC plots and trying to interpret what the "meanings" of each of them could be. The first dimension seems to cluster terms by activity; the more active terms (deploy, merge) are more alike. The second dimension follows a similar logic, the third seems to latch cluster via a common understanding of "engineering work". The fourth dimension is the one that I am primairly interested in, as it seems to group terms that hint at the organizational concept of 'decision rights.' Who has the permissions to make decisions and affect organizational and technical change is a pre-eminent nexus for any organization. It is funny then, that my intentional inclusion into the class of "rights" to try to provoke this semantic cluster seems to have been rejected. + +I find PCAs very useful in explaining different semantic variance and provoking further research avenues. However I am hesitant to assign too much value onto their capabilities of meaning making. Fasttext's weights are trained on common crawl and wikipedia. While these corpus are incredible resources for general text analysis, their utility in robust semantic meaning are limited by that very generality. I would be curious to know the resources for more localized, context-specific word embeddings. It seems like a useful tool for developing more grounded and specific explanations of semantic variance. + +## Task 6 +I am interested in how references to the user bots and scripts community changes over time, especially over the time-window of the VisualEditor deployment. The WikiMedia platform and the MediaWiki software that supports it is organized in a core-peripheral development model, a common software engineering structure. It is often difficult to dilineate developers as core or peripheral, but in this case it is relatively straightforward. We can use Carliss Baldwin's framework of 'mirroring' between a technical artifact and the organization that builds it to identify the different actors in this software development: the engineers who work on core libraries are core developers, they are also largely employed by the WikiMedia Foundation; the non-employee contributors who work on extension libraries and bots are peripheral developers. My hunch is that these categorizations --- which are technically enforced via write-permissions on the constituent software libraries --- are invoked in the broader community's response to popular or unpopular feature deployments by the core organization. The last jump in that analysis is to identify the underlying commit changes during these periods of community response. + +I am interested in the different ways people talk about contributor agency in this setting: who has it, who doesn't. I want to take the first step in exploring that (an initial, descriptive analysis) through similar methods to Kozlowski's class and gender explorations. This pole exploration is a useful preliminary, validation analysis for later studies of how people discuss certain communities' agency. First we can define our poles in terms of the tokens closest and furthest from the action of "merging," the critical decision regarding what code is included and excluded from the project. Core developers often have merge permissions to a project; peripheral developers often do not. By studying the weighted relationships between how these engineers discuss power in the context of how they discuss the action of merging code, we can gain a better understanding of Phabricator engineers' discussions of work. Given that this exploration combines two methods from class to develop a method that is not introduced in class, it is also a bit of a test of the method's validity. +```{r} +fcm_matrix = tokens %>% + tokens_remove(stopwords("en"), padding = FALSE) %>% + quanteda::fcm(context = "window", window = 3) %>% as.matrix() + +like_merge_pole <- names(sort(fcm_matrix[,"merging"], decreasing = T)[1:20]) +like_merge_label <- "Like Merge" +dislike_merge_pole <- names(sort(fcm_matrix[,"merging"], decreasing = F)[1:20]) +dislike_merge_label <- "Dislike Merge" + +#we define our target pool as the different stakeholder groups of software projects +targets <- names(sort(fcm_matrix[,"power"], decreasing = T)[1:4]) + +library(lsa) +``` +Then, we can use the Kozlowski function to find the difference between the two poles that we have constructed. +```{r} +project_term_on_dimension = function(pole_1, label_1, pole_2, label_2, term_to_project, embedding_matrix){ + pole_1_centroid = colMeans(embedding_matrix[pole_1, ]) + pole_2_centroid = colMeans(embedding_matrix[pole_2, ]) + + axis = pole_2_centroid - pole_1_centroid + + if(length(term_to_project) > 1){ + cosine_w = lsa::cosine(axis, t(embedding_matrix[term_to_project, ])) + } else { + cosine_w = lsa::cosine(axis, embedding_matrix[term_to_project, ]) + } + return(data.frame(from = label_1, + to = label_2, + term = term_to_project, projection = cosine_w)) +} +``` +We then pass the values to the term projection. +```{r} +merge_projection <- project_term_on_dimension(like_merge_pole, like_merge_label, dislike_merge_pole, dislike_merge_label, targets, fasttext_mat) + +``` +The projection function fails when trying to calculate the column means. It seems that the error is that certain terms in the pole vectors are not identifiable in the embedding matrix --- the localized terminology "zend" "parsoid" and likely the acronym "HHVM" are not present in the common crawl and Wikipedia training data. Some of this is certainly a function of testing the bounds of the method, but I also think that it speaks to the limitations of relying on embeddings for the analysis of localized semantics. \ No newline at end of file diff --git a/hw1/hw1.html b/hw1/hw1.html new file mode 100644 index 0000000..f53e209 --- /dev/null +++ b/hw1/hw1.html @@ -0,0 +1,437 @@ + + + + + + + + + + + + + +Homework 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

Matthew Gaughan, Assignment 1 for Sociology 476: Computational +Content Analysis.

+
+

Task 0

+
library(stringr)
+
+string_0 <- "Before you get started, I have to ask you for a quick favor. I lost my
+key. Could you please search for it and grab it from this string here
+using regex?"
+
+string_1 <- "Ah, thanks! But, actually, that was the wrong key. Actually, it’s this
+key that I need. I don’t need the first key. Can you get this one using
+a single regex?"
+
+string_2 <- "Given the confusion we’ve had with keys lately, I was wondering whether
+it would make sense to use colored keys. For instance, we’d have a yellow
+key, a purple key, and a red key, etc. And then you could just grab all
+those at once with a single regex, but you wouldn’t accidentally get
+some other key. Couldn’t you? By the way, I found my NU-ID - this purple
+keycard I had been searching for so long."
+
+#string 0
+str_extract_all(string_0, regex("key"))
+
## [[1]]
+## [1] "key"
+
#string 1
+str_extract_all(string_1, regex("\\bkey\\b(?!\\.)"))
+
## [[1]]
+## [1] "key"
+
#string 2
+str_extract_all(string_2, regex("\\bkey\\b(?!\\.)"))
+
## [[1]]
+## [1] "key" "key" "key"
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/hw1/hw1.log b/hw1/hw1.log new file mode 100644 index 0000000..b4fc24c --- /dev/null +++ b/hw1/hw1.log @@ -0,0 +1,2 @@ + +! sh: 1: pdflatex: not found diff --git a/hw1/hw1.tex b/hw1/hw1.tex new file mode 100644 index 0000000..0950c5f --- /dev/null +++ b/hw1/hw1.tex @@ -0,0 +1,138 @@ +% Options for packages loaded elsewhere +\PassOptionsToPackage{unicode}{hyperref} +\PassOptionsToPackage{hyphens}{url} +\documentclass[ +]{article} +\usepackage{xcolor} +\usepackage[margin=1in]{geometry} +\usepackage{amsmath,amssymb} +\setcounter{secnumdepth}{-\maxdimen} % remove section numbering +\usepackage{iftex} +\ifPDFTeX + \usepackage[T1]{fontenc} + \usepackage[utf8]{inputenc} + \usepackage{textcomp} % provide euro and other symbols +\else % if luatex or xetex + \usepackage{unicode-math} % this also loads fontspec + \defaultfontfeatures{Scale=MatchLowercase} + \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} +\fi +\usepackage{lmodern} +\ifPDFTeX\else + % xetex/luatex font selection +\fi +% Use upquote if available, for straight quotes in verbatim environments +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +\IfFileExists{microtype.sty}{% use microtype if available + \usepackage[]{microtype} + \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts +}{} +\makeatletter +\@ifundefined{KOMAClassName}{% if non-KOMA class + \IfFileExists{parskip.sty}{% + \usepackage{parskip} + }{% else + \setlength{\parindent}{0pt} + \setlength{\parskip}{6pt plus 2pt minus 1pt}} +}{% if KOMA class + \KOMAoptions{parskip=half}} +\makeatother +\usepackage{color} +\usepackage{fancyvrb} +\newcommand{\VerbBar}{|} +\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} +\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} +% Add ',fontsize=\small' for more characters per line +\usepackage{framed} +\definecolor{shadecolor}{RGB}{248,248,248} +\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} +\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} +\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} +\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} +\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} +\newcommand{\BuiltInTok}[1]{#1} +\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} +\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} +\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} +\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} +\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} +\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} +\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} +\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} +\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} +\newcommand{\ExtensionTok}[1]{#1} +\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} +\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} +\newcommand{\ImportTok}[1]{#1} +\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} +\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} +\newcommand{\NormalTok}[1]{#1} +\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} +\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} +\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} +\newcommand{\RegionMarkerTok}[1]{#1} +\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} +\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} +\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} +\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} +\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} +\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} +\usepackage{graphicx} +\makeatletter +\newsavebox\pandoc@box +\newcommand*\pandocbounded[1]{% scales image to fit in text height/width + \sbox\pandoc@box{#1}% + \Gscale@div\@tempa{\textheight}{\dimexpr\ht\pandoc@box+\dp\pandoc@box\relax}% + \Gscale@div\@tempb{\linewidth}{\wd\pandoc@box}% + \ifdim\@tempb\p@<\@tempa\p@\let\@tempa\@tempb\fi% select the smaller of both + \ifdim\@tempa\p@<\p@\scalebox{\@tempa}{\usebox\pandoc@box}% + \else\usebox{\pandoc@box}% + \fi% +} +% Set default figure placement to htbp +\def\fps@figure{htbp} +\makeatother +\setlength{\emergencystretch}{3em} % prevent overfull lines +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} +\usepackage{bookmark} +\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available +\urlstyle{same} +\hypersetup{ + pdftitle={Homework 1}, + hidelinks, + pdfcreator={LaTeX via pandoc}} + +\title{Homework 1} +\author{} +\date{\vspace{-2.5em}} + +\begin{document} +\maketitle + +Matthew Gaughan, Assignment 1 for Sociology 476: Computational Content +Analysis. + +\subsection{Task 0}\label{task-0} + +\begin{Shaded} +\begin{Highlighting}[] +\NormalTok{string\_0 }\OtherTok{\textless{}{-}} \StringTok{"Before you get started, I have to ask you for a quick favor. I lost my} +\StringTok{key. Could you please search for it and grab it from this string here} +\StringTok{using regex?"} + +\NormalTok{string\_1 }\OtherTok{\textless{}{-}} \StringTok{"Ah, thanks! But, actually, that was the wrong key. Actually, it’s this} +\StringTok{key that I need. I don’t need the first key. Can you get this one using} +\StringTok{a single regex?"} + +\NormalTok{string\_2 }\OtherTok{\textless{}{-}} \StringTok{"Given the confusion we’ve had with keys lately, I was wondering whether} +\StringTok{it would make sense to use colored keys. For instance, we’d have a yellow} +\StringTok{key, a purple key, and a red key, etc. And then you could just grab all} +\StringTok{those at once with a single regex, but you wouldn’t accidentally get} +\StringTok{some other key. Couldn’t you? By the way, I found my NU{-}ID {-} this purple} +\StringTok{keycard I had been searching for so long."} +\end{Highlighting} +\end{Shaded} + + +\end{document} diff --git a/hw1/hw1_files/figure-html/unnamed-chunk-13-1.png b/hw1/hw1_files/figure-html/unnamed-chunk-13-1.png new file mode 100644 index 0000000..f9d0987 Binary files /dev/null and b/hw1/hw1_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/rstudio-server.job b/rstudio-server.job new file mode 100644 index 0000000..9a66c20 --- /dev/null +++ b/rstudio-server.job @@ -0,0 +1,100 @@ +#!/bin/sh + +#SBATCH --job-name=comp-text-analysis +#SBATCH --partition=cpu-g2 #update this line - use hyakalloc to find partitions you can use + +#SBATCH --time=03:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --mem=32G + +#SBATCH --signal=USR2 +#SBATCH --output=%x_%j.out + +# This script will request a single CPU with four threads with 20GB of RAM for 2 hours. +# You can adjust --time, --nodes, --ntasks, and --mem above to adjust these settings for your session. + +# --output=%x_%j.out creates a output file called rstudio-server_XXXXXXXX.out +# where the %x is short hand for --job-name above and the X's are an 8-digit +# jobID assigned by SLURM when our job is submitted. + +RSTUDIO_CWD="/mmfs1/home/mjilg/git/soc-text-assignments" # UPDATE THIS LINE +RSTUDIO_SIF="rstudio_latest.sif" # update this line + +# Create temp directory for ephemeral content to bind-mount in the container +RSTUDIO_TMP=$(/usr/bin/python3 -c 'import tempfile; print(tempfile.mkdtemp())') + +mkdir -p -m 700 \ + ${RSTUDIO_TMP}/run \ + ${RSTUDIO_TMP}/tmp \ + ${RSTUDIO_TMP}/var/lib/rstudio-server + +cat > ${RSTUDIO_TMP}/database.conf < ${RSTUDIO_TMP}/rsession.sh <&2 <&2 +exit $APPTAINER_EXIT_CODE