Compare commits
No commits in common. "8f2409feb0c8dfb7711d7efa43ff581645387d5a" and "55964c754bea9ea9a4e8ef7f0b16cb933b5dc097" have entirely different histories.
8f2409feb0
...
55964c754b
@ -2,11 +2,6 @@
|
||||
|
||||
Analysis scripts and code for studying the deployment processes of three MediaWiki/Wikimedia features (2013-2015)
|
||||
|
||||
/p1
|
||||
- contains the scripts and plots from the first phase of analysis
|
||||
|
||||
/p2
|
||||
- contains the scripts and plots from the second phase of analysis
|
||||
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 781 KiB After Width: | Height: | Size: 781 KiB |
Before Width: | Height: | Size: 774 KiB After Width: | Height: | Size: 774 KiB |
Before Width: | Height: | Size: 743 KiB After Width: | Height: | Size: 743 KiB |
Before Width: | Height: | Size: 810 KiB After Width: | Height: | Size: 810 KiB |
Before Width: | Height: | Size: 670 KiB After Width: | Height: | Size: 670 KiB |
Before Width: | Height: | Size: 787 KiB After Width: | Height: | Size: 787 KiB |
Before Width: | Height: | Size: 734 KiB After Width: | Height: | Size: 734 KiB |
Before Width: | Height: | Size: 391 KiB After Width: | Height: | Size: 391 KiB |
Before Width: | Height: | Size: 431 KiB After Width: | Height: | Size: 431 KiB |
Before Width: | Height: | Size: 377 KiB After Width: | Height: | Size: 377 KiB |
Before Width: | Height: | Size: 420 KiB After Width: | Height: | Size: 420 KiB |
Before Width: | Height: | Size: 381 KiB After Width: | Height: | Size: 381 KiB |
Before Width: | Height: | Size: 421 KiB After Width: | Height: | Size: 421 KiB |
Before Width: | Height: | Size: 372 KiB After Width: | Height: | Size: 372 KiB |
Before Width: | Height: | Size: 413 KiB After Width: | Height: | Size: 413 KiB |
Before Width: | Height: | Size: 420 KiB After Width: | Height: | Size: 420 KiB |
Before Width: | Height: | Size: 425 KiB After Width: | Height: | Size: 425 KiB |
Before Width: | Height: | Size: 370 KiB After Width: | Height: | Size: 370 KiB |
Before Width: | Height: | Size: 406 KiB After Width: | Height: | Size: 406 KiB |
After Width: | Height: | Size: 1.3 MiB |
Before Width: | Height: | Size: 1.3 MiB After Width: | Height: | Size: 1.3 MiB |
Before Width: | Height: | Size: 1.2 MiB After Width: | Height: | Size: 1.2 MiB |
Before Width: | Height: | Size: 1.1 MiB After Width: | Height: | Size: 1.1 MiB |
Before Width: | Height: | Size: 860 KiB After Width: | Height: | Size: 860 KiB |
Before Width: | Height: | Size: 1.0 MiB After Width: | Height: | Size: 1.0 MiB |
Before Width: | Height: | Size: 1.6 MiB After Width: | Height: | Size: 1.6 MiB |
Before Width: | Height: | Size: 1.7 MiB After Width: | Height: | Size: 1.7 MiB |
Before Width: | Height: | Size: 988 KiB After Width: | Height: | Size: 988 KiB |
Before Width: | Height: | Size: 2.1 MiB After Width: | Height: | Size: 2.1 MiB |
Before Width: | Height: | Size: 1.4 MiB After Width: | Height: | Size: 1.4 MiB |
Before Width: | Height: | Size: 1.5 MiB After Width: | Height: | Size: 1.5 MiB |
18
mgaughan-rstudio-server_27419348.out
Normal file
@ -0,0 +1,18 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:50819 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: lM83HdgeT310p2tkyoCk
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 27419348
|
||||
slurmstepd: error: *** JOB 27419348 ON n3439 CANCELLED AT 2025-07-07T13:08:38 ***
|
@ -1,97 +0,0 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
|
||||
MODEL_NAME = "Blablablab/neurobiber"
|
||||
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
|
||||
|
||||
# List of the 96 features that Neurobiber can predict
|
||||
BIBER_FEATURES = [
|
||||
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
||||
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
||||
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
||||
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
||||
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
||||
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
||||
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
||||
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
||||
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
||||
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
||||
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
||||
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
||||
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
||||
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
||||
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
||||
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
||||
]
|
||||
|
||||
def load_model_and_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to("cuda")
|
||||
model.eval()
|
||||
return model, tokenizer
|
||||
|
||||
def chunk_text(text, chunk_size=CHUNK_SIZE):
|
||||
tokens = text.strip().split()
|
||||
if not tokens:
|
||||
return []
|
||||
return [" ".join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]
|
||||
|
||||
def get_predictions_chunked_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
||||
chunked_texts = []
|
||||
chunk_indices = []
|
||||
for idx, text in enumerate(texts):
|
||||
start = len(chunked_texts)
|
||||
text_chunks = chunk_text(text, chunk_size)
|
||||
chunked_texts.extend(text_chunks)
|
||||
chunk_indices.append({
|
||||
'original_idx': idx,
|
||||
'chunk_range': (start, start + len(text_chunks))
|
||||
})
|
||||
|
||||
# If there are no chunks (empty inputs), return zeros
|
||||
if not chunked_texts:
|
||||
return np.zeros((len(texts), model.config.num_labels))
|
||||
|
||||
all_chunk_preds = []
|
||||
for i in range(0, len(chunked_texts), subbatch_size):
|
||||
batch_chunks = chunked_texts[i : i + subbatch_size]
|
||||
encodings = tokenizer(
|
||||
batch_chunks,
|
||||
return_tensors='pt',
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=chunk_size
|
||||
).to("cuda")
|
||||
|
||||
with torch.no_grad(), torch.amp.autocast("cuda"):
|
||||
outputs = model(**encodings)
|
||||
probs = torch.sigmoid(outputs.logits)
|
||||
all_chunk_preds.append(probs.cpu())
|
||||
|
||||
all_chunk_preds = torch.cat(all_chunk_preds, dim=0) if all_chunk_preds else torch.empty(0)
|
||||
predictions = [None] * len(texts)
|
||||
|
||||
for info in chunk_indices:
|
||||
start, end = info['chunk_range']
|
||||
if start == end:
|
||||
# No tokens => no features
|
||||
pred = torch.zeros(model.config.num_labels)
|
||||
else:
|
||||
# Take max across chunks for each feature
|
||||
chunk_preds = all_chunk_preds[start:end]
|
||||
pred, _ = torch.max(chunk_preds, dim=0)
|
||||
predictions[info['original_idx']] = (pred > 0.5).int().numpy()
|
||||
|
||||
return np.array(predictions)
|
||||
|
||||
def predict_batch(model, tokenizer, texts, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
||||
return get_predictions_chunked_batch(model, tokenizer, texts, chunk_size, subbatch_size)
|
||||
|
||||
def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32):
|
||||
batch_preds = predict_batch(model, tokenizer, [text], chunk_size, subbatch_size)
|
||||
return batch_preds[0]
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("my brain hurts!")
|
||||
#https://huggingface.co/Blablablab/neurobiber
|
@ -1,32 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH -A p32852
|
||||
#SBATCH -p gengpu
|
||||
#SBATCH --gres=gpu:a100:1
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --time=24:00:00
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=neurobiber-categorization
|
||||
#SBATCH --output=neurobiber-categorization.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
echo "starting the job at: $(date)"
|
||||
|
||||
echo "setting up the environment"
|
||||
|
||||
module purge
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate neurobiber
|
||||
|
||||
echo "running the p1 categorization script"
|
||||
|
||||
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py
|
||||
|
||||
echo "job finished, cleaning up"
|
||||
|
||||
conda deactivate
|
||||
|
||||
echo "job pau at: $(date)"
|
||||
|
@ -1,31 +0,0 @@
|
||||
#!/bin/bash
|
||||
#SBATCH -A p32852
|
||||
#SBATCH -p gengpu
|
||||
#SBATCH --gres=gpu:a100:1
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --time=24:00:00
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=p1-categorization
|
||||
#SBATCH --output=p1-categorization.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
echo "starting the job at: $(date)"
|
||||
|
||||
echo "setting up the environment"
|
||||
|
||||
module purge
|
||||
eval "$(conda shell.bash hook)"
|
||||
conda activate olmo
|
||||
|
||||
echo "running the p1 categorization script"
|
||||
|
||||
|
||||
echo "job finished, cleaning up"
|
||||
|
||||
conda deactivate
|
||||
|
||||
echo "job pau at: $(date)"
|
||||
|
@ -1,6 +0,0 @@
|
||||
[ ] generate clean rows for each comment from the discussion data
|
||||
[ ] get data onto quest
|
||||
[ ] run neurobiber over the data set, appending vectors onto the array
|
||||
[ ] set up the unsupervised classification pipeline
|
||||
[ ] iterate the prompt
|
||||
[ ] run the classification of the prompt
|
Before Width: | Height: | Size: 102 KiB After Width: | Height: | Size: 102 KiB |
Before Width: | Height: | Size: 69 KiB After Width: | Height: | Size: 69 KiB |
Before Width: | Height: | Size: 145 KiB After Width: | Height: | Size: 145 KiB |
Before Width: | Height: | Size: 77 KiB After Width: | Height: | Size: 77 KiB |
Before Width: | Height: | Size: 92 KiB After Width: | Height: | Size: 92 KiB |
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 46 KiB |
Before Width: | Height: | Size: 71 KiB After Width: | Height: | Size: 71 KiB |
Before Width: | Height: | Size: 55 KiB After Width: | Height: | Size: 55 KiB |
Before Width: | Height: | Size: 698 KiB After Width: | Height: | Size: 698 KiB |
Before Width: | Height: | Size: 747 KiB After Width: | Height: | Size: 747 KiB |
Before Width: | Height: | Size: 25 KiB After Width: | Height: | Size: 25 KiB |
Before Width: | Height: | Size: 33 KiB After Width: | Height: | Size: 33 KiB |
Before Width: | Height: | Size: 60 KiB After Width: | Height: | Size: 60 KiB |