1
0

Compare commits

...

17 Commits

Author SHA1 Message Date
mgaughan
c50a3b57ff updating new git organization to remove sif file 2025-06-03 09:52:20 -05:00
mgaughan
ff8ca0b46e updating with new container, collected categorizations 2025-06-03 09:43:25 -05:00
mgaughan
c6f4a244f4 updating (and failing) to plot categorization with sankey diagram 2025-06-02 22:40:48 -05:00
mgaughan
9403c79c44 pulling new olmocr image and new categorization stuff 2025-06-02 21:24:53 -05:00
mgaughan
c5df6cb6c6 removing ill categorizations 2025-06-02 11:35:45 -05:00
mgaughan
63450ba7ef now with updated categorizations 2025-06-02 11:29:59 -05:00
mgaughan
5ed797e971 trying to get olmocr to run, updated categorization values 2025-06-02 11:27:23 -05:00
mgaughan
d8b9ca9dea updating with docker images and categorized citations 2025-06-02 09:01:18 -05:00
mgaughan
c7448f2fc2 trying to load-balance the few-shot a bit more 2025-05-30 21:45:30 -05:00
mgaughan
225d7f53c8 bad categorization data, some restructuring of the repo 2025-05-30 21:36:18 -05:00
mgaughan
9985e190e7 updated with preliminary categorization 2025-05-30 21:20:36 -05:00
mgaughan
c3bb0801a2 ~final~ update to categorization script 2025-05-30 16:39:24 -05:00
mgaughan
86e2cd3ed8 updating with manual dedup of citations 2025-05-30 16:37:03 -05:00
mgaughan
1d63537027 redoing the dedup csv, something wrong with the other one 2025-05-30 13:52:13 -05:00
mgaughan
9d86f24c41 updating scripts and models for classification; errors in citation csv 2025-05-30 13:33:41 -05:00
mgaughan
17c69a6c92 updating prompts for categorization trial 2025-05-20 23:12:11 -05:00
mgaughan
7aedc1edbb mid-point on setting up the olmo models on quest; updating the organization of different scripts 2025-05-20 21:32:44 -05:00
23 changed files with 12273 additions and 13380 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.sif

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

3049
cites/auto_dedup_results.csv Normal file

File diff suppressed because one or more lines are too long

24
cites/bib_to_csv.py Normal file
View File

@ -0,0 +1,24 @@
import csv
import bibtexparser
from tqdm import tqdm
import pandas as pd
df1 = pd.read_csv('auto_dedup_results.csv')
df2 = pd.read_csv('auto_dedup_results_bad_abstracts.csv')
filtered_df1 = df1[df1['title'].isin(df2.iloc[:, 2])]
filtered_df1.to_csv('filtered_dedup_correct.csv', index=False)
'''
with open("auto_dedup_results.bib") as bibfile:
bib_db = bibtexparser.load(bibfile)
fields = ['duplicate_id', 'bibtype', 'title', 'abstract', 'doi']
with open('auto_dedup_results.csv', 'w', newline="", encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fields)
writer.writeheader()
for entry in tqdm(bib_db.entries, desc="Converting BibTeX to CSV"):
row = {field: entry.get(field, '') for field in fields}
writer.writerow(row)
'''

View File

@ -0,0 +1,9 @@
starting the job at: Mon Jun 2 11:43:44 CDT 2025
setting up the environment
running the p1 categorization script
cuda
NVIDIA A100-SXM4-80GB
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=af8da8da-1900-3762-4351-d9c80d33463b, L2_cache_size=40MB)
Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s] Loading checkpoint shards: 17%|█▋ | 1/6 [00:00<00:03, 1.42it/s] Loading checkpoint shards: 33%|███▎ | 2/6 [00:01<00:03, 1.20it/s] Loading checkpoint shards: 50%|█████ | 3/6 [00:02<00:02, 1.07it/s] Loading checkpoint shards: 67%|██████▋ | 4/6 [00:03<00:01, 1.05it/s] Loading checkpoint shards: 83%|████████▎ | 5/6 [00:04<00:00, 1.06it/s] Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00, 1.17it/s] Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00, 1.14it/s]
job finished, cleaning up
job pau at: Mon Jun 2 13:20:49 CDT 2025

View File

@ -0,0 +1,9 @@
starting the job at: Mon Jun 2 22:58:46 CDT 2025
setting up the environment
running the p1 categorization script
cuda
NVIDIA A100-PCIE-40GB
_CudaDeviceProperties(name='NVIDIA A100-PCIE-40GB', major=8, minor=0, total_memory=40442MB, multi_processor_count=108, uuid=a48cfab5-6d74-8479-c725-d4a6e53059e3, L2_cache_size=40MB)
Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s] Loading checkpoint shards: 17%|█▋ | 1/6 [00:01<00:06, 1.36s/it] Loading checkpoint shards: 33%|███▎ | 2/6 [00:02<00:05, 1.33s/it] Loading checkpoint shards: 50%|█████ | 3/6 [00:03<00:03, 1.28s/it] Loading checkpoint shards: 67%|██████▋ | 4/6 [00:05<00:02, 1.46s/it] Loading checkpoint shards: 83%|████████▎ | 5/6 [00:07<00:01, 1.47s/it] Loading checkpoint shards: 100%|██████████| 6/6 [00:08<00:00, 1.30s/it] Loading checkpoint shards: 100%|██████████| 6/6 [00:08<00:00, 1.35s/it]
job finished, cleaning up
job pau at: Tue Jun 3 00:46:04 CDT 2025

17
cites/shell.nix Normal file
View File

@ -0,0 +1,17 @@
{
pkgs ? import <nixpkgs> { },
}:
pkgs.mkShell {
name = "trial-env";
packages = with pkgs; [
pkgs.python312
pkgs.python312Packages.bibtexparser
pkgs.python312Packages.tqdm
pkgs.python312Packages.pandas
git
];
shellHook = ''
echo "trying to find a good quote for here"
'';
}

20
containers/ocr_run.sh Normal file
View File

@ -0,0 +1,20 @@
#!/bin/bash
#SBATCH -A p32852
#SBATCH -p gengpu
#SBATCH --gres=gpu:a100:1
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --time=24:00:00
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=olmocr-pull-docker-img
#SBATCH --output=olmocr-pull.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu
# using singularity to pull the allenai olmocr docker image
module load singularity
print("running ocr container")
singularity run

View File

@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH -A p32852
#SBATCH -p gengpu
#SBATCH --gres=gpu:a100:1
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --time=24:00:00
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=olmocr-pull-docker-img
#SBATCH --output=olmocr-pull.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu
# using singularity to pull the allenai olmocr docker image
module load singularity
export SINGULARITY_CACHEDIR=$TMPDIR
singularity pull olmocr_container.sif docker://alleninstituteforai/olmocr:latest

View File

@ -0,0 +1,65 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
import torch
import csv
import pandas as pd
#load in the different models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0))
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-7B").to(device)
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-7B")
#priming prompt
prompt_1 = "For the GIVEN DATA, Please categorize it based on the following numbered characteristics: \n\n 1: YES/NO (Characteristic 1. This is an English language empirical study. Empirical studies discuss data or observations.) \n 2: YES/NO (Characteristic 2. This discusses free and open source software (FOSS or OSS). The focus of the GIVEN DATA is on free or open source software projects or ecosystems.) \n 3: YES/NO (Characteristic 3. The GIVEN DATA discusses FOSS project evolution. FOSS project evolution describes any changes to free and open source projects.) \n 4: YES/NO (Characteristic 4. This GIVEN DATA discusses FOSS project adaptation. FOSS project adaptation describes the intentional strategic changes made by projects to better align with the project's broader environment.) \n\n Characteristics 2, 3, and 4 can only be YES if the preceding characteristic was also a YES. \n\n Only respond with the appropriate number followed by 'YES' if the characteristic is present in the provided data or 'NO' if it is not (e.g. '1. YES; 2. NO;'). Do not provide any additional information."
example_4 = "Example 4: TITLE - Analysis of Open Source Software Evolution Using Evolution Curve Method \n ABSTRACT - Design and evolution of modem information systems is influenced by many factors: technical, organizational, social, and psychological. This is especially true for open source software systems (OSSS), when many developers from different backgrounds interact, share their ideas and contribute towards the development and improvement of a software product. The evolution of all OSSS is a continuous process of source code development, adaptation, improvement and maintenance. Studying changes to the various characteristics of source code can help us understand the evolution of a software system. In this paper, the software evolution process is analyzed using a proposed Evolution curve (E-curve) method, which is based on information theoretic metrics of source code. The method allows identifying major evolution stages and transition points of an analyzed software system. The application of the E-curves is demonstrated for the eMule system. .\n CATEGORIES: 1. YES; 2. YES; 3.YES; 4. NO"
example_1 = "Example 1: TITLE - Thermal Insulation Properties of Milkweed Floss Nonwovens: Influence of Temperature, Relative Humidity, and Fiber Content \n ABSTRACT - This study investigated the influence of fiber content, temperature, and relative humidity on the thermal insulation properties of nonwoven mats made of seed fibers from Asclepias Syriaca, commonly known as milkweed floss. Nonwoven mats with a 1-inch thickness were produced by uniformly arranging milkweed fibers within a mold. Various quantities of fiber were employed to obtain nonwoven mats with a fiber content ranging from 5 to 35 kg/m3. Thermal conductivity and thermal diffusivity were measured across diverse relative humidity levels and temperatures. Simultaneously, milkweed floss samples were exposed to identical environmental conditions to assess the moisture regain and specific heat capacities of the fiber. The specific heat capacity of milkweed and thermal conductivity of the nonwovens exhibited a linear increase with temperature. The thermal diffusivity and thermal conductivity of the nonwovens decreased with rising fiber content. The thermal insulation properties of the nonwovens remained partially stable below 30\\% relative humidity but substantially deteriorated at higher levels. The nonwovens exhibited optimal thermal insulation properties at a fiber content between 20 and 25 kg/m3. The results of this study highlighted several technical advantages of employing milkweed floss as a sustainable and lightweight solution for thermal insulation. \n CATEGORIES: 1. YES; 2. NO; 3. NO; 4. NO;"
example_3 = "Example 3: TITLE - Social network structures in open source software development teams \n ABSTRACT - Drawing on social network theories and previous studies, this research examines the dynamics of social network structures in open source software (OSS) teams. Three projects were selected from SourceForge.net in terms of their similarities as well as their differences. Monthly data were extracted from the bug tracking systems in order to achieve a longitudinal view of the interaction pattern of each project. Social network analysis was used to generate the indices of social structure. The finding suggests that the interaction pattern of OSS projects evolves from a single hub at the beginning to a corel periphery model as the projects move forward.\n CATEGORIES: 1. YES; 2. YES; 3. NO; 4. NO"
example_2 = "Example 2: TITLE - An Exploratory Mixed-methods Study on General Data Protection Regulation (GDPR) Compliance in Open-Source Software \n ABSTRACT- Background: Governments worldwide are considering data privacy regulations. These laws, such as the European Unions General Data Protection Regulation (GDPR), require software developers to meet privacy-related requirements when interacting with users data. Prior research describes the impact of such laws on software development, but only for commercial software. Although open-source software is commonly integrated into regulated software, and thus must be engineered or adapted for compliance, we do not know how such laws impact open-source software development. Aims: To understand how data privacy laws affect open-source software (OSS) development, we focus on the European Unions GDPR, as it is the most prominent such law. We investigated how GDPR compliance activities influence OSS developer activity (RQ1), how OSS developers perceive fulfilling GDPR requirements (RQ2), the most challenging GDPR requirements to implement (RQ3), and how OSS developers assess GDPR compliance (RQ4). Method: We distributed an online survey to explore perceptions of GDPR implementations from open-source developers (N=56). To augment this analysis, we further conducted a repository mining study to analyze development metrics on pull requests (N=31,462) submitted to open-source GitHub repositories. Results: Our results suggest GDPR policies complicate OSS development and introduce challenges, primarily regarding the management of users data, implementation costs and time, and assessments of compliance. Moreover, we observed negative perceptions of the GDPR from OSS developers and significant increases in development activity, in particular metrics related to coding and reviewing, on GitHub pull requests related to GDPR compliance. Conclusions: Our findings provide future research directions and implications for improving data privacy policies, motivating the need for relevant resources and automated tools to support data privacy regulation implementation and compliance efforts in OSS. \n CATEGORIES: 1. YES; 2. YES; 3. YES; 4. YES;"
with open("cites/053025_man_filtered_dedup.csv", mode='r', newline='') as file:
reader = csv.reader(file)
array_of_categorizations = []
index = -1
for row in reader:
index += 1
if index <= 0:
continue
cite_dict = {}
#organizing the data from each citation
cite_dict['key'] = row[0]
cite_dict['title'] = row[1]
cite_dict['abstract'] = row[2]
#prompt construction
given_data = f"GIVEN DATA: Title - {cite_dict['title']} \n Abstract - {cite_dict['abstract']}"
prompt = f"{prompt_1}\n\n{example_1}\n\n{example_2}\n\n{example_3}\n\n{example_4}\n\n{given_data}\n"
#handoff to the model
inputs = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)
#deterministic sampling and getting the response back
response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
#getting the resulting codes
codes_id = response_txt.rfind("CATEGORIES:")
if codes_id != -1:
result = response_txt[codes_id + len("CATEGORIES:"):].strip()
else:
cite_dict["1"] = "NULL"
cite_dict["2"] = "NULL"
cite_dict["3"] = "NULL"
cite_dict["4"] = "NULL"
#writing them to the citation_dict
for item in result.strip(";").split(";"):
key_value = item.strip().split('. ')
if len(key_value) == 2:
key = key_value[0]
value = key_value[1]
cite_dict[key] = value
array_of_categorizations.append(cite_dict)
#CSV everything
df = pd.DataFrame(array_of_categorizations)
df.to_csv('060325_olmo_categorized_citations.csv', index=False)

View File

@ -1,8 +0,0 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
import torch
#load in the different models
olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
#

View File

@ -2,11 +2,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
import torch
#load in the different models
olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
#load in the different models
device = "cuda" if torch.cuda.is_available() else "cpu"
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0325-32B-Instruct").to(device)
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0325-32B-Instruct")
#prompt
first_prompt = "What are the characteristic patterns (e.g. empirical setting, methodology, analytical framing) of the following studies? In your response format the patterns identified in the data set as discrete qualitative codes."
#hand the model the data
data_prompt = "TKTK"
#put together
prompt = f"{first_prompt}\n{data_prompt}"
#collect the response

23
models/p2-ocr.py Normal file
View File

@ -0,0 +1,23 @@
# https://huggingface.co/allenai/olmOCR-7B-0225-preview
import torch
import base64
from io import BytesIO
from PIL import image
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
# Initialize the model
model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#for all pages in a pdf
for
#

View File

@ -12,3 +12,11 @@ models\
studies\
- the pdf files for the final sample of studies
containers\
- requisite containers for running language analysis on HPC
viz\
- scripts for making visualizations out of any of our analyses
successful_logs\
- for posterity's sake, the logs of correct analyses

32
scripts/p1-cat.sh Normal file
View File

@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH -A p32852
#SBATCH -p gengpu
#SBATCH --gres=gpu:a100:1
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --time=24:00:00
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=p1-categorization
#SBATCH --output=p1-categorization.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu
echo "starting the job at: $(date)"
echo "setting up the environment"
module purge
eval "$(conda shell.bash hook)"
conda activate olmo
echo "running the p1 categorization script"
python /home/nws8519/git/adaptation-slr/models/p1-categorization.py
echo "job finished, cleaning up"
conda deactivate
echo "job pau at: $(date)"

0
scripts/p2-ir.sh Normal file
View File

View File

@ -1,6 +0,0 @@
setting up the environment
running the pdf to json ocr conversion
ERROR:olmocr.check:pdftoppm is not installed.
ERROR:olmocr.check:Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions
job finished, cleaning up
job pau at: Tue May 20 14:29:36 CDT 2025

50
viz/sankey.R Normal file
View File

@ -0,0 +1,50 @@
data <- read.csv("/home/mgaughan/git/adaptation-slr/cites/060225_olmo_categorized_citations.csv")
table(data$X1)
library(ggsankey)
library(ggplot2)
library(dplyr)
cols <- c("X1", "X2", "X3", "X4")
data[cols] <- lapply(data[cols], function(x) ifelse(x == "YES", "YES", "NO"))
#data <- data %>%
# mutate(across(starts_with("X"), ~ ifelse(.x, "Yes", "No")))
data$freq = 1
# ggsankey
# https://r-charts.com/flow/sankey-diagram-ggplot2/
alluvial_plot <- ggplot(data,
aes(axis1 = X1, axis2 = X2, axis3 = X3, axis4 = X4, y = freq)) +
geom_alluvium(fill = "grey70", width = 1/12, alpha = 0.7) +
geom_stratum(aes(fill = after_stat(stratum)), width = 1/12, color = "black") +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 4) +
scale_x_discrete(limits = c("X1", "X2", "X3", "X4"), expand = c(.05, .05)) +
scale_fill_manual(values = c("YES" = "#4CAF50", "NO" = "#F44336")) +
labs(title = "Alluvial Plot: YES/NO at Each Stage",
x = "Stage", y = "Count") +
theme_minimal()
alluvial_plot
library(ggsankey)
data$case_id <- seq_len(nrow(data))
sankey_data <- data %>%
select(case_id, X1, X2, X3, X4) %>%
pivot_longer(-case_id, names_to = "stage", values_to = "value") %>%
arrange(case_id, stage)
# Convert to sankey-friendly format
sankey_ready <- sankey_data %>%
make_long(stage, value, id = case_id)
ggplot(sankey_ready, aes(x = x,
next_x = next_x,
node = node,
next_node = next_node,
fill = factor(node))) +
geom_sankey(flow.alpha = 0.6, node.color = "black") +
geom_sankey_label(size = 3, color = "black") +
theme_sankey(base_size = 12) +
labs(title = "Sankey Diagram", fill = "Value")

21
viz/shell.nix Normal file
View File

@ -0,0 +1,21 @@
{
pkgs ? import <nixpkgs> { },
}:
pkgs.mkShell {
name = "viz-slr";
packages = with pkgs; [
pkgs.R
pkgs.rPackages.httr
pkgs.rPackages.ggplot2
pkgs.rPackages.tidytuesdayR
pkgs.rPackages.tidyverse
pkgs.rPackages.ggalluvial
pkgs.rPackages.dplyr
git
];
shellHook = ''
echo "trying to find a good quote for here"
'';
}