updating new git organization to remove sif file

updating with new container, collected categorizations
updating (and failing) to plot categorization with sankey diagram
2025-06-03 09:52:20 -05:00 · 2025-06-03 09:43:25 -05:00 · 2025-06-02 22:40:48 -05:00 · 2025-06-02 21:24:53 -05:00 · 2025-06-02 11:35:45 -05:00 · 2025-06-02 11:29:59 -05:00
23 changed files with 12273 additions and 13380 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+*.sif
--- a/cites/053025_man_filtered_dedup.csv
+++ b/cites/053025_man_filtered_dedup.csv
--- a/cites/060225_olmo_categorized_citations.csv
+++ b/cites/060225_olmo_categorized_citations.csv
--- a/cites/060325_olmo_categorized_citations.csv
+++ b/cites/060325_olmo_categorized_citations.csv
--- a/cites/auto-dedup-cites.csv
+++ b/cites/auto-dedup-cites.csv
--- a/cites/auto_dedup_results.csv
+++ b/cites/auto_dedup_results.csv
--- a/cites/bib_to_csv.py
+++ b/cites/bib_to_csv.py
@ -0,0 +1,24 @@
+import csv 
+import bibtexparser
+from tqdm import tqdm
+import pandas as pd 
+
+df1 = pd.read_csv('auto_dedup_results.csv')
+df2 = pd.read_csv('auto_dedup_results_bad_abstracts.csv')
+filtered_df1 = df1[df1['title'].isin(df2.iloc[:, 2])]
+filtered_df1.to_csv('filtered_dedup_correct.csv', index=False)
+
+
+'''
+with open("auto_dedup_results.bib") as bibfile:
+    bib_db = bibtexparser.load(bibfile)
+
+fields = ['duplicate_id', 'bibtype', 'title', 'abstract', 'doi']
+
+with open('auto_dedup_results.csv', 'w', newline="", encoding='utf-8') as csvfile:
+    writer = csv.DictWriter(csvfile, fieldnames=fields)
+    writer.writeheader()
+    for entry in tqdm(bib_db.entries, desc="Converting BibTeX to CSV"):
+        row = {field: entry.get(field, '') for field in fields}
+        writer.writerow(row)
+'''
--- a/cites/p1-categorization.log
+++ b/cites/p1-categorization.log
@ -0,0 +1,9 @@
+starting the job at: Mon Jun  2 11:43:44 CDT 2025
+setting up the environment
+running the p1 categorization script
+cuda
+NVIDIA A100-SXM4-80GB
+_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=af8da8da-1900-3762-4351-d9c80d33463b, L2_cache_size=40MB)
+
Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]
Loading checkpoint shards:  17%|█▋        | 1/6 [00:00<00:03,  1.42it/s]
Loading checkpoint shards:  33%|███▎      | 2/6 [00:01<00:03,  1.20it/s]
Loading checkpoint shards:  50%|█████     | 3/6 [00:02<00:02,  1.07it/s]
Loading checkpoint shards:  67%|██████▋   | 4/6 [00:03<00:01,  1.05it/s]
Loading checkpoint shards:  83%|████████▎ | 5/6 [00:04<00:00,  1.06it/s]
Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00,  1.17it/s]
Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00,  1.14it/s]
+job finished, cleaning up
+job pau at: Mon Jun  2 13:20:49 CDT 2025
--- a/cites/second-p1-categorization.log
+++ b/cites/second-p1-categorization.log
@ -0,0 +1,9 @@
+starting the job at: Mon Jun  2 22:58:46 CDT 2025
+setting up the environment
+running the p1 categorization script
+cuda
+NVIDIA A100-PCIE-40GB
+_CudaDeviceProperties(name='NVIDIA A100-PCIE-40GB', major=8, minor=0, total_memory=40442MB, multi_processor_count=108, uuid=a48cfab5-6d74-8479-c725-d4a6e53059e3, L2_cache_size=40MB)
+
Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]
Loading checkpoint shards:  17%|█▋        | 1/6 [00:01<00:06,  1.36s/it]
Loading checkpoint shards:  33%|███▎      | 2/6 [00:02<00:05,  1.33s/it]
Loading checkpoint shards:  50%|█████     | 3/6 [00:03<00:03,  1.28s/it]
Loading checkpoint shards:  67%|██████▋   | 4/6 [00:05<00:02,  1.46s/it]
Loading checkpoint shards:  83%|████████▎ | 5/6 [00:07<00:01,  1.47s/it]
Loading checkpoint shards: 100%|██████████| 6/6 [00:08<00:00,  1.30s/it]
Loading checkpoint shards: 100%|██████████| 6/6 [00:08<00:00,  1.35s/it]
+job finished, cleaning up
+job pau at: Tue Jun  3 00:46:04 CDT 2025
--- a/cites/shell.nix
+++ b/cites/shell.nix
@ -0,0 +1,17 @@
+{
+  pkgs ? import <nixpkgs> { },
+}:
+
+pkgs.mkShell {
+  name = "trial-env";
+  packages = with pkgs; [
+   pkgs.python312
+   pkgs.python312Packages.bibtexparser
+   pkgs.python312Packages.tqdm
+   pkgs.python312Packages.pandas
+   git
+  ];
+  shellHook = ''
+  echo "trying to find a good quote for here"
+  '';
+}
--- a/containers/ocr_run.sh
+++ b/containers/ocr_run.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+#SBATCH -A p32852
+#SBATCH -p gengpu
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=24:00:00
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=4
+#SBATCH --job-name=olmocr-pull-docker-img 
+#SBATCH --output=olmocr-pull.log
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=gaughan@u.northwestern.edu
+
+# using singularity to pull the allenai olmocr docker image
+module load singularity
+
+print("running ocr container") 
+singularity run
+
--- a/containers/olmocr_docker.sh
+++ b/containers/olmocr_docker.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -A p32852
+#SBATCH -p gengpu
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=24:00:00
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=4
+#SBATCH --job-name=olmocr-pull-docker-img 
+#SBATCH --output=olmocr-pull.log
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=gaughan@u.northwestern.edu
+
+# using singularity to pull the allenai olmocr docker image
+module load singularity
+
+export SINGULARITY_CACHEDIR=$TMPDIR
+
+singularity pull olmocr_container.sif docker://alleninstituteforai/olmocr:latest
+
--- a/models/p1-categorization.py
+++ b/models/p1-categorization.py
@ -0,0 +1,65 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
+import torch
+import csv 
+import pandas as pd 
+
+#load in the different models 
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(device)
+print(torch.cuda.get_device_name(0))
+print(torch.cuda.get_device_properties(0))
+olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-7B").to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-7B")
+
+#priming prompt
+prompt_1 = "For the GIVEN DATA, Please categorize it based on the following numbered characteristics: \n\n 1: YES/NO (Characteristic 1. This is an English language empirical study. Empirical studies discuss data or observations.) \n 2: YES/NO (Characteristic 2. This discusses free and open source software (FOSS or OSS). The focus of the GIVEN DATA is on free or open source software projects or ecosystems.) \n 3: YES/NO (Characteristic 3. The GIVEN DATA discusses FOSS project evolution. FOSS project evolution describes any changes to free and open source projects.) \n 4: YES/NO (Characteristic 4. This GIVEN DATA discusses FOSS project adaptation. FOSS project adaptation describes the intentional strategic changes made by projects to better align with the project's broader environment.) \n\n Characteristics 2, 3, and 4 can only be YES if the preceding characteristic was also a YES. \n\n Only respond with the appropriate number followed by 'YES' if the characteristic is present in the provided data or 'NO' if it is not (e.g. '1. YES; 2. NO;'). Do not provide any additional information."
+
+example_4 = "Example 4: TITLE - Analysis of Open Source Software Evolution Using Evolution Curve Method \n ABSTRACT - Design and evolution of modem information systems is influenced by many   factors: technical, organizational, social, and psychological. This is   especially true for open source software systems (OSSS), when many   developers from different backgrounds interact, share their ideas and   contribute towards the development and improvement of a software   product. The evolution of all OSSS is a continuous process of source   code development, adaptation, improvement and maintenance. Studying   changes to the various characteristics of source code can help us   understand the evolution of a software system. In this paper, the   software evolution process is analyzed using a proposed Evolution curve   (E-curve) method, which is based on information theoretic metrics of   source code. The method allows identifying major evolution stages and   transition points of an analyzed software system. The application of the   E-curves is demonstrated for the eMule system. .\n CATEGORIES: 1. YES; 2. YES; 3.YES; 4. NO"
+
+example_1 = "Example 1: TITLE - Thermal Insulation Properties of Milkweed Floss Nonwovens: Influence of   Temperature, Relative Humidity, and Fiber Content \n ABSTRACT - This study investigated the influence of fiber content, temperature, and   relative humidity on the thermal insulation properties of nonwoven mats   made of seed fibers from Asclepias Syriaca, commonly known as milkweed   floss. Nonwoven mats with a 1-inch thickness were produced by uniformly   arranging milkweed fibers within a mold. Various quantities of fiber   were employed to obtain nonwoven mats with a fiber content ranging from   5 to 35 kg/m3. Thermal conductivity and thermal diffusivity were   measured across diverse relative humidity levels and temperatures.   Simultaneously, milkweed floss samples were exposed to identical   environmental conditions to assess the moisture regain and specific heat   capacities of the fiber. The specific heat capacity of milkweed and   thermal conductivity of the nonwovens exhibited a linear increase with   temperature. The thermal diffusivity and thermal conductivity of the   nonwovens decreased with rising fiber content. The thermal insulation   properties of the nonwovens remained partially stable below 30\\%   relative humidity but substantially deteriorated at higher levels. The   nonwovens exhibited optimal thermal insulation properties at a fiber   content between 20 and 25 kg/m3. The results of this study highlighted   several technical advantages of employing milkweed floss as a   sustainable and lightweight solution for thermal insulation. \n CATEGORIES: 1. YES; 2. NO; 3. NO; 4. NO;"
+
+example_3 = "Example 3: TITLE - Social network structures in open source software development teams \n ABSTRACT - Drawing on social network theories and previous studies, this research   examines the dynamics of social network structures in open source   software (OSS) teams. Three projects were selected from SourceForge.net   in terms of their similarities as well as their differences. Monthly   data were extracted from the bug tracking systems in order to achieve a   longitudinal view of the interaction pattern of each project. Social   network analysis was used to generate the indices of social structure.   The finding suggests that the interaction pattern of OSS projects   evolves from a single hub at the beginning to a corel periphery model as   the projects move forward.\n CATEGORIES: 1. YES; 2. YES; 3. NO; 4. NO"
+
+example_2 = "Example 2: TITLE - An Exploratory Mixed-methods Study on General Data Protection Regulation (GDPR) Compliance in Open-Source Software \n ABSTRACT- Background: Governments worldwide are considering data privacy regulations. These laws, such as the European Union’s General Data Protection Regulation (GDPR), require software developers to meet privacy-related requirements when interacting with users’ data. Prior research describes the impact of such laws on software development, but only for commercial software. Although open-source software is commonly integrated into regulated software, and thus must be engineered or adapted for compliance, we do not know how such laws impact open-source software development. Aims: To understand how data privacy laws affect open-source software (OSS) development, we focus on the European Union’s GDPR, as it is the most prominent such law. We investigated how GDPR compliance activities influence OSS developer activity (RQ1), how OSS developers perceive fulfilling GDPR requirements (RQ2), the most challenging GDPR requirements to implement (RQ3), and how OSS developers assess GDPR compliance (RQ4). Method: We distributed an online survey to explore perceptions of GDPR implementations from open-source developers (N=56). To augment this analysis, we further conducted a repository mining study to analyze development metrics on pull requests (N=31,462) submitted to open-source GitHub repositories. Results: Our results suggest GDPR policies complicate OSS development and introduce challenges, primarily regarding the management of users’ data, implementation costs and time, and assessments of compliance. Moreover, we observed negative perceptions of the GDPR from OSS developers and significant increases in development activity, in particular metrics related to coding and reviewing, on GitHub pull requests related to GDPR compliance. Conclusions: Our findings provide future research directions and implications for improving data privacy policies, motivating the need for relevant resources and automated tools to support data privacy regulation implementation and compliance efforts in OSS. \n CATEGORIES: 1. YES; 2. YES; 3. YES; 4. YES;"
+
+with open("cites/053025_man_filtered_dedup.csv", mode='r', newline='') as file:
+    reader = csv.reader(file)
+    array_of_categorizations = []
+    index = -1
+    for row in reader:
+        index += 1
+        if index <= 0:
+            continue
+        cite_dict = {}
+        #organizing the data from each citation
+        cite_dict['key'] = row[0]
+        cite_dict['title'] = row[1]
+        cite_dict['abstract'] = row[2]
+        #prompt construction
+        given_data = f"GIVEN DATA: Title - {cite_dict['title']} \n Abstract - {cite_dict['abstract']}"
+        prompt = f"{prompt_1}\n\n{example_1}\n\n{example_2}\n\n{example_3}\n\n{example_4}\n\n{given_data}\n"
+        #handoff to the model
+        inputs = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)
+        #deterministic sampling and getting the response back 
+        response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False)
+        response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
+        #getting the resulting codes 
+        codes_id = response_txt.rfind("CATEGORIES:")
+        if codes_id != -1:
+            result = response_txt[codes_id + len("CATEGORIES:"):].strip()
+        else:
+            cite_dict["1"] = "NULL"
+            cite_dict["2"] = "NULL"
+            cite_dict["3"] = "NULL"
+            cite_dict["4"] = "NULL"
+        #writing them to the citation_dict
+        for item in result.strip(";").split(";"):
+            key_value = item.strip().split('. ')
+            if len(key_value) == 2:
+                key = key_value[0]
+                value = key_value[1]
+                cite_dict[key] = value
+        array_of_categorizations.append(cite_dict)
+    #CSV everything
+    df = pd.DataFrame(array_of_categorizations)
+    df.to_csv('060325_olmo_categorized_citations.csv', index=False)
--- a/models/p1-classification.py
+++ b/models/p1-classification.py
@ -1,8 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
-import torch
-
-#load in the different models 
-olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
-
-#
--- a/models/p2-first-ir.py
+++ b/models/p2-first-ir.py
@ -2,11 +2,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
 import torch

 #load in the different models 
-olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct")
+#load in the different models 
+device = "cuda" if torch.cuda.is_available() else "cpu"
+olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0325-32B-Instruct").to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0325-32B-Instruct")

 #prompt 
-
+first_prompt = "What are the characteristic patterns (e.g. empirical setting, methodology, analytical framing) of the following studies? In your response format the patterns identified in the data set as discrete qualitative codes."
 #hand the model the data 
-
+data_prompt = "TKTK"
+#put together
+prompt = f"{first_prompt}\n{data_prompt}"
 #collect the response
--- a/models/p2-ocr.py
+++ b/models/p2-ocr.py
@ -0,0 +1,23 @@
+# https://huggingface.co/allenai/olmOCR-7B-0225-preview
+import torch 
+import base64
+
+from io import BytesIO
+from PIL import image
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+
+from olmocr.data.renderpdf import render_pdf_to_base64png
+from olmocr.prompts import build_finetuning_prompt
+from olmocr.prompts.anchor import get_anchor_text
+
+
+# Initialize the model
+model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
+#for all pages in a pdf 
+for 
+
+#
--- a/readme.txt
+++ b/readme.txt
@ -12,3 +12,11 @@ models\
 studies\
 - the pdf files for the final sample of studies

+containers\
+- requisite containers for running language analysis on HPC
+
+viz\
+- scripts for making visualizations out of any of our analyses
+
+successful_logs\
+- for posterity's sake, the logs of correct analyses
--- a/scripts/p1-cat.sh
+++ b/scripts/p1-cat.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH -A p32852
+#SBATCH -p gengpu
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=24:00:00
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=4
+#SBATCH --job-name=p1-categorization 
+#SBATCH --output=p1-categorization.log
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=gaughan@u.northwestern.edu
+
+echo "starting the job at: $(date)"
+
+echo "setting up the environment"
+
+module purge
+eval "$(conda shell.bash hook)"
+conda activate olmo
+
+echo "running the p1 categorization script"
+
+python /home/nws8519/git/adaptation-slr/models/p1-categorization.py
+
+echo "job finished, cleaning up"
+
+conda deactivate
+
+echo "job pau at: $(date)"
+
--- a/scripts/p2-ir.sh
+++ b/scripts/p2-ir.sh
--- a/scripts/quest_srun_OCR.sh
+++ b/scripts/quest_srun_OCR.sh
--- a/slr_ocr_logs.log
+++ b/slr_ocr_logs.log
@ -1,6 +0,0 @@
-setting up the environment
-running the pdf to json ocr conversion
-ERROR:olmocr.check:pdftoppm is not installed.
-ERROR:olmocr.check:Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions
-job finished, cleaning up
-job pau at: Tue May 20 14:29:36 CDT 2025
--- a/viz/sankey.R
+++ b/viz/sankey.R
@ -0,0 +1,50 @@
+data <- read.csv("/home/mgaughan/git/adaptation-slr/cites/060225_olmo_categorized_citations.csv")
+table(data$X1)
+
+library(ggsankey)
+library(ggplot2)
+library(dplyr)
+
+cols <- c("X1", "X2", "X3", "X4")
+data[cols] <- lapply(data[cols], function(x) ifelse(x == "YES", "YES", "NO"))
+#data <- data %>%
+#  mutate(across(starts_with("X"), ~ ifelse(.x, "Yes", "No")))
+data$freq = 1
+# ggsankey
+# https://r-charts.com/flow/sankey-diagram-ggplot2/
+alluvial_plot <- ggplot(data,
+       aes(axis1 = X1, axis2 = X2, axis3 = X3, axis4 = X4, y = freq)) +
+  geom_alluvium(fill = "grey70", width = 1/12, alpha = 0.7) +
+  geom_stratum(aes(fill = after_stat(stratum)), width = 1/12, color = "black") +
+  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 4) +
+  scale_x_discrete(limits = c("X1", "X2", "X3", "X4"), expand = c(.05, .05)) +
+  scale_fill_manual(values = c("YES" = "#4CAF50", "NO" = "#F44336")) +
+  labs(title = "Alluvial Plot: YES/NO at Each Stage",
+       x = "Stage", y = "Count") +
+  theme_minimal()
+
+alluvial_plot
+
+
+library(ggsankey)
+
+data$case_id <- seq_len(nrow(data))
+
+sankey_data <- data %>%
+  select(case_id, X1, X2, X3, X4) %>%
+  pivot_longer(-case_id, names_to = "stage", values_to = "value") %>%
+  arrange(case_id, stage)
+
+# Convert to sankey-friendly format
+sankey_ready <- sankey_data %>%
+  make_long(stage, value, id = case_id)
+
+ggplot(sankey_ready, aes(x = x, 
+                         next_x = next_x, 
+                         node = node, 
+                         next_node = next_node, 
+                         fill = factor(node))) +
+  geom_sankey(flow.alpha = 0.6, node.color = "black") +
+  geom_sankey_label(size = 3, color = "black") +
+  theme_sankey(base_size = 12) +
+  labs(title = "Sankey Diagram", fill = "Value")
--- a/viz/shell.nix
+++ b/viz/shell.nix
@ -0,0 +1,21 @@
+{
+  pkgs ? import <nixpkgs> { },
+}:
+
+pkgs.mkShell {
+  name = "viz-slr";
+  packages = with pkgs; [
+   pkgs.R
+   pkgs.rPackages.httr
+   pkgs.rPackages.ggplot2
+   pkgs.rPackages.tidytuesdayR
+   pkgs.rPackages.tidyverse
+   pkgs.rPackages.ggalluvial
+   pkgs.rPackages.dplyr
+   git
+  ];
+  shellHook = ''
+  echo "trying to find a good quote for here"
+  '';
+}
+
Author	SHA1	Message	Date
mgaughan	c50a3b57ff	updating new git organization to remove sif file	2025-06-03 09:52:20 -05:00
mgaughan	ff8ca0b46e	updating with new container, collected categorizations	2025-06-03 09:43:25 -05:00
mgaughan	c6f4a244f4	updating (and failing) to plot categorization with sankey diagram	2025-06-02 22:40:48 -05:00
mgaughan	9403c79c44	pulling new olmocr image and new categorization stuff	2025-06-02 21:24:53 -05:00
mgaughan	c5df6cb6c6	removing ill categorizations	2025-06-02 11:35:45 -05:00
mgaughan	63450ba7ef	now with updated categorizations	2025-06-02 11:29:59 -05:00
mgaughan	5ed797e971	trying to get olmocr to run, updated categorization values	2025-06-02 11:27:23 -05:00
mgaughan	d8b9ca9dea	updating with docker images and categorized citations	2025-06-02 09:01:18 -05:00
mgaughan	c7448f2fc2	trying to load-balance the few-shot a bit more	2025-05-30 21:45:30 -05:00
mgaughan	225d7f53c8	bad categorization data, some restructuring of the repo	2025-05-30 21:36:18 -05:00
mgaughan	9985e190e7	updated with preliminary categorization	2025-05-30 21:20:36 -05:00
mgaughan	c3bb0801a2	~final~ update to categorization script	2025-05-30 16:39:24 -05:00
mgaughan	86e2cd3ed8	updating with manual dedup of citations	2025-05-30 16:37:03 -05:00
mgaughan	1d63537027	redoing the dedup csv, something wrong with the other one	2025-05-30 13:52:13 -05:00
mgaughan	9d86f24c41	updating scripts and models for classification; errors in citation csv	2025-05-30 13:33:41 -05:00
mgaughan	17c69a6c92	updating prompts for categorization trial	2025-05-20 23:12:11 -05:00
mgaughan	7aedc1edbb	mid-point on setting up the olmo models on quest; updating the organization of different scripts	2025-05-20 21:32:44 -05:00