diff --git a/models/p1-categorization.py b/models/p1-categorization.py new file mode 100644 index 0000000..5c26aae --- /dev/null +++ b/models/p1-categorization.py @@ -0,0 +1,26 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM +import torch + +#load in the different models +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct").to(device) +tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct") + +#priming prompt +first_prompt = "You are a multi-category classifer model. You are tasked with applying qualitative codes to title-abstract pairs of academic studies. We define the following study characteristics below:" +characteristics_prompt = "1. English language empirical studies: academic papers written in Egnlish that study or analyze evidence. Literature reviews are not empirical studies. 2. Focus on FOSS projects: is the focus of the research work on the domain of free and open source software projects. 3. Study FOSS project evolution: is the focus of the research work on longitudinal changes to free and open source projects. 4. Study FOSS project adaptation: is the focus of the research work on intentional changes made by free and open source software projects to better align themselves with their broader environment." +formatting_prompt = "For each code that we have specified, provide a binary YES or NO classification depending on whether or not the code applies to the title-abstract pair. Responses shouldonly include YES or NO responses to each characteristic's inclusion and should be formatted as [characteristic number]:[classification] for ALL four study characteristics that we have defined. Here is the title-abstract pair: " + +data_prompt = "Title - Underproduction: An Approach for Measuring Risk in Open Source Software \n Abstract - The widespread adoption of Free/Libre and Open Source Software (FLOSS) means that the ongoing maintenance of many widely used software components relies on the collaborative effort of volunteers who set their own priorities and choose their own tasks. We argue that this has created a new form of risk that we call 'underproduction' which occurs when the supply of software engineering labor becomes out of alignment with the demand of people who rely on the software produced. We present a conceptual framework for identifying relative underproduction in software as well as a statistical method for applying our framework to a comprehensive dataset from the Debian GNU/Linux distribution that includes 21,902 source packages and the full history of 461,656 bugs. We draw on this application to present two experiments: (1) a demonstration of how our technique can be used to identify at-risk software packages in a large FLOSS repository and (2) a validation of these results using an alternate indicator of package risk. Our analysis demonstrates both the utility of our approach and reveals the existence of widespread underproduction in a range of widely-installed software components in Debian. " + +prompt = f"{first_prompt}\n{characteristics_prompt}\n{formatting_prompt}\n{data_prompt}" + +inputs = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device) + +#deterministic sampling +response = olmo.generate(**inputs, max_new_tokens=256, do_sample=False) +response_txt = tokenizer.batch_decode(response, skip_special_tokens=True)[0] + +with open('/home/nws8519/git/adaptation-slr/trial-output.txt', 'w') as file: + file.write(response_txt) + diff --git a/models/p1-classification.py b/models/p1-classification.py deleted file mode 100644 index ef8fc4b..0000000 --- a/models/p1-classification.py +++ /dev/null @@ -1,8 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM -import torch - -#load in the different models -olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct") -tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct") - -# diff --git a/models/p2-first-ir.py b/models/p2-first-ir.py index ed32f3c..69ad657 100644 --- a/models/p2-first-ir.py +++ b/models/p2-first-ir.py @@ -2,11 +2,15 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM import torch #load in the different models -olmo = AutoModelForCausalLLM.from_pretrained("allenai/OLMo-2-0425-1B-Instruct") -tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0425-1B-Instruct") +#load in the different models +device = "cuda" if torch.cuda.is_available() else "cpu" +olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0325-32B-Instruct").to(device) +tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0325-32B-Instruct") #prompt - +first_prompt = "What are the characteristic patterns (e.g. empirical setting, methodology, analytical framing) of the following studies? In your response format the patterns identified in the data set as discrete qualitative codes." #hand the model the data - +data_prompt = "TKTK" +#put together +prompt = f"{first_prompt}\n{data_prompt}" #collect the response diff --git a/models/p2-ocr.py b/models/p2-ocr.py new file mode 100644 index 0000000..e69de29 diff --git a/p1-categorization.log b/p1-categorization.log new file mode 100644 index 0000000..56d2d57 --- /dev/null +++ b/p1-categorization.log @@ -0,0 +1,50 @@ +setting up the environment +running the p1 categorization script + Fetching 14 files: 0%| | 0/14 [00:00 + olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0325-32B-Instruct").to(device) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained + return model_class.from_pretrained( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/modeling_utils.py", line 309, in _wrapper + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4420, in from_pretrained + checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/modeling_utils.py", line 1178, in _get_resolved_checkpoint_files + checkpoint_files, sharded_metadata = get_checkpoint_shard_files( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/utils/hub.py", line 1110, in get_checkpoint_shard_files + cached_filenames = cached_files( + ^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/utils/hub.py", line 557, in cached_files + raise e + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/utils/hub.py", line 485, in cached_files + snapshot_download( + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/huggingface_hub/_snapshot_download.py", line 297, in snapshot_download + thread_map( + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/tqdm/contrib/concurrent.py", line 69, in thread_map + return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/tqdm/contrib/concurrent.py", line 51, in _executor_map + return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/tqdm/std.py", line 1181, in __iter__ + for obj in iterable: + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/concurrent/futures/_base.py", line 619, in result_iterator + yield _result_or_cancel(fs.pop()) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/concurrent/futures/_base.py", line 317, in _result_or_cancel + return fut.result(timeout) + ^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/concurrent/futures/_base.py", line 456, in result + return self.__get_result() + ^^^^^^^^^^^^^^^^^^^ +object address : 0x14ebbe9730c0 + diff --git a/scripts/p1-cat.sh b/scripts/p1-cat.sh new file mode 100644 index 0000000..66f46b8 --- /dev/null +++ b/scripts/p1-cat.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#SBATCH -A p32852 +#SBATCH -p gengpu +#SBATCH --gres=gpu:a100:1 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=24:00:00 +#SBATCH --mem=64G +#SBATCH --cpus-per-task=4 +#SBATCH --job-name=p1-categorization +#SBATCH --output=p1-categorization.log +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=gaughan@u.northwestern.edu + +echo "setting up the environment" + +module purge +eval "$(conda shell.bash hook)" +conda activate olmo + +echo "running the p1 categorization script" + +python /home/nws8519/git/adaptation-slr/models/p1-categorization.py + +echo "job finished, cleaning up" + +conda deactivate + +echo "job pau at: $(date)" + diff --git a/scripts/p2-ir.sh b/scripts/p2-ir.sh new file mode 100644 index 0000000..e69de29 diff --git a/scripts/quest_srun.sh b/scripts/quest_srun_OCR.sh similarity index 100% rename from scripts/quest_srun.sh rename to scripts/quest_srun_OCR.sh diff --git a/slr_ocr_logs.log b/slr_ocr_logs.log deleted file mode 100644 index c6bdec5..0000000 --- a/slr_ocr_logs.log +++ /dev/null @@ -1,6 +0,0 @@ -setting up the environment -running the pdf to json ocr conversion -ERROR:olmocr.check:pdftoppm is not installed. -ERROR:olmocr.check:Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions -job finished, cleaning up -job pau at: Tue May 20 14:29:36 CDT 2025