From cb2fe737cdae92e822f829efc57411e61b563fd9 Mon Sep 17 00:00:00 2001 From: mgaughan Date: Sat, 11 Oct 2025 07:38:11 -0500 Subject: [PATCH] updating batching script, preparing for run --- p2/quest/101025-batched-mw-olmo-info-cat.log | 15 +++++++++++++++ .../python_scripts/090425_batched_olmo_cat.py | 13 +++++++++---- p2/quest/slurm_jobs/090425_olmo_batched_cat.sh | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 p2/quest/101025-batched-mw-olmo-info-cat.log diff --git a/p2/quest/101025-batched-mw-olmo-info-cat.log b/p2/quest/101025-batched-mw-olmo-info-cat.log new file mode 100644 index 0000000..33e557d --- /dev/null +++ b/p2/quest/101025-batched-mw-olmo-info-cat.log @@ -0,0 +1,15 @@ +setting up the environment by loading in conda environment at Sat Oct 11 00:24:37 CDT 2025 +running the batched olmo categorization job at Sat Oct 11 00:24:37 CDT 2025 +[nltk_data] Downloading package punkt_tab to +[nltk_data] /home/nws8519/nltk_data... +[nltk_data] Package punkt_tab is already up-to-date! +cuda +NVIDIA A100-SXM4-80GB +_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=393ab5c3-2bcb-e4c6-52ad-eb4896a9d4fe, L2_cache_size=40MB) + Loading checkpoint shards: 0%| | 0/12 [00:00 + with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +FileNotFoundError: [Errno 2] No such file or directory: '/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv' +unsupervised batched olmo categorization pau at Sat Oct 11 00:27:22 CDT 2025 diff --git a/p2/quest/python_scripts/090425_batched_olmo_cat.py b/p2/quest/python_scripts/090425_batched_olmo_cat.py index 8066565..b81e8e6 100644 --- a/p2/quest/python_scripts/090425_batched_olmo_cat.py +++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py @@ -40,6 +40,8 @@ TYPOLOGY: [[TASK PROGRESS]], in which stakeholders request or report progress of tasks and sub-tasks towards the solution of the issue. For example, “I made an initial stab at it... - this is just a proof of concept that gets the version string into nodejs. I’ll start working on adding the swig interfaces...” [[TESTING]], in which participants discuss the testing procedure and results, as well as the system environment, code, data, and feedback involved in testing. For example, “Tested on ‘0.101’ and ‘master’ - the issue seems to be fixed on ‘master’ not just for the example document, but for the entire corpus...” + +[[NA]], in which the sentence contents are entirely incomprehensible or only consist of punctuation or numerals. For example, "***", "ve-ce-protectedNode", or "T8597". [[FUTURE PLAN]], in which participants discuss the long-term plan related to the issue; such plans usually involve work/ideas that are not required to close the current issue. For example, “For the futures, stay tuned, as we’re prototyping something in this direction.” @@ -57,7 +59,7 @@ TYPOLOGY: """ instructions="The sentence's category is: " -with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", mode='r', newline='') as file: +with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file: reader = csv.reader(file) array_of_categorizations = [] index = -1 @@ -70,7 +72,10 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ text_dict['id'] = row[0] text_dict['task_title'] = row[1] text_dict['comment_text'] = row[2] - text_dict['comment_type'] = row[12] + text_dict['date_created'] = row[3] + text_dict['comment_type'] = row[6] + text_dict['TaskPHID'] = row[5] + text_dict['AuthorPHID'] = row[4] if text_dict['comment_type'] == "task_description": raw_text = text_dict['task_title'] + ". \n\n" + text_dict['comment_text'] else: @@ -102,7 +107,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ batch = comment_sentences[i:i+batch_size] prompts = [] for sent in batch: - given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['comment_type'] \n Text -{sent}**'\n" + given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['task_title'] \n Text -{sent}**'\n" prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}" prompts.append(prompt) inputs = tokenizer(prompts, return_tensors='pt', return_token_type_ids=False, padding=True, truncation=True).to(device) @@ -122,7 +127,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_ array_of_categorizations.append(text_dict) df = pd.DataFrame(array_of_categorizations) #print(df.head()) - df.to_csv('all_092225_olmo_batched_categorized.csv', index=False) + df.to_csv('all_101025_olmo_batched_categorized.csv', index=False) diff --git a/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh index c251c09..098bd97 100644 --- a/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh +++ b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh @@ -9,7 +9,7 @@ #SBATCH --mem=64G #SBATCH --cpus-per-task=4 #SBATCH --job-name=batched-MW-info-typology -#SBATCH --output=batched-mw-olmo-info-cat.log +#SBATCH --output=101025-batched-mw-olmo-info-cat.log #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=gaughan@u.northwestern.edu