1
0

hopefully last update to human sampling

This commit is contained in:
mgaughan 2025-09-16 12:16:10 -05:00
parent 89969daab5
commit bb67fea96b
7 changed files with 1198 additions and 1571 deletions

View File

@ -0,0 +1,10 @@
setting up the environment by loading in conda environment at Tue Sep 16 11:46:51 CDT 2025
running the batched olmo categorization job at Tue Sep 16 11:46:51 CDT 2025
[nltk_data] Downloading package punkt_tab to
[nltk_data] /home/nws8519/nltk_data...
[nltk_data] Package punkt_tab is already up-to-date!
cuda
NVIDIA A100-SXM4-80GB
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=b6c5753c-65f3-91cd-dd90-e56a02d2cf99, L2_cache_size=40MB)
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s] Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:05, 2.11it/s] Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:05, 1.99it/s] Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:05, 1.79it/s] Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:05, 1.54it/s] Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:04, 1.63it/s] Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:04, 1.49it/s] Loading checkpoint shards: 58%|█████▊ | 7/12 [00:04<00:03, 1.50it/s] Loading checkpoint shards: 67%|██████▋ | 8/12 [00:05<00:02, 1.54it/s] Loading checkpoint shards: 75%|███████▌ | 9/12 [00:05<00:01, 1.63it/s] Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.78it/s] Loading checkpoint shards: 92%|█████████▏| 11/12 [00:06<00:00, 1.79it/s] Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00, 1.82it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -71,6 +71,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
text_dict['task_title'] = row[1] text_dict['task_title'] = row[1]
text_dict['comment_text'] = row[2] text_dict['comment_text'] = row[2]
text_dict['comment_type'] = row[12] text_dict['comment_type'] = row[12]
text_dict['TaskPHID'] = row[11]
#making sure the comment title is included in things #making sure the comment title is included in things
if text_dict['comment_type'] == "task_description": if text_dict['comment_type'] == "task_description":
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text'] raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
@ -123,8 +124,12 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
''' '''
array_of_categorizations.append(text_dict) array_of_categorizations.append(text_dict)
df = pd.DataFrame(array_of_categorizations) df = pd.DataFrame(array_of_categorizations)
random_df = df.sample(n=300, random_state=8)
random_df.to_csv('091625_human_text_sample.csv', index=False) #taking a random sample of 50 task discussions
unique_tasks = df['TaskPHID'].unique()
sampled_tasks = pd.Series(unique_tasks).sample(n=25, random_state=8)
random_df = df[df['TaskPHID'].isin(sampled_tasks)]
random_df.to_csv('091625_human_conversation_sample.csv', index=False)

View File

@ -23,8 +23,8 @@ conda activate olmo
echo "running the batched olmo categorization job at $(date)" echo "running the batched olmo categorization job at $(date)"
# python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py #python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py
echo "unsupervised batched olmo categorization pau at $(date)" echo "unsupervised batched olmo categorization pau at $(date)"

View File

@ -0,0 +1,30 @@
#!/bin/bash
#SBATCH -A p32852
#SBATCH -p gengpu
#SBATCH --gres=gpu:a100:1
#SBATCH --constraint=sxm
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --time=48:00:00
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=sampling-MW-info-typology
#SBATCH --output=sampling-mw-olmo-info-cat.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu
module purge
eval "$(conda shell.bash hook)"
echo "setting up the environment by loading in conda environment at $(date)"
conda activate olmo
echo "running the sampling job at $(date)"
#python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py
echo "sampling pau at $(date)"

View File

@ -0,0 +1,10 @@
setting up the environment by loading in conda environment at Tue Sep 16 12:11:09 CDT 2025
running the sampling job at Tue Sep 16 12:11:09 CDT 2025
[nltk_data] Downloading package punkt_tab to
[nltk_data] /home/nws8519/nltk_data...
[nltk_data] Package punkt_tab is already up-to-date!
cuda
NVIDIA A100-SXM4-80GB
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=342f78d3-1877-6c6d-fb71-9a90e928d24e, L2_cache_size=40MB)
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s] Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:04, 2.43it/s] Loading checkpoint shards: 17%|█▋ | 2/12 [00:01<00:05, 1.71it/s] Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:05, 1.62it/s] Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:05, 1.59it/s] Loading checkpoint shards: 42%|████▏ | 5/12 [00:03<00:04, 1.58it/s] Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:03, 1.62it/s] Loading checkpoint shards: 58%|█████▊ | 7/12 [00:04<00:03, 1.53it/s] Loading checkpoint shards: 67%|██████▋ | 8/12 [00:05<00:02, 1.48it/s] Loading checkpoint shards: 75%|███████▌ | 9/12 [00:05<00:02, 1.40it/s] Loading checkpoint shards: 83%|████████▎ | 10/12 [00:06<00:01, 1.43it/s] Loading checkpoint shards: 92%|█████████▏| 11/12 [00:07<00:00, 1.43it/s] Loading checkpoint shards: 100%|██████████| 12/12 [00:07<00:00, 1.64it/s]
sampling pau at Tue Sep 16 12:14:57 CDT 2025