hopefully last update to human sampling
This commit is contained in:
parent
89969daab5
commit
bb67fea96b
10
batched-mw-olmo-info-cat.log
Normal file
10
batched-mw-olmo-info-cat.log
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
setting up the environment by loading in conda environment at Tue Sep 16 11:46:51 CDT 2025
|
||||||
|
running the batched olmo categorization job at Tue Sep 16 11:46:51 CDT 2025
|
||||||
|
[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Package punkt_tab is already up-to-date!
|
||||||
|
cuda
|
||||||
|
NVIDIA A100-SXM4-80GB
|
||||||
|
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=b6c5753c-65f3-91cd-dd90-e56a02d2cf99, L2_cache_size=40MB)
|
||||||
|
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:05, 2.11it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:05, 1.99it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:05, 1.79it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:05, 1.54it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:04, 1.63it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:04, 1.49it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:04<00:03, 1.50it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:05<00:02, 1.54it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:05<00:01, 1.63it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.78it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:06<00:00, 1.79it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:06<00:00, 1.82it/s]
|
||||||
|
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
|
1139
dsl/091625_human_conversation_sample.csv
Normal file
1139
dsl/091625_human_conversation_sample.csv
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -71,6 +71,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
text_dict['task_title'] = row[1]
|
text_dict['task_title'] = row[1]
|
||||||
text_dict['comment_text'] = row[2]
|
text_dict['comment_text'] = row[2]
|
||||||
text_dict['comment_type'] = row[12]
|
text_dict['comment_type'] = row[12]
|
||||||
|
text_dict['TaskPHID'] = row[11]
|
||||||
#making sure the comment title is included in things
|
#making sure the comment title is included in things
|
||||||
if text_dict['comment_type'] == "task_description":
|
if text_dict['comment_type'] == "task_description":
|
||||||
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
raw_text = text_dict['task_title'] + "\n\n" + text_dict['comment_text']
|
||||||
@ -123,8 +124,12 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_
|
|||||||
'''
|
'''
|
||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
random_df = df.sample(n=300, random_state=8)
|
|
||||||
random_df.to_csv('091625_human_text_sample.csv', index=False)
|
#taking a random sample of 50 task discussions
|
||||||
|
unique_tasks = df['TaskPHID'].unique()
|
||||||
|
sampled_tasks = pd.Series(unique_tasks).sample(n=25, random_state=8)
|
||||||
|
random_df = df[df['TaskPHID'].isin(sampled_tasks)]
|
||||||
|
random_df.to_csv('091625_human_conversation_sample.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -23,8 +23,8 @@ conda activate olmo
|
|||||||
|
|
||||||
echo "running the batched olmo categorization job at $(date)"
|
echo "running the batched olmo categorization job at $(date)"
|
||||||
|
|
||||||
# python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py
|
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py
|
||||||
|
|
||||||
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py
|
#python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py
|
||||||
|
|
||||||
echo "unsupervised batched olmo categorization pau at $(date)"
|
echo "unsupervised batched olmo categorization pau at $(date)"
|
||||||
|
30
p2/quest/slurm_jobs/091625_sampling.sh
Normal file
30
p2/quest/slurm_jobs/091625_sampling.sh
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH -A p32852
|
||||||
|
#SBATCH -p gengpu
|
||||||
|
#SBATCH --gres=gpu:a100:1
|
||||||
|
#SBATCH --constraint=sxm
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --time=48:00:00
|
||||||
|
#SBATCH --mem=64G
|
||||||
|
#SBATCH --cpus-per-task=4
|
||||||
|
#SBATCH --job-name=sampling-MW-info-typology
|
||||||
|
#SBATCH --output=sampling-mw-olmo-info-cat.log
|
||||||
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||||
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||||
|
|
||||||
|
module purge
|
||||||
|
|
||||||
|
eval "$(conda shell.bash hook)"
|
||||||
|
|
||||||
|
echo "setting up the environment by loading in conda environment at $(date)"
|
||||||
|
|
||||||
|
conda activate olmo
|
||||||
|
|
||||||
|
echo "running the sampling job at $(date)"
|
||||||
|
|
||||||
|
#python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py
|
||||||
|
|
||||||
|
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/label_sampling.py
|
||||||
|
|
||||||
|
echo "sampling pau at $(date)"
|
10
sampling-mw-olmo-info-cat.log
Normal file
10
sampling-mw-olmo-info-cat.log
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
setting up the environment by loading in conda environment at Tue Sep 16 12:11:09 CDT 2025
|
||||||
|
running the sampling job at Tue Sep 16 12:11:09 CDT 2025
|
||||||
|
[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Package punkt_tab is already up-to-date!
|
||||||
|
cuda
|
||||||
|
NVIDIA A100-SXM4-80GB
|
||||||
|
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=342f78d3-1877-6c6d-fb71-9a90e928d24e, L2_cache_size=40MB)
|
||||||
|
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:04, 2.43it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:01<00:05, 1.71it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:05, 1.62it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:05, 1.59it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:03<00:04, 1.58it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:03, 1.62it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:04<00:03, 1.53it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:05<00:02, 1.48it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:05<00:02, 1.40it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:06<00:01, 1.43it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:07<00:00, 1.43it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:07<00:00, 1.64it/s]
|
||||||
|
sampling pau at Tue Sep 16 12:14:57 CDT 2025
|
Loading…
Reference in New Issue
Block a user