changes for archiving.
This commit is contained in:
28
datasets/Makefile
Normal file
28
datasets/Makefile
Normal file
@@ -0,0 +1,28 @@
|
||||
all: ../../data/reddit_comments_by_subreddit.parquet ../../data/reddit_submissions_by_subreddit.parquet
|
||||
|
||||
../../data/reddit_comments_by_subreddit.parquet:../../data/temp/reddit_comments.parquet
|
||||
../start_spark_and_run.sh 4 comments_2_parquet_part2.py
|
||||
|
||||
../../data/temp/reddit_comments.parquet: comments_task_list.sh run_comments_jobs.sbatch
|
||||
mkdir -p comments_jobs
|
||||
mkdir -p ../../data/temp/
|
||||
sbatch --wait --array=1-$(shell cat comments_task_list.sh | wc -l) run_comments_jobs.sbatch 0
|
||||
|
||||
temp_reddit_comments.parquet: ../../data/temp/reddit_comments.parquet
|
||||
|
||||
comments_task_list.sh: comments_2_parquet_part1.py
|
||||
srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 bash -c "source ~/.bashrc && python3 comments_2_parquet_part1.py gen_task_list --overwrite=False"
|
||||
|
||||
submissions_task_list.sh: submissions_2_parquet_part1.py
|
||||
srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 submissions_2_parquet_part1.py gen_task_list
|
||||
|
||||
../../data/reddit_submissions_by_subreddit.parquet:../../data/temp/reddit_submissions.parquet
|
||||
../start_spark_and_run.sh 4 submissions_2_parquet_part2.py
|
||||
|
||||
../../data/temp/reddit_submissions.parquet: submissions_task_list.sh run_submissions_jobs.sbatch
|
||||
mkdir -p submissions_jobs
|
||||
rm -rf ../../data/temp/reddit_submissions.parquet
|
||||
mkdir -p ../../data/temp/
|
||||
sbatch --wait --array=1-$(shell cat submissions_task_list.sh | wc -l) run_submissions_jobs.sbatch 0
|
||||
|
||||
temp_reddit_submissions.parquet: ../../data/temp/reddit_submissions.parquet
|
||||
@@ -47,11 +47,11 @@ def parse_comment(comment, names= None):
|
||||
return tuple(row)
|
||||
|
||||
|
||||
# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
|
||||
# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','../../data/spark_tmp')])
|
||||
|
||||
def parse_dump(partition):
|
||||
|
||||
dumpdir = f"/gscratch/comdata/raw_data/reddit_dumps/comments/{partition}"
|
||||
dumpdir = f"../../data/reddit_dumps/comments/{partition}"
|
||||
|
||||
stream = open_input_file(dumpdir)
|
||||
rows = map(parse_comment, stream)
|
||||
@@ -76,11 +76,11 @@ def parse_dump(partition):
|
||||
pa.field('error', pa.string(), nullable=True),
|
||||
])
|
||||
|
||||
p = Path("/gscratch/comdata/output/temp/reddit_comments.parquet")
|
||||
p = Path("../../data/temp/reddit_comments.parquet")
|
||||
p.mkdir(exist_ok=True,parents=True)
|
||||
|
||||
N=10000
|
||||
with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet",
|
||||
with pq.ParquetWriter(f"../../data/temp/reddit_comments.parquet/{partition}.parquet",
|
||||
schema=schema,
|
||||
compression='snappy',
|
||||
flavor='spark') as writer:
|
||||
@@ -96,12 +96,12 @@ def parse_dump(partition):
|
||||
writer.close()
|
||||
|
||||
|
||||
def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/comments", overwrite=True):
|
||||
def gen_task_list(dumpdir="../../data/raw_data/reddit_dumps/comments", overwrite=True):
|
||||
files = list(find_dumps(dumpdir,base_pattern="RC_20*.*"))
|
||||
with open("comments_task_list.sh",'w') as of:
|
||||
for fpath in files:
|
||||
partition = os.path.split(fpath)[1]
|
||||
if (not Path(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
|
||||
if (not Path(f"../../data/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
|
||||
of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n')
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/usr/bin/bash
|
||||
source ~/.bashrc
|
||||
echo $(hostname)
|
||||
start_spark_cluster.sh
|
||||
spark-submit --verbose --master spark://$(hostname):43015 submissions_2_parquet_part2.py
|
||||
stop-all.sh
|
||||
24
datasets/run_comments_jobs.sbatch
Normal file
24
datasets/run_comments_jobs.sbatch
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
## tf reddit comments
|
||||
#SBATCH --job-name="cdsc_reddit; parse comment dumps"
|
||||
## Allocation Definition
|
||||
#SBATCH --account=comdata
|
||||
#SBATCH --partition=compute-bigmem
|
||||
## Resources
|
||||
## Nodes. This should always be 1 for parallel-sql.
|
||||
#SBATCH --nodes=1
|
||||
## Walltime (12 hours)
|
||||
#SBATCH --time=24:00:00
|
||||
## Memory per node
|
||||
#SBATCH --mem=8G
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH
|
||||
#SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/datasets
|
||||
#SBATCH --output=comments_jobs/%A_%a.out
|
||||
#SBATCH --error=comments_jobs/%A_%a.out
|
||||
. /opt/ohpc/admin/lmod/lmod/init/profile
|
||||
source ~/.bashrc
|
||||
TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
|
||||
TASK_CALL=$(sed -n ${TASK_NUM}p ./comments_task_list.sh)
|
||||
${TASK_CALL}
|
||||
23
datasets/run_submissions_jobs.sbatch
Normal file
23
datasets/run_submissions_jobs.sbatch
Normal file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
## tf reddit comments
|
||||
#SBATCH --job-name="cdsc_reddit; parse submission dumps"
|
||||
## Allocation Definition
|
||||
#SBATCH --account=comdata-ckpt
|
||||
#SBATCH --partition=ckpt
|
||||
## Resources
|
||||
## Nodes. This should always be 1 for parallel-sql.
|
||||
#SBATCH --nodes=1
|
||||
## Walltime (12 hours)
|
||||
#SBATCH --time=24:00:00
|
||||
## Memory per node
|
||||
#SBATCH --mem=8G
|
||||
#SBATCH --cpus-per-task=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH
|
||||
#SBATCH --chdir /gscratch/comdata/users/nathante/cdsc_reddit/datasets
|
||||
#SBATCH --output=submissions_jobs/%A_%a.out
|
||||
#SBATCH --error=submissions_jobs/%A_%a.out
|
||||
|
||||
TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
|
||||
TASK_CALL=$(sed -n ${TASK_NUM}p ./submissions_task_list.sh)
|
||||
${TASK_CALL}
|
||||
Reference in New Issue
Block a user