Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex
This commit is contained in:
		
						commit
						65deba5e4e
					
				| @ -6,7 +6,7 @@ from os import path | ||||
| import hashlib | ||||
| 
 | ||||
| shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text | ||||
| shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text | ||||
| #shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text | ||||
| 
 | ||||
| shasums = shasums1 + shasums2 | ||||
| dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" | ||||
|  | ||||
| @ -1,12 +1,12 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| user_agent='nathante teblunthuis <nathante@uw.edu>' | ||||
| user_agent='"nathante teblunthuis <nathante@uw.edu>"' | ||||
| output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' | ||||
| base_url='https://files.pushshift.io/reddit/comments/' | ||||
| 
 | ||||
| wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| 
 | ||||
| 
 | ||||
| ./check_comments_shas.py | ||||
|  | ||||
| @ -1,14 +1,14 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| user_agent='nathante teblunthuis <nathante@uw.edu>' | ||||
| user_agent='"nathante teblunthuis <nathante@uw.edu>"' | ||||
| output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions' | ||||
| base_url='https://files.pushshift.io/reddit/submissions/' | ||||
| 
 | ||||
| wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url | ||||
| wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ | ||||
| 
 | ||||
| ./check_submission_shas.py | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user