Update reddit comments data with daily dumps.
This commit is contained in:
		
							parent
							
								
									2740f55915
								
							
						
					
					
						commit
						4ced659d19
					
				
							
								
								
									
										4
									
								
								check_comments_shas.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										4
									
								
								check_comments_shas.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @ -5,8 +5,10 @@ import requests | |||||||
| from os import path | from os import path | ||||||
| import hashlib | import hashlib | ||||||
| 
 | 
 | ||||||
| shasums = requests.get("https://files.pushshift.io/reddit/comments/sha256sums.txt").text | shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text | ||||||
|  | shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text | ||||||
| 
 | 
 | ||||||
|  | shasums = shasums1 + shasums2 | ||||||
| dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" | dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" | ||||||
| 
 | 
 | ||||||
| for l in shasums.strip().split('\n'): | for l in shasums.strip().split('\n'): | ||||||
|  | |||||||
| @ -1,9 +1,6 @@ | |||||||
| ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete  | ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete  | ||||||
| 
 | 
 | ||||||
| #!/usr/bin/env bash | #!/usr/bin/env bash | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| echo "#!/usr/bin/bash" > job_script.sh | echo "#!/usr/bin/bash" > job_script.sh | ||||||
| echo "source $(pwd)/../bin/activate" >> job_script.sh | echo "source $(pwd)/../bin/activate" >> job_script.sh | ||||||
| echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh | echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh | ||||||
|  | |||||||
| @ -40,6 +40,8 @@ def open_input_file(input_filename): | |||||||
|         cmd = ["xzcat",'-dk', '-T 20',input_filename] |         cmd = ["xzcat",'-dk', '-T 20',input_filename] | ||||||
|     elif re.match(r'.*\.zst',input_filename): |     elif re.match(r'.*\.zst',input_filename): | ||||||
|         cmd = ['zstd','-dck', input_filename] |         cmd = ['zstd','-dck', input_filename] | ||||||
|  |     elif re.match(r'.*\.gz',input_filename): | ||||||
|  |         cmd = ['gzip','-dc', input_filename] | ||||||
|     try: |     try: | ||||||
|         input_file = Popen(cmd, stdout=PIPE).stdout |         input_file = Popen(cmd, stdout=PIPE).stdout | ||||||
|     except NameError as e: |     except NameError as e: | ||||||
|  | |||||||
| @ -4,8 +4,11 @@ user_agent='nathante teblunthuis <nathante@uw.edu>' | |||||||
| output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' | output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' | ||||||
| base_url='https://files.pushshift.io/reddit/comments/' | base_url='https://files.pushshift.io/reddit/comments/' | ||||||
| 
 | 
 | ||||||
| wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url | wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url | ||||||
| wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url | wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url | ||||||
| wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url | wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url | ||||||
| 
 | 
 | ||||||
| ./check_comment_shas.py | # starting in 2020 we use daily dumps not monthly dumps | ||||||
|  | wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/ | ||||||
|  | 
 | ||||||
|  | ./check_comments_shas.py | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user