Created
November 30, 2014 23:20
-
-
Save raymonstah/c9d27c559a5078dc5f71 to your computer and use it in GitHub Desktop.
A reddit imgur ripper. Prevents downloading same files repeatedly if ran multiple times.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
# Raymond Ho | |
# November 29, 2014 | |
# Downloads imgur uploads from reddit and saves them in a new directory. | |
# Rerun every so often to download new pictures without downloading the | |
# same pictures multiple times. A text file keeps track of what has | |
# already been downloaded. | |
import requests # To access Reddit | |
import urllib.request # To download files | |
import re # To find RegEx, | |
import os # To check if directory exists | |
def rip_from_reddit(subreddit='MechanicalKeyboards', post_limit=100): | |
""" | |
Returns a list of unseen pics from specified subreddit. | |
Updates text file's link. | |
""" | |
url = 'http://www.reddit.com/r/' + subreddit + '/.json?' +\ | |
'limit=' + str(post_limit) | |
authorize = ('User-Agent', 'Raymonds redditrip') | |
r = requests.get(url, auth=authorize) | |
# RegExp to find image links. | |
images = re.findall("http://i.imgur.com/\w+.(?:jpg|gif|png)", r.text) | |
unseen_pics = [] # These are going to be the pics we download later. | |
# Updates the text file with new links | |
with open(file_name, 'a+') as f: | |
for link in images: | |
if link not in seen_pics: | |
unseen_pics.append(link) | |
f.write(link + '\n') | |
print('Adding', link) | |
if not unseen_pics: | |
print('No new files to download.\ | |
\nCheck subreddit spelling / Increase limit.') | |
#Create path if it doesn't exist. | |
PICDIR = 'reddit_' + subreddit.lower() + '/' | |
if not os.path.exists(PICDIR): | |
os.makedirs(PICDIR) | |
# Download everything we haven't seen already. | |
for pic in unseen_pics: | |
urllib.request.urlretrieve(pic, PICDIR + pic[pic.rfind('/')+1:]) | |
print('Downloading..', pic) | |
if __name__ == '__main__': | |
seen_pics = [] | |
file_name = 'redditlinks.txt' | |
# Read file into a list to see what we already downloaded | |
# If file doesn't exist, create it. | |
try: | |
with open(file_name, 'r') as f: | |
for line in f: | |
seen_pics.append(line.strip()) | |
except FileNotFoundError: | |
open(file_name, 'w') | |
rip_from_reddit() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment