Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 65 additions & 39 deletions redditdownload/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
from urllib2 import urlopen, Request, HTTPError
from json import JSONDecoder

import time

def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):

def getitems(subreddit, multireddit=False, previd='', reddit_sort=None, search_timestamps=None):
"""Return list of items from a subreddit.

:param subreddit: subreddit to load the post
:param multireddit: multireddit if given instead subreddit
:param previd: previous post id, to get more post
:param reddit_sort: type of sorting post
:param search_timestamps: performs a reddit search between two timestamps
:returns: list -- list of post url

:Example:
Expand All @@ -29,6 +32,10 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
... print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP
"""

items = []

url_args = []

if multireddit:
if '/m/' not in subreddit:
warning = ('That doesn\'t look like a multireddit. Are you sure'
Expand All @@ -43,17 +50,24 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
'Call --help for more info')
print warning
sys.exit(1)
# no sorting needed
if reddit_sort is None:
url = 'http://www.reddit.com/r/{}.json'.format(subreddit)
# if sort is top or controversial, may include advanced sort (ie week, all etc)
elif 'top' in reddit_sort:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top')
elif 'controversial' in reddit_sort:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'controversial')
# use default

if search_timestamps is not None:
url = 'http://www.reddit.com/r/{}/search.json'.format(subreddit)
url_args.append(('q', 'timestamp%3A{}..{}'.format(search_timestamps[0], search_timestamps[1])))
url_args.append(('restrict_sr', 'on'))
url_args.append(('syntax', 'cloudsearch'))
else:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, reddit_sort)
# no sorting needed
if reddit_sort is None:
url = 'http://www.reddit.com/r/{}.json'.format(subreddit)
# if sort is top or controversial, may include advanced sort (ie week, all etc)
elif 'top' in reddit_sort:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top')
elif 'controversial' in reddit_sort:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'controversial')
# use default
else:
url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, reddit_sort)

# Get items after item with 'id' of previd.

Expand All @@ -62,7 +76,7 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
# here where is query start
# query for previd comment
if previd:
url = '%s?after=t3_%s' % (url, previd)
url_args.append(('after', 't3_{}'.format(previd)))

# query for more advanced top and controversial sort
# available extension : hour, day, week, month, year, all
Expand All @@ -85,35 +99,47 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
sort_type = 'controversial'

if is_advanced_sort:
# check if url have already query
if '?' in url.split('/')[-1]:
url += '&'
else: # url dont have query yet
url += '?'
# add advanced sort
url += 'sort={}&t={}'.format(sort_type, sort_time_limit)

try:
req = Request(url, headers=hdr)
json = urlopen(req).read()
data = JSONDecoder().decode(json)
if isinstance(data, dict):
items = [x['data'] for x in data['data']['children']]
elif isinstance(data, list):
# e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json
items = [x['data'] for subdata in data for x in subdata['data']['children']]
items = [item for item in items if item.get('url')]
except HTTPError as ERROR:
error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url)
sys.exit(error_message)
except ValueError as ERROR:
if ERROR.args[0] == 'No JSON object could be decoded':
error_message = 'ERROR: subreddit "%s" does not exist' % (subreddit)
url_args.append(('sort', sort_type))
url_args.append(('t', sort_time_limit))

if len(url_args) > 0:
url += '?'
for a in url_args:
url += '{}={}&'.format(a[0], a[1])
url = url[:-1]

DONE = False
while not DONE:
try:
req = Request(url, headers=hdr)
json = urlopen(req).read()
data = JSONDecoder().decode(json)
if isinstance(data, dict):
items.extend([x['data'] for x in data['data']['children']])
elif isinstance(data, list):
# e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json
items.extend([x['data'] for subdata in data for x in subdata['data']['children']])
items.extend([item for item in items if item.get('url')])
DONE = True
except HTTPError as ERROR:
error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url)

if ERROR.code == 503:
# don't abort
print('\tGot error 503, waiting 10 seconds before resuming...')
time.sleep(10)
else:
DONE = True

except ValueError as ERROR:
if ERROR.args[0] == 'No JSON object could be decoded':
error_message = 'ERROR: subreddit "%s" does not exist' % (subreddit)
DONE = True
raise ERROR
except KeyboardInterrupt as ERROR:
error_message = '\tKeyboardInterrupt: url:{}.'.format(url)
sys.exit(error_message)
raise ERROR
except KeyboardInterrupt as ERROR:
error_message = '\tKeyboardInterrupt: url:{}.'.format(url)
sys.exit(error_message)

# This is weird but apparently necessary: reddit's json data
# returns `url` values html-escaped, whereas we normally need them
Expand Down
Loading