From f1b02e695b3e150cebe6849bb491fb7f68b74b95 Mon Sep 17 00:00:00 2001
From: Valentin <valentin.rublack@gmail.com>
Date: Thu, 6 Apr 2017 01:05:46 -0400
Subject: [PATCH] Added option to download images in a specific time range. The
 range can be very big, allowing you to bypass the 1000 post limit and
 downloading virtually every post of a subreddit. Simply call with
 ''--timestamps 1206826595 1342905912', for example, where the two numbers are
 two UNIX timestamps

---
 redditdownload/reddit.py         | 104 ++++++----
 redditdownload/redditdownload.py | 337 +++++++++++++++++--------------
 2 files changed, 251 insertions(+), 190 deletions(-)

diff --git a/redditdownload/reddit.py b/redditdownload/reddit.py
index 9130af7..4451787 100755
--- a/redditdownload/reddit.py
+++ b/redditdownload/reddit.py
@@ -6,14 +6,17 @@
 from urllib2 import urlopen, Request, HTTPError
 from json import JSONDecoder
 
+import time
 
-def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
+
+def getitems(subreddit, multireddit=False, previd='', reddit_sort=None, search_timestamps=None):
     """Return list of items from a subreddit.
 
     :param subreddit: subreddit to load the post
     :param multireddit: multireddit if given instead subreddit
     :param previd: previous post id, to get more post
     :param reddit_sort: type of sorting post
+    :param search_timestamps: performs a reddit search between two timestamps
     :returns: list -- list of post url
 
     :Example:
@@ -29,6 +32,10 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
     ...     print '\t%s - %s' % (item['title'], item['url']) # doctest: +SKIP
     """
 
+    items = []
+
+    url_args = []
+
     if multireddit:
         if '/m/' not in subreddit:
             warning = ('That doesn\'t look like a multireddit. Are you sure'
@@ -43,17 +50,24 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
                        'Call --help for more info')
             print warning
             sys.exit(1)
-        # no sorting needed
-        if reddit_sort is None:
-            url = 'http://www.reddit.com/r/{}.json'.format(subreddit)
-        # if sort is top or controversial, may include advanced sort (ie week, all etc)
-        elif 'top' in reddit_sort:
-            url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top')
-        elif 'controversial' in reddit_sort:
-            url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'controversial')
-        # use default
+
+        if search_timestamps is not None:
+            url = 'http://www.reddit.com/r/{}/search.json'.format(subreddit)
+            url_args.append(('q', 'timestamp%3A{}..{}'.format(search_timestamps[0], search_timestamps[1])))
+            url_args.append(('restrict_sr', 'on'))
+            url_args.append(('syntax', 'cloudsearch'))
         else:
-            url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, reddit_sort)
+            # no sorting needed
+            if reddit_sort is None:
+                url = 'http://www.reddit.com/r/{}.json'.format(subreddit)
+            # if sort is top or controversial, may include advanced sort (ie week, all etc)
+            elif 'top' in reddit_sort:
+                url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'top')
+            elif 'controversial' in reddit_sort:
+                url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, 'controversial')
+            # use default
+            else:
+                url = 'http://www.reddit.com/r/{}/{}.json'.format(subreddit, reddit_sort)
 
     # Get items after item with 'id' of previd.
 
@@ -62,7 +76,7 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
     # here where is query start
     # query for previd comment
     if previd:
-        url = '%s?after=t3_%s' % (url, previd)
+        url_args.append(('after', 't3_{}'.format(previd)))
 
     # query for more advanced top and controversial sort
     # available extension : hour, day, week, month, year, all
@@ -85,35 +99,47 @@ def getitems(subreddit, multireddit=False, previd='', reddit_sort=None):
             sort_type = 'controversial'
 
         if is_advanced_sort:
-            # check if url have already query
-            if '?' in url.split('/')[-1]:
-                url += '&'
-            else:  # url dont have query yet
-                url += '?'
             # add advanced sort
-            url += 'sort={}&t={}'.format(sort_type, sort_time_limit)
-
-    try:
-        req = Request(url, headers=hdr)
-        json = urlopen(req).read()
-        data = JSONDecoder().decode(json)
-        if isinstance(data, dict):
-            items = [x['data'] for x in data['data']['children']]
-        elif isinstance(data, list):
-            # e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json
-            items = [x['data'] for subdata in data for x in subdata['data']['children']]
-            items = [item for item in items if item.get('url')]
-    except HTTPError as ERROR:
-        error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url)
-        sys.exit(error_message)
-    except ValueError as ERROR:
-        if ERROR.args[0] == 'No JSON object could be decoded':
-            error_message = 'ERROR: subreddit "%s" does not exist' % (subreddit)
+            url_args.append(('sort', sort_type))
+            url_args.append(('t', sort_time_limit))
+
+    if len(url_args) > 0:
+        url += '?'
+        for a in url_args:
+            url += '{}={}&'.format(a[0], a[1])
+        url = url[:-1]
+
+    DONE = False
+    while not DONE:
+        try:
+            req = Request(url, headers=hdr)
+            json = urlopen(req).read()
+            data = JSONDecoder().decode(json)
+            if isinstance(data, dict):
+                items.extend([x['data'] for x in data['data']['children']])
+            elif isinstance(data, list):
+                # e.g. https://www.reddit.com/r/photoshopbattles/comments/29evni.json
+                items.extend([x['data'] for subdata in data for x in subdata['data']['children']])
+                items.extend([item for item in items if item.get('url')])
+            DONE = True
+        except HTTPError as ERROR:
+            error_message = '\tHTTP ERROR: Code %s for %s' % (ERROR.code, url)
+
+            if ERROR.code == 503:
+                # don't abort
+                print('\tGot error 503, waiting 10 seconds before resuming...')
+                time.sleep(10)
+            else:
+                DONE = True
+
+        except ValueError as ERROR:
+            if ERROR.args[0] == 'No JSON object could be decoded':
+                error_message = 'ERROR: subreddit "%s" does not exist' % (subreddit)
+                DONE = True
+            raise ERROR
+        except KeyboardInterrupt as ERROR:
+            error_message = '\tKeyboardInterrupt: url:{}.'.format(url)
             sys.exit(error_message)
-        raise ERROR
-    except KeyboardInterrupt as ERROR:
-        error_message = '\tKeyboardInterrupt: url:{}.'.format(url)
-        sys.exit(error_message)
 
     # This is weird but apparently necessary: reddit's json data
     # returns `url` values html-escaped, whereas we normally need them
diff --git a/redditdownload/redditdownload.py b/redditdownload/redditdownload.py
index 6c807c1..1ea5bde 100755
--- a/redditdownload/redditdownload.py
+++ b/redditdownload/redditdownload.py
@@ -16,6 +16,7 @@
     splitext as pathsplitext)
 from os import mkdir, getcwd
 import time
+import math
 
 from .gfycat import gfycat
 from .reddit import getitems
@@ -253,6 +254,8 @@ def parse_args(args):
                         help='Minimum score of images to download.')
     PARSER.add_argument('--num', metavar='n', default=1000, type=int, required=False,
                         help='Number of images to download. Set to 0 to disable the limit')
+    PARSER.add_argument('--timestamps', metavar='ts', default=None, type=int, required=False, nargs='*',
+                        help='Performs reddit search between two UNIX timestamps')
     PARSER.add_argument('--update', default=False, action='store_true', required=False,
                         help='Run until you encounter a file already downloaded.')
     PARSER.add_argument('--sfw', default=False, action='store_true', required=False,
@@ -302,7 +305,6 @@ def main():
     print(parse_reddit_argument(ARGS.reddit))
 
     TOTAL = DOWNLOADED = ERRORS = SKIPPED = FAILED = 0
-    FINISHED = False
 
     # Create the specified directory if it doesn't already exist.
     if not pathexists(ARGS.dir):
@@ -325,167 +327,200 @@ def main():
     if sort_type:
         sort_type = sort_type.lower()
 
-    while not FINISHED:
-        ITEMS = getitems(
-            ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
-            reddit_sort=sort_type)
+    timestamps = ARGS.timestamps
+    if timestamps is not None and len(timestamps) != 2:
+        print('Error: Timestamp must be 2 values!')
+        exit(1)
 
-        # measure time and set the program to wait 4 second between request
-        # as per reddit api guidelines
-        end_time = time.clock()
+    # Divide interval of small intervals of 3 days each.
+    # If the interval was larger, not all items might be returned per interval.
+    three_days_diff = 3 * 86400
+    if timestamps is None:
+        subintervals = 1
+    else:
+        subintervals = int(math.ceil(timestamps[1] - timestamps[0]) / three_days_diff)
+        print('Dividing timestep interval into {} small sub-intervals'.format(subintervals))
+    for si in range(subintervals):
+        if timestamps is not None:
+            # search from newest to oldest
+            interval = (max(timestamps[0], timestamps[1] - (si + 1) * three_days_diff),
+                        timestamps[1] - si * three_days_diff)
+            if si >= 1:
+                LAST = None
+
+            print('Search in sub-interval [{}, {}]'.format(interval[0], interval[1]))
+        else:
+            interval = None
 
-        if start_time is not None:
-            elapsed_time = end_time - start_time
+        FINISHED = False
 
-            if elapsed_time <= 4:  # throttling
-                time.sleep(4 - elapsed_time)
+        while not FINISHED:
+            ITEMS = getitems(
+                ARGS.reddit, multireddit=ARGS.multireddit, previd=LAST,
+                reddit_sort=sort_type,
+                search_timestamps=interval)
 
-        start_time = time.clock()
+            # measure time and set the program to wait 4 second between request
+            # as per reddit api guidelines
+            end_time = time.clock()
 
-        if not ITEMS:
-            # No more items to process
-            break
+            if start_time is not None:
+                elapsed_time = end_time - start_time
+
+                if elapsed_time <= 4:  # throttling
+                    time.sleep(4 - elapsed_time)
+
+            start_time = time.clock()
+
+            if not ITEMS:
+                # No more items to process
+                break
+
+            for ITEM in ITEMS:
+                TOTAL += 1
+
+                # not downloading if url is reddit comment
+                if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
+                            re.match(reddit_comment_regex, ITEM['url']) is not None):
+                    #print('    Skip:[{}]'.format(ITEM['url']))
+                    continue
+
+                if ITEM['score'] < ARGS.score:
+                    if ARGS.verbose:
+                        print('    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']),
+                              'which is lower than required score of {}.'.format(ARGS.score))
+
+                    SKIPPED += 1
+                    continue
+                elif ARGS.sfw and ITEM['over_18']:
+                    if ARGS.verbose:
+                        print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))
+
+                    SKIPPED += 1
+                    continue
+                elif ARGS.nsfw and not ITEM['over_18']:
+                    if ARGS.verbose:
+                        print('    Not NSFW, skipping %s' % (ITEM['id']))
 
-        for ITEM in ITEMS:
-            TOTAL += 1
-
-            # not downloading if url is reddit comment
-            if ('reddit.com/r/' + ARGS.reddit + '/comments/' in ITEM['url'] or
-                    re.match(reddit_comment_regex, ITEM['url']) is not None):
-                print('    Skip:[{}]'.format(ITEM['url']))
-                continue
-
-            if ITEM['score'] < ARGS.score:
-                if ARGS.verbose:
-                    print('    SCORE: {} has score of {}'.format(ITEM['id'], ITEM['score']),
-                          'which is lower than required score of {}.'.format(ARGS.score))
-
-                SKIPPED += 1
-                continue
-            elif ARGS.sfw and ITEM['over_18']:
-                if ARGS.verbose:
-                    print('    NSFW: %s is marked as NSFW.' % (ITEM['id']))
-
-                SKIPPED += 1
-                continue
-            elif ARGS.nsfw and not ITEM['over_18']:
-                if ARGS.verbose:
-                    print('    Not NSFW, skipping %s' % (ITEM['id']))
-
-                SKIPPED += 1
-                continue
-            elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
-                if ARGS.verbose:
-                    print('    Regex not matched')
-
-                SKIPPED += 1
-                continue
-            elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
-                if ARGS.verbose:
-                    print('    Album found, skipping %s' % (ITEM['id']))
-
-                SKIPPED += 1
-                continue
-
-            if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
-                if ARGS.verbose:
-                    print('    Title does not contain "{}",'.format(ARGS.title_contain),
-                          'skipping {}'.format(ITEM['id']))
-
-                SKIPPED += 1
-                continue
-
-            FILECOUNT = 0
-            try:
-                URLS = extract_urls(ITEM['url'])
-            except Exception:
-                _log.exception("Failed to extract urls for %r", URLS)
-                continue
-            for URL in URLS:
+                    SKIPPED += 1
+                    continue
+                elif ARGS.regex and not re.match(RE_RULE, ITEM['title']):
+                    if ARGS.verbose:
+                        print('    Regex not matched')
+
+                    SKIPPED += 1
+                    continue
+                elif ARGS.skipAlbums and 'imgur.com/a/' in ITEM['url']:
+                    if ARGS.verbose:
+                        print('    Album found, skipping %s' % (ITEM['id']))
+
+                    SKIPPED += 1
+                    continue
+
+                if ARGS.title_contain and ARGS.title_contain.lower() not in ITEM['title'].lower():
+                    if ARGS.verbose:
+                        print('    Title does not contain "{}",'.format(ARGS.title_contain),
+                              'skipping {}'.format(ITEM['id']))
+
+                    SKIPPED += 1
+                    continue
+
+                FILECOUNT = 0
                 try:
-                    # Find gfycat if requested
-                    if URL.endswith('gif') and ARGS.mirror_gfycat:
-                        check = gfycat().check(URL)
-                        if check.get("urlKnown"):
-                            URL = check.get('webmUrl')
-
-                    FILEEXT = pathsplitext(URL)[1]
-                    # Trim any http query off end of file extension.
-                    FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
-                    if not FILEEXT:
-                        # A more usable option that empty.
-                        # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
-                        FILEEXT = '.jpg'
-
-                    # Only append numbers if more than one file
-                    FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')
-
-                    # create filename based on given input from user
-                    if ARGS.filename_format == 'url':
-                        FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
-                    elif ARGS.filename_format == 'title':
-                        FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
-                        if len(FILENAME) >= 256:
-                            shortened_item_title = slugify(ITEM['title'])[:256-len(FILENAME)]
-                            FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
-                    else:
-                        FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
-                    # join file with directory
-                    FILEPATH = pathjoin(ARGS.dir, FILENAME)
-
-                    # Improve debuggability list URL before download too.
-                    # url may be wrong so skip that
-                    if URL.encode('utf-8') == 'http://':
-                        raise URLError('Url is empty')
-                    else:
-                        text_templ = '    Attempting to download URL[{}] as [{}].'
-                        print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')))
-
-                    # Download the image
+                    URLS = extract_urls(ITEM['url'])
+                except Exception:
+                    _log.exception("Failed to extract urls for %r", URLS)
+                    continue
+                for URL in URLS:
                     try:
-                        download_from_url(URL, FILEPATH)
-                        # Image downloaded successfully!
-                        print('    Sucessfully downloaded URL [%s] as [%s].' % (URL, FILENAME))
-                        DOWNLOADED += 1
-                        FILECOUNT += 1
-
-                    except Exception as exc:
-                        print('    %s' % (exc,))
+                        # Find gfycat if requested
+                        if URL.endswith('gif') and ARGS.mirror_gfycat:
+                            check = gfycat().check(URL)
+                            if check.get("urlKnown"):
+                                URL = check.get('webmUrl')
+
+                        FILEEXT = pathsplitext(URL)[1]
+                        # Trim any http query off end of file extension.
+                        FILEEXT = re.sub(r'\?.*$', '', FILEEXT)
+                        if not FILEEXT:
+                            # A more usable option that empty.
+                            # The extension can be fixed after downloading, but then the 'already downloaded' check will be harder.
+                            FILEEXT = '.jpg'
+
+                        # Only append numbers if more than one file
+                        FILENUM = ('_%d' % FILECOUNT if len(URLS) > 1 else '')
+
+                        # create filename based on given input from user
+                        if ARGS.filename_format == 'url':
+                            FILENAME = '%s%s%s' % (pathsplitext(pathbasename(URL))[0], '', FILEEXT)
+                        elif ARGS.filename_format == 'title':
+                            FILENAME = '%s%s%s' % (slugify(ITEM['title']), FILENUM, FILEEXT)
+                            if len(FILENAME) >= 256:
+                                shortened_item_title = slugify(ITEM['title'])[:256 - len(FILENAME)]
+                                FILENAME = '%s%s%s' % (shortened_item_title, FILENUM, FILEEXT)
+                        else:
+                            FILENAME = '%s%s%s' % (ITEM['id'], FILENUM, FILEEXT)
+                        # join file with directory
+                        FILEPATH = pathjoin(ARGS.dir, FILENAME)
+
+                        # Improve debuggability list URL before download too.
+                        # url may be wrong so skip that
+                        if URL.encode('utf-8') == 'http://':
+                            raise URLError('Url is empty')
+                        else:
+                            text_templ = '    Attempting to download URL[{}] as [{}]'
+                            print(text_templ.format(URL.encode('utf-8'), FILENAME.encode('utf-8')))
+
+                        # Download the image
+                        try:
+                            download_from_url(URL, FILEPATH)
+                            # Image downloaded successfully!
+                            text_templ = '    Successfully downloaded URL[{}] as [{}]'
+                            print(text_templ.format(URL, FILENAME))
+                            DOWNLOADED += 1
+                            FILECOUNT += 1
+
+                        except Exception as exc:
+                            print('    %s' % (exc,))
+                            ERRORS += 1
+
+                        if ARGS.num and DOWNLOADED >= ARGS.num:
+                            FINISHED = True
+                            break
+                    except WrongFileTypeException as ERROR:
+                        print('    %s' % (ERROR,))
+                        _log_wrongtype(url=URL, target_dir=ARGS.dir,
+                                       filecount=FILECOUNT, _downloaded=DOWNLOADED,
+                                       filename=FILENAME)
+                        SKIPPED += 1
+                    except FileExistsException as ERROR:
+                        print('    %s' % (ERROR,))
                         ERRORS += 1
+                        if ARGS.update:
+                            print('    Update complete, exiting.')
+                            FINISHED = True
+                            break
+                    except HTTPError as ERROR:
+                        print('    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL))
+
+                        if HTTPError.code == 503:
+                            print('Waiting 10 seconds before resuming')
+                            time.sleep(10)
+                        FAILED += 1
+                    except URLError as ERROR:
+                        print('    URL ERROR: %s!' % (URL,))
+                        FAILED += 1
+                    except InvalidURL as ERROR:
+                        print('    Invalid URL: %s!' % (URL,))
+                        FAILED += 1
+                    except Exception as exc:
+                        _log.exception("Problem with %r: %r", URL, exc)
+                        FAILED += 1
 
-                    if ARGS.num and DOWNLOADED >= ARGS.num:
-                        FINISHED = True
-                        break
-                except WrongFileTypeException as ERROR:
-                    print('    %s' % (ERROR,))
-                    _log_wrongtype(url=URL, target_dir=ARGS.dir,
-                                   filecount=FILECOUNT, _downloaded=DOWNLOADED,
-                                   filename=FILENAME)
-                    SKIPPED += 1
-                except FileExistsException as ERROR:
-                    print('    %s' % (ERROR,))
-                    ERRORS += 1
-                    if ARGS.update:
-                        print('    Update complete, exiting.')
-                        FINISHED = True
-                        break
-                except HTTPError as ERROR:
-                    print('    HTTP ERROR: Code %s for %s.' % (ERROR.code, URL))
-                    FAILED += 1
-                except URLError as ERROR:
-                    print('    URL ERROR: %s!' % (URL,))
-                    FAILED += 1
-                except InvalidURL as ERROR:
-                    print('    Invalid URL: %s!' % (URL,))
-                    FAILED += 1
-                except Exception as exc:
-                    _log.exception("Problem with %r: %r", URL, exc)
-                    FAILED += 1
-
-            if FINISHED:
-                break
+                if FINISHED:
+                    break
 
-        LAST = ITEM['id'] if ITEM is not None else None
+            LAST = ITEM['id'] if ITEM is not None else None
 
     print('Downloaded {} files'.format(DOWNLOADED),
           '(Processed {}, Skipped {}, Exists {})'.format(TOTAL, SKIPPED, ERRORS))