diff --git a/commonsdownloader/commonsdownloader.py b/commonsdownloader/commonsdownloader.py index 8dd8598..8310b7b 100644 --- a/commonsdownloader/commonsdownloader.py +++ b/commonsdownloader/commonsdownloader.py @@ -3,16 +3,12 @@ """Download files from Wikimedia Commons.""" -import sys - -reload(sys) -sys.setdefaultencoding("utf-8") - import os import logging import argparse -from thumbnaildownload import download_file, DownloadException -from itertools import izip_longest +from commonsdownloader.thumbnaildownload import download_file, DownloadException + +from itertools import zip_longest def get_category_files_from_api(category_name): @@ -20,14 +16,13 @@ def get_category_files_from_api(category_name): import mwclient site = mwclient.Site('commons.wikimedia.org') category = site.Categories[category_name] - return (x.page_title.encode('utf-8') - for x in category.members(namespace=6)) + return (x.page_title for x in category.members(namespace=6)) def download_from_category(category_name, output_path, width): """Download files of a given category.""" file_names = get_category_files_from_api(category_name) - files_to_download = izip_longest(file_names, [], fillvalue=width) + files_to_download = zip_longest(file_names, [], fillvalue=width) download_files_if_not_in_manifest(files_to_download, output_path) @@ -52,7 +47,7 @@ def download_from_file_list(file_list, output_path): def get_files_from_arguments(files, width): """Yield the file names and chosen width.""" - return izip_longest(files, [], fillvalue=width) + return zip_longest(files, [], fillvalue=width) def download_from_files(files, output_path, width): @@ -101,7 +96,7 @@ def download_files_if_not_in_manifest(files_iterator, output_path): try: download_file(file_name, output_path, width=width) write_file_to_manifest(file_name, width, manifest_fh) - except DownloadException, e: + except DownloadException as e: logging.error("Could not download %s: %s", file_name, e.message) diff --git a/commonsdownloader/thumbnaildownload.py b/commonsdownloader/thumbnaildownload.py index e1b1f9b..2023fe0 100644 --- a/commonsdownloader/thumbnaildownload.py +++ b/commonsdownloader/thumbnaildownload.py @@ -4,9 +4,8 @@ import os import re -import urllib2 import logging - +import urllib DEFAULT_WIDTH = 100 @@ -41,13 +40,15 @@ class CouldNotWriteFileOnDiskException(DownloadException): def clean_up_filename(file_name): """Return the cleaned-up file title.""" - return file_name.strip().replace(' ', '_') + file_name = file_name.strip() + file_name = file_name.replace(' ', '_') + return file_name def make_thumb_url(image_name, width): """Return the URL to the thumbnail of the file, at the given width.""" base_url = "http://commons.wikimedia.org/w/index.php?title=Special:FilePath&file=%s&width=%s" - return base_url % (urllib2.quote(image_name), width) + return base_url % (urllib.parse.quote(image_name), width) def make_full_size_url(image_name): @@ -72,13 +73,14 @@ def get_thumbnail_of_file(image_name, width): """Return the file contents of the thumbnail of the given file.""" hdr = {'User-Agent': 'Python urllib2'} url = make_thumb_url(image_name, width) - req = urllib2.Request(url, headers=hdr) + req = urllib.request.Request(url, headers=hdr) try: logging.debug("Retrieving %s", url) - opened = urllib2.urlopen(req) - extension = opened.headers.subtype - return opened.read(), make_thumbnail_name(image_name, extension) - except urllib2.HTTPError, e: + opened = urllib.request.urlopen(req) + extension = opened.info().get_content_subtype() + contents = opened.read() + return contents, make_thumbnail_name(image_name, extension) + except urllib.error.HTTPError as e: message = e.fp.read() raise get_exception_based_on_api_message(message, image_name) @@ -87,13 +89,13 @@ def get_full_size_file(image_name): """Return the file contents of given file at full size.""" hdr = {'User-Agent': 'Python urllib2'} url = make_full_size_url(image_name) - req = urllib2.Request(url, headers=hdr) + req = urllib.request.Request(url, headers=hdr) try: logging.debug("Retrieving %s", url) - opened = urllib2.urlopen(req) - extension = opened.headers.subtype + opened = urllib.request.urlopen(req) + extension = opened.info().get_content_subtype() return opened.read(), make_thumbnail_name(image_name, extension) - except urllib2.HTTPError, e: + except urllib.error.HTTPError as e: message = e.fp.read() raise get_exception_based_on_api_message(message, image_name) @@ -131,12 +133,12 @@ def download_file(image_name, output_path, width=DEFAULT_WIDTH): logging.debug("Writing as %s", output_file_path) f.write(contents) return output_file_path - except IOError, e: + except IOError as e: msg = 'Could not write file %s on disk to %s: %s' % \ (image_name, output_path, e.message) logging.error(msg) raise CouldNotWriteFileOnDiskException(msg) - except Exception, e: + except Exception as e: logging.critical(e.message) msg = 'An unexpected error occured when downloading %s to %s: %s' % \ (image_name, output_path, e.message)