Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions commonsdownloader/commonsdownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,26 @@

"""Download files from Wikimedia Commons."""

import sys

reload(sys)
sys.setdefaultencoding("utf-8")

import os
import logging
import argparse
from thumbnaildownload import download_file, DownloadException
from itertools import izip_longest
from commonsdownloader.thumbnaildownload import download_file, DownloadException

from itertools import zip_longest


def get_category_files_from_api(category_name):
"""Yield the file names of a category by querying the MediaWiki API."""
import mwclient
site = mwclient.Site('commons.wikimedia.org')
category = site.Categories[category_name]
return (x.page_title.encode('utf-8')
for x in category.members(namespace=6))
return (x.page_title for x in category.members(namespace=6))


def download_from_category(category_name, output_path, width):
"""Download files of a given category."""
file_names = get_category_files_from_api(category_name)
files_to_download = izip_longest(file_names, [], fillvalue=width)
files_to_download = zip_longest(file_names, [], fillvalue=width)
download_files_if_not_in_manifest(files_to_download, output_path)


Expand All @@ -52,7 +47,7 @@ def download_from_file_list(file_list, output_path):

def get_files_from_arguments(files, width):
"""Yield the file names and chosen width."""
return izip_longest(files, [], fillvalue=width)
return zip_longest(files, [], fillvalue=width)


def download_from_files(files, output_path, width):
Expand Down Expand Up @@ -101,7 +96,7 @@ def download_files_if_not_in_manifest(files_iterator, output_path):
try:
download_file(file_name, output_path, width=width)
write_file_to_manifest(file_name, width, manifest_fh)
except DownloadException, e:
except DownloadException as e:
logging.error("Could not download %s: %s", file_name, e.message)


Expand Down
32 changes: 17 additions & 15 deletions commonsdownloader/thumbnaildownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@

import os
import re
import urllib2
import logging

import urllib

DEFAULT_WIDTH = 100

Expand Down Expand Up @@ -41,13 +40,15 @@ class CouldNotWriteFileOnDiskException(DownloadException):

def clean_up_filename(file_name):
"""Return the cleaned-up file title."""
return file_name.strip().replace(' ', '_')
file_name = file_name.strip()
file_name = file_name.replace(' ', '_')
return file_name


def make_thumb_url(image_name, width):
"""Return the URL to the thumbnail of the file, at the given width."""
base_url = "http://commons.wikimedia.org/w/index.php?title=Special:FilePath&file=%s&width=%s"
return base_url % (urllib2.quote(image_name), width)
return base_url % (urllib.parse.quote(image_name), width)


def make_full_size_url(image_name):
Expand All @@ -72,13 +73,14 @@ def get_thumbnail_of_file(image_name, width):
"""Return the file contents of the thumbnail of the given file."""
hdr = {'User-Agent': 'Python urllib2'}
url = make_thumb_url(image_name, width)
req = urllib2.Request(url, headers=hdr)
req = urllib.request.Request(url, headers=hdr)
try:
logging.debug("Retrieving %s", url)
opened = urllib2.urlopen(req)
extension = opened.headers.subtype
return opened.read(), make_thumbnail_name(image_name, extension)
except urllib2.HTTPError, e:
opened = urllib.request.urlopen(req)
extension = opened.info().get_content_subtype()
contents = opened.read()
return contents, make_thumbnail_name(image_name, extension)
except urllib.error.HTTPError as e:
message = e.fp.read()
raise get_exception_based_on_api_message(message, image_name)

Expand All @@ -87,13 +89,13 @@ def get_full_size_file(image_name):
"""Return the file contents of given file at full size."""
hdr = {'User-Agent': 'Python urllib2'}
url = make_full_size_url(image_name)
req = urllib2.Request(url, headers=hdr)
req = urllib.request.Request(url, headers=hdr)
try:
logging.debug("Retrieving %s", url)
opened = urllib2.urlopen(req)
extension = opened.headers.subtype
opened = urllib.request.urlopen(req)
extension = opened.info().get_content_subtype()
return opened.read(), make_thumbnail_name(image_name, extension)
except urllib2.HTTPError, e:
except urllib.error.HTTPError as e:
message = e.fp.read()
raise get_exception_based_on_api_message(message, image_name)

Expand Down Expand Up @@ -131,12 +133,12 @@ def download_file(image_name, output_path, width=DEFAULT_WIDTH):
logging.debug("Writing as %s", output_file_path)
f.write(contents)
return output_file_path
except IOError, e:
except IOError as e:
msg = 'Could not write file %s on disk to %s: %s' % \
(image_name, output_path, e.message)
logging.error(msg)
raise CouldNotWriteFileOnDiskException(msg)
except Exception, e:
except Exception as e:
logging.critical(e.message)
msg = 'An unexpected error occured when downloading %s to %s: %s' % \
(image_name, output_path, e.message)
Expand Down