Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f04f80e
Add Mangasee scraper
matoro Feb 8, 2019
01b3823
handle requests.exceptions.StreamConsumedError in page_download_task()
matoro Feb 9, 2019
9d382ee
mangasee: improve speed with persistent session
matoro Feb 11, 2019
9d4325c
scraper: do not sanitize download_dir when explicitly set
matoro Feb 13, 2019
c708c3a
add mangahere scraper; numerous misc fixes
matoro Feb 15, 2019
0f28bca
mangahere: fix chapter scraping for adult content warning
matoro Feb 18, 2019
6b49af6
mangahere: fix chapter regex pattern, add test
matoro Feb 24, 2019
c5eba73
allow download individual chapters via with get command
matoro Feb 24, 2019
61395ee
mangahere: print error message when bad status code received
matoro Nov 23, 2019
6774150
mangasee: allow multiple retries of failed page fetch
matoro Nov 27, 2019
b313be1
Merge remote-tracking branch 'upstream/master' into mangasee
matoro Nov 28, 2019
79115df
Add Manganelo scraper
matoro Mar 5, 2020
6b7480c
scraper: check if chapter already exists in database
matoro Mar 12, 2020
cb7ad09
scraper: add retries for entire chapter
matoro Mar 13, 2020
695771c
scraper: add proper support for retrying pages
matoro Mar 25, 2020
fb53267
mangadex: add resiliency to initial image request
matoro Mar 25, 2020
d4ce9b4
mangahere: overhaul for new anti-scraping system
matoro Apr 5, 2020
0c42254
mangahere: mobile site changed again, move completely to desktop site
matoro Apr 21, 2020
2c3fcb8
mangasee: move from elem.text to elem.contents
matoro Apr 22, 2020
81bbd5b
mangahere: update to use .contents instead of .text
matoro May 7, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cum/cum.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,21 +265,21 @@ def get(input, directory):
"""
chapter_list = []
for item in input:
series = None
try:
series = utility.series_by_url(item)
except exceptions.ScrapingError:
output.warning('Scraping error ({})'.format(item))
continue
pass
except exceptions.LoginError as e:
output.warning('{} ({})'.format(e.message, item))
continue
if series:
chapter_list += series.chapters
chapter = None
try:
chapter = utility.chapter_by_url(item)
except exceptions.ScrapingError:
output.warning('Scraping error ({})'.format(item))
continue
pass
except exceptions.LoginError as e:
output.warning('{} ({})'.format(e.message, item))
continue
Expand Down
10 changes: 9 additions & 1 deletion cum/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,15 @@ def to_object(self):
if parse.netloc == 'www.yuri-ism.net':
from cum.scrapers.yuriism import YuriismChapter
return YuriismChapter(**kwargs)

if parse.netloc == 'mangaseeonline.us':
from cum.scrapers.mangasee import MangaseeChapter
return MangaseeChapter(**kwargs)
if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'):
from cum.scrapers.mangahere import MangahereChapter
return MangahereChapter(**kwargs)
if parse.netloc == 'manganelo.com':
from cum.scrapers.manganelo import ManganeloChapter
return ManganeloChapter(**kwargs)

class Group(Base):
__tablename__ = 'groups'
Expand Down
9 changes: 9 additions & 0 deletions cum/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,28 @@
from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries
from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries
from cum.scrapers.mangadex import MangadexSeries, MangadexChapter
from cum.scrapers.manganelo import ManganeloSeries, ManganeloChapter
from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter
from cum.scrapers.mangahere import MangahereSeries, MangahereChapter
from cum.scrapers.yuriism import YuriismChapter, YuriismSeries

series_scrapers = [
DokiReaderSeries,
DynastyScansSeries,
MadokamiSeries,
MangadexSeries,
ManganeloSeries,
MangaseeSeries,
MangahereSeries,
YuriismSeries,
]
chapter_scrapers = [
DokiReaderChapter,
DynastyScansChapter,
MadokamiChapter,
MangadexChapter,
ManganeloChapter,
MangaseeChapter,
MangahereChapter,
YuriismChapter,
]
73 changes: 59 additions & 14 deletions cum/scrapers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABCMeta, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from cum import config, db, output
from cum import config, db, exceptions, output
from mimetypes import guess_extension
from re import match, sub
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
Expand Down Expand Up @@ -195,6 +195,11 @@ def filename(self):
elif match(r'[0-9]*\.[0-9]*$', self.chapter):
number, decimal = self.chapter.split('.')
chapter = 'c{:0>3} x{}'.format(number, decimal)
# Individually numbered chapter with double-decimal (e.g. '2.164.5').
# Used by titles with multiple volumes/seasons and special chapters.
elif match(r'[0-9]*(\.[0-9]*){2}$', self.chapter):
volume, number, decimal = self.chapter.split('.')
chapter = 'c{:0>3} x{:0>3}.{}'.format(volume, number, decimal)
# Failing all else, e.g. 'Special'. Becomes 'c000 [Special]'.
else:
chapter = 'c000 [{}]'.format(self.chapter)
Expand All @@ -209,13 +214,20 @@ def filename(self):
else:
ext = 'zip'

directory_set = False
if self.directory:
directory = os.path.expanduser(self.directory)
directory_set = True
else:
directory = name
download_dir = os.path.expanduser(config.get().download_directory)
download_dir = os.path.join(download_dir, directory)
download_dir = self._strip_unwanted_characters(download_dir)
# only sanitize download_dir if the user did not explicitly set it
# assume that if it is set, the user wanted it exactly as set
# if they include bad characters and it breaks things, that's their
# fault.
if not directory_set:
download_dir = self._strip_unwanted_characters(download_dir)
download_dir = self.create_directory(download_dir)

# Format the filename somewhat based on Daiz's manga naming scheme.
Expand Down Expand Up @@ -292,16 +304,45 @@ def page_download_finish(bar, files, fs):
bar.update(1)

@staticmethod
def page_download_task(page_num, r):
def page_download_task(page_num, r, page_url = None):
"""Saves the response body of a single request, returning the file
handle and the passed through number of the page to allow for non-
sequential downloads in parallel.
"""
ext = BaseChapter.guess_extension(r.headers.get('content-type'))
f = NamedTemporaryFile(suffix=ext, delete=False)
for chunk in r.iter_content(chunk_size=4096):
if chunk:
f.write(chunk)
retries = 20
while retries > 0:
try:
for chunk in r.iter_content(chunk_size=4096):
if chunk:
f.write(chunk)
retries = 0
# basically ignores this exception that requests throws. my
# understanding is that it is raised when you attempt to iter_content()
# over the same content twice. don't understand how that situation
# arises with the current code but it did somehow.
# https://stackoverflow.com/questions/45379903/
except requests.exceptions.StreamConsumedError:
pass
# when under heavy load, Mangadex will often kill the connection in
# the middle of an image download. in the original architecture,
# the requests are all opened in the scrapers in stream mode, then
# the actual image payloads are downloaded in the asynchronous
# callbacks. when this occurs we have not choice but to re-request
# the image from the beginning (easier than playing around with range
# headers). this means each thread may issue multiple new requests.
# I have found the performance overhead to be mostly negligible.
except requests.exceptions.ChunkedEncodingError:
if not page_url:
output.error("Connection killed on page {} but scraper does not support retries".format(str(page_num)))
raise exceptions.ScrapingError
output.warning("Connection killed on page {}, {} retries remaining".format(str(page_num), str(retries)))
retries = retries - 1
if retries <= 0:
output.error("Connection killed on page {}, no retries remaining - aborting chapter".format(str(page_num)))
raise exceptions.ScrapingError
r = requests.get(page_url, stream = True)
f.flush()
f.close()
r.close()
Expand All @@ -326,15 +367,19 @@ def progress_bar(self, arg):

def save(self, series, ignore=False):
"""Save a chapter to database."""
# check if chapter already exists in database
try:
c = db.Chapter(self, series)
except IntegrityError:
db.session.rollback()
else:
if ignore:
c.downloaded = -1
db.session.add(c)
c = db.session.query(db.Chapter).filter_by(url=self.url).one()
except NoResultFound:
try:
db.session.commit()
c = db.Chapter(self, series)
except IntegrityError:
db.session.rollback()
else:
if ignore:
c.downloaded = -1
db.session.add(c)
try:
db.session.commit()
except IntegrityError:
db.session.rollback()
22 changes: 17 additions & 5 deletions cum/scrapers/mangadex.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,25 @@ def download(self):
if guess_type(page)[0]:
image = server + chapter_hash + '/' + page
else:
print('Unkown image type for url {}'.format(page))
raise ValueError
r = requests.get(image, stream=True)
print('Unknown image type for url {}'.format(page))
raise exceptions.ScrapingError
retries = 3
r = None
while retries > 0:
try:
r = requests.get(image, stream=True)
break
except requests.exceptions.ConnectionError:
output.warning("Initial request for page {} failed, {} retries remaining".format(str(i), str(retries)))
retries = retries - 1
if not r:
output.error("Failed to request page {}".format(str(i)))
raise exceptions.ScrapingError
if r.status_code == 404:
r.close()
raise ValueError
fut = download_pool.submit(self.page_download_task, i, r)
raise exceptions.ScrapingError
fut = download_pool.submit(self.page_download_task,
i, r, page_url = image)
fut.add_done_callback(partial(self.page_download_finish,
bar, files))
futures.append(fut)
Expand Down
Loading