From f04f80e62b3ec171603aca0d02f9f477aed30b5e Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 7 Feb 2019 22:01:48 -0500 Subject: [PATCH 01/19] Add Mangasee scraper Adds support for Mangasee (mangaseeonline.us) --- cum/db.py | 3 + cum/scrapers/__init__.py | 3 + cum/scrapers/mangasee.py | 114 +++++++++++++++++++++++++++ tests/test_scraper_mangasee.py | 138 +++++++++++++++++++++++++++++++++ 4 files changed, 258 insertions(+) create mode 100644 cum/scrapers/mangasee.py create mode 100644 tests/test_scraper_mangasee.py diff --git a/cum/db.py b/cum/db.py index 8f05241..b12ef3c 100644 --- a/cum/db.py +++ b/cum/db.py @@ -242,6 +242,9 @@ def to_object(self): if parse.netloc == 'www.yuri-ism.net': from cum.scrapers.yuriism import YuriismChapter return YuriismChapter(**kwargs) + if parse.netloc == 'mangaseeonline.us': + from cum.scrapers.mangasee import MangaseeChapter + return MangaseeChapter(**kwargs) class Group(Base): diff --git a/cum/scrapers/__init__.py b/cum/scrapers/__init__.py index cb05f04..8266d07 100644 --- a/cum/scrapers/__init__.py +++ b/cum/scrapers/__init__.py @@ -2,6 +2,7 @@ from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries from cum.scrapers.mangadex import MangadexSeries, MangadexChapter +from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter from cum.scrapers.yuriism import YuriismChapter, YuriismSeries series_scrapers = [ @@ -9,6 +10,7 @@ DynastyScansSeries, MadokamiSeries, MangadexSeries, + MangaseeSeries, YuriismSeries, ] chapter_scrapers = [ @@ -16,5 +18,6 @@ DynastyScansChapter, MadokamiChapter, MangadexChapter, + MangaseeChapter, YuriismChapter, ] diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py new file mode 100644 index 0000000..a7e4130 --- /dev/null +++ b/cum/scrapers/mangasee.py @@ -0,0 +1,114 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from cum.scrapers.base import BaseChapter, BaseSeries, download_pool +from functools import partial +import concurrent.futures +import json +import re +import requests +import traceback + + +class MangaseeSeries(BaseSeries): + url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+') + + def __init__(self, url, **kwargs): + super().__init__(url, **kwargs) + spage = requests.get(url) + self.soup = BeautifulSoup(spage.text, config.get().html_parser) + self.chapters = self.get_chapters() + + def get_chapters(self): + try: + rows = self.soup.find_all("a", class_="list-group-item") + except AttributeError: + raise exceptions.ScrapingError() + chapters = [] + for i, row in enumerate(rows): + chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online", + row["title"]).groups()[0] + chap_url = "https://mangaseeonline.us" + row["href"] + chap_name = row.find_all("span")[0].text + chap_date = row.find_all("time")[0].text + result = MangaseeChapter(name=self.name, + alias=self.alias, + chapter=chap_num, + url=chap_url, + title=chap_name, + groups=[], + upload_date=chap_date) + chapters.append(result) + return chapters + + @property + def name(self): + try: + return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee", + self.soup.find_all("title")[0].text).groups()[0] + except AttributeError: + print(traceback.format_exc()) + raise exceptions.ScrapingError + + +class MangaseeChapter(BaseChapter): + url_re = re.compile((r'https?://mangaseeonline\.us/' + r'read-online/.+-chapter-[0-9\.]+-page-[0-9]+\.html')) + upload_date = None + uses_pages = True + + def download(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + + for script in self.soup.find_all("script"): + if re.match("\n\tChapterArr=.+", script.text): + image_list = script.text + continue + + image_list = re.sub("\n\tChapterArr=", "", image_list) + image_list = re.sub(";\n\t?", "", image_list) + image_list = re.sub("PageArr=", ",", image_list) + image_list = "[" + image_list + "]" + image_list = json.loads(image_list)[1] + pages = [] + for image in image_list: + if image != "CurPage": + if re.match(".+blogspot.+", image_list[image]): + image_list[image] = image_list[image].\ + replace("http://", "https://") + pages.append(image_list[image]) + + futures = [] + files = [None] * len(pages) + with self.progress_bar(pages) as bar: + for i, page in enumerate(pages): + retries = 0 + while retries < 3: + try: + r = requests.get(page, stream=True) + break + except requests.exceptions.ConnectionError: + retries += 1 + if r.status_code != 200: + r.close() + raise ValueError + fut = download_pool.submit(self.page_download_task, i, r) + fut.add_done_callback(partial(self.page_download_finish, + bar, files)) + futures.append(fut) + concurrent.futures.wait(futures) + self.create_zip(files) + + def from_url(url): + cpage = requests.get(url) + soup = BeautifulSoup(cpage.text, config.get().html_parser) + chap_num = soup.find_all("span", class_="CurChapter")[0].text + iname = soup.find_all("a", class_="list-link")[0]["href"] + series = MangaseeSeries("https://mangaseeonline.us" + iname) + for chapter in series.chapters: + if chapter.chapter == str(chap_num): + return chapter + return None diff --git a/tests/test_scraper_mangasee.py b/tests/test_scraper_mangasee.py new file mode 100644 index 0000000..ca11277 --- /dev/null +++ b/tests/test_scraper_mangasee.py @@ -0,0 +1,138 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from nose.tools import nottest +from urllib.parse import urljoin +import cumtest +import os +import requests +import unittest +import zipfile + + +class TestMangasee(cumtest.CumTest): + MANGASEE_URL = 'https://mangaseeonline.us/' + + def setUp(self): + super().setUp() + global mangasee + from cum.scrapers import mangasee + + def tearDown(self): + self.directory.cleanup() + + def get_five_latest_releases(self): + r = requests.get(self.MANGASEE_URL) + soup = BeautifulSoup(r.text, config.get().html_parser) + chapters = soup.find_all("a", class_="latestSeries") + links = [urljoin(self.MANGASEE_URL, x.get("href")) for x in chapters] + return links[:5] + + @nottest + def series_information_tester(self, data): + series = mangasee.MangaseeSeries(data['url']) + self.assertEqual(series.name, data['name']) + self.assertEqual(series.alias, data['alias']) + self.assertEqual(series.url, data['url']) + self.assertIs(series.directory, None) + self.assertEqual(len(series.chapters), len(data['chapters'])) + for chapter in series.chapters: + self.assertEqual(chapter.name, data['name']) + self.assertEqual(chapter.alias, data['alias']) + self.assertIn(chapter.chapter, data['chapters']) + data['chapters'].remove(chapter.chapter) + self.assertIs(chapter.directory, None) + self.assertEqual(len(data['chapters']), 0) + + def test_chapter_download_latest(self): + latest_releases = self.get_five_latest_releases() + for release in latest_releases: + try: + chapter = mangasee.MangaseeChapter.from_url(release) + except exceptions.ScrapingError as e: + print('scraping error for {} - {}'.format(release, e)) + continue + else: + chapter.get(use_db=False) + + def test_chapter_filename_decimal(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Citrus-S-A-B-U-R-O-Uta-chapter-20.5-page-1.html" + chapter = mangasee.MangaseeChapter.from_url(URL) + path = os.path.join(self.directory.name, 'Citrus SABURO Uta', + 'Citrus SABURO Uta - c020 x5 [Unknown].zip') + self.assertEqual(chapter.chapter, '20.5') + self.assertEqual(chapter.filename, path) + + def test_chapter_information_normal(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Ramen-Daisuki-Koizumi-San-chapter-18-page-1.html" + chapter = mangasee.MangaseeChapter.from_url(URL) + self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') + self.assertTrue(chapter.available()) + self.assertEqual(chapter.chapter, '18') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-san') + self.assertEqual(chapter.title, 'Chapter 18') + path = os.path.join(self.directory.name, + 'Ramen Daisuki Koizumi-san', + 'Ramen Daisuki Koizumi-san - c018 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 8) + + def test_chapter_information_chapterzero(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Inu-To-Hasami-Wa-Tsukaiyou-chapter-0-page-1.html" + chapter = mangasee.MangaseeChapter.from_url(URL) + self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') + self.assertEqual(chapter.chapter, '0') + self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou') + self.assertEqual(chapter.title, 'Chapter 0') + path = os.path.join( + self.directory.name, 'Inu to Hasami wa Tsukaiyou', + 'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 51) + + def test_chapter_unavailable(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Oyasumi-Punpun-chapter-999-page-1.html" + chapter = mangasee.MangaseeChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_series_oneword(self): + data = {'alias': 'aria', + 'chapters': ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '10.5', '11', '12', '13', '14', '15', + '16', '17', '18', '19', '20', '21', '22', '23', + '24', '25', '26', '27', '28', '29', '30', '30.5', + '31', '32', '33', '34', '35', '35.5', '36', + '37', '37.5', '38', '39', '40', '41', '42', '43', + '44', '45', '45.5', '46', '47', '48', '49', + '50', '50.5', '51', '52', '53', '54', '55', '56', + '57', '57.5', '58', '59', '60', '60.5'], + 'name': 'Aria', + 'url': 'https://mangaseeonline.us/manga/Aria'} + self.series_information_tester(data) + + def test_series_multiplewords(self): + data = {'alias': 'prunus-girl', + 'chapters': ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '11', '12', '13', '14', '15', + '16', '17', '18', '19', '20', '21', '22', + '23', '24', '25', '26', '27', '28', '29', '30', + '31', '32', '32.5', '33', '34', '35', '36', '37', + '38', '39', '40', '41', '42', '43'], + 'name': 'Prunus Girl', + 'url': 'https://mangaseeonline.us/manga/Prunus-Girl'} + self.series_information_tester(data) + + +if __name__ == '__main__': + unittest.main() From 01b3823b40b06f01abbb00762bfeb13d54c4bfb8 Mon Sep 17 00:00:00 2001 From: matoro Date: Sat, 9 Feb 2019 11:26:11 -0500 Subject: [PATCH 02/19] handle requests.exceptions.StreamConsumedError in page_download_task() This basically ignores this exception that requests throws. My understanding is that it is raised when you attempt to iter_content() over the same content twice. Don't understand how that situation arises with the current code but it did somehow. https://stackoverflow.com/questions/45379903/ --- cum/scrapers/base.py | 14 +++++++++++--- cum/scrapers/mangasee.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index b074658..1a86f5e 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -299,9 +299,17 @@ def page_download_task(page_num, r): """ ext = BaseChapter.guess_extension(r.headers.get('content-type')) f = NamedTemporaryFile(suffix=ext, delete=False) - for chunk in r.iter_content(chunk_size=4096): - if chunk: - f.write(chunk) + try: + for chunk in r.iter_content(chunk_size=4096): + if chunk: + f.write(chunk) + # basically ignores this exception that requests throws. my + # understanding is that it is raised when you attempt to iter_content() + # over the same content twice. don't understand how that situation + # arises with the current code but it did somehow. + # https://stackoverflow.com/questions/45379903/ + except requests.exceptions.StreamConsumedError: + pass f.flush() f.close() r.close() diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py index a7e4130..2c5748d 100644 --- a/cum/scrapers/mangasee.py +++ b/cum/scrapers/mangasee.py @@ -86,7 +86,7 @@ def download(self): with self.progress_bar(pages) as bar: for i, page in enumerate(pages): retries = 0 - while retries < 3: + while retries < 10: try: r = requests.get(page, stream=True) break From 9d382eed7d5245908e02e0994ee055cf8996893d Mon Sep 17 00:00:00 2001 From: matoro Date: Sun, 10 Feb 2019 21:15:28 -0500 Subject: [PATCH 03/19] mangasee: improve speed with persistent session --- cum/scrapers/mangasee.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py index 2c5748d..577bdb9 100644 --- a/cum/scrapers/mangasee.py +++ b/cum/scrapers/mangasee.py @@ -83,12 +83,13 @@ def download(self): futures = [] files = [None] * len(pages) + req_session = requests.Session() with self.progress_bar(pages) as bar: for i, page in enumerate(pages): retries = 0 while retries < 10: try: - r = requests.get(page, stream=True) + r = req_session.get(page, stream=True) break except requests.exceptions.ConnectionError: retries += 1 From 9d4325cb55c6f2340ac27317db8f2a11df2e7d0b Mon Sep 17 00:00:00 2001 From: matoro Date: Tue, 12 Feb 2019 21:06:18 -0500 Subject: [PATCH 04/19] scraper: do not sanitize download_dir when explicitly set Assume that if the download_dir is explicitly set, the user wanted it exactly that way. If they include bad characters and it breaks things, that is their fault. --- cum/scrapers/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index 1a86f5e..f0e5012 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -209,13 +209,20 @@ def filename(self): else: ext = 'zip' + directory_set = False if self.directory: directory = os.path.expanduser(self.directory) + directory_set = True else: directory = name download_dir = os.path.expanduser(config.get().download_directory) download_dir = os.path.join(download_dir, directory) - download_dir = self._strip_unwanted_characters(download_dir) + # only sanitize download_dir if the user did not explicitly set it + # assume that if it is set, the user wanted it exactly as set + # if they include bad characters and it breaks things, that's their + # fault. + if not directory_set: + download_dir = self._strip_unwanted_characters(download_dir) download_dir = self.create_directory(download_dir) # Format the filename somewhat based on Daiz's manga naming scheme. From c708c3a89a860b1dd54c63edb7c81932d1268977 Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 14 Feb 2019 21:45:36 -0500 Subject: [PATCH 05/19] add mangahere scraper; numerous misc fixes remove debugging imports, add more tests to mangasee scraper, add support for multi-volume/multi-season titles, fix 404 detection on mangasee scraper, change beautifulsoup element parsing to find() instead of find_all() --- cum/db.py | 3 + cum/scrapers/__init__.py | 3 + cum/scrapers/base.py | 5 + cum/scrapers/mangahere.py | 129 +++++++++++++++++++++ cum/scrapers/mangasee.py | 42 +++++-- tests/test_scraper_mangahere.py | 199 ++++++++++++++++++++++++++++++++ tests/test_scraper_mangasee.py | 42 +++++++ 7 files changed, 415 insertions(+), 8 deletions(-) create mode 100644 cum/scrapers/mangahere.py create mode 100644 tests/test_scraper_mangahere.py diff --git a/cum/db.py b/cum/db.py index b12ef3c..db1af2a 100644 --- a/cum/db.py +++ b/cum/db.py @@ -245,6 +245,9 @@ def to_object(self): if parse.netloc == 'mangaseeonline.us': from cum.scrapers.mangasee import MangaseeChapter return MangaseeChapter(**kwargs) + if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'): + from cum.scrapers.mangahere import MangahereChapter + return MangahereChapter(**kwargs) class Group(Base): diff --git a/cum/scrapers/__init__.py b/cum/scrapers/__init__.py index 8266d07..b7a68d9 100644 --- a/cum/scrapers/__init__.py +++ b/cum/scrapers/__init__.py @@ -3,6 +3,7 @@ from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries from cum.scrapers.mangadex import MangadexSeries, MangadexChapter from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter +from cum.scrapers.mangahere import MangahereSeries, MangahereChapter from cum.scrapers.yuriism import YuriismChapter, YuriismSeries series_scrapers = [ @@ -11,6 +12,7 @@ MadokamiSeries, MangadexSeries, MangaseeSeries, + MangahereSeries, YuriismSeries, ] chapter_scrapers = [ @@ -19,5 +21,6 @@ MadokamiChapter, MangadexChapter, MangaseeChapter, + MangahereChapter, YuriismChapter, ] diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index f0e5012..b0d1866 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -195,6 +195,11 @@ def filename(self): elif match(r'[0-9]*\.[0-9]*$', self.chapter): number, decimal = self.chapter.split('.') chapter = 'c{:0>3} x{}'.format(number, decimal) + # Individually numbered chapter with double-decimal (e.g. '2.164.5'). + # Used by titles with multiple volumes/seasons and special chapters. + elif match(r'[0-9]*(\.[0-9]*){2}$', self.chapter): + volume, number, decimal = self.chapter.split('.') + chapter = 'c{:0>3} x{:0>3}.{}'.format(volume, number, decimal) # Failing all else, e.g. 'Special'. Becomes 'c000 [Special]'. else: chapter = 'c000 [{}]'.format(self.chapter) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py new file mode 100644 index 0000000..41c72d8 --- /dev/null +++ b/cum/scrapers/mangahere.py @@ -0,0 +1,129 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from cum.scrapers.base import BaseChapter, BaseSeries, download_pool +from functools import partial +import concurrent.futures +import re +import requests + + +class MangahereSeries(BaseSeries): + url_re = re.compile(r'https?://((www|m)\.)?mangahere\.cc/manga/.+') + + def __init__(self, url, **kwargs): + super().__init__(url, **kwargs) + # convert mobile link to desktop + spage = requests.get(url.replace("m.", "www.")) + if spage.status_code == 404: + raise exceptions.ScrapingError + self.soup = BeautifulSoup(spage.text, config.get().html_parser) + self.chapters = self.get_chapters() + + def get_chapters(self): + try: + rows = self.soup.find("ul", class_="detail-main-list")\ + .find_all("li") + except AttributeError: + raise exceptions.ScrapingError() + chapters = [] + for i, row in enumerate(rows): + chap_num = re.match((r"/manga/[^/]+((/v[0-9]+)?" + r"/c[0-9\.]+)/[0-9]+\.html$"), + row.find("a")["href"]).groups()[0]\ + .replace("/", "") + if "v" in chap_num: + chap_num = chap_num.replace("v", "").replace("c", ".") + else: + chap_num = chap_num.replace("c", "") + if chap_num == "000": + chap_num = "0" + else: + chap_num = chap_num.lstrip("0") + # convert mobile link to desktop + chap_url = "https://www.mangahere.cc" + \ + row.find("a")["href"].replace("/roll_manga/", "/manga/") + chap_name = row.find("p", class_="title3").text + chap_date = row.find("p", class_="title2").text + result = MangahereChapter(name=self.name, + alias=self.alias, + chapter=chap_num, + url=chap_url, + title=chap_name, + groups=[], + upload_date=chap_date) + chapters.append(result) + return chapters + + @property + def name(self): + try: + return re.match(r".+ - Read (.+) Online at MangaHere$", + self.soup.find("title").text).groups()[0] + except AttributeError: + raise exceptions.ScrapingError + + +class MangahereChapter(BaseChapter): + url_re = re.compile((r'https?://((www|m)\.)?mangahere\.cc' + r'/(roll_)?manga(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$')) + upload_date = None + uses_pages = True + + def download(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url.replace("www.", "m.") + .replace("/manga/", "/roll_manga/")) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + + image_list = self.soup.find("div", class_="mangaread-img")\ + .find_all("img") + pages = [] + for image in image_list: + pages.append(image["data-original"].replace("http://", "https://")) + + futures = [] + files = [None] * len(pages) + req_session = requests.Session() + with self.progress_bar(pages) as bar: + for i, page in enumerate(pages): + retries = 0 + while retries < 10: + try: + r = req_session.get(page, stream=True) + break + except requests.exceptions.ConnectionError: + retries += 1 + if r.status_code != 200: + r.close() + raise ValueError + fut = download_pool.submit(self.page_download_task, i, r) + fut.add_done_callback(partial(self.page_download_finish, + bar, files)) + futures.append(fut) + concurrent.futures.wait(futures) + self.create_zip(files) + + def from_url(url): + chap_num = re.match((r"https?://((www|m)\.)?mangahere\.cc/(roll_)?" + r"manga/[^/]+((/v[0-9]+)?/c[0-9\.]+)" + r"/[0-9]+\.html"), url)\ + .groups()[3].replace("/", "") + if "v" in chap_num: + chap_num = chap_num.replace("v", "").replace("c", ".") + else: + chap_num = chap_num.replace("c", "") + if chap_num == "000": + chap_num = "0" + else: + chap_num = chap_num.lstrip("0") + parent_url = re.match((r"(https?://((www|m)\.)?mangahere\.cc/(roll_)?" + r"manga/[^/]+)(/v[0-9]+)?/" + r"c[0-9\.]+/[0-9]+\.html"), + url).groups()[0] + series = MangahereSeries(parent_url) + for chapter in series.chapters: + if chapter.chapter == str(chap_num): + return chapter + return None diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py index 577bdb9..67d9d9c 100644 --- a/cum/scrapers/mangasee.py +++ b/cum/scrapers/mangasee.py @@ -6,18 +6,32 @@ import json import re import requests -import traceback class MangaseeSeries(BaseSeries): url_re = re.compile(r'https?://mangaseeonline\.us/manga/.+') + multi_season_regex = re.compile((r"(https?://mangaseeonline\.us)" + r"?/read-online/" + r".+-chapter-[0-9\.]+-index-" + r"([0-9]+)-page-[0-9]+\.html")) def __init__(self, url, **kwargs): super().__init__(url, **kwargs) spage = requests.get(url) + if spage.status_code == 404: + raise exceptions.ScrapingError self.soup = BeautifulSoup(spage.text, config.get().html_parser) self.chapters = self.get_chapters() + def _get_chapnum_multiseason_series(self, url, chap_num): + if not re.match(self.multi_season_regex, url): + # chapter is from season 1 + return "01." + chap_num.zfill(3) + else: + # chapter is from season >1 + season = re.match(self.multi_season_regex, url).groups()[1] + return season.zfill(2) + "." + chap_num.zfill(3) + def get_chapters(self): try: rows = self.soup.find_all("a", class_="list-group-item") @@ -27,9 +41,12 @@ def get_chapters(self): for i, row in enumerate(rows): chap_num = re.match(r"Read .+ Chapter ([0-9\.]+) For Free Online", row["title"]).groups()[0] + if not hasattr(self, "is_multi_season"): + if re.match(self.multi_season_regex, row["href"]): + self.is_multi_season = True chap_url = "https://mangaseeonline.us" + row["href"] - chap_name = row.find_all("span")[0].text - chap_date = row.find_all("time")[0].text + chap_name = row.find("span").text + chap_date = row.find("time").text result = MangaseeChapter(name=self.name, alias=self.alias, chapter=chap_num, @@ -38,15 +55,24 @@ def get_chapters(self): groups=[], upload_date=chap_date) chapters.append(result) + # the chapters in the first season of a multi-season title + # are indistinguishable from a non-multi-season title. thus + # we must retroactively reanalyze all chapters and adjust + # chapter numbers if *any* are multi-season + if hasattr(self, "is_multi_season"): + for chapter in chapters: + chapter.chapter = self.\ + _get_chapnum_multiseason_series(chapter.url, + chapter.chapter) + return chapters @property def name(self): try: return re.match(r"Read (.+) Man[a-z]+ For Free \| MangaSee", - self.soup.find_all("title")[0].text).groups()[0] + self.soup.find("title").text).groups()[0] except AttributeError: - print(traceback.format_exc()) raise exceptions.ScrapingError @@ -106,10 +132,10 @@ def download(self): def from_url(url): cpage = requests.get(url) soup = BeautifulSoup(cpage.text, config.get().html_parser) - chap_num = soup.find_all("span", class_="CurChapter")[0].text - iname = soup.find_all("a", class_="list-link")[0]["href"] + # chap_num = soup.find("span", class_="CurChapter").text + iname = soup.find("a", class_="list-link")["href"] series = MangaseeSeries("https://mangaseeonline.us" + iname) for chapter in series.chapters: - if chapter.chapter == str(chap_num): + if chapter.url == url: return chapter return None diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py new file mode 100644 index 0000000..70b3d70 --- /dev/null +++ b/tests/test_scraper_mangahere.py @@ -0,0 +1,199 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from nose.tools import nottest +from urllib.parse import urljoin +import cumtest +import os +import requests +import unittest +import zipfile + + +class TestMangahere(cumtest.CumTest): + MANGAHERE_URL = 'https://www.mangahere.cc/' + + def setUp(self): + super().setUp() + global mangahere + from cum.scrapers import mangahere + + def tearDown(self): + self.directory.cleanup() + + def get_five_latest_releases(self): + r = requests.get(self.MANGAHERE_URL) + soup = BeautifulSoup(r.text, config.get().html_parser) + chapters = soup.find("ul", class_="manga-list-1-list").find_all("li") + links = [urljoin(self.MANGAHERE_URL, + x.find("p", class_="manga-list-1-item-subtitle") + .find("a").get("href")) for x in chapters] + return links[:5] + + @nottest + def series_information_tester(self, data): + series = mangahere.MangahereSeries(data['url']) + self.assertEqual(series.name, data['name']) + self.assertEqual(series.alias, data['alias']) + self.assertEqual(series.url, data['url']) + self.assertIs(series.directory, None) + self.assertEqual(len(series.chapters), len(data['chapters'])) + for chapter in series.chapters: + self.assertEqual(chapter.name, data['name']) + self.assertEqual(chapter.alias, data['alias']) + self.assertIn(chapter.chapter, data['chapters']) + data['chapters'].remove(chapter.chapter) + self.assertIs(chapter.directory, None) + self.assertEqual(len(data['chapters']), 0) + + def test_chapter_download_latest(self): + latest_releases = self.get_five_latest_releases() + for release in latest_releases: + try: + chapter = mangahere.MangahereChapter.from_url(release) + except exceptions.ScrapingError as e: + print('scraping error for {} - {}'.format(release, e)) + continue + else: + chapter.get(use_db=False) + + def test_chapter_filename_decimal(self): + URL = "https://www.mangahere.cc/manga/citrus_saburouta/" + URL = "https://www.mangahere.cc/manga/citrus_saburouta/" + \ + "c020.5/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + path = os.path.join(self.directory.name, 'Citrus Saburouta', + 'Citrus Saburouta - c020 x5 [Unknown].zip') + self.assertEqual(chapter.chapter, '20.5') + self.assertEqual(chapter.filename, path) + + def test_chapter_information_normal(self): + URL = "https://www.mangahere.cc/manga/" + \ + "ramen_daisuki_koizumi_san/c018/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') + self.assertTrue(chapter.available()) + self.assertEqual(chapter.chapter, '18') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi san') + self.assertEqual(chapter.title, + 'Ch.018 - Eighteenth Bowl: Strange-flavored Ramen') + path = os.path.join(self.directory.name, + 'Ramen Daisuki Koizumi san', + 'Ramen Daisuki Koizumi san - c018 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 8) + + def test_chapter_information_chapterzero(self): + URL = "https://www.mangahere.cc/manga/" + \ + "hidamari_sketch/v01/c000/1.html" + URL = "https://www.mangahere.cc/manga/" + \ + "inu_to_hasami_wa_tsukaiyou/c000/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') + self.assertEqual(chapter.chapter, '0') + self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou') + self.assertEqual(chapter.title, 'Ch.000') + path = os.path.join( + self.directory.name, 'Inu to Hasami wa Tsukaiyou', + 'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 32) + + def test_chapter_information_volume(self): + URL = "https://www.mangahere.cc/manga/" + \ + "full_metal_alchemist/v026/c107/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + self.assertEqual(chapter.alias, 'full-metal-alchemist') + self.assertEqual(chapter.chapter, '26.107') + self.assertEqual(chapter.name, 'Full Metal Alchemist') + self.assertEqual(chapter.title, 'Vol.026 Ch.107 - The Final Battle') + path = os.path.join( + self.directory.name, 'Full Metal Alchemist', + 'Full Metal Alchemist - c026 x107 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 69) + + def test_chapter_information_volume_decimal(self): + URL = "https://www.mangahere.cc/manga/" + \ + "ai_yori_aoshi/v16/c133.5/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + self.assertEqual(chapter.alias, 'ai-yori-aoshi') + self.assertEqual(chapter.chapter, '16.133.5') + self.assertEqual(chapter.name, 'Ai Yori Aoshi') + self.assertEqual(chapter.title, 'Vol.16 Ch.133.5 ' + + '- Special Chapter - Hanakotoba - Language of Flower') + path = os.path.join( + self.directory.name, 'Ai Yori Aoshi', + 'Ai Yori Aoshi - c016 x133.5 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 14) + + def test_series_invalid(self): + URL = "https://www.mangahere.cc/manga/not_a_manga" + with self.assertRaises(exceptions.ScrapingError): + series = mangahere.MangahereSeries(url=URL) + + def test_chapter_unavailable_badvolume(self): + URL = "https://www.mangahere.cc/manga/oyasumi_punpun/v99/c147/1.html" + chapter = mangahere.MangahereChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_chapter_unavailable_badchapter(self): + URL = "https://www.mangahere.cc/manga/oyasumi_punpun/v09/c999/1.html" + chapter = mangahere.MangahereChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_chapter_unavailable_flatchapters(self): + URL = "https://www.mangahere.cc/manga/nikoniko_x_punpun/c999/1.html" + chapter = mangahere.MangahereChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_series_flatchapters(self): + data = {'alias': 'aria', + 'chapters': ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '10.5', '11', '12', '13', '14', '15', + '16', '17', '18', '19', '20', '21', '22', '23', + '24', '25', '26', '27', '28', '29', '30', '30.5', + '31', '32', '33', '34', '35', '35.5', '36', + '37', '37.5', '38', '39', '40', '41', '42', '43', + '44', '45', '45.5', '46', '47', '48', '49', + '50', '50.5', '51', '52', '53', '54', '55', '56', + '57', '57.5', '58', '59', '60', '60.1'], + 'name': 'Aria', + 'url': 'https://www.mangahere.cc/manga/aria'} + self.series_information_tester(data) + + def test_series_volumes(self): + data = {'alias': 'prunus-girl', + 'chapters': ['1.001', '1.001.5', '1.002', '1.003', '1.004', + '1.005', '1.005.5', '1.006', '1.007', '1.008', + '1.009', '1.010', '1.011', '1.011.5', '2.012', + '2.013', '2.014', '2.015', '3.014', '3.015', + '3.016', '3.017', '3.018', '3.019', '3.020', + '3.021', '3.022', '3.023', '3.024', '3.025', + '3.026', '3.027', '5.028', '5.029', '5.030', + '5.031', '5.032', '5.032.5', '5.033', '5.034', + '5.035', '5.036', '5.037', '5.038', '5.039', + '5.040', '5.041', '5.042', '5.042.5'], + 'name': 'Prunus Girl', + 'url': 'https://www.mangahere.cc/manga/prunus_girl'} + self.series_information_tester(data) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_scraper_mangasee.py b/tests/test_scraper_mangasee.py index ca11277..ccf9176 100644 --- a/tests/test_scraper_mangasee.py +++ b/tests/test_scraper_mangasee.py @@ -100,6 +100,48 @@ def test_chapter_information_chapterzero(self): files = chapter_zip.infolist() self.assertEqual(len(files), 51) + def test_chapter_information_multiseason(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Kubera-chapter-3-index-2-page-1.html" + chapter = mangasee.MangaseeChapter.from_url(URL) + self.assertEqual(chapter.alias, 'kubera') + self.assertEqual(chapter.chapter, '02.003') + self.assertEqual(chapter.name, 'Kubera') + self.assertEqual(chapter.title, 'S2 - Chapter 3') + path = os.path.join( + self.directory.name, 'Kubera', + 'Kubera - c002 x003 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 39) + + def test_chapter_information_multiseason_decimal(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "Kubera-chapter-164.5-index-2-page-1.html" + chapter = mangasee.MangaseeChapter.from_url(URL) + self.assertEqual(chapter.alias, 'kubera') + self.assertEqual(chapter.chapter, '02.164.5') + self.assertEqual(chapter.name, 'Kubera') + self.assertEqual(chapter.title, 'S2 - Chapter 164.5') + path = os.path.join( + self.directory.name, 'Kubera', + 'Kubera - c002 x164.5 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 45) + + def test_series_invalid(self): + URL = "https://mangaseeonline.us/read-online/" + \ + "not_a_manga" + with self.assertRaises(exceptions.ScrapingError): + series = mangasee.MangaseeSeries(url=URL) + def test_chapter_unavailable(self): URL = "https://mangaseeonline.us/read-online/" + \ "Oyasumi-Punpun-chapter-999-page-1.html" From 0f28bcacb611159e517eb40612aa49d5fa0c747f Mon Sep 17 00:00:00 2001 From: matoro Date: Mon, 18 Feb 2019 12:46:11 -0500 Subject: [PATCH 06/19] mangahere: fix chapter scraping for adult content warning --- cum/scrapers/mangahere.py | 29 ++++++++++++++++------------- tests/test_scraper_mangahere.py | 25 ++++++++++++++----------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index 41c72d8..1669949 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -12,8 +12,9 @@ class MangahereSeries(BaseSeries): def __init__(self, url, **kwargs): super().__init__(url, **kwargs) - # convert mobile link to desktop - spage = requests.get(url.replace("m.", "www.")) + # convert desktop link to mobile + # bypasses adult content warning js + spage = requests.get(url.replace("www.", "m.")) if spage.status_code == 404: raise exceptions.ScrapingError self.soup = BeautifulSoup(spage.text, config.get().html_parser) @@ -21,14 +22,15 @@ def __init__(self, url, **kwargs): def get_chapters(self): try: - rows = self.soup.find("ul", class_="detail-main-list")\ - .find_all("li") + rows = self.soup.find("div", class_="manga-chapters")\ + .find("ul").find_all("li") except AttributeError: raise exceptions.ScrapingError() chapters = [] for i, row in enumerate(rows): - chap_num = re.match((r"/manga/[^/]+((/v[0-9]+)?" - r"/c[0-9\.]+)/[0-9]+\.html$"), + chap_num = re.match((r"//m\.mangahere\.cc" + r"/manga/[^/]+((/v[0-9]+)?" + r"/c[0-9\.]+)/?$"), row.find("a")["href"]).groups()[0]\ .replace("/", "") if "v" in chap_num: @@ -40,24 +42,23 @@ def get_chapters(self): else: chap_num = chap_num.lstrip("0") # convert mobile link to desktop - chap_url = "https://www.mangahere.cc" + \ - row.find("a")["href"].replace("/roll_manga/", "/manga/") - chap_name = row.find("p", class_="title3").text - chap_date = row.find("p", class_="title2").text + chap_url = "https:" + row.find("a")["href"]\ + .replace("/roll_manga/", "/manga/")\ + .replace("m.", "www.") + chap_name = row.text result = MangahereChapter(name=self.name, alias=self.alias, chapter=chap_num, url=chap_url, title=chap_name, - groups=[], - upload_date=chap_date) + groups=[]) chapters.append(result) return chapters @property def name(self): try: - return re.match(r".+ - Read (.+) Online at MangaHere$", + return re.match(r"(.+) - MangaHere Mobile$", self.soup.find("title").text).groups()[0] except AttributeError: raise exceptions.ScrapingError @@ -73,6 +74,8 @@ def download(self): if not getattr(self, "cpage", None): self.cpage = requests.get(self.url.replace("www.", "m.") .replace("/manga/", "/roll_manga/")) + if self.cpage.status_code == 404: + raise exceptions.ScrapingError if not getattr(self, "soup", None): self.soup = BeautifulSoup(self.cpage.text, config.get().html_parser) diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py index 70b3d70..03880b3 100644 --- a/tests/test_scraper_mangahere.py +++ b/tests/test_scraper_mangahere.py @@ -45,6 +45,13 @@ def series_information_tester(self, data): self.assertIs(chapter.directory, None) self.assertEqual(len(data['chapters']), 0) + # This test is disabled because I have discovered (via this test) + # that for some series, the mobile links for chapters return 404s, + # even the links on the actual mobile index page, making those + # chapters unavailable via mobile. Until I can get around to + # reverse-engineering the obfuscation on the desktop site, + # some series may not be able to be downloaded/followed. + @nottest def test_chapter_download_latest(self): latest_releases = self.get_five_latest_releases() for release in latest_releases: @@ -73,12 +80,11 @@ def test_chapter_information_normal(self): self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') self.assertTrue(chapter.available()) self.assertEqual(chapter.chapter, '18') - self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi san') - self.assertEqual(chapter.title, - 'Ch.018 - Eighteenth Bowl: Strange-flavored Ramen') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-san') + self.assertEqual(chapter.title, 'C.18') path = os.path.join(self.directory.name, - 'Ramen Daisuki Koizumi san', - 'Ramen Daisuki Koizumi san - c018 [Unknown].zip') + 'Ramen Daisuki Koizumi-san', + 'Ramen Daisuki Koizumi-san - c018 [Unknown].zip') self.assertEqual(chapter.filename, path) chapter.download() self.assertTrue(os.path.isfile(path)) @@ -87,15 +93,13 @@ def test_chapter_information_normal(self): self.assertEqual(len(files), 8) def test_chapter_information_chapterzero(self): - URL = "https://www.mangahere.cc/manga/" + \ - "hidamari_sketch/v01/c000/1.html" URL = "https://www.mangahere.cc/manga/" + \ "inu_to_hasami_wa_tsukaiyou/c000/1.html" chapter = mangahere.MangahereChapter.from_url(URL) self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') self.assertEqual(chapter.chapter, '0') self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou') - self.assertEqual(chapter.title, 'Ch.000') + self.assertEqual(chapter.title, 'C.0') path = os.path.join( self.directory.name, 'Inu to Hasami wa Tsukaiyou', 'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip') @@ -113,7 +117,7 @@ def test_chapter_information_volume(self): self.assertEqual(chapter.alias, 'full-metal-alchemist') self.assertEqual(chapter.chapter, '26.107') self.assertEqual(chapter.name, 'Full Metal Alchemist') - self.assertEqual(chapter.title, 'Vol.026 Ch.107 - The Final Battle') + self.assertEqual(chapter.title, 'V.26 C.107') path = os.path.join( self.directory.name, 'Full Metal Alchemist', 'Full Metal Alchemist - c026 x107 [Unknown].zip') @@ -131,8 +135,7 @@ def test_chapter_information_volume_decimal(self): self.assertEqual(chapter.alias, 'ai-yori-aoshi') self.assertEqual(chapter.chapter, '16.133.5') self.assertEqual(chapter.name, 'Ai Yori Aoshi') - self.assertEqual(chapter.title, 'Vol.16 Ch.133.5 ' + - '- Special Chapter - Hanakotoba - Language of Flower') + self.assertEqual(chapter.title, 'V.16 C.133.5') path = os.path.join( self.directory.name, 'Ai Yori Aoshi', 'Ai Yori Aoshi - c016 x133.5 [Unknown].zip') From 6b49af679f07630b6a558881266c113d1dec1f4b Mon Sep 17 00:00:00 2001 From: matoro Date: Sat, 23 Feb 2019 23:33:48 -0500 Subject: [PATCH 07/19] mangahere: fix chapter regex pattern, add test as discovered by this fix, there is a lack of test coverage on guaranteeing series/chapter URL regex matches which needs to be addressed, since directly invoking from_url() does not check matches against .url_re --- cum/scrapers/mangahere.py | 2 +- tests/test_scraper_mangahere.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index 1669949..70e6fd0 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -66,7 +66,7 @@ def name(self): class MangahereChapter(BaseChapter): url_re = re.compile((r'https?://((www|m)\.)?mangahere\.cc' - r'/(roll_)?manga(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$')) + r'/(roll_)?manga/[^/]+(/v[0-9]+)?/c[0-9\.]+/[0-9]+\.html$')) upload_date = None uses_pages = True diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py index 03880b3..e3bcb14 100644 --- a/tests/test_scraper_mangahere.py +++ b/tests/test_scraper_mangahere.py @@ -92,6 +92,25 @@ def test_chapter_information_normal(self): files = chapter_zip.infolist() self.assertEqual(len(files), 8) + def test_chapter_information_multidigit(self): + URL = "https://www.mangahere.cc/manga/" + \ + "tsurezure_children/c192/1.html" + chapter = mangahere.MangahereChapter.from_url(URL) + self.assertEqual(chapter.alias, 'tsurezure-children') + self.assertTrue(chapter.available()) + self.assertEqual(chapter.chapter, '192') + self.assertEqual(chapter.name, 'Tsurezure Children') + self.assertEqual(chapter.title, 'C.192') + path = os.path.join(self.directory.name, + 'Tsurezure Children', + 'Tsurezure Children - c192 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 6) + def test_chapter_information_chapterzero(self): URL = "https://www.mangahere.cc/manga/" + \ "inu_to_hasami_wa_tsukaiyou/c000/1.html" From c5eba73a73982cf4123bef21da646398ddc6030e Mon Sep 17 00:00:00 2001 From: matoro Date: Sat, 23 Feb 2019 23:48:44 -0500 Subject: [PATCH 08/19] allow download individual chapters via with get command --- cum/cum.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cum/cum.py b/cum/cum.py index 689c205..041a961 100755 --- a/cum/cum.py +++ b/cum/cum.py @@ -265,21 +265,21 @@ def get(input, directory): """ chapter_list = [] for item in input: + series = None try: series = utility.series_by_url(item) except exceptions.ScrapingError: - output.warning('Scraping error ({})'.format(item)) - continue + pass except exceptions.LoginError as e: output.warning('{} ({})'.format(e.message, item)) continue if series: chapter_list += series.chapters + chapter = None try: chapter = utility.chapter_by_url(item) except exceptions.ScrapingError: - output.warning('Scraping error ({})'.format(item)) - continue + pass except exceptions.LoginError as e: output.warning('{} ({})'.format(e.message, item)) continue From 61395ee2a4c460e18b8d3086ae7f0a9a314cb808 Mon Sep 17 00:00:00 2001 From: matoro Date: Sat, 23 Nov 2019 11:05:59 -0500 Subject: [PATCH 09/19] mangahere: print error message when bad status code received --- cum/scrapers/mangahere.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index 70e6fd0..e40701b 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -1,5 +1,5 @@ from bs4 import BeautifulSoup -from cum import config, exceptions +from cum import config, exceptions, output from cum.scrapers.base import BaseChapter, BaseSeries, download_pool from functools import partial import concurrent.futures @@ -100,6 +100,7 @@ def download(self): retries += 1 if r.status_code != 200: r.close() + output.error("Page download got status code {}".format(str(r.status_code))) raise ValueError fut = download_pool.submit(self.page_download_task, i, r) fut.add_done_callback(partial(self.page_download_finish, From 677415007bde275746cb06eb4457fb78feb55f01 Mon Sep 17 00:00:00 2001 From: matoro Date: Wed, 27 Nov 2019 18:53:43 -0500 Subject: [PATCH 10/19] mangasee: allow multiple retries of failed page fetch --- cum/scrapers/mangasee.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py index 67d9d9c..0dfc7da 100644 --- a/cum/scrapers/mangasee.py +++ b/cum/scrapers/mangasee.py @@ -1,5 +1,5 @@ from bs4 import BeautifulSoup -from cum import config, exceptions +from cum import config, exceptions, output from cum.scrapers.base import BaseChapter, BaseSeries, download_pool from functools import partial import concurrent.futures @@ -116,11 +116,17 @@ def download(self): while retries < 10: try: r = req_session.get(page, stream=True) - break + if r.status_code != 200: + output.warning('Failed to fetch page with status {}, retrying #{}' + .format(str(r.status_code), str(retries))) + retries += 1 + else: + break except requests.exceptions.ConnectionError: retries += 1 if r.status_code != 200: - r.close() + output.error('Failed to fetch page with status {}, giving up' + .format(str(r.status_code))) raise ValueError fut = download_pool.submit(self.page_download_task, i, r) fut.add_done_callback(partial(self.page_download_finish, From 79115df21fa645fc88e269a9733f084979c7db97 Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 5 Mar 2020 08:08:37 -0500 Subject: [PATCH 11/19] Add Manganelo scraper --- cum/db.py | 4 +- cum/scrapers/__init__.py | 3 + cum/scrapers/manganelo.py | 124 ++++++++++++++++++++++++++ tests/test_scraper_manganelo.py | 149 ++++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 cum/scrapers/manganelo.py create mode 100644 tests/test_scraper_manganelo.py diff --git a/cum/db.py b/cum/db.py index db1af2a..9b1bf66 100644 --- a/cum/db.py +++ b/cum/db.py @@ -248,7 +248,9 @@ def to_object(self): if parse.netloc in ('www.mangahere.cc', 'm.mangahere.cc'): from cum.scrapers.mangahere import MangahereChapter return MangahereChapter(**kwargs) - + if parse.netloc == 'manganelo.com': + from cum.scrapers.manganelo import ManganeloChapter + return ManganeloChapter(**kwargs) class Group(Base): __tablename__ = 'groups' diff --git a/cum/scrapers/__init__.py b/cum/scrapers/__init__.py index b7a68d9..4823bf9 100644 --- a/cum/scrapers/__init__.py +++ b/cum/scrapers/__init__.py @@ -2,6 +2,7 @@ from cum.scrapers.dynastyscans import DynastyScansChapter, DynastyScansSeries from cum.scrapers.madokami import MadokamiChapter, MadokamiSeries from cum.scrapers.mangadex import MangadexSeries, MangadexChapter +from cum.scrapers.manganelo import ManganeloSeries, ManganeloChapter from cum.scrapers.mangasee import MangaseeSeries, MangaseeChapter from cum.scrapers.mangahere import MangahereSeries, MangahereChapter from cum.scrapers.yuriism import YuriismChapter, YuriismSeries @@ -11,6 +12,7 @@ DynastyScansSeries, MadokamiSeries, MangadexSeries, + ManganeloSeries, MangaseeSeries, MangahereSeries, YuriismSeries, @@ -20,6 +22,7 @@ DynastyScansChapter, MadokamiChapter, MangadexChapter, + ManganeloChapter, MangaseeChapter, MangahereChapter, YuriismChapter, diff --git a/cum/scrapers/manganelo.py b/cum/scrapers/manganelo.py new file mode 100644 index 0000000..35f7ac9 --- /dev/null +++ b/cum/scrapers/manganelo.py @@ -0,0 +1,124 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions, output +from cum.scrapers.base import BaseChapter, BaseSeries, download_pool +from functools import partial +from warnings import filterwarnings +import concurrent.futures +import json +import re +import requests + + +class ManganeloSeries(BaseSeries): + url_re = re.compile(r'https?://manganelo\.com/manga/.+') + + def __init__(self, url, **kwargs): + super().__init__(url, **kwargs) + filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning) + spage = requests.get(url) + if spage.status_code == 404: + raise exceptions.ScrapingError + self.soup = BeautifulSoup(spage.text, config.get().html_parser) + # 404 pages actually return HTTP 200 + if self.soup.find("title").text == "404 Not Found": + raise exceptions.ScrapingError + self.chapters = self.get_chapters() + + def get_chapters(self): + try: + rows = self.soup.find_all("li", class_="a-h") + except AttributeError: + raise exceptions.ScrapingError() + chapters = [] + for i, row in enumerate(rows): + chap_num = re.match(r"https?://manganelo\.com/chapter/.+/?chapter_([0-9\.]+)", + row.find("a")["href"]).groups()[0] + chap_url = row.find("a")["href"] + chap_name = row.find("a")["title"] + chap_date = row.find_all("span")[1]["title"] + result = ManganeloChapter(name=self.name, + alias=self.alias, + chapter=chap_num, + url=chap_url, + title=chap_name, + groups=[], + upload_date=chap_date) + chapters.append(result) + return chapters + + @property + def name(self): + try: + return re.match(r"(.+) Manga Online Free - Manganelo", + self.soup.find("title").text).groups()[0] + except AttributeError: + raise exceptions.ScrapingError + + +class ManganeloChapter(BaseChapter): + url_re = re.compile((r'https?://manganelo\.com/' + r'chapter/.+/chapter_[0-9\.]')) + upload_date = None + uses_pages = True + + # 404 pages actually return HTTP 200 + # thus this method override + def available(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + return self.soup.find("title").text != "404 Not Found" + + def download(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, + config.get().html_parser) + + # 404 pages actually return HTTP 200 + if self.soup.find("title").text == "404 Not Found": + raise exceptions.ScrapingError + pages = [ image["src"] for image in self.soup.find("div", class_ = "container-chapter-reader").find_all("img") ] + + futures = [] + files = [None] * len(pages) + req_session = requests.Session() + with self.progress_bar(pages) as bar: + for i, page in enumerate(pages): + retries = 0 + while retries < 10: + try: + r = req_session.get(page, stream=True) + if r.status_code != 200: + output.warning('Failed to fetch page with status {}, retrying #{}' + .format(str(r.status_code), str(retries))) + retries += 1 + else: + break + except requests.exceptions.ConnectionError: + retries += 1 + if r.status_code != 200: + output.error('Failed to fetch page with status {}, giving up' + .format(str(r.status_code))) + raise ValueError + fut = download_pool.submit(self.page_download_task, i, r) + fut.add_done_callback(partial(self.page_download_finish, + bar, files)) + futures.append(fut) + concurrent.futures.wait(futures) + self.create_zip(files) + req_session.close() + + def from_url(url): + cpage = requests.get(url) + soup = BeautifulSoup(cpage.text, config.get().html_parser) + iname = re.match("https?://manganelo\.com/chapter/(.+)/chapter_[0-9\.]+", + url).groups()[0] + series = ManganeloSeries("https://manganelo.com/manga/" + iname) + for chapter in series.chapters: + if chapter.url == url: + return chapter + return None diff --git a/tests/test_scraper_manganelo.py b/tests/test_scraper_manganelo.py new file mode 100644 index 0000000..f28a69c --- /dev/null +++ b/tests/test_scraper_manganelo.py @@ -0,0 +1,149 @@ +from bs4 import BeautifulSoup +from cum import config, exceptions +from nose.tools import nottest +from urllib.parse import urljoin +from warnings import filterwarnings +import cumtest +import os +import requests +import unittest +import zipfile + + +class TestManganelo(cumtest.CumTest): + MANGANELO_URL = 'https://manganelo.com/genre-all' + + def setUp(self): + super().setUp() + global manganelo + filterwarnings(action = "ignore", message = "unclosed", category = ResourceWarning) + from cum.scrapers import manganelo + + def tearDown(self): + self.directory.cleanup() + + def get_five_latest_releases(self): + r = requests.get(self.MANGANELO_URL) + soup = BeautifulSoup(r.text, config.get().html_parser) + chapters = soup.find_all("a", class_="genres-item-chap") + links = [x["href"] for x in chapters] + return links[:5] + + @nottest + def series_information_tester(self, data): + series = manganelo.ManganeloSeries(data['url']) + self.assertEqual(series.name, data['name']) + self.assertEqual(series.alias, data['alias']) + self.assertEqual(series.url, data['url']) + self.assertIs(series.directory, None) + self.assertEqual(len(series.chapters), len(data['chapters'])) + for chapter in series.chapters: + self.assertEqual(chapter.name, data['name']) + self.assertEqual(chapter.alias, data['alias']) + self.assertIn(chapter.chapter, data['chapters']) + data['chapters'].remove(chapter.chapter) + self.assertIs(chapter.directory, None) + self.assertEqual(len(data['chapters']), 0) + + # This test is disabled temporarily due to the architecture of + # the chapter.from_url method, which assumes that if a chapter + # exists then it will be listed on the series page. Manganelo + # seems to violate this assumption, in that there are chapters + # which are accessible from the "latest chapters" page but which + # are not listed on their respective series' pages, at least + # not immediately. + # TODO: come back to this test and find a way to construct a + # chapter without requiring metadata from the series page + def _test_chapter_download_latest(self): + latest_releases = self.get_five_latest_releases() + for release in latest_releases: + try: + chapter = manganelo.ManganeloChapter.from_url(release) + except exceptions.ScrapingError as e: + print('scraping error for {} - {}'.format(release, e)) + continue + else: + chapter.get(use_db=False) + + def test_chapter_filename_decimal(self): + URL = "https://manganelo.com/chapter/citrus_saburo_uta/chapter_24.6" + chapter = manganelo.ManganeloChapter.from_url(URL) + path = os.path.join(self.directory.name, 'Citrus Saburo Uta', + 'Citrus Saburo Uta - c024 x6 [Unknown].zip') + self.assertEqual(chapter.chapter, '24.6') + self.assertEqual(chapter.filename, path) + + def test_chapter_information_normal(self): + URL = "https://manganelo.com/chapter/ramen_daisuki_koizumisan/chapter_18" + chapter = manganelo.ManganeloChapter.from_url(URL) + self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') + self.assertTrue(chapter.available()) + self.assertEqual(chapter.chapter, '18') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-San') + self.assertEqual(chapter.title, 'Ramen Daisuki Koizumi-san Chapter 18') + path = os.path.join(self.directory.name, + 'Ramen Daisuki Koizumi-San', + 'Ramen Daisuki Koizumi-San - c018 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 8) + + def test_chapter_information_chapterzero(self): + URL = "https://manganelo.com/chapter/inu_to_hasami_wa_tsukaiyou/chapter_0" + chapter = manganelo.ManganeloChapter.from_url(URL) + self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') + self.assertEqual(chapter.chapter, '0') + self.assertEqual(chapter.name, 'Inu To Hasami Wa Tsukaiyou') + self.assertEqual(chapter.title, 'Inu to Hasami wa Tsukaiyou Vol.1 Chapter 0') + path = os.path.join( + self.directory.name, 'Inu To Hasami Wa Tsukaiyou', + 'Inu To Hasami Wa Tsukaiyou - c000 [Unknown].zip') + self.assertEqual(chapter.filename, path) + chapter.download() + self.assertTrue(os.path.isfile(path)) + with zipfile.ZipFile(path) as chapter_zip: + files = chapter_zip.infolist() + self.assertEqual(len(files), 32) + + def test_series_invalid(self): + URL = "https://manganelo.com/manga/test_bad_manga_name" + with self.assertRaises(exceptions.ScrapingError): + series = manganelo.ManganeloSeries(url=URL) + + def test_chapter_unavailable(self): + URL = "https://manganelo.com/chapter/oyasumi_punpun/chapter_999" + chapter = manganelo.ManganeloChapter(url=URL) + self.assertFalse(chapter.available()) + + def test_series_oneword(self): + data = {'alias': 'aria', + 'chapters': ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '10.5', '11', '12', '13', '14', '15', + '16', '17', '18', '19', '20', '21', '22', '23', + '24', '25', '26', '27', '28', '29', '30', '30.5', + '31', '32', '33', '34', '35', '35.5', '36', + '37', '37.5', '38', '39', '40', '41', '42', '43', + '44', '45', '45.5', '46', '47', '48', '49', + '50', '50.5', '51', '52', '53', '54', '55', '56', + '57', '57.5', '58', '59', '60', '60.1'], + 'name': 'Aria', + 'url': 'https://manganelo.com/manga/aria'} + self.series_information_tester(data) + + def test_series_multiplewords(self): + data = {'alias': 'prunus-girl', + 'chapters': ['1', '1.5', '2', '3', '4', '5', '5.5', '6', '7', '8', + '9', '10', '11', '11.5', '12', '13', '14', '15', + '16', '16.5', '17', '18', '19', '20', '21', '22', + '23', '24', '25', '26', '27', '28', '29', '30', + '31', '32', '32.5', '33', '34', '35', '36', '37', + '38', '39', '40', '41', '42', '42.5'], + 'name': 'Prunus Girl', + 'url': 'https://manganelo.com/manga/prunus_girl'} + self.series_information_tester(data) + +if __name__ == '__main__': + unittest.main() From 6b7480ca6df4a6d7276d8811f4c22e52a479f6f7 Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 12 Mar 2020 07:46:04 -0400 Subject: [PATCH 12/19] scraper: check if chapter already exists in database Somehow I was still getting sqlalchemy.exc.IntegrityError exceptions thrown despite the explicit catch due to violating the unique url constraint. This only happened for Mangadex chapters. I still don't understand why, but this fixes it at least. --- cum/scrapers/base.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index b0d1866..58a4e24 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -346,15 +346,19 @@ def progress_bar(self, arg): def save(self, series, ignore=False): """Save a chapter to database.""" + # check if chapter already exists in database try: - c = db.Chapter(self, series) - except IntegrityError: - db.session.rollback() - else: - if ignore: - c.downloaded = -1 - db.session.add(c) + c = db.session.query(db.Chapter).filter_by(url=self.url).one() + except NoResultFound: try: - db.session.commit() + c = db.Chapter(self, series) except IntegrityError: db.session.rollback() + else: + if ignore: + c.downloaded = -1 + db.session.add(c) + try: + db.session.commit() + except IntegrityError: + db.session.rollback() From cb7ad097b74b18943101d6c9f509f6d64d114034 Mon Sep 17 00:00:00 2001 From: matoro Date: Thu, 12 Mar 2020 21:43:58 -0400 Subject: [PATCH 13/19] scraper: add retries for entire chapter --- cum/scrapers/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index 58a4e24..7e8aef0 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -259,7 +259,14 @@ def get(self, use_db=True): mark the chapter as downloaded if `db_remove` is set to False. """ if self.available(): - self.download() + self.retries = 3 + while self.retries > 0: + try: + self.download() + break + except requests.exceptions.ChunkedEncodingError: + output.warnings('Connection terminated, retry #{}'.format(str(3 - self.retries))) + self.retries = self.retries - 1 if use_db: self.mark_downloaded() elif use_db: From 695771c562572458ebf18c01a949ddbc4ec46347 Mon Sep 17 00:00:00 2001 From: matoro Date: Tue, 24 Mar 2020 21:04:22 -0400 Subject: [PATCH 14/19] scraper: add proper support for retrying pages note that this requires an optional change to be made to scrapers. if you want your scraper to support retrying, you must pass the url of the image to the worker payload so that it can be re-requested if necessary. if the worker fails to download an image and the scraper has not passed this parameter, an error message will be emitted and the old behavior will be used, i.e., crash everything. --- cum/scrapers/base.py | 56 +++++++++++++++++++++++++--------------- cum/scrapers/mangadex.py | 9 ++++--- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/cum/scrapers/base.py b/cum/scrapers/base.py index 7e8aef0..b7bc448 100644 --- a/cum/scrapers/base.py +++ b/cum/scrapers/base.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod from concurrent.futures import ThreadPoolExecutor -from cum import config, db, output +from cum import config, db, exceptions, output from mimetypes import guess_extension from re import match, sub from sqlalchemy.exc import IntegrityError, SQLAlchemyError @@ -259,14 +259,7 @@ def get(self, use_db=True): mark the chapter as downloaded if `db_remove` is set to False. """ if self.available(): - self.retries = 3 - while self.retries > 0: - try: - self.download() - break - except requests.exceptions.ChunkedEncodingError: - output.warnings('Connection terminated, retry #{}'.format(str(3 - self.retries))) - self.retries = self.retries - 1 + self.download() if use_db: self.mark_downloaded() elif use_db: @@ -311,24 +304,45 @@ def page_download_finish(bar, files, fs): bar.update(1) @staticmethod - def page_download_task(page_num, r): + def page_download_task(page_num, r, page_url = None): """Saves the response body of a single request, returning the file handle and the passed through number of the page to allow for non- sequential downloads in parallel. """ ext = BaseChapter.guess_extension(r.headers.get('content-type')) f = NamedTemporaryFile(suffix=ext, delete=False) - try: - for chunk in r.iter_content(chunk_size=4096): - if chunk: - f.write(chunk) - # basically ignores this exception that requests throws. my - # understanding is that it is raised when you attempt to iter_content() - # over the same content twice. don't understand how that situation - # arises with the current code but it did somehow. - # https://stackoverflow.com/questions/45379903/ - except requests.exceptions.StreamConsumedError: - pass + retries = 20 + while retries > 0: + try: + for chunk in r.iter_content(chunk_size=4096): + if chunk: + f.write(chunk) + retries = 0 + # basically ignores this exception that requests throws. my + # understanding is that it is raised when you attempt to iter_content() + # over the same content twice. don't understand how that situation + # arises with the current code but it did somehow. + # https://stackoverflow.com/questions/45379903/ + except requests.exceptions.StreamConsumedError: + pass + # when under heavy load, Mangadex will often kill the connection in + # the middle of an image download. in the original architecture, + # the requests are all opened in the scrapers in stream mode, then + # the actual image payloads are downloaded in the asynchronous + # callbacks. when this occurs we have not choice but to re-request + # the image from the beginning (easier than playing around with range + # headers). this means each thread may issue multiple new requests. + # I have found the performance overhead to be mostly negligible. + except requests.exceptions.ChunkedEncodingError: + if not page_url: + output.error("Connection killed on page {} but scraper does not support retries".format(str(page_num))) + raise exceptions.ScrapingError + output.warning("Connection killed on page {}, {} retries remaining".format(str(page_num), str(retries))) + retries = retries - 1 + if retries <= 0: + output.error("Connection killed on page {}, no retries remaining - aborting chapter".format(str(page_num))) + raise exceptions.ScrapingError + r = requests.get(page_url, stream = True) f.flush() f.close() r.close() diff --git a/cum/scrapers/mangadex.py b/cum/scrapers/mangadex.py index 1d7c0c7..f2140c4 100644 --- a/cum/scrapers/mangadex.py +++ b/cum/scrapers/mangadex.py @@ -124,13 +124,14 @@ def download(self): if guess_type(page)[0]: image = server + chapter_hash + '/' + page else: - print('Unkown image type for url {}'.format(page)) - raise ValueError + print('Unknown image type for url {}'.format(page)) + raise exceptions.ScrapingError r = requests.get(image, stream=True) if r.status_code == 404: r.close() - raise ValueError - fut = download_pool.submit(self.page_download_task, i, r) + raise exceptions.ScrapingError + fut = download_pool.submit(self.page_download_task, + i, r, page_url = image) fut.add_done_callback(partial(self.page_download_finish, bar, files)) futures.append(fut) From fb53267e84055a2bbd7bfcd4b13936736388ce7c Mon Sep 17 00:00:00 2001 From: matoro Date: Tue, 24 Mar 2020 21:41:58 -0400 Subject: [PATCH 15/19] mangadex: add resiliency to initial image request --- cum/scrapers/mangadex.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cum/scrapers/mangadex.py b/cum/scrapers/mangadex.py index f2140c4..0bdbc18 100644 --- a/cum/scrapers/mangadex.py +++ b/cum/scrapers/mangadex.py @@ -126,7 +126,18 @@ def download(self): else: print('Unknown image type for url {}'.format(page)) raise exceptions.ScrapingError - r = requests.get(image, stream=True) + retries = 3 + r = None + while retries > 0: + try: + r = requests.get(image, stream=True) + break + except requests.exceptions.ConnectionError: + output.warning("Initial request for page {} failed, {} retries remaining".format(str(i), str(retries))) + retries = retries - 1 + if not r: + output.error("Failed to request page {}".format(str(i))) + raise exceptions.ScrapingError if r.status_code == 404: r.close() raise exceptions.ScrapingError From d4ce9b42489e5b58773f508df82611cdef0a567e Mon Sep 17 00:00:00 2001 From: matoro Date: Sun, 5 Apr 2020 11:55:24 -0500 Subject: [PATCH 16/19] mangahere: overhaul for new anti-scraping system Mangahere has removed its legacy mobile interface that made for easy scraping; they are now protected by CloudFlare Bot Management (the desktop version already was). In my testing, the heuristic measures implemented here have managed to reliably bypass the protection. However, since their anti-bot measures are now heuristic-based, there is no guarantee that it will work for every host from every network location. Feedback is appreciated. --- cum/scrapers/mangahere.py | 145 +++++++++++++++++++++++++------- setup.py | 3 +- tests/test_scraper_mangahere.py | 44 +++++----- 3 files changed, 139 insertions(+), 53 deletions(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index e40701b..b76c0f9 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -2,10 +2,28 @@ from cum import config, exceptions, output from cum.scrapers.base import BaseChapter, BaseSeries, download_pool from functools import partial +from jsbeautifier import beautify +from json import loads import concurrent.futures import re import requests +# as of 2020/04/04, the old mobile interface which allowed easy scraping +# has been removed, and mobile now copies desktop which is protected +# by Cloudflare's Bot Management +# https://www.cloudflare.com/products/bot-management/ +# in my personal testing, the following heuristic headers do reliably bypass +# it, at least to the extent necessary to hit their new progressive page load system +# however, because it is a heuristic-based system, there is no guarantee +# that just because it works from one machine and/or network location +# that it will work for others. +# feedback is appreciated. +chrome_headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept-language": "en-US,en;q=0.9", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36" + } class MangahereSeries(BaseSeries): url_re = re.compile(r'https?://((www|m)\.)?mangahere\.cc/manga/.+') @@ -14,7 +32,7 @@ def __init__(self, url, **kwargs): super().__init__(url, **kwargs) # convert desktop link to mobile # bypasses adult content warning js - spage = requests.get(url.replace("www.", "m.")) + spage = requests.get(url.replace("www.", "m."), cookies = { "isAdult": "1" }) if spage.status_code == 404: raise exceptions.ScrapingError self.soup = BeautifulSoup(spage.text, config.get().html_parser) @@ -22,16 +40,18 @@ def __init__(self, url, **kwargs): def get_chapters(self): try: - rows = self.soup.find("div", class_="manga-chapters")\ - .find("ul").find_all("li") + # broken 2020/04/04 + # rows = self.soup.find("div", class_="manga-chapters")\ + # .find("ul").find_all("li") + rows = self.soup.find("div", class_="detail-chapters-list")\ + .find_all("a") except AttributeError: raise exceptions.ScrapingError() chapters = [] for i, row in enumerate(rows): - chap_num = re.match((r"//m\.mangahere\.cc" - r"/manga/[^/]+((/v[0-9]+)?" - r"/c[0-9\.]+)/?$"), - row.find("a")["href"]).groups()[0]\ + chap_num = re.match((r"/manga/[^/]+(?:(?:/v[0-9]+)?" + r"/c([0-9\.]+))/[0-9]+\.html$"), + row.get("href")).groups()[0]\ .replace("/", "") if "v" in chap_num: chap_num = chap_num.replace("v", "").replace("c", ".") @@ -42,10 +62,9 @@ def get_chapters(self): else: chap_num = chap_num.lstrip("0") # convert mobile link to desktop - chap_url = "https:" + row.find("a")["href"]\ - .replace("/roll_manga/", "/manga/")\ - .replace("m.", "www.") - chap_name = row.text + chap_url = "https://www.mangahere.cc" + row.get("href")\ + .replace("/roll_manga/", "/manga/") + chap_name = row.text.strip() result = MangahereChapter(name=self.name, alias=self.alias, chapter=chap_num, @@ -70,50 +89,109 @@ class MangahereChapter(BaseChapter): upload_date = None uses_pages = True + def _request_pages(self, mid, cid, pages): + base_url = re.search(r"(.+/)[0-9]\.html", self.url.replace("www.", "m.")).groups()[0] + data_url = base_url + "chapterfun.ashx?cid=" + str(cid) + "&page=" + str(len(pages) + 1) + "&key=" + chrome_headers["accept"] = "*/*" + chrome_headers["referer"] = self.url.replace("www.", "m.") + chrome_headers["x-requested-with"] = "XMLHttpRequest" + data = self.session.get(data_url, headers = chrome_headers) + if data.text == "": + raise cum.exceptions.ScrapingError + try: + data_clean = beautify(data.text) + if not getattr(self, "pvalue", None): + self.pvalue = "https:" + re.search(r"pvalue\[i\] = \"(.+)\" \+ pvalue\[i\];", data_clean).groups()[0] + # formatted_chap_num = re.search(r".+/c([0-9\.]+)/[0-9]\.html", self.url).groups()[0] + # if "." not in formatted_chap_num: + # formatted_chap_num += ".0" + for page in loads(re.search("var pvalue = (.+);", data_clean).groups()[0]): + full_page = self.pvalue + page + if full_page not in pages: + pages.append(full_page) + except Exception: + raise exceptions.ScrapingError + return pages + def download(self): + + self.session = requests.Session() + if not getattr(self, "cpage", None): - self.cpage = requests.get(self.url.replace("www.", "m.") - .replace("/manga/", "/roll_manga/")) + self.cpage = self.session.get(self.url.replace("www.", "m."), headers = chrome_headers) if self.cpage.status_code == 404: raise exceptions.ScrapingError + if not getattr(self, "soup", None): self.soup = BeautifulSoup(self.cpage.text, config.get().html_parser) - image_list = self.soup.find("div", class_="mangaread-img")\ - .find_all("img") + # broken 2020/04/04 + # image_list = self.soup.find("div", class_="mangaread-img")\ + # .find_all("img") + # pages = [] + # for image in image_list: + # pages.append(image["data-original"].replace("http://", "https://")) + pages = [] - for image in image_list: - pages.append(image["data-original"].replace("http://", "https://")) + (mid, cid) = (None, None) + # index of script with ids may vary + for f in [ 5, 6 ]: + try: + mid = re.search("var comicid = ([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] + cid = re.search("var chapterid =([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] + except AttributeError: + pass + if mid and cid: + old_num_pages = -1 + while old_num_pages != len(pages): + old_num_pages = len(pages) + pages = self._request_pages(mid, cid, pages) + else: + # some titles (seems to be ones with low page counts like webtoons) + # don't use progressively-loaded pages. for these, the image list + # can be extracted directly off the main page + pages = loads(re.search("var newImgs = (.+);var newImginfos", beautify(self.soup.find_all("script")[6].text).replace("\\", "").replace("'", "\"")).groups()[0]) + for i, page in enumerate(pages): + pages[i] = "https:" + page futures = [] files = [None] * len(pages) - req_session = requests.Session() with self.progress_bar(pages) as bar: for i, page in enumerate(pages): retries = 0 while retries < 10: try: - r = req_session.get(page, stream=True) + r = self.session.get(page, stream=True) break except requests.exceptions.ConnectionError: retries += 1 - if r.status_code != 200: - r.close() - output.error("Page download got status code {}".format(str(r.status_code))) - raise ValueError - fut = download_pool.submit(self.page_download_task, i, r) - fut.add_done_callback(partial(self.page_download_finish, - bar, files)) - futures.append(fut) + # end of chapter detection in the web ui is done by issuing requests + # for nonexistent pages which return 404s (who comes up with this) + if r.status_code != 404: + if r.status_code != 200: + r.close() + output.error("Page download got status code {}".format(str(r.status_code))) + raise exceptions.ScrapingError + fut = download_pool.submit(self.page_download_task, i, r) + fut.add_done_callback(partial(self.page_download_finish, + bar, files)) + futures.append(fut) + else: + try: + del files[i] + except IndexError: + self.session.close() + raise exceptions.ScrapingError concurrent.futures.wait(futures) self.create_zip(files) + self.session.close() def from_url(url): - chap_num = re.match((r"https?://((www|m)\.)?mangahere\.cc/(roll_)?" - r"manga/[^/]+((/v[0-9]+)?/c[0-9\.]+)" + chap_num = re.match((r"https?://(?:(?:www|m)\.)?mangahere\.cc/(?:roll_)?" + r"manga/[^/]+(?:(?:/v[0-9]+)?/c([0-9\.]+))" r"/[0-9]+\.html"), url)\ - .groups()[3].replace("/", "") + .groups()[0] if "v" in chap_num: chap_num = chap_num.replace("v", "").replace("c", ".") else: @@ -131,3 +209,10 @@ def from_url(url): if chapter.chapter == str(chap_num): return chapter return None + + def available(self): + if not getattr(self, "cpage", None): + self.cpage = requests.get(self.url.replace("www.", "m.")) + if not getattr(self, "soup", None): + self.soup = BeautifulSoup(self.cpage.text, config.get().html_parser) + return self.soup.find("title").text != "Error - MangaHere Mobile" diff --git a/setup.py b/setup.py index f254766..9f4cbfb 100644 --- a/setup.py +++ b/setup.py @@ -119,7 +119,8 @@ def write_version_file(): 'Click', 'natsort', 'requests', - 'SQLAlchemy' + 'SQLAlchemy', + 'jsbeautifier' ], extras_require={ 'testing': ['codecov', 'cov-core', 'nose2', 'pycodestyle'] diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py index e3bcb14..dc11bf9 100644 --- a/tests/test_scraper_mangahere.py +++ b/tests/test_scraper_mangahere.py @@ -80,11 +80,11 @@ def test_chapter_information_normal(self): self.assertEqual(chapter.alias, 'ramen-daisuki-koizumi-san') self.assertTrue(chapter.available()) self.assertEqual(chapter.chapter, '18') - self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi-san') - self.assertEqual(chapter.title, 'C.18') + self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi san') + self.assertEqual(chapter.title, 'Ch.018') path = os.path.join(self.directory.name, - 'Ramen Daisuki Koizumi-san', - 'Ramen Daisuki Koizumi-san - c018 [Unknown].zip') + 'Ramen Daisuki Koizumi san', + 'Ramen Daisuki Koizumi san - c018 [Unknown].zip') self.assertEqual(chapter.filename, path) chapter.download() self.assertTrue(os.path.isfile(path)) @@ -100,7 +100,7 @@ def test_chapter_information_multidigit(self): self.assertTrue(chapter.available()) self.assertEqual(chapter.chapter, '192') self.assertEqual(chapter.name, 'Tsurezure Children') - self.assertEqual(chapter.title, 'C.192') + self.assertEqual(chapter.title, 'Ch.192') path = os.path.join(self.directory.name, 'Tsurezure Children', 'Tsurezure Children - c192 [Unknown].zip') @@ -118,7 +118,7 @@ def test_chapter_information_chapterzero(self): self.assertEqual(chapter.alias, 'inu-to-hasami-wa-tsukaiyou') self.assertEqual(chapter.chapter, '0') self.assertEqual(chapter.name, 'Inu to Hasami wa Tsukaiyou') - self.assertEqual(chapter.title, 'C.0') + self.assertEqual(chapter.title, 'Ch.000') path = os.path.join( self.directory.name, 'Inu to Hasami wa Tsukaiyou', 'Inu to Hasami wa Tsukaiyou - c000 [Unknown].zip') @@ -134,12 +134,12 @@ def test_chapter_information_volume(self): "full_metal_alchemist/v026/c107/1.html" chapter = mangahere.MangahereChapter.from_url(URL) self.assertEqual(chapter.alias, 'full-metal-alchemist') - self.assertEqual(chapter.chapter, '26.107') + self.assertEqual(chapter.chapter, '107') self.assertEqual(chapter.name, 'Full Metal Alchemist') - self.assertEqual(chapter.title, 'V.26 C.107') + self.assertEqual(chapter.title, 'Vol.026 Ch.107 - The Final Battle') path = os.path.join( self.directory.name, 'Full Metal Alchemist', - 'Full Metal Alchemist - c026 x107 [Unknown].zip') + 'Full Metal Alchemist - c107 [Unknown].zip') self.assertEqual(chapter.filename, path) chapter.download() self.assertTrue(os.path.isfile(path)) @@ -152,12 +152,12 @@ def test_chapter_information_volume_decimal(self): "ai_yori_aoshi/v16/c133.5/1.html" chapter = mangahere.MangahereChapter.from_url(URL) self.assertEqual(chapter.alias, 'ai-yori-aoshi') - self.assertEqual(chapter.chapter, '16.133.5') + self.assertEqual(chapter.chapter, '133.5') self.assertEqual(chapter.name, 'Ai Yori Aoshi') - self.assertEqual(chapter.title, 'V.16 C.133.5') + self.assertEqual(chapter.title, 'Vol.16 Ch.133.5 - Special Chapter - Hanakotoba - Language of Flower') path = os.path.join( self.directory.name, 'Ai Yori Aoshi', - 'Ai Yori Aoshi - c016 x133.5 [Unknown].zip') + 'Ai Yori Aoshi - c133 x5 [Unknown].zip') self.assertEqual(chapter.filename, path) chapter.download() self.assertTrue(os.path.isfile(path)) @@ -202,16 +202,16 @@ def test_series_flatchapters(self): def test_series_volumes(self): data = {'alias': 'prunus-girl', - 'chapters': ['1.001', '1.001.5', '1.002', '1.003', '1.004', - '1.005', '1.005.5', '1.006', '1.007', '1.008', - '1.009', '1.010', '1.011', '1.011.5', '2.012', - '2.013', '2.014', '2.015', '3.014', '3.015', - '3.016', '3.017', '3.018', '3.019', '3.020', - '3.021', '3.022', '3.023', '3.024', '3.025', - '3.026', '3.027', '5.028', '5.029', '5.030', - '5.031', '5.032', '5.032.5', '5.033', '5.034', - '5.035', '5.036', '5.037', '5.038', '5.039', - '5.040', '5.041', '5.042', '5.042.5'], + 'chapters': ['1', '1.5', '2', '3', '4', + '5', '5.5', '6', '7', '8', + '9', '10', '11', '11.5', '12', + '13', '14', '15', '14', '15', + '16', '17', '18', '19', '20', + '21', '22', '23', '24', '25', + '26', '27', '28', '29', '30', + '31', '32', '32.5', '33', '34', + '35', '36', '37', '38', '39', + '40', '41', '42', '42.5'], 'name': 'Prunus Girl', 'url': 'https://www.mangahere.cc/manga/prunus_girl'} self.series_information_tester(data) From 0c42254d7a232d11b914211bf689db83cda63052 Mon Sep 17 00:00:00 2001 From: matoro Date: Tue, 21 Apr 2020 14:39:33 -0500 Subject: [PATCH 17/19] mangahere: mobile site changed again, move completely to desktop site also added more flexibility as ad scripts are added/removed from the site --- cum/scrapers/mangahere.py | 31 ++++++++++++++++++------------- tests/test_scraper_mangahere.py | 4 ++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index b76c0f9..485dfcb 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -32,7 +32,7 @@ def __init__(self, url, **kwargs): super().__init__(url, **kwargs) # convert desktop link to mobile # bypasses adult content warning js - spage = requests.get(url.replace("www.", "m."), cookies = { "isAdult": "1" }) + spage = requests.get(url.replace("m.", "www."), cookies = { "isAdult": "1" }) if spage.status_code == 404: raise exceptions.ScrapingError self.soup = BeautifulSoup(spage.text, config.get().html_parser) @@ -43,7 +43,7 @@ def get_chapters(self): # broken 2020/04/04 # rows = self.soup.find("div", class_="manga-chapters")\ # .find("ul").find_all("li") - rows = self.soup.find("div", class_="detail-chapters-list")\ + rows = self.soup.find("ul", class_="detail-main-list")\ .find_all("a") except AttributeError: raise exceptions.ScrapingError() @@ -64,7 +64,7 @@ def get_chapters(self): # convert mobile link to desktop chap_url = "https://www.mangahere.cc" + row.get("href")\ .replace("/roll_manga/", "/manga/") - chap_name = row.text.strip() + chap_name = row.find("p").text result = MangahereChapter(name=self.name, alias=self.alias, chapter=chap_num, @@ -77,7 +77,7 @@ def get_chapters(self): @property def name(self): try: - return re.match(r"(.+) - MangaHere Mobile$", + return re.match(r"(.+) Manga - Read .+ Online at MangaHere", self.soup.find("title").text).groups()[0] except AttributeError: raise exceptions.ScrapingError @@ -90,10 +90,10 @@ class MangahereChapter(BaseChapter): uses_pages = True def _request_pages(self, mid, cid, pages): - base_url = re.search(r"(.+/)[0-9]\.html", self.url.replace("www.", "m.")).groups()[0] + base_url = re.search(r"(.+/)[0-9]\.html", self.url.replace("m.", "www.")).groups()[0] data_url = base_url + "chapterfun.ashx?cid=" + str(cid) + "&page=" + str(len(pages) + 1) + "&key=" chrome_headers["accept"] = "*/*" - chrome_headers["referer"] = self.url.replace("www.", "m.") + chrome_headers["referer"] = self.url.replace("m.", "www.") chrome_headers["x-requested-with"] = "XMLHttpRequest" data = self.session.get(data_url, headers = chrome_headers) if data.text == "": @@ -118,7 +118,7 @@ def download(self): self.session = requests.Session() if not getattr(self, "cpage", None): - self.cpage = self.session.get(self.url.replace("www.", "m."), headers = chrome_headers) + self.cpage = self.session.get(self.url.replace("m.", "www."), headers = chrome_headers) if self.cpage.status_code == 404: raise exceptions.ScrapingError @@ -136,7 +136,8 @@ def download(self): pages = [] (mid, cid) = (None, None) # index of script with ids may vary - for f in [ 5, 6 ]: + # it may also change as ads are added/removed from the site + for f in range(0, len(self.soup.find_all("script"))): try: mid = re.search("var comicid = ([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] cid = re.search("var chapterid =([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] @@ -151,7 +152,13 @@ def download(self): # some titles (seems to be ones with low page counts like webtoons) # don't use progressively-loaded pages. for these, the image list # can be extracted directly off the main page - pages = loads(re.search("var newImgs = (.+);var newImginfos", beautify(self.soup.find_all("script")[6].text).replace("\\", "").replace("'", "\"")).groups()[0]) + for g in range(0, len(self.soup.find_all("script"))): + try: + pages = loads(re.search("var newImgs = (.+);var newImginfos", beautify(self.soup.find_all("script")[g].text).replace("\\", "").replace("'", "\"")).groups()[0]) + except AttributeError: + pass + if not len(pages): + raise ScrapingError for i, page in enumerate(pages): pages[i] = "https:" + page @@ -212,7 +219,5 @@ def from_url(url): def available(self): if not getattr(self, "cpage", None): - self.cpage = requests.get(self.url.replace("www.", "m.")) - if not getattr(self, "soup", None): - self.soup = BeautifulSoup(self.cpage.text, config.get().html_parser) - return self.soup.find("title").text != "Error - MangaHere Mobile" + self.cpage = requests.get(self.url.replace("m.", "www.")) + return self.cpage.status_code == 200 diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py index dc11bf9..f32cbf4 100644 --- a/tests/test_scraper_mangahere.py +++ b/tests/test_scraper_mangahere.py @@ -81,7 +81,7 @@ def test_chapter_information_normal(self): self.assertTrue(chapter.available()) self.assertEqual(chapter.chapter, '18') self.assertEqual(chapter.name, 'Ramen Daisuki Koizumi san') - self.assertEqual(chapter.title, 'Ch.018') + self.assertEqual(chapter.title, 'Ch.018 - Eighteenth Bowl: Strange-flavored Ramen') path = os.path.join(self.directory.name, 'Ramen Daisuki Koizumi san', 'Ramen Daisuki Koizumi san - c018 [Unknown].zip') @@ -100,7 +100,7 @@ def test_chapter_information_multidigit(self): self.assertTrue(chapter.available()) self.assertEqual(chapter.chapter, '192') self.assertEqual(chapter.name, 'Tsurezure Children') - self.assertEqual(chapter.title, 'Ch.192') + self.assertEqual(chapter.title, 'Ch.192 - There\'s Nothing Tying Us Together (Shibasaki/Ubukata)') path = os.path.join(self.directory.name, 'Tsurezure Children', 'Tsurezure Children - c192 [Unknown].zip') From 2c3fcb8895061746994e30d42993357aeb61760c Mon Sep 17 00:00:00 2001 From: matoro Date: Wed, 22 Apr 2020 10:19:53 -0500 Subject: [PATCH 18/19] mangasee: move from elem.text to elem.contents I don't know if something changed in BeautifulSoup to cause this, but this fixes the issue --- cum/scrapers/mangasee.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cum/scrapers/mangasee.py b/cum/scrapers/mangasee.py index 0dfc7da..60e2d73 100644 --- a/cum/scrapers/mangasee.py +++ b/cum/scrapers/mangasee.py @@ -90,8 +90,8 @@ def download(self): config.get().html_parser) for script in self.soup.find_all("script"): - if re.match("\n\tChapterArr=.+", script.text): - image_list = script.text + if len(script.contents) and re.match("\n\tChapterArr=.+", script.contents[0]): + image_list = script.contents[0] continue image_list = re.sub("\n\tChapterArr=", "", image_list) @@ -127,13 +127,14 @@ def download(self): if r.status_code != 200: output.error('Failed to fetch page with status {}, giving up' .format(str(r.status_code))) - raise ValueError + raise exceptions.ScrapingError fut = download_pool.submit(self.page_download_task, i, r) fut.add_done_callback(partial(self.page_download_finish, bar, files)) futures.append(fut) concurrent.futures.wait(futures) self.create_zip(files) + req_session.close() def from_url(url): cpage = requests.get(url) From 81bbd5b8c3225f1c4e829b2143556f41a7ef6e96 Mon Sep 17 00:00:00 2001 From: matoro Date: Wed, 6 May 2020 20:37:18 -0500 Subject: [PATCH 19/19] mangahere: update to use .contents instead of .text also, update scraper --- cum/scrapers/mangahere.py | 19 +++++++++++++++---- tests/test_scraper_mangahere.py | 10 +++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/cum/scrapers/mangahere.py b/cum/scrapers/mangahere.py index 485dfcb..07e0073 100644 --- a/cum/scrapers/mangahere.py +++ b/cum/scrapers/mangahere.py @@ -77,8 +77,18 @@ def get_chapters(self): @property def name(self): try: - return re.match(r"(.+) Manga - Read .+ Online at MangaHere", + # so I'm not sure if this is an anti-scraping measure or not, but + # sometimes the name of the series returned in the raw page text + # has a space as the first character. my measurements put it + # occurring ~30% of the time. if that's the case, then we need + # to replace the first letter with the capitalized first letter + # of the series name from the url + tentative_name = re.match(r"(.+) Manga - Read .+ Online at MangaHere", self.soup.find("title").text).groups()[0] + if tentative_name.startswith(" "): + first_letter = self.url.replace("m.", "www.")[31].upper() + tentative_name[0] = first_letter + return tentative_name except AttributeError: raise exceptions.ScrapingError @@ -139,8 +149,9 @@ def download(self): # it may also change as ads are added/removed from the site for f in range(0, len(self.soup.find_all("script"))): try: - mid = re.search("var comicid = ([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] - cid = re.search("var chapterid =([0-9]+)", self.soup.find_all("script")[f].text).groups()[0] + if len(self.soup.find_all("script")[f].contents): + mid = re.search("var comicid = ([0-9]+)", self.soup.find_all("script")[f].contents[0]).groups()[0] + cid = re.search("var chapterid =([0-9]+)", self.soup.find_all("script")[f].contents[0]).groups()[0] except AttributeError: pass if mid and cid: @@ -158,7 +169,7 @@ def download(self): except AttributeError: pass if not len(pages): - raise ScrapingError + raise exceptions.ScrapingError for i, page in enumerate(pages): pages[i] = "https:" + page diff --git a/tests/test_scraper_mangahere.py b/tests/test_scraper_mangahere.py index f32cbf4..cba8dfd 100644 --- a/tests/test_scraper_mangahere.py +++ b/tests/test_scraper_mangahere.py @@ -90,7 +90,7 @@ def test_chapter_information_normal(self): self.assertTrue(os.path.isfile(path)) with zipfile.ZipFile(path) as chapter_zip: files = chapter_zip.infolist() - self.assertEqual(len(files), 8) + self.assertEqual(len(files), 9) def test_chapter_information_multidigit(self): URL = "https://www.mangahere.cc/manga/" + \ @@ -109,7 +109,7 @@ def test_chapter_information_multidigit(self): self.assertTrue(os.path.isfile(path)) with zipfile.ZipFile(path) as chapter_zip: files = chapter_zip.infolist() - self.assertEqual(len(files), 6) + self.assertEqual(len(files), 7) def test_chapter_information_chapterzero(self): URL = "https://www.mangahere.cc/manga/" + \ @@ -127,7 +127,7 @@ def test_chapter_information_chapterzero(self): self.assertTrue(os.path.isfile(path)) with zipfile.ZipFile(path) as chapter_zip: files = chapter_zip.infolist() - self.assertEqual(len(files), 32) + self.assertEqual(len(files), 33) def test_chapter_information_volume(self): URL = "https://www.mangahere.cc/manga/" + \ @@ -145,7 +145,7 @@ def test_chapter_information_volume(self): self.assertTrue(os.path.isfile(path)) with zipfile.ZipFile(path) as chapter_zip: files = chapter_zip.infolist() - self.assertEqual(len(files), 69) + self.assertEqual(len(files), 70) def test_chapter_information_volume_decimal(self): URL = "https://www.mangahere.cc/manga/" + \ @@ -163,7 +163,7 @@ def test_chapter_information_volume_decimal(self): self.assertTrue(os.path.isfile(path)) with zipfile.ZipFile(path) as chapter_zip: files = chapter_zip.infolist() - self.assertEqual(len(files), 14) + self.assertEqual(len(files), 15) def test_series_invalid(self): URL = "https://www.mangahere.cc/manga/not_a_manga"