From 774b5f979e4e7a4fb6c245f641cf8a42582a9bff Mon Sep 17 00:00:00 2001 From: erhannis Date: Thu, 29 Jan 2026 03:55:49 -0500 Subject: [PATCH 1/6] Added url scraping to webtoons. It works, but feels like some of the design decisions were questionable. --- dosagelib/director.py | 2 +- dosagelib/plugins/webtoons.py | 21 +++++++++++++++++++++ dosagelib/scraper.py | 23 +++++++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/dosagelib/director.py b/dosagelib/director.py index 7402650c83..96f5606b1e 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -210,7 +210,7 @@ def getScrapers(comics: Collection[str], basepath: str, adult=True, listing=Fals # make the following command work: # find Comics -type d | xargs -n1 -P10 dosage -b Comics comic = comic[len(basepath) + 1:].lstrip(os.sep) - if ':' in comic: + if ':' in comic and not (comic.startswith("http:") or comic.startswith("https:")): name, index = comic.split(':', 1) indexes = index.split(',') else: diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index c62726e959..3a8a600ef6 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -47,6 +47,27 @@ def namer(self, imageUrl, pageUrl): imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0] return "%s-%03d.%s" % (episodeNum, imageNum, imageExt) + #DUMMY Redundant + @classmethod + def handlesurl(cls, url) -> bool: + import re + m = re.match(r"^http.*webtoons.com/.+?/(.+?/.+?)/.+title_no=(\d+)", url.lower()) + if m != None: + path = m.group(1) + number = m.group(2) + return True + return False + + @classmethod + def handleurl(cls, url) -> list[ParserScraper]: + import re + m = re.match(r"^http.*webtoons.com/.+?/(.+?/.+?)/.+title_no=(\d+)", url.lower()) + if m != None: + path = m.group(1) + number = m.group(2) + return [cls(path, path, number)] + return [] + @classmethod def getmodules(cls): return ( diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 648ff5fdb9..b30c3a9792 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -95,6 +95,14 @@ class Scraper: # HTTP session for configuration & cookies session: http.Session = http.default_session + @classmethod + def handlesurl(cls, url) -> bool: + return False + + @classmethod + def handleurl(cls, url) -> list[Scraper]: + return [] + @classmethod def getmodules(cls) -> Collection[Scraper]: if cls.url is None: @@ -537,6 +545,7 @@ class Cache: """ def __init__(self) -> None: self.data: List[Scraper] = [] + self.scrapers: List[Scraper] = [] self.userdirs: set[pathlib.Path] = set() def find(self, comic: str) -> Scraper: @@ -547,6 +556,12 @@ def find(self, comic: str) -> Scraper: if not comic: raise ValueError("empty comic name") candidates = [] + + for scraper in self.scrapers: + if scraper.handlesurl(comic): + candidates.extend(scraper.handleurl(comic)) + #THINK Maybe just return? + cname = comic.lower() for scraper in self.all(include_removed=True): lname = scraper.name.lower() @@ -600,6 +615,7 @@ def addmodule(self, module) -> int: classes = 0 for plugin in loader.get_module_plugins(module, Scraper): classes += 1 + self.scrapers.append(plugin) self.data.extend(plugin.getmodules()) return classes @@ -615,6 +631,13 @@ def all(self, include_removed=False) -> list[Scraper]: else: return [x for x in self.data if x.url] + def getbyurl(self, url) -> list[Scraper]: + res = [] + for plugin in self.scrapers: + if plugin.handlesurl(url): + res.extend(plugin.handleurl(url)) + return res + def validate(self) -> None: """Check for duplicate scraper names.""" d: Dict[str, Scraper] = {} From b979274061306c697e96e483a548e48c6e0fbf67 Mon Sep 17 00:00:00 2001 From: erhannis Date: Thu, 29 Jan 2026 04:33:32 -0500 Subject: [PATCH 2/6] Some cleanup --- dosagelib/plugins/webtoons.py | 11 ----------- dosagelib/scraper.py | 19 ++++++------------- 2 files changed, 6 insertions(+), 24 deletions(-) diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 3a8a600ef6..272a3e8a77 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -47,17 +47,6 @@ def namer(self, imageUrl, pageUrl): imageExt = pageUrl.rsplit('.', 1)[-1].split('?', 1)[0] return "%s-%03d.%s" % (episodeNum, imageNum, imageExt) - #DUMMY Redundant - @classmethod - def handlesurl(cls, url) -> bool: - import re - m = re.match(r"^http.*webtoons.com/.+?/(.+?/.+?)/.+title_no=(\d+)", url.lower()) - if m != None: - path = m.group(1) - number = m.group(2) - return True - return False - @classmethod def handleurl(cls, url) -> list[ParserScraper]: import re diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index b30c3a9792..352cb332c6 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -95,10 +95,6 @@ class Scraper: # HTTP session for configuration & cookies session: http.Session = http.default_session - @classmethod - def handlesurl(cls, url) -> bool: - return False - @classmethod def handleurl(cls, url) -> list[Scraper]: return [] @@ -545,7 +541,7 @@ class Cache: """ def __init__(self) -> None: self.data: List[Scraper] = [] - self.scrapers: List[Scraper] = [] + self.plugins: List[Scraper] = [] self.userdirs: set[pathlib.Path] = set() def find(self, comic: str) -> Scraper: @@ -557,10 +553,8 @@ def find(self, comic: str) -> Scraper: raise ValueError("empty comic name") candidates = [] - for scraper in self.scrapers: - if scraper.handlesurl(comic): - candidates.extend(scraper.handleurl(comic)) - #THINK Maybe just return? + candidates.extend(self.getbyurl(comic)) + #THINK Maybe just return if not empty? cname = comic.lower() for scraper in self.all(include_removed=True): @@ -615,7 +609,7 @@ def addmodule(self, module) -> int: classes = 0 for plugin in loader.get_module_plugins(module, Scraper): classes += 1 - self.scrapers.append(plugin) + self.plugins.append(plugin) self.data.extend(plugin.getmodules()) return classes @@ -633,9 +627,8 @@ def all(self, include_removed=False) -> list[Scraper]: def getbyurl(self, url) -> list[Scraper]: res = [] - for plugin in self.scrapers: - if plugin.handlesurl(url): - res.extend(plugin.handleurl(url)) + for plugin in self.plugins: + res.extend(plugin.handleurl(url)) return res def validate(self) -> None: From 614cd17395456e42362ed726d5504205c53a2b74 Mon Sep 17 00:00:00 2001 From: erhannis Date: Thu, 29 Jan 2026 04:37:47 -0500 Subject: [PATCH 3/6] Cleanup. This might be fit to print. --- dosagelib/director.py | 16 ++++++++++------ dosagelib/scraper.py | 17 ++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/dosagelib/director.py b/dosagelib/director.py index 96f5606b1e..4dfe083813 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -210,13 +210,17 @@ def getScrapers(comics: Collection[str], basepath: str, adult=True, listing=Fals # make the following command work: # find Comics -type d | xargs -n1 -P10 dosage -b Comics comic = comic[len(basepath) + 1:].lstrip(os.sep) - if ':' in comic and not (comic.startswith("http:") or comic.startswith("https:")): - name, index = comic.split(':', 1) - indexes = index.split(',') + if comic.startswith("http:") or comic.startswith("https:"): + scraper = scrapercache.findbyurl(comic) + indexes = None else: - name = comic - indexes = None - scraper = scrapercache.find(name) + if ':' in comic: + name, index = comic.split(':', 1) + indexes = index.split(',') + else: + name = comic + indexes = None + scraper = scrapercache.find(name) if shouldRunScraper(scraper, adult, listing): # FIXME: Find a better way to work with indexes scraper.indexes = indexes diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 352cb332c6..04465f54b4 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -553,9 +553,6 @@ def find(self, comic: str) -> Scraper: raise ValueError("empty comic name") candidates = [] - candidates.extend(self.getbyurl(comic)) - #THINK Maybe just return if not empty? - cname = comic.lower() for scraper in self.all(include_removed=True): lname = scraper.name.lower() @@ -625,11 +622,17 @@ def all(self, include_removed=False) -> list[Scraper]: else: return [x for x in self.data if x.url] - def getbyurl(self, url) -> list[Scraper]: - res = [] + def findbyurl(self, url) -> list[Scraper]: + candidates = [] for plugin in self.plugins: - res.extend(plugin.handleurl(url)) - return res + candidates.extend(plugin.handleurl(url)) + + if len(candidates) > 1: + comics = ", ".join(x.name for x in candidates) + raise ValueError('multiple comics found: %s' % comics) + elif not candidates: + raise ValueError('comic %r not found' % comic) + return candidates[0] def validate(self) -> None: """Check for duplicate scraper names.""" From 4978ae529f72140926cc8fabdace305ce1a85c09 Mon Sep 17 00:00:00 2001 From: erhannis Date: Thu, 29 Jan 2026 05:23:23 -0500 Subject: [PATCH 4/6] Added comicfury url scraper --- dosagelib/plugins/comicfury.py | 13 +++++++++++++ dosagelib/plugins/webtoons.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index d7e514ead5..93fd0507bc 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -88,6 +88,19 @@ def shouldSkipUrl(self, url, data): return (self.match(data, '//div[@id="comicimagewrap"]//video') and not self.match(data, '//div[@id="comicimagewrap"]//img')) + @classmethod + def handleurl(cls, url) -> list[ParserScraper]: + import re + m = re.match(r"^http.*comicfury\.com/read/([^/]+)/?.*", url.lower()) + if m != None: + name = m.group(1) + return [cls(name, name)] + m = re.match(r"^http.*://(.+?)\.thecomicseries\.com/?.*", url.lower()) + if m != None: + name = m.group(1) + return [cls(name, name)] + return [] + @classmethod def getmodules(cls): return ( diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 272a3e8a77..8f836c36d4 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -50,7 +50,7 @@ def namer(self, imageUrl, pageUrl): @classmethod def handleurl(cls, url) -> list[ParserScraper]: import re - m = re.match(r"^http.*webtoons.com/.+?/(.+?/.+?)/.+title_no=(\d+)", url.lower()) + m = re.match(r"^http.*webtoons\.com/.+?/(.+?/.+?)/.+title_no=(\d+)", url.lower()) if m != None: path = m.group(1) number = m.group(2) From 8031dec360ac0328bf4c1a83d28a581346593838 Mon Sep 17 00:00:00 2001 From: erhannis Date: Thu, 29 Jan 2026 05:29:32 -0500 Subject: [PATCH 5/6] Added mention of url in the help documentation --- dosagelib/cmd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dosagelib/cmd.py b/dosagelib/cmd.py index 48f774b4e3..877541d238 100644 --- a/dosagelib/cmd.py +++ b/dosagelib/cmd.py @@ -115,7 +115,7 @@ def setup_options(console: console.Console) -> ArgumentParser: parser.add_argument('--list-all', action='store_true', help=argparse.SUPPRESS) comic_arg = parser.add_argument('comic', nargs='*', - help='comic module name (including case insensitive substrings)') + help='comic module name (including case insensitive substrings). Also accepts URLs on certain websites (ComicFury, WebToons).') comic_arg.completer = scraper_completion with contextlib.suppress(ImportError): completers = importlib.import_module('argcomplete.completers') From c489e6c78bd4aeb46b736683ace3637ca41cbab3 Mon Sep 17 00:00:00 2001 From: erhannis Date: Fri, 30 Jan 2026 00:48:35 -0500 Subject: [PATCH 6/6] Added more aliases for ComicFury --- dosagelib/plugins/comicfury.py | 23 +++++++++++++++-------- dosagelib/scraper.py | 1 + 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index 93fd0507bc..797a42fb55 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -91,14 +91,21 @@ def shouldSkipUrl(self, url, data): @classmethod def handleurl(cls, url) -> list[ParserScraper]: import re - m = re.match(r"^http.*comicfury\.com/read/([^/]+)/?.*", url.lower()) - if m != None: - name = m.group(1) - return [cls(name, name)] - m = re.match(r"^http.*://(.+?)\.thecomicseries\.com/?.*", url.lower()) - if m != None: - name = m.group(1) - return [cls(name, name)] + rs = [ + r"^http.*comicfury\.com/read/([^/]+)/?.*", + r"^http.*://(.+?)\.thecomicseries\.com/?.*", + r"^http.*://(.+?)\.the-comic\.org/?.*", + r"^http.*://(.+?)\.thecomicstrip\.org/?.*", + r"^http.*://(.+?)\.cfw\.me/?.*", + r"^http.*://(.+?)\.webcomic\.ws/?.*" + ] + for r in rs: + m = re.match(r, url.lower()) + if m != None: + name = m.group(1) + ps = cls(name, name) + ps.multipleImagesPerStrip = True + return [ps] return [] @classmethod diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 04465f54b4..e5c47bc0cf 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -54,6 +54,7 @@ class Scraper: # Stop search for previous URLs at this URL firstStripUrl: Optional[str] = None + #THINK Is there harm in defaulting this to True? # if more than one image per URL is expected multipleImagesPerStrip: bool = False