From 8961ed38b2f8febd2656f4aacc61808bd0b692bd Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sat, 26 Aug 2023 20:59:22 -0400 Subject: [PATCH 01/35] fix selenium webdriver after updates --- crawler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index d94b763c..b75e294f 100644 --- a/crawler.py +++ b/crawler.py @@ -14,6 +14,7 @@ from urllib.parse import unquote, quote from selenium import webdriver +from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By import requests from concurrent import futures @@ -355,7 +356,11 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, chrome_options.add_argument("headless") if proxy is not None and proxy_type is not None: chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy)) - driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) + + # driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) + service = Service(executable_path=chrome_path) + options = webdriver.ChromeOptions() + driver = webdriver.Chrome(service=service, options=options) if engine == "Google": driver.set_window_size(1920, 1080) From 9ac58f760d5b1863df2437415993a5c76dc44282 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Tue, 14 Nov 2023 16:49:46 -0500 Subject: [PATCH 02/35] change file prefix change file prefix to keywords used, retain search engine so google and bing results can co-exist From 747e889803a09b2674c7f9e895cf26ee469eb1c2 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Tue, 14 Nov 2023 17:03:00 -0500 Subject: [PATCH 03/35] see previous --- image_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image_downloader.py b/image_downloader.py index 21a50315..5e362491 100644 --- a/image_downloader.py +++ b/image_downloader.py @@ -68,7 +68,7 @@ def main(argv): downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, - file_prefix=args.engine) + file_prefix=args.keywords + "_" + args.engine) print("Finished.") From a226ce8affee5707497030ee647b9d9afe0b3800 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:04:08 -0500 Subject: [PATCH 04/35] minor fixes from https://github.com/RapDoodle (#2) minor fixes from https://github.com/RapDoodle/Image-Downloader/ minor tweaks from me --- crawler.py | 12 +++++++++--- downloader.py | 16 +++++++++++++++- image_downloader.py | 2 +- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index b75e294f..3bb2fa69 100644 --- a/crawler.py +++ b/crawler.py @@ -325,6 +325,10 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, :return: list of scraped image urls """ +# Validate engine name + if engine not in ['Google', 'Baidu', 'Bing']: + raise Exception(f'Unknown engine name: {engine}') + my_print("\nScraping From {} Image Search ...\n".format(engine), quiet) my_print("Keywords: " + keywords, quiet) if max_number <= 0: @@ -357,10 +361,11 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, if proxy is not None and proxy_type is not None: chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy)) + chrome_options.add_argument('--ignore-certificate-errors') + # driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) service = Service(executable_path=chrome_path) - options = webdriver.ChromeOptions() - driver = webdriver.Chrome(service=service, options=options) + driver = webdriver.Chrome(service=service, options=chrome_options) if engine == "Google": driver.set_window_size(1920, 1080) @@ -370,10 +375,11 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, driver.set_window_size(1920, 1080) driver.get(query_url) image_urls = bing_image_url_from_webpage(driver) - else: # Baidu + elif engine == "Baidu": driver.set_window_size(10000, 7500) driver.get(query_url) image_urls = baidu_image_url_from_webpage(driver) + driver.close() else: # api if engine == "Baidu": diff --git a/downloader.py b/downloader.py index abe69f6b..61817e09 100644 --- a/downloader.py +++ b/downloader.py @@ -35,12 +35,16 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p while True: try: try_times += 1 + #= image_url = image_url.split('&')[0] # https://github.com/pablobots/Image-Downloader/commit/5bdbe076589459b9d0c41a563b92993cac1a892e response = requests.get( image_url, headers=headers, timeout=timeout, proxies=proxies) with open(file_path, 'wb') as f: f.write(response.content) response.close() file_type = imghdr.what(file_path) + if file_type == 'jpeg': + file_type = 'jpg' + # if file_type is not None: if file_type in ["jpg", "jpeg", "png", "bmp", "webp"]: new_file_name = "{}.{}".format(file_name, file_type) @@ -50,6 +54,7 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p else: os.remove(file_path) print("## Err: TYPE({}) {}".format(file_type, image_url)) + return False break except Exception as e: if try_times < 3: @@ -57,6 +62,7 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p if response: response.close() print("## Fail: {} {}".format(image_url, e.args)) + return False break @@ -70,12 +76,13 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time :param dst_dir: output the downloaded images to dst_dir :param file_prefix: if set to "img", files will be in format "img_xxx.jpg" :param concurrency: number of requests process simultaneously - :return: none + :return: the number of successful downloads """ socket.setdefaulttimeout(timeout) with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor: + count = 1 future_list = list() count = 0 if not os.path.exists(dst_dir): @@ -86,3 +93,10 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy)) count += 1 concurrent.futures.wait(future_list, timeout=180) + + # Count the number of successful downloads + for future in future_list: + if future.result(): + success_downloads += 1 + + return success_downloads diff --git a/image_downloader.py b/image_downloader.py index 5e362491..f53e3f00 100644 --- a/image_downloader.py +++ b/image_downloader.py @@ -30,7 +30,7 @@ def main(argv): parser.add_argument("--safe-mode", "-S", action="store_true", default=False, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--face-only", "-F", action="store_true", default=False, - help="Only search for ") + help="Only search for faces (only available in Google)")") parser.add_argument("--proxy_http", "-ph", type=str, default=None, help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, From 9450f18828bcce38bbf1aa045a7f6a502e1bad6f Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Mon, 27 Nov 2023 00:34:52 -0500 Subject: [PATCH 05/35] tweak the fixes (#3) --- downloader.py | 55 ++++++++++++++++++++++++++++++++++++++++----- image_downloader.py | 2 +- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/downloader.py b/downloader.py index 61817e09..559f6f9e 100644 --- a/downloader.py +++ b/downloader.py @@ -21,6 +21,14 @@ # 'Connection': 'close', } +# additional check for imghdr.what() +def test_html(h, f): + if b' Date: Mon, 27 Nov 2023 00:44:55 -0500 Subject: [PATCH 06/35] delete unknown or html files (#4) --- downloader.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/downloader.py b/downloader.py index 559f6f9e..82c054c2 100644 --- a/downloader.py +++ b/downloader.py @@ -58,8 +58,11 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p if file_type == 'jpeg': file_type = 'jpg' - # if file_type in ["jpg", "jpeg", "png", "bmp", "webp"]: - if file_type is not None: + if file_type is None or file_type == 'html': + os.remove(file_path) + print("## Err: TYPE({}) {}".format(file_type, image_url)) + return False + elif file_type in ["jpg", "jpeg", "png", "bmp", "webp"]: if file_name.endswith("." + file_type): new_file_name = file_name else: @@ -69,11 +72,12 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p shutil.move(file_path, new_file_path) print("## OK: {} {}".format(new_file_name, image_url)) return True - elif file_type == 'html': + else: os.remove(file_path) print("## Err: TYPE({}) {}".format(file_type, image_url)) return False break + except Exception as e: if try_times < 3: file_name = file_name + "a" From 37afb90e359f5554ecbd2c0bd6023f065462c2fa Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Mon, 27 Nov 2023 18:52:27 -0500 Subject: [PATCH 07/35] Updates from forks (#5) Parts of changes from: https://github.com/tungalbert99/ https://github.com/ubadly/ --- crawler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/crawler.py b/crawler.py index 3bb2fa69..12efc858 100644 --- a/crawler.py +++ b/crawler.py @@ -22,12 +22,14 @@ g_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Proxy-Connection": "keep-alive", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Accept-Encoding": "gzip, deflate, sdch", # 'Connection': 'close', } +session = requests.Session() +session.headers = g_headers + if getattr(sys, 'frozen', False): bundle_dir = sys._MEIPASS else: @@ -82,7 +84,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): break thumb_elements_old = thumb_elements driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(2) + time.sleep(5) show_more = driver.find_elements(By.CLASS_NAME, "mye4qd") if len(show_more) == 1 and show_more[0].is_displayed() and show_more[0].is_enabled(): my_print("Click show_more button.", quiet) @@ -121,7 +123,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): image_elements = driver.find_elements(By.CLASS_NAME, "islib") image_urls = list() - url_pattern = r"imgurl=\S*&imgrefurl" + url_pattern = r"imgurl=\S*&imgrefurl" # url_pattern = r"imgurl=(.*?)&" for image_element in image_elements[:max_number]: outer_html = image_element.get_attribute("outerHTML") @@ -189,7 +191,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False, image_urls = [] while start <= max_number: url = 'https://www.bing.com/images/async?q={}&first={}&count=35'.format(keywords, start) - res = requests.get(url, proxies=proxies, headers=g_headers) + res = session.get(url, proxies=proxies, headers=g_headers) res.encoding = "utf-8" image_urls_batch = re.findall('murl":"(.*?)"', res.text) if len(image_urls) > 0 and image_urls_batch[-1] == image_urls[-1]: @@ -253,7 +255,7 @@ def decode_url(url): proxies = {"http": "{}://{}".format(proxy_type, proxy), "https": "{}://{}".format(proxy_type, proxy)} - res = requests.get(init_url, proxies=proxies, headers=g_headers) + res = session.get(init_url, proxies=proxies, headers=g_headers) init_json = json.loads(res.text.replace(r"\'", "").encode("utf-8"), strict=False) total_num = init_json['listNum'] @@ -273,7 +275,7 @@ def process_batch(batch_no, batch_size): try_time = 0 while True: try: - response = requests.get(url, proxies=proxies, headers=g_headers) + response = session.get(url, proxies=proxies, headers=g_headers) break except Exception as e: try_time += 1 @@ -380,7 +382,8 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, driver.get(query_url) image_urls = baidu_image_url_from_webpage(driver) - driver.close() + # driver.close() # just closes the window. quit() does much more cleanup + driver.quit() else: # api if engine == "Baidu": image_urls = baidu_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only, From 72834344680709c7faab21b083e208faf484009d Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Mon, 27 Nov 2023 20:51:25 -0500 Subject: [PATCH 08/35] fix filename and filetype issues (#6) --- downloader.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/downloader.py b/downloader.py index 82c054c2..c90be4f7 100644 --- a/downloader.py +++ b/downloader.py @@ -21,7 +21,7 @@ # 'Connection': 'close', } -# additional check for imghdr.what() +# additional checks for imghdr.what() def test_html(h, f): if b'= 200: + print("Truncating: {}".format(file_name)) + file_name = file_name[:200] + if file_name.endswith("." + file_type): new_file_name = file_name else: @@ -73,7 +88,7 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p print("## OK: {} {}".format(new_file_name, image_url)) return True else: - os.remove(file_path) + # os.remove(file_path) print("## Err: TYPE({}) {}".format(file_type, image_url)) return False break From ea6c62167f2af7b1cc495a098d7c5438b6e92760 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Mon, 27 Nov 2023 21:05:23 -0500 Subject: [PATCH 09/35] PEP 8 formatting (#7) PEP 8 formatting via https://formatter.org/python-formatter --- downloader.py | 76 +++++++++++++++-------- image_downloader.py | 147 +++++++++++++++++++++++++++++++++----------- 2 files changed, 160 insertions(+), 63 deletions(-) diff --git a/downloader.py b/downloader.py index c90be4f7..dd157300 100644 --- a/downloader.py +++ b/downloader.py @@ -23,25 +23,29 @@ # additional checks for imghdr.what() def test_html(h, f): - if b'= 200: print("Truncating: {}".format(file_name)) file_name = file_name[:200] - + if file_name.endswith("." + file_type): new_file_name = file_name - else: + else: new_file_name = "{}.{}".format(file_name, file_type) - + new_file_path = os.path.join(dst_dir, new_file_name) shutil.move(file_path, new_file_path) print("## OK: {} {}".format(new_file_name, image_url)) @@ -92,7 +97,7 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p print("## Err: TYPE({}) {}".format(file_type, image_url)) return False break - + except Exception as e: if try_times < 3: file_name = file_name + "a" @@ -104,7 +109,15 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p break -def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None): +def download_images( + image_urls, + dst_dir, + file_prefix="img", + concurrency=50, + timeout=20, + proxy_type=None, + proxy=None, +): """ Download image according to given urls and automatically rename them in order. :param timeout: @@ -123,19 +136,28 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time future_list = list() count = 0 success_downloads = 0 - + if not os.path.exists(dst_dir): os.makedirs(dst_dir) for image_url in image_urls: # file_name = file_prefix + "_" + "%04d" % count print("## URL : {}".format(image_url)) file_name = image_url - file_name = split_string(file_name, '?', 0) - file_name = split_string(file_name, '&', 0) - file_name = split_string(file_name, '/', -1) + file_name = split_string(file_name, "?", 0) + file_name = split_string(file_name, "&", 0) + file_name = split_string(file_name, "/", -1) print("## FILE: {}".format(file_name)) - future_list.append(executor.submit( - download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy)) + future_list.append( + executor.submit( + download_image, + image_url, + dst_dir, + file_name, + timeout, + proxy_type, + proxy, + ) + ) count += 1 concurrent.futures.wait(future_list, timeout=180) @@ -153,12 +175,12 @@ def split_string(str, delimiter, index): s, _, t = s.partition(delimiter) if index == 0: break - if t == '': + if t == "": break index = index - 1 s = t - if s == '': + if s == "": s = str return s diff --git a/image_downloader.py b/image_downloader.py index 3c58a7bf..99cfbf6e 100644 --- a/image_downloader.py +++ b/image_downloader.py @@ -11,39 +11,102 @@ import downloader import utils + def main(argv): parser = argparse.ArgumentParser(description="Image Downloader") - parser.add_argument("keywords", type=str, - help='Keywords to search. ("in quotes")') - parser.add_argument("--engine", "-e", type=str, default="Google", - help="Image search engine.", choices=["Google", "Bing", "Baidu"]) - parser.add_argument("--driver", "-d", type=str, default="chrome_headless", - help="Image search engine.", choices=["chrome_headless", "chrome", "api"]) - parser.add_argument("--max-number", "-n", type=int, default=100, - help="Max number of images download for the keywords.") - parser.add_argument("--num-threads", "-j", type=int, default=50, - help="Number of threads to concurrently download images.") - parser.add_argument("--timeout", "-t", type=int, default=10, - help="Seconds to timeout when download an image.") - parser.add_argument("--output", "-o", type=str, default="./download_images", - help="Output directory to save downloaded images.") - parser.add_argument("--safe-mode", "-S", action="store_true", default=False, - help="Turn on safe search mode. (Only effective in Google)") - parser.add_argument("--face-only", "-F", action="store_true", default=False, - help="Only search for faces (only available in Google)") - parser.add_argument("--proxy_http", "-ph", type=str, default=None, - help="Set http proxy (e.g. 192.168.0.2:8080)") - parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, - help="Set socks5 proxy (e.g. 192.168.0.2:1080)") + parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")') + parser.add_argument( + "--engine", + "-e", + type=str, + default="Google", + help="Image search engine.", + choices=["Google", "Bing", "Baidu"], + ) + parser.add_argument( + "--driver", + "-d", + type=str, + default="chrome_headless", + help="Image search engine.", + choices=["chrome_headless", "chrome", "api"], + ) + parser.add_argument( + "--max-number", + "-n", + type=int, + default=100, + help="Max number of images download for the keywords.", + ) + parser.add_argument( + "--num-threads", + "-j", + type=int, + default=50, + help="Number of threads to concurrently download images.", + ) + parser.add_argument( + "--timeout", + "-t", + type=int, + default=10, + help="Seconds to timeout when download an image.", + ) + parser.add_argument( + "--output", + "-o", + type=str, + default="./download_images", + help="Output directory to save downloaded images.", + ) + parser.add_argument( + "--safe-mode", + "-S", + action="store_true", + default=False, + help="Turn on safe search mode. (Only effective in Google)", + ) + parser.add_argument( + "--face-only", + "-F", + action="store_true", + default=False, + help="Only search for faces (only available in Google)", + ) + parser.add_argument( + "--proxy_http", + "-ph", + type=str, + default=None, + help="Set http proxy (e.g. 192.168.0.2:8080)", + ) + parser.add_argument( + "--proxy_socks5", + "-ps", + type=str, + default=None, + help="Set socks5 proxy (e.g. 192.168.0.2:1080)", + ) # type is not supported for Baidu - parser.add_argument("--type", "-ty", type=str, default=None, - help="What kinds of images to download.", choices=["clipart", "linedrawing", "photograph"]) + parser.add_argument( + "--type", + "-ty", + type=str, + default=None, + help="What kinds of images to download.", + choices=["clipart", "linedrawing", "photograph"], + ) # Bing: color for colored images, bw for black&white images, other color contains Red, orange, yellow, green # Teal, Blue, Purple, Pink, Brown, Black, Gray, White # Baidu: white, bw, black, pink, blue, red, yellow, purple, green, teal, orange, brown # Google: bw, red, orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown - parser.add_argument("--color", "-cl", type=str, default=None, - help="Specify the color of desired images.") + parser.add_argument( + "--color", + "-cl", + type=str, + default=None, + help="Specify the color of desired images.", + ) args = parser.parse_args(args=argv) @@ -60,18 +123,30 @@ def main(argv): print("Dependencies not resolved, exit.") return - crawled_urls = crawler.crawl_image_urls(args.keywords, - engine=args.engine, max_number=args.max_number, - face_only=args.face_only, safe_mode=args.safe_mode, - proxy_type=proxy_type, proxy=proxy, - browser=args.driver, image_type=args.type, color=args.color) - downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, - concurrency=args.num_threads, timeout=args.timeout, - proxy_type=proxy_type, proxy=proxy, - file_prefix=args.keywords + "_" + args.engine) + crawled_urls = crawler.crawl_image_urls( + args.keywords, + engine=args.engine, + max_number=args.max_number, + face_only=args.face_only, + safe_mode=args.safe_mode, + proxy_type=proxy_type, + proxy=proxy, + browser=args.driver, + image_type=args.type, + color=args.color, + ) + downloader.download_images( + image_urls=crawled_urls, + dst_dir=args.output, + concurrency=args.num_threads, + timeout=args.timeout, + proxy_type=proxy_type, + proxy=proxy, + file_prefix=args.keywords + "_" + args.engine, + ) print("Finished.") -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv[1:]) From 5b9746bd3df61b1d1c1cb150b295b7d20ac41928 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Thu, 30 Nov 2023 12:57:24 -0500 Subject: [PATCH 10/35] fixes for file type detection (#8) --- downloader.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/downloader.py b/downloader.py index dd157300..4e0bf8da 100644 --- a/downloader.py +++ b/downloader.py @@ -25,6 +25,12 @@ def test_html(h, f): if b"= 200: print("Truncating: {}".format(file_name)) file_name = file_name[:200] From c8c7512b90811dc7f80643ab5da5e1a16c33ce78 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Thu, 30 Nov 2023 15:21:47 -0500 Subject: [PATCH 11/35] Check HTTP status codes (#9) --- downloader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/downloader.py b/downloader.py index 4e0bf8da..431593f5 100644 --- a/downloader.py +++ b/downloader.py @@ -67,9 +67,15 @@ def download_image( image_url, headers=headers, timeout=timeout, proxies=proxies ) - # TODO: handle 429 Too Many Requests - # TODO: handle 404 not found (don't even save the content) - # TODO: handle 403 Forbidden (don't even save the content) + # TODO: handle 429 Too Many Requests, set a timer to slow down request frequency + # handle 401 Unauthorized (don't even save the content) + # handle 404 not found (don't even save the content) + # handle 403 Forbidden (don't even save the content) + + if response.status_code in [ 404,403,401 ]: + print("## Err: STATUS CODE({}) {}".format(response.status_code, image_url)) + return False + with open(file_path, "wb") as f: f.write(response.content) response.close() From 5928872b00378955ec9134917fcd831ae2bd4258 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 10 Dec 2023 18:12:55 -0500 Subject: [PATCH 12/35] Bump pyinstaller from 5.9.0 to 5.13.1 (#10) Bumps [pyinstaller](https://github.com/pyinstaller/pyinstaller) from 5.9.0 to 5.13.1. - [Release notes](https://github.com/pyinstaller/pyinstaller/releases) - [Changelog](https://github.com/pyinstaller/pyinstaller/blob/develop/doc/CHANGES.rst) - [Commits](https://github.com/pyinstaller/pyinstaller/compare/v5.9.0...v5.13.1) --- updated-dependencies: - dependency-name: pyinstaller dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fa29d1ec..37bfbb0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.4.0 -pyinstaller==5.9.0 +pyinstaller==5.13.1 PyQt5==5.15.9 requests==2.31.0 selenium==4.8.3 From 6abbc1556ae20fd1400bab06ff24ecad5c85971a Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 10 Dec 2023 18:14:40 -0500 Subject: [PATCH 13/35] tbd --- downloader.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/downloader.py b/downloader.py index 431593f5..ce1a82c2 100644 --- a/downloader.py +++ b/downloader.py @@ -4,6 +4,7 @@ # Email: sczhengyabin@hotmail.com from __future__ import print_function +from urllib.parse import unquote import shutil import imghdr @@ -22,6 +23,7 @@ } # additional checks for imghdr.what() + def test_html(h, f): if b" Date: Wed, 3 Jan 2024 19:39:28 -0500 Subject: [PATCH 14/35] Update README.md Add pointer to iCrawler --- README.md | 54 ++---------------------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index a027e67a..dc308d5c 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,3 @@ -# Image Downloader +# Not Maintained -[![996.icu](https://img.shields.io/badge/link-996.icu-red.svg)](https://996.icu) - -## [中文说明](https://github.com/QianyanTech/Image-Downloader/blob/master/README_zh.md) - -## 1. Introdoction - -Crawl and download images using Selenium or API -Using python3 and PyQt5 - -## 2. Key features - -+ Supported Search Engine: Google, Bing, Baidu -+ Keywords input from keyboard, or input from line seperated keywords list file for batch process. -+ Download image using customizable number of threads. -+ Fully supported conditional search (eg. filetype:, site:). -+ Switch for Google safe mode. -+ Proxy configuration (socks, http). -+ CMD and GUI ways of using are provided. - -## 3. Usage - -### 3.1 GUI - -Run `image_downloader_gui.py` script to yank GUI: -```bash -python image_downloader_gui.py -``` - -![GUI](/GUI.png) - -### 3.2 CMD - -```bash -usage: image_downloader.py [-h] [--engine {Google,Bing,Baidu}] - [--driver {chrome_headless,chrome,api}] - [--max-number MAX_NUMBER] - [--num-threads NUM_THREADS] [--timeout TIMEOUT] - [--output OUTPUT] [--safe-mode] [--face-only] - [--proxy_http PROXY_HTTP] - [--proxy_socks5 PROXY_SOCKS5] - keywords -``` - -## Star History - -[![Star History Chart](https://api.star-history.com/svg?repos=QianyanTech/Image-Downloader&type=Date)](https://star-history.com/#QianyanTech/Image-Downloader&Date) - -## License - -+ MIT License -+ 996ICU License +I have moved to using [https://github.com/hellock/icrawler] iCrawler. It has some bugs but the code seems more mature. From dc83112e77e3f13ef7bdec1ff0b9c4d0b116f55b Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 11 Feb 2024 05:20:17 -0500 Subject: [PATCH 15/35] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dc308d5c..cf66e971 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ -# Not Maintained -I have moved to using [https://github.com/hellock/icrawler] iCrawler. It has some bugs but the code seems more mature. +I have moved to using [https://github.com/hellock/icrawler] icrawler. It has some bugs but the code seems more mature. + +Image-Downloader executes a browser with JavaScript, using chromedriver or similar, which seems to be the 2024 way to understand the web. But the code is more difficult to re-use. + +icrawler uses BeautifulSoup to parse the text response, which still works. It is a text-based library, but easy to subclass and modify just the relevant parts. From e808cbb2f69c43a5c0fb5abc9afe1f1c6e4e06ba Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sat, 23 Mar 2024 00:07:29 -0400 Subject: [PATCH 16/35] Update README.md (#12) From 3c7fe5762655ae54dc79b0b5300fbbdb7434a141 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sat, 23 Mar 2024 15:45:50 -0400 Subject: [PATCH 17/35] Update README.md restore original --- README.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cf66e971..a027e67a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,53 @@ +# Image Downloader -I have moved to using [https://github.com/hellock/icrawler] icrawler. It has some bugs but the code seems more mature. +[![996.icu](https://img.shields.io/badge/link-996.icu-red.svg)](https://996.icu) -Image-Downloader executes a browser with JavaScript, using chromedriver or similar, which seems to be the 2024 way to understand the web. But the code is more difficult to re-use. +## [中文说明](https://github.com/QianyanTech/Image-Downloader/blob/master/README_zh.md) -icrawler uses BeautifulSoup to parse the text response, which still works. It is a text-based library, but easy to subclass and modify just the relevant parts. +## 1. Introdoction + +Crawl and download images using Selenium or API +Using python3 and PyQt5 + +## 2. Key features + ++ Supported Search Engine: Google, Bing, Baidu ++ Keywords input from keyboard, or input from line seperated keywords list file for batch process. ++ Download image using customizable number of threads. ++ Fully supported conditional search (eg. filetype:, site:). ++ Switch for Google safe mode. ++ Proxy configuration (socks, http). ++ CMD and GUI ways of using are provided. + +## 3. Usage + +### 3.1 GUI + +Run `image_downloader_gui.py` script to yank GUI: +```bash +python image_downloader_gui.py +``` + +![GUI](/GUI.png) + +### 3.2 CMD + +```bash +usage: image_downloader.py [-h] [--engine {Google,Bing,Baidu}] + [--driver {chrome_headless,chrome,api}] + [--max-number MAX_NUMBER] + [--num-threads NUM_THREADS] [--timeout TIMEOUT] + [--output OUTPUT] [--safe-mode] [--face-only] + [--proxy_http PROXY_HTTP] + [--proxy_socks5 PROXY_SOCKS5] + keywords +``` + +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=QianyanTech/Image-Downloader&type=Date)](https://star-history.com/#QianyanTech/Image-Downloader&Date) + +## License + ++ MIT License ++ 996ICU License From b3ea6b353f31d9eaba59a6351c94404dc17980e1 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 24 Mar 2024 22:54:22 -0400 Subject: [PATCH 18/35] PRP8 --- crawler.py | 199 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/crawler.py b/crawler.py index 12efc858..2ed4cc43 100644 --- a/crawler.py +++ b/crawler.py @@ -1,4 +1,5 @@ """ Crawl image urls from image search engine. """ + # -*- coding: utf-8 -*- # author: Yabin Zheng # Email: sczhengyabin@hotmail.com @@ -30,7 +31,7 @@ session = requests.Session() session.headers = g_headers -if getattr(sys, 'frozen', False): +if getattr(sys, "frozen", False): bundle_dir = sys._MEIPASS else: bundle_dir = os.path.dirname(os.path.abspath(__file__)) @@ -41,16 +42,18 @@ def my_print(msg, quiet=False): print(msg) -def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=None, color=None): +def google_gen_query_url( + keywords, face_only=False, safe_mode=False, image_type=None, color=None +): base_url = "https://www.google.com/search?tbm=isch&hl=en" keywords_str = "&q=" + quote(keywords) query_url = base_url + keywords_str - + if safe_mode is True: query_url += "&safe=on" else: query_url += "&safe=off" - + filter_url = "&tbs=" if color is not None: @@ -58,12 +61,12 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type= filter_url += "ic:gray%2C" else: filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower()) - + if image_type is not None: if image_type.lower() == "linedrawing": image_type = "lineart" filter_url += "itp:{}".format(image_type) - + if face_only is True: filter_url += "itp:face" @@ -86,18 +89,24 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(5) show_more = driver.find_elements(By.CLASS_NAME, "mye4qd") - if len(show_more) == 1 and show_more[0].is_displayed() and show_more[0].is_enabled(): + if ( + len(show_more) == 1 + and show_more[0].is_displayed() + and show_more[0].is_enabled() + ): my_print("Click show_more button.", quiet) show_more[0].click() time.sleep(3) except Exception as e: print("Exception ", e) pass - + if len(thumb_elements) == 0: return [] - my_print("Click on each thumbnail image to get image url, may take a moment ...", quiet) + my_print( + "Click on each thumbnail image to get image url, may take a moment ...", quiet + ) retry_click = [] for i, elem in enumerate(thumb_elements): @@ -112,7 +121,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): print("Error while clicking in thumbnail:", e) retry_click.append(elem) - if len(retry_click) > 0: + if len(retry_click) > 0: my_print("Retry some failed clicks ...", quiet) for elem in retry_click: try: @@ -120,10 +129,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): elem.click() except Exception as e: print("Error while retrying click:", e) - + image_elements = driver.find_elements(By.CLASS_NAME, "islib") image_urls = list() - url_pattern = r"imgurl=\S*&imgrefurl" # url_pattern = r"imgurl=(.*?)&" + url_pattern = r"imgurl=\S*&imgrefurl" # url_pattern = r"imgurl=(.*?)&" for image_element in image_elements[:max_number]: outer_html = image_element.get_attribute("outerHTML") @@ -134,17 +143,19 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): return image_urls -def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=None, color=None): +def bing_gen_query_url( + keywords, face_only=False, safe_mode=False, image_type=None, color=None +): base_url = "https://www.bing.com/images/search?" keywords_str = "&q=" + quote(keywords) query_url = base_url + keywords_str filter_url = "&qft=" if face_only is True: filter_url += "+filterui:face-face" - + if image_type is not None: filter_url += "+filterui:photo-{}".format(image_type) - + if color is not None: if color == "bw" or color == "color": filter_url += "+filterui:color2-{}".format(color.lower()) @@ -166,8 +177,7 @@ def bing_image_url_from_webpage(driver): image_elements = driver.find_elements(By.CLASS_NAME, "iusc") if len(image_elements) > img_count: img_count = len(image_elements) - driver.execute_script( - "window.scrollTo(0, document.body.scrollHeight);") + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") else: smb = driver.find_elements(By.CLASS_NAME, "btn_seemore") if len(smb) > 0 and smb[0].is_displayed(): @@ -181,30 +191,48 @@ def bing_image_url_from_webpage(driver): image_urls.append(m_json["murl"]) return image_urls -def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False, - proxy=None, proxy_type=None): + +def bing_get_image_url_using_api( + keywords, max_number=10000, face_only=False, proxy=None, proxy_type=None +): proxies = None if proxy and proxy_type: - proxies = {"http": "{}://{}".format(proxy_type, proxy), - "https": "{}://{}".format(proxy_type, proxy)} + proxies = { + "http": "{}://{}".format(proxy_type, proxy), + "https": "{}://{}".format(proxy_type, proxy), + } start = 1 image_urls = [] while start <= max_number: - url = 'https://www.bing.com/images/async?q={}&first={}&count=35'.format(keywords, start) + url = "https://www.bing.com/images/async?q={}&first={}&count=35".format( + keywords, start + ) res = session.get(url, proxies=proxies, headers=g_headers) res.encoding = "utf-8" - image_urls_batch = re.findall('murl":"(.*?)"', res.text) + image_urls_batch = re.findall("murl":"(.*?)"", res.text) if len(image_urls) > 0 and image_urls_batch[-1] == image_urls[-1]: break image_urls += image_urls_batch start += len(image_urls_batch) return image_urls + baidu_color_code = { - "white": 1024, "bw": 2048, "black": 512, "pink": 64, "blue": 16, "red": 1, - "yellow": 2, "purple": 32, "green": 4, "teal": 8, "orange": 256, "brown": 128 + "white": 1024, + "bw": 2048, + "black": 512, + "pink": 64, + "blue": 16, + "red": 1, + "yellow": 2, + "purple": 32, + "green": 4, + "teal": 8, + "orange": 256, + "brown": 128, } + def baidu_gen_query_url(keywords, face_only=False, safe_mode=False, color=None): base_url = "https://image.baidu.com/search/index?tn=baiduimage" keywords_str = "&word=" + quote(keywords) @@ -230,21 +258,23 @@ def baidu_image_url_from_webpage(driver): return image_urls -def baidu_get_image_url_using_api(keywords, max_number=10000, face_only=False, - proxy=None, proxy_type=None): +def baidu_get_image_url_using_api( + keywords, max_number=10000, face_only=False, proxy=None, proxy_type=None +): def decode_url(url): - in_table = '0123456789abcdefghijklmnopqrstuvw' - out_table = '7dgjmoru140852vsnkheb963wtqplifca' + in_table = "0123456789abcdefghijklmnopqrstuvw" + out_table = "7dgjmoru140852vsnkheb963wtqplifca" translate_table = str.maketrans(in_table, out_table) - mapping = {'_z2C$q': ':', '_z&e3B': '.', 'AzdH3F': '/'} + mapping = {"_z2C$q": ":", "_z&e3B": ".", "AzdH3F": "/"} for k, v in mapping.items(): url = url.replace(k, v) return url.translate(translate_table) - base_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592"\ - "&lm=7&fp=result&ie=utf-8&oe=utf-8&st=-1" - keywords_str = "&word={}&queryWord={}".format( - quote(keywords), quote(keywords)) + base_url = ( + "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592" + "&lm=7&fp=result&ie=utf-8&oe=utf-8&st=-1" + ) + keywords_str = "&word={}&queryWord={}".format(quote(keywords), quote(keywords)) query_url = base_url + keywords_str query_url += "&face={}".format(1 if face_only else 0) @@ -252,12 +282,14 @@ def decode_url(url): proxies = None if proxy and proxy_type: - proxies = {"http": "{}://{}".format(proxy_type, proxy), - "https": "{}://{}".format(proxy_type, proxy)} + proxies = { + "http": "{}://{}".format(proxy_type, proxy), + "https": "{}://{}".format(proxy_type, proxy), + } res = session.get(init_url, proxies=proxies, headers=g_headers) init_json = json.loads(res.text.replace(r"\'", "").encode("utf-8"), strict=False) - total_num = init_json['listNum'] + total_num = init_json["listNum"] target_num = min(max_number, total_num) crawl_num = min(target_num * 2, total_num) @@ -270,8 +302,7 @@ def decode_url(url): def process_batch(batch_no, batch_size): image_urls = list() - url = query_url + \ - "&pn={}&rn={}".format(batch_no * batch_size, batch_size) + url = query_url + "&pn={}&rn={}".format(batch_no * batch_size, batch_size) try_time = 0 while True: try: @@ -282,21 +313,21 @@ def process_batch(batch_no, batch_size): if try_time > 3: print(e) return image_urls - response.encoding = 'utf-8' + response.encoding = "utf-8" res_json = json.loads(response.text.replace(r"\'", ""), strict=False) - for data in res_json['data']: + for data in res_json["data"]: # if 'middleURL' in data.keys(): # url = data['middleURL'] # image_urls.append(url) - if 'objURL' in data.keys(): - url = unquote(decode_url(data['objURL'])) - if 'src=' in url: - url_p1 = url.split('src=')[1] - url = url_p1.split('&refer=')[0] + if "objURL" in data.keys(): + url = unquote(decode_url(data["objURL"])) + if "src=" in url: + url_p1 = url.split("src=")[1] + url = url_p1.split("&refer=")[0] image_urls.append(url) # print(url) - elif 'replaceUrl' in data.keys() and len(data['replaceUrl']) == 2: - image_urls.append(data['replaceUrl'][1]['ObjURL']) + elif "replaceUrl" in data.keys() and len(data["replaceUrl"]) == 2: + image_urls.append(data["replaceUrl"][1]["ObjURL"]) return image_urls @@ -308,12 +339,22 @@ def process_batch(batch_no, batch_size): else: print(future.exception()) - return crawled_urls[:min(len(crawled_urls), target_num)] - - -def crawl_image_urls(keywords, engine="Google", max_number=10000, - face_only=False, safe_mode=False, proxy=None, - proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None): + return crawled_urls[: min(len(crawled_urls), target_num)] + + +def crawl_image_urls( + keywords, + engine="Google", + max_number=10000, + face_only=False, + safe_mode=False, + proxy=None, + proxy_type="http", + quiet=False, + browser="chrome_headless", + image_type=None, + color=None, +): """ Scrape image urls of keywords from Google Image Search :param keywords: keywords you want to search @@ -327,9 +368,9 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, :return: list of scraped image urls """ -# Validate engine name - if engine not in ['Google', 'Baidu', 'Bing']: - raise Exception(f'Unknown engine name: {engine}') + # Validate engine name + if engine not in ["Google", "Baidu", "Bing"]: + raise Exception(f"Unknown engine name: {engine}") my_print("\nScraping From {} Image Search ...\n".format(engine), quiet) my_print("Keywords: " + keywords, quiet) @@ -342,9 +383,13 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, my_print("Safe Mode: {}".format(str(safe_mode)), quiet) if engine == "Google": - query_url = google_gen_query_url(keywords, face_only, safe_mode, image_type, color) + query_url = google_gen_query_url( + keywords, face_only, safe_mode, image_type, color + ) elif engine == "Bing": - query_url = bing_gen_query_url(keywords, face_only, safe_mode, image_type, color) + query_url = bing_gen_query_url( + keywords, face_only, safe_mode, image_type, color + ) elif engine == "Baidu": query_url = baidu_gen_query_url(keywords, face_only, safe_mode, color) else: @@ -361,9 +406,11 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, if "headless" in browser: chrome_options.add_argument("headless") if proxy is not None and proxy_type is not None: - chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy)) - - chrome_options.add_argument('--ignore-certificate-errors') + chrome_options.add_argument( + "--proxy-server={}://{}".format(proxy_type, proxy) + ) + + chrome_options.add_argument("--ignore-certificate-errors") # driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) service = Service(executable_path=chrome_path) @@ -384,13 +431,23 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, # driver.close() # just closes the window. quit() does much more cleanup driver.quit() - else: # api + else: # api if engine == "Baidu": - image_urls = baidu_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only, - proxy=proxy, proxy_type=proxy_type) + image_urls = baidu_get_image_url_using_api( + keywords, + max_number=max_number, + face_only=face_only, + proxy=proxy, + proxy_type=proxy_type, + ) elif engine == "Bing": - image_urls = bing_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only, - proxy=proxy, proxy_type=proxy_type) + image_urls = bing_get_image_url_using_api( + keywords, + max_number=max_number, + face_only=face_only, + proxy=proxy, + proxy_type=proxy_type, + ) else: my_print("Engine {} is not supported on API mode.".format(engine)) @@ -399,7 +456,11 @@ def crawl_image_urls(keywords, engine="Google", max_number=10000, else: output_num = max_number - my_print("\n== {0} out of {1} crawled images urls will be used.\n".format( - output_num, len(image_urls)), quiet) + my_print( + "\n== {0} out of {1} crawled images urls will be used.\n".format( + output_num, len(image_urls) + ), + quiet, + ) return image_urls[0:output_num] From 2eca32d2bf6e389a70511efd94ac9fdaef767660 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Mon, 25 Mar 2024 01:26:56 -0400 Subject: [PATCH 19/35] fix the broken crap (#14) Co-authored-by: Albert Tung --- .gitignore | 3 ++- crawler.py | 18 +++++++----------- image_downloader.py | 8 ++++---- requirements.txt | 2 +- utils.py | 35 +++++++++++++++-------------------- 5 files changed, 29 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 6fc332ca..6f093a9e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ images .vscode download_images build -dist \ No newline at end of file +dist +.DS_Store diff --git a/crawler.py b/crawler.py index 2ed4cc43..ff8275c6 100644 --- a/crawler.py +++ b/crawler.py @@ -6,19 +6,17 @@ from __future__ import print_function +import json +import os import re -import time import sys -import os -import json -import shutil +import time +from concurrent import futures +from urllib.parse import quote, unquote -from urllib.parse import unquote, quote +import requests from selenium import webdriver -from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By -import requests -from concurrent import futures g_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", @@ -401,7 +399,6 @@ def crawl_image_urls( if browser != "api": browser = str.lower(browser) - chrome_path = shutil.which("chromedriver") chrome_options = webdriver.ChromeOptions() if "headless" in browser: chrome_options.add_argument("headless") @@ -413,9 +410,8 @@ def crawl_image_urls( chrome_options.add_argument("--ignore-certificate-errors") # driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) - service = Service(executable_path=chrome_path) + service = webdriver.ChromeService() driver = webdriver.Chrome(service=service, options=chrome_options) - if engine == "Google": driver.set_window_size(1920, 1080) driver.get(query_url) diff --git a/image_downloader.py b/image_downloader.py index 99cfbf6e..bb2b923a 100644 --- a/image_downloader.py +++ b/image_downloader.py @@ -9,7 +9,7 @@ import crawler import downloader -import utils + def main(argv): @@ -119,9 +119,9 @@ def main(argv): proxy_type = "socks5" proxy = args.proxy_socks5 - if not utils.resolve_dependencies(args.driver): - print("Dependencies not resolved, exit.") - return +# if not utils.resolve_dependencies(args.driver): +# print("Dependencies not resolved, exit.") +# return crawled_urls = crawler.crawl_image_urls( args.keywords, diff --git a/requirements.txt b/requirements.txt index 37bfbb0d..d594f7b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ chromedriver-autoinstaller==0.4.0 pyinstaller==5.13.1 PyQt5==5.15.9 requests==2.31.0 -selenium==4.8.3 +selenium==4.11.0 diff --git a/utils.py b/utils.py index ba4f7038..be2adbf6 100644 --- a/utils.py +++ b/utils.py @@ -2,7 +2,6 @@ # author: Yabin Zheng # Email: sczhengyabin@hotmail.com -import chromedriver_autoinstaller def gen_valid_dir_name_for_keywords(keywords): keep = ["-", "_", "."] @@ -13,7 +12,7 @@ def gen_valid_dir_name_for_keywords(keywords): class AppConfig(object): def __init__(self): self.engine = "Google" - + self.driver = "chrome_headless" self.keywords = "" @@ -33,23 +32,28 @@ def __init__(self): def to_command_paras(self): str_paras = "" - - str_paras += ' -e ' + self.engine - str_paras += ' -d ' + self.driver + str_paras += " -e " + self.engine + + str_paras += " -d " + self.driver - str_paras += ' -n ' + str(self.max_number) + str_paras += " -n " + str(self.max_number) - str_paras += ' -j ' + str(self.num_threads) + str_paras += " -j " + str(self.num_threads) - str_paras += ' -o "' + self.output_dir + '/' + \ - gen_valid_dir_name_for_keywords(self.keywords) + '"' + str_paras += ( + ' -o "' + + self.output_dir + + "/" + + gen_valid_dir_name_for_keywords(self.keywords) + + '"' + ) if self.face_only: - str_paras += ' -F ' + str_paras += " -F " if self.safe_mode: - str_paras += ' -S ' + str_paras += " -S " if self.proxy_type == "http": str_paras += ' -ph "' + self.proxy + '"' @@ -68,12 +72,3 @@ def gen_keywords_list_from_str(keywords_str, sep=","): def gen_keywords_list_from_file(filepath): with open(filepath, "r", encoding="utf-8") as f: return f.readlines() - -def resolve_dependencies(driver=str): - if "chrome" in driver: - print("Checking Google Chrome and chromedriver ...") - driver_path = chromedriver_autoinstaller.install() - if not driver_path: - return False - print("OK.") - return True From 86ee0c0ca0092138eb4c4115210c1f3bd80a75e7 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:12:26 -0400 Subject: [PATCH 20/35] Tungalbert99 changes merged with master fixes (#15) * tungalbert99 fork as-is Merge in github desktop/git was not successful. So f it I'll do it brute force * force merge master Hopefully force merge master changes over tungalbert99 patch * restore chromedriver autoinstall * actually restore chromedriver autoinstall * restore missed lines * order matters --- crawler.py | 13 +++++-------- downloader.py | 24 ++++++++---------------- image_downloader.py | 14 +++++--------- utils.py | 9 +++++++++ 4 files changed, 27 insertions(+), 33 deletions(-) diff --git a/crawler.py b/crawler.py index ff8275c6..4d32c86e 100644 --- a/crawler.py +++ b/crawler.py @@ -1,5 +1,4 @@ """ Crawl image urls from image search engine. """ - # -*- coding: utf-8 -*- # author: Yabin Zheng # Email: sczhengyabin@hotmail.com @@ -21,7 +20,8 @@ g_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Proxy-Connection": "keep-alive", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Accept-Encoding": "gzip, deflate, sdch", # 'Connection': 'close', } @@ -130,7 +130,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): image_elements = driver.find_elements(By.CLASS_NAME, "islib") image_urls = list() - url_pattern = r"imgurl=\S*&imgrefurl" # url_pattern = r"imgurl=(.*?)&" + url_pattern = r"imgurl=\S*&imgrefurl" for image_element in image_elements[:max_number]: outer_html = image_element.get_attribute("outerHTML") @@ -365,7 +365,6 @@ def crawl_image_urls( :param browser: browser to use when crawl image urls :return: list of scraped image urls """ - # Validate engine name if engine not in ["Google", "Baidu", "Bing"]: raise Exception(f"Unknown engine name: {engine}") @@ -400,6 +399,7 @@ def crawl_image_urls( if browser != "api": browser = str.lower(browser) chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("--ignore-certificate-errors") if "headless" in browser: chrome_options.add_argument("headless") if proxy is not None and proxy_type is not None: @@ -407,8 +407,6 @@ def crawl_image_urls( "--proxy-server={}://{}".format(proxy_type, proxy) ) - chrome_options.add_argument("--ignore-certificate-errors") - # driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) service = webdriver.ChromeService() driver = webdriver.Chrome(service=service, options=chrome_options) @@ -424,8 +422,7 @@ def crawl_image_urls( driver.set_window_size(10000, 7500) driver.get(query_url) image_urls = baidu_image_url_from_webpage(driver) - - # driver.close() # just closes the window. quit() does much more cleanup + # driver.close() just closes the window. quit() does much more cleanup driver.quit() else: # api if engine == "Baidu": diff --git a/downloader.py b/downloader.py index ce1a82c2..5cd06856 100644 --- a/downloader.py +++ b/downloader.py @@ -48,22 +48,20 @@ def test_xml(h, f): imghdr.tests.append(test_xml) # imghdr checks for JFIF specifically, ignoring optional markers including metadata -def test_jpg(h, f): +def test_jpg2(h, f): if (h[:3] == "\xff\xd8\xff"): return "jpg" -imghdr.tests.append(test_jpg) +imghdr.tests.append(test_jpg2) -def download_image( - image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None -): +def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None): proxies = None if proxy_type is not None: proxies = { "http": proxy_type + "://" + proxy, - "https": proxy_type + "://" + proxy, + "https": proxy_type + "://" + proxy } file_name = unquote(file_name) @@ -73,8 +71,10 @@ def download_image( while True: try: try_times += 1 + # https://github.com/pablobots/Image-Downloader/commit/5bdbe076589459b9d0c41a563b92993cac1a892e - image_url = image_url.split('&')[0] + image_url = image_url.split('&')[0] + response = requests.get( image_url, headers=headers, timeout=timeout, proxies=proxies ) @@ -139,15 +139,7 @@ def download_image( break -def download_images( - image_urls, - dst_dir, - file_prefix="img", - concurrency=50, - timeout=20, - proxy_type=None, - proxy=None, -): +def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None): """ Download image according to given urls and automatically rename them in order. :param timeout: diff --git a/image_downloader.py b/image_downloader.py index bb2b923a..67e37057 100644 --- a/image_downloader.py +++ b/image_downloader.py @@ -9,7 +9,7 @@ import crawler import downloader - +import utils def main(argv): @@ -67,11 +67,7 @@ def main(argv): help="Turn on safe search mode. (Only effective in Google)", ) parser.add_argument( - "--face-only", - "-F", - action="store_true", - default=False, - help="Only search for faces (only available in Google)", + "--face-only", "-F", action="store_true", default=False, help="Only search for faces (only available in Google)" ) parser.add_argument( "--proxy_http", @@ -119,9 +115,9 @@ def main(argv): proxy_type = "socks5" proxy = args.proxy_socks5 -# if not utils.resolve_dependencies(args.driver): -# print("Dependencies not resolved, exit.") -# return + if not utils.resolve_dependencies(args.driver): + print("Dependencies not resolved, exit.") + return crawled_urls = crawler.crawl_image_urls( args.keywords, diff --git a/utils.py b/utils.py index be2adbf6..ad24904f 100644 --- a/utils.py +++ b/utils.py @@ -72,3 +72,12 @@ def gen_keywords_list_from_str(keywords_str, sep=","): def gen_keywords_list_from_file(filepath): with open(filepath, "r", encoding="utf-8") as f: return f.readlines() + +def resolve_dependencies(driver=str): + if "chrome" in driver: + print("Checking Google Chrome and chromedriver ...") + driver_path = chromedriver_autoinstaller.install() + if not driver_path: + return False + print("OK.") + return True From e31b44b060e3508ff6283e25d1c925e320d0bfaa Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:22:37 -0400 Subject: [PATCH 21/35] all important import --- utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils.py b/utils.py index ad24904f..4c8723a8 100644 --- a/utils.py +++ b/utils.py @@ -2,6 +2,7 @@ # author: Yabin Zheng # Email: sczhengyabin@hotmail.com +import chromedriver_autoinstaller def gen_valid_dir_name_for_keywords(keywords): keep = ["-", "_", "."] From f443453078ea436b6bd0d215257283018aa6e8a3 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Fri, 19 Apr 2024 22:08:36 -0400 Subject: [PATCH 22/35] fixes from google updates I forgot who, some fork on github, but without the formatting changes From d9906d0b8dc5a1f34bd31853525aa79909bc70ab Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sat, 20 Apr 2024 11:02:12 -0400 Subject: [PATCH 23/35] feat: Adapt to Google page updates from MuLoo/Image-Downloader --- crawler.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index 4d32c86e..cc9fba80 100644 --- a/crawler.py +++ b/crawler.py @@ -77,7 +77,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): thumb_elements = [] while True: try: - thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i") + # old way to get thumb_elements + # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i") + # Adapt to the updated Google image search page + thumb_elements = driver.find_elements(By.CSS_SELECTOR, ".H8Rx8c > g-img > img") my_print("Find {} images.".format(len(thumb_elements)), quiet) if len(thumb_elements) >= max_number: break @@ -85,7 +88,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): break thumb_elements_old = thumb_elements driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(5) + time.sleep(10) show_more = driver.find_elements(By.CLASS_NAME, "mye4qd") if ( len(show_more) == 1 @@ -128,7 +131,9 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): except Exception as e: print("Error while retrying click:", e) - image_elements = driver.find_elements(By.CLASS_NAME, "islib") + + # image_elements = driver.find_elements(By.CLASS_NAME, "islib") + image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a") image_urls = list() url_pattern = r"imgurl=\S*&imgrefurl" From d759f8a44656c915297f25470eebda10869af7f9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 7 Jul 2024 00:01:41 -0400 Subject: [PATCH 24/35] --- (#16) updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d594f7b4..d3db9b64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.4.0 pyinstaller==5.13.1 PyQt5==5.15.9 -requests==2.31.0 +requests==2.32.0 selenium==4.11.0 From cc1e5a6179d5fb21d887077aa7d866b5d0c28d3c Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sat, 14 Sep 2024 22:38:36 -0400 Subject: [PATCH 25/35] fix jpeg tests identify jpg/jpeg files better, not just JFIF --- downloader.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/downloader.py b/downloader.py index 5cd06856..c7993853 100644 --- a/downloader.py +++ b/downloader.py @@ -56,6 +56,19 @@ def test_jpg2(h, f): imghdr.tests.append(test_jpg2) +# https://stackoverflow.com/questions/8032642/how-can-i-obtain-the-image-size-using-a-standard-python-class-without-using-an +def test_jpeg2(h, f): + # SOI APP2 + ICC_PROFILE + if h[0:4] == '\xff\xd8\xff\xe2' and h[6:17] == b'ICC_PROFILE': + return 'jpeg' + # SOI APP14 + Adobe + if h[0:4] == '\xff\xd8\xff\xee' and h[6:11] == b'Adobe': + return 'jpeg' + # SOI DQT + if h[0:4] == '\xff\xd8\xff\xdb': + return 'jpeg' +imghdr.tests.append(test_jpeg2) + def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None): proxies = None if proxy_type is not None: @@ -109,9 +122,9 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p print("## Err: TYPE({}) {}".format(file_type, image_url)) return False elif file_type in ["jpg", "jpeg", "png", "bmp", "webp", 'gif']: - if len(file_name) >= 200: + if len(file_name) >= 150: print("Truncating: {}".format(file_name)) - file_name = file_name[:200] + file_name = file_name[:150] if file_name.endswith("." + file_type): new_file_name = file_name @@ -181,7 +194,7 @@ def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, time ) ) count += 1 - concurrent.futures.wait(future_list, timeout=180) + concurrent.futures.wait(future_list, timeout=90) # Count the number of successful downloads for future in future_list: From e5246007a1997ae20c61b9c457e19dace3acf6df Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 22 Sep 2024 17:56:10 -0400 Subject: [PATCH 26/35] fix filename logic fix filename logic add VS project --- Image-Downloader-master.pyproj | 11835 +++++++++++++++++++++++++++++++ downloader.py | 92 +- requirements.txt | 4 +- 3 files changed, 11887 insertions(+), 44 deletions(-) create mode 100644 Image-Downloader-master.pyproj diff --git a/Image-Downloader-master.pyproj b/Image-Downloader-master.pyproj new file mode 100644 index 00000000..7fc6d0e7 --- /dev/null +++ b/Image-Downloader-master.pyproj @@ -0,0 +1,11835 @@ + + + + Debug + 2.0 + {df3acfb9-3979-40d6-aaf4-912a01ecb210} + + image_downloader_gui.py + + . + . + {888888a0-9f3d-457c-b088-3a5042f75d52} + Standard Python launcher + MSBuild|envImageDownloaderEnv|$(MSBuildProjectFullPath) + + + + + 10.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + envImageDownloaderEnv + 3.10 + envImageDownloaderEnv (Python 3.10 (64-bit)) + Scripts\python.exe + Scripts\pythonw.exe + PYTHONPATH + X64 + + + + \ No newline at end of file diff --git a/downloader.py b/downloader.py index c7993853..1c610fcc 100644 --- a/downloader.py +++ b/downloader.py @@ -5,6 +5,7 @@ from __future__ import print_function from urllib.parse import unquote +from pathlib import Path import shutil import imghdr @@ -101,55 +102,62 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p print("## Err: STATUS CODE({}) {}".format(response.status_code, image_url)) return False - with open(file_path, "wb") as f: - f.write(response.content) - response.close() - - file_type = imghdr.what(file_path) - - if file_name.endswith(".jpeg"): - file_name = file_name.replace(".jpeg", ".jpg") - - if file_type == "jpeg": - file_type = "jpg" - - if file_type is None: - # os.remove(file_path) - print("## Err: TYPE({}) {}".format(file_type, file_name)) - return False - elif file_type == "html" or file_type == "xml": - os.remove(file_path) - print("## Err: TYPE({}) {}".format(file_type, image_url)) - return False - elif file_type in ["jpg", "jpeg", "png", "bmp", "webp", 'gif']: - if len(file_name) >= 150: - print("Truncating: {}".format(file_name)) - file_name = file_name[:150] - - if file_name.endswith("." + file_type): - new_file_name = file_name - else: - new_file_name = "{}.{}".format(file_name, file_type) - - new_file_path = os.path.join(dst_dir, new_file_name) - shutil.move(file_path, new_file_path) - print("## OK: {} {}".format(new_file_name, image_url)) - return True - else: - # os.remove(file_path) - print("## Err: TYPE({}) {}".format(file_type, image_url)) - return False - break + file_name = get_filename(file_name, response.content) + file_path = os.path.join(dst_dir, file_name) + + file_attempts = 5 + while file_attempts > 0: + try: + with open(file_path, "wb") as f: + f.write(response.content) + response.close() + break + except Exception as e: + file_attempts -= 1 + file_name = file_name = Path(file_name).stem + "_" + Path(file_name).suffix except Exception as e: if try_times < 3: - file_name = file_name + "a" continue if response: response.close() print("## Fail: {} {}".format(image_url, e.args)) - return False - break + break + +def get_filename(file_name, content): + + #TODO: use python-magic + if file_name.endswith(".jpeg"): + file_name = file_name.replace(".jpeg", ".jpg") + + file_type = imghdr.what('', content) + + if file_type == "jpeg": + file_type = "jpg" + + if file_type is None: + # os.remove(file_path) + print("## Err: TYPE({}) {}".format(file_type, file_name)) + return file_name + elif file_type == "html" or file_type == "xml": + # os.remove(file_path) + print("## Err: TYPE({}) {}".format(file_type, file_name)) + return file_name + elif file_type in ["jpg", "jpeg", "png", "bmp", "webp", 'gif']: + + if file_name.endswith("." + file_type): + new_file_name = file_name + print("## OK: {}".format(new_file_name)) + else: + file_name = Path(file_name).stem + new_file_name = "{}.{}".format(file_name, file_type) + print("## OK: {} => {}".format(file_name, new_file_name)) + return new_file_name + + else: + # os.remove(file_path) + print("## Err: TYPE({}) {}".format(file_type, file_name)) + return file_name def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None): diff --git a/requirements.txt b/requirements.txt index d3db9b64..358d2e45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -chromedriver-autoinstaller==0.4.0 +chromedriver-autoinstaller==0.6.2 pyinstaller==5.13.1 PyQt5==5.15.9 -requests==2.32.0 +requests==2.31.0 selenium==4.11.0 From 848cd374b021c0b6ff6bdf1593917c414eef3b11 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 22 Sep 2024 18:55:02 -0400 Subject: [PATCH 27/35] fix file naming more --- downloader.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/downloader.py b/downloader.py index 1c610fcc..6a9b06bb 100644 --- a/downloader.py +++ b/downloader.py @@ -104,17 +104,24 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p file_name = get_filename(file_name, response.content) file_path = os.path.join(dst_dir, file_name) + base_file_path = file_path - file_attempts = 5 - while file_attempts > 0: + file_attempts = 0 + while file_attempts < 50: try: - with open(file_path, "wb") as f: + # open for exclusive creation, failing if the file already exists + with open(file_path, "xb") as f: f.write(response.content) response.close() break + except FileExistsError: + file_attempts += 1 + file_name = "{}_{}{}".format(Path(base_file_path).stem, file_attempts, Path(base_file_path).suffix) + file_path = os.path.join(dst_dir, file_name) except Exception as e: - file_attempts -= 1 - file_name = file_name = Path(file_name).stem + "_" + Path(file_name).suffix + file_attempts += 1 + file_name = "unknown" + Path(file_name).suffix + file_path = os.path.join(dst_dir, file_name) except Exception as e: if try_times < 3: @@ -127,6 +134,11 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p def get_filename(file_name, content): #TODO: use python-magic + + # just in case + if "/" in file_name: + file_name = split_string(file_name, "/", -1) + if file_name.endswith(".jpeg"): file_name = file_name.replace(".jpeg", ".jpg") From 641f7ac98f4d714821ef98a705f4eaf26a9cfee0 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 22 Sep 2024 21:29:47 -0400 Subject: [PATCH 28/35] more fixing --- downloader.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/downloader.py b/downloader.py index 6a9b06bb..68a6a484 100644 --- a/downloader.py +++ b/downloader.py @@ -24,6 +24,20 @@ } # additional checks for imghdr.what() +# default tests: +# test_bmp +# test_exr +# test_gif +# test_jpeg +# test_pbm +# test_pgm +# test_png +# test_ppm +# test_rast +# test_rgb +# test_tiff +# test_webp +# test_xbm def test_html(h, f): if b" Date: Mon, 23 Sep 2024 02:08:06 -0400 Subject: [PATCH 29/35] fix jpg again --- downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/downloader.py b/downloader.py index 68a6a484..129042b5 100644 --- a/downloader.py +++ b/downloader.py @@ -6,6 +6,7 @@ from __future__ import print_function from urllib.parse import unquote from pathlib import Path +from hashlib import sha256 import shutil import imghdr @@ -65,7 +66,7 @@ def test_xml(h, f): # imghdr checks for JFIF specifically, ignoring optional markers including metadata def test_jpg(h, f): - if (h[:3] == "\xff\xd8\xff"): + if h.startswith(b"\xff\xd8\xff"): return "jpg" return None @@ -123,6 +124,8 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p if len(response.content) < 1: break; + hash_string = sha256(response.content).hexdigest() + file_name = get_filename(file_name, response.content) file_path = os.path.join(dst_dir, file_name) base_file_path = file_path From 547f84c8eaba03062c64f93ab5364e01f768a4b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Oct 2024 21:40:25 -0400 Subject: [PATCH 30/35] Bump requests from 2.31.0 to 2.32.2 (#17) Bumps [requests](https://github.com/psf/requests) from 2.31.0 to 2.32.2. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.31.0...v2.32.2) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 358d2e45..3f2e15f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.6.2 pyinstaller==5.13.1 PyQt5==5.15.9 -requests==2.31.0 +requests==2.32.2 selenium==4.11.0 From 5cf7d5abd85b005ff792a1ccaf43bdb2d56e503c Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Wed, 9 Oct 2024 02:01:28 -0400 Subject: [PATCH 31/35] Add files via upload (#18) --- crawler.py | 17 +++++++++++++---- downloader.py | 4 +--- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/crawler.py b/crawler.py index cc9fba80..1c0e1535 100644 --- a/crawler.py +++ b/crawler.py @@ -77,6 +77,8 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): thumb_elements = [] while True: try: + # tuankg1028 + # thumb_elements = driver.find_elements(By.CLASS_NAME, "ivg-i") # old way to get thumb_elements # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i") # Adapt to the updated Google image search page @@ -133,15 +135,21 @@ def google_image_url_from_webpage(driver, max_number, quiet=False): # image_elements = driver.find_elements(By.CLASS_NAME, "islib") + # tuankg1028 + # image_elements = driver.find_elements(By.CLASS_NAME, "ivg-i") image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a") image_urls = list() - url_pattern = r"imgurl=\S*&imgrefurl" + #url_pattern = r"imgurl=\S*&imgrefurl" + # bluelul/Image-Downloader + url_pattern = r"imgurl=\S*&tbnid" for image_element in image_elements[:max_number]: outer_html = image_element.get_attribute("outerHTML") re_group = re.search(url_pattern, outer_html) if re_group is not None: - image_url = unquote(re_group.group()[7:-14]) + # image_url = unquote(re_group.group()[7:-14]) + # bluelul/Image-Downloader + image_url = unquote(re_group.group()[7:-10]) image_urls.append(image_url) return image_urls @@ -213,7 +221,7 @@ def bing_get_image_url_using_api( res = session.get(url, proxies=proxies, headers=g_headers) res.encoding = "utf-8" image_urls_batch = re.findall("murl":"(.*?)"", res.text) - if len(image_urls) > 0 and image_urls_batch[-1] == image_urls[-1]: + if len(image_urls) > 0 and len(image_urls_batch) > 0 and image_urls_batch[-1] == image_urls[-1]: break image_urls += image_urls_batch start += len(image_urls_batch) @@ -406,7 +414,8 @@ def crawl_image_urls( chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--ignore-certificate-errors") if "headless" in browser: - chrome_options.add_argument("headless") + chrome_options.add_argument("--headless=old") # headless for v < 129 + # https://chromium-review.googlesource.com/c/chromium/src/+/5789117 ( if proxy is not None and proxy_type is not None: chrome_options.add_argument( "--proxy-server={}://{}".format(proxy_type, proxy) diff --git a/downloader.py b/downloader.py index 129042b5..81c7d572 100644 --- a/downloader.py +++ b/downloader.py @@ -132,18 +132,16 @@ def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, p file_attempts = 0 while file_attempts < 50: + file_attempts += 1 try: # open for exclusive creation, failing if the file already exists with open(file_path, "xb") as f: f.write(response.content) - response.close() break except FileExistsError: - file_attempts += 1 file_name = "{}_{}{}".format(Path(base_file_path).stem, file_attempts, Path(base_file_path).suffix) file_path = os.path.join(dst_dir, file_name) except Exception as e: - file_attempts += 1 file_name = "unknown" + Path(file_name).suffix file_path = os.path.join(dst_dir, file_name) From fcb21c4a0999b70ccdae0299f4a336930a027e28 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Sun, 13 Oct 2024 21:20:01 -0400 Subject: [PATCH 32/35] return everything --- crawler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 1c0e1535..7e23eee2 100644 --- a/crawler.py +++ b/crawler.py @@ -470,4 +470,5 @@ def crawl_image_urls( quiet, ) - return image_urls[0:output_num] +# return image_urls[0:output_num] + return image_urls From 81d69ecc866d634e6b11c84d4d9e759d14edecf7 Mon Sep 17 00:00:00 2001 From: Patty-OFurniture <127074553+Patty-OFurniture@users.noreply.github.com> Date: Tue, 19 Nov 2024 18:30:02 -0500 Subject: [PATCH 33/35] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index a027e67a..c7cb471c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +Commit 848cd37 uses exclusive mode, which fails if a file exists. It seems like the best way to check, to avoid a race condition. This is to avoid overwriting any file. + +The remainder of this readme is the original. + # Image Downloader [![996.icu](https://img.shields.io/badge/link-996.icu-red.svg)](https://996.icu) From fdc43e5d7583f4cce1ce78c3dade0f1e71e44b0f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 30 Nov 2025 01:16:13 -0500 Subject: [PATCH 34/35] Bump pyinstaller from 5.13.1 to 6.0.0 (#21) Bumps [pyinstaller](https://github.com/pyinstaller/pyinstaller) from 5.13.1 to 6.0.0. - [Release notes](https://github.com/pyinstaller/pyinstaller/releases) - [Changelog](https://github.com/pyinstaller/pyinstaller/blob/develop/doc/CHANGES.rst) - [Commits](https://github.com/pyinstaller/pyinstaller/compare/v5.13.1...v6.0.0) --- updated-dependencies: - dependency-name: pyinstaller dependency-version: 6.0.0 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3f2e15f0..a962c34a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.6.2 -pyinstaller==5.13.1 +pyinstaller==6.0.0 PyQt5==5.15.9 requests==2.32.2 selenium==4.11.0 From f40f26922af451e35d4e835c8e296aba31a847ff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 30 Nov 2025 01:16:50 -0500 Subject: [PATCH 35/35] Bump requests from 2.32.2 to 2.32.4 (#20) Bumps [requests](https://github.com/psf/requests) from 2.32.2 to 2.32.4. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.32.2...v2.32.4) --- updated-dependencies: - dependency-name: requests dependency-version: 2.32.4 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a962c34a..4be9cf42 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ chromedriver-autoinstaller==0.6.2 pyinstaller==6.0.0 PyQt5==5.15.9 -requests==2.32.2 +requests==2.32.4 selenium==4.11.0