From 8aa41b8e5a83aa666ffd9f736883c9f73c14cad1 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Fri, 30 Aug 2024 03:46:20 +0900 Subject: [PATCH 01/31] fix : dockeroperator bug by adding proxy --- .../dags/viral/twitter_crawler.py | 9 ++++++--- docker-compose.yaml | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/brickstudy_ingestion/dags/viral/twitter_crawler.py b/brickstudy_ingestion/dags/viral/twitter_crawler.py index 90970f9..8efd411 100644 --- a/brickstudy_ingestion/dags/viral/twitter_crawler.py +++ b/brickstudy_ingestion/dags/viral/twitter_crawler.py @@ -1,4 +1,5 @@ from datetime import datetime +import os from airflow import DAG from airflow.models import Variable @@ -20,11 +21,11 @@ } # ========================================= - OUTPUT_FILENAME = "test.csv" SEARCH_KEYWORD = "enhypen" LIMIT = 10 TOKEN = Variable.get("TWITTER_CRAWLER_AUTH_TOKEN_PASSWORD") +HOST_BASE_PATH = '/Users/seoyeongkim/Documents/ETL' with DAG( dag_id=DAG_ID, @@ -36,15 +37,17 @@ task_id='t_docker', image='brickstudy/twitter_crawler:latest', container_name='twitter_crawler', + api_version='1.37', auto_remove=True, mount_tmp_dir=False, mounts=[ - Mount(source="/opt/airflow/logs/tweets-data", target="/app/tweets-data", type="bind"), + Mount(source=f"{HOST_BASE_PATH}/logs", target="/app/tweets-data", type="bind"), ], command=[ + "bash", "-c", f"npx --yes tweet-harvest@latest -o {OUTPUT_FILENAME} -s {SEARCH_KEYWORD} -l {LIMIT} --token {TOKEN}" ], - docker_url='unix://var/run/docker.sock', + docker_url='tcp://docker-socket-proxy:2375', network_mode='bridge', ) diff --git a/docker-compose.yaml b/docker-compose.yaml index 0912597..8b4abb5 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -69,6 +69,7 @@ x-airflow-common: # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server # yamllint enable rule:line-length AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' + AIRFLOW__CORE__ENABLE_XCOM_PICKLING: 'true' # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks # for other purpose (development, test and especially production usage) build/extend Airflow image. _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-apache-airflow[postgres,virtualenv,apache-airflow-providers-mysql]} @@ -79,7 +80,8 @@ x-airflow-common: - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins - user: "${AIRFLOW_UID:-50000}:0" + # user: "${AIRFLOW_UID:-50000}:0" + user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" depends_on: &airflow-common-depends-on redis: @@ -343,6 +345,20 @@ services: airflow-init: condition: service_completed_successfully + # Proxy container for docker socket + # Forward DockerOperator TCP connection to Host docker daemon + docker-socket-proxy: + image: tecnativa/docker-socket-proxy:0.1.1 + environment: + CONTAINERS: 1 + IMAGES: 1 + AUTH: 1 + POST: 1 + privileged: true + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro # read only + restart: always + # ======================================== # Kafka infra zookeeper: From 89233f234b09ecb30799b081a1daed252f2e46b3 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 13:44:53 +0900 Subject: [PATCH 02/31] =?UTF-8?q?feature=20:=20get=5Fsoup=20=EC=9C=A0?= =?UTF-8?q?=ED=8B=B8=EC=AA=BD=EC=9C=BC=EB=A1=9C=20=EC=9D=B4=EB=8F=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/__init__.py | 34 ---------------- .../src/scrapper/oliveyoung.py | 4 +- brickstudy_ingestion/src/scrapper/utils.py | 39 +++++++++++++++++++ 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/__init__.py b/brickstudy_ingestion/src/scrapper/__init__.py index 673266f..fd40910 100644 --- a/brickstudy_ingestion/src/scrapper/__init__.py +++ b/brickstudy_ingestion/src/scrapper/__init__.py @@ -1,38 +1,4 @@ -import urllib -from urllib.request import urlopen -from urllib.error import HTTPError, URLError -from bs4 import BeautifulSoup -import random -import time -from src.common.exception import ExtractError -def get_soup(url: str = None): - user_agent_lst = ['Googlebot', 'Yeti', 'Daumoa', 'Twitterbot'] - user_agent = user_agent_lst[random.randint(0, len(user_agent_lst) - 1)] - headers = {'User-Agent': user_agent} - try: - req = urllib.request.Request(url, headers=headers) - page = urlopen(req) - html = page.read().decode("utf-8") - soup = BeautifulSoup(html, "html.parser") - except (HTTPError, URLError) as e: - err = ExtractError( - code=000, - message=f"**{url}** HTTPError/URLError. Sleep 5 and continue.", - log=e - ) - time.sleep(5) # TODO 이 경우 해당 url에 대해 재실행 필요 - except (ValueError) as e: - err = ExtractError( - code=000, - message=f"**{url}** ValueError. Ignore this url parameter.", - log=e - ) - print(err) - soup = None # TODO 해당 url 무시 - else: - time.sleep(random.random()) - return soup diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py index 5dd94bd..f912a12 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py @@ -1,7 +1,7 @@ from collections import defaultdict from datetime import datetime -from . import get_soup +from src.scrapper.utils import get_soup from src.scrapper.models import brand_generator @@ -101,7 +101,7 @@ def _get_items(self) -> None: """ for brand in self.brand_metadata.keys(): brand_url = self.brand_metadata[brand].brand_shop_detail_url - brand_url_soup = get_soup(brand_url) + brand_url_soup = get_soup(brand_url)==----------0-0 if brand_url_soup is None: continue item_dic = {} diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py index ca9d141..5d151f4 100644 --- a/brickstudy_ingestion/src/scrapper/utils.py +++ b/brickstudy_ingestion/src/scrapper/utils.py @@ -1,3 +1,42 @@ +def get_soup(url: str = None): + import urllib + from urllib.request import urlopen + from urllib.error import HTTPError, URLError + from bs4 import BeautifulSoup + import random + import time + + from src.common.exception import ExtractError + + user_agent_lst = ['Googlebot', 'Yeti', 'Daumoa', 'Twitterbot'] + user_agent = user_agent_lst[random.randint(0, len(user_agent_lst) - 1)] + headers = {'User-Agent': user_agent} + + try: + req = urllib.request.Request(url, headers=headers) + page = urlopen(req) + html = page.read().decode("utf-8") + soup = BeautifulSoup(html, "html.parser") + except (HTTPError, URLError) as e: + err = ExtractError( + code=000, + message=f"**{url}** HTTPError/URLError. Sleep 5 and continue.", + log=e + ) + time.sleep(5) # TODO 이 경우 해당 url에 대해 재실행 필요 + except (ValueError) as e: + err = ExtractError( + code=000, + message=f"**{url}** ValueError. Ignore this url parameter.", + log=e + ) + print(err) + soup = None # TODO 해당 url 무시 + else: + time.sleep(random.random()) + return soup + + def dict_partitioner(data: dict, level: int): total_n = len(data) partition_n = total_n // level From fb0f20cdb6cfe60f892ee15fab58ae21fd31d2b6 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 13:45:13 +0900 Subject: [PATCH 03/31] fix : readme --- brickstudy_ingestion/src/scrapper/readme.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/readme.md b/brickstudy_ingestion/src/scrapper/readme.md index 4448800..65d9d98 100644 --- a/brickstudy_ingestion/src/scrapper/readme.md +++ b/brickstudy_ingestion/src/scrapper/readme.md @@ -2,11 +2,13 @@ ``` brickstudy_ingestion/src/scrapper -├── browser.py # selenium으로 크롤링에 필요한 configs, utils 정의 모듈 -├── inscrawler.py # instagram crawler main 모듈 -├── models.py -├── oliveyoung.py # oliveyoung scrapper main 모듈 -└── utils.py # 공통 메소드 +├── __init__.py # scrapper 모듈 entrypoint +├── Dockerfile # [Twitter] twitter crawler 동작 환경 +├── browser.py # [Instagram] selenium으로 크롤링하는데에 사용되는 로직 정의 모듈 +├── inscrawler.py # [Instagram]instagram crawler main 모듈 +├── models.py # [Oliveyoung] 올리브영 브랜드 수집 데이터 구조 +├── oliveyoung.py # [Oliveyoung] scrapper main 모듈 +└── utils.py # 공통 유틸리티 메소드 ``` From b763d17210a688a50209150b326cbbb8e1596d1e Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 16:02:47 +0900 Subject: [PATCH 04/31] fix : bug- typo --- brickstudy_ingestion/src/scrapper/oliveyoung.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py index f912a12..1606637 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py @@ -101,7 +101,7 @@ def _get_items(self) -> None: """ for brand in self.brand_metadata.keys(): brand_url = self.brand_metadata[brand].brand_shop_detail_url - brand_url_soup = get_soup(brand_url)==----------0-0 + brand_url_soup = get_soup(brand_url) if brand_url_soup is None: continue item_dic = {} From aba10541cf83b2cba5036c211f8ce3c5621fc2ea Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 17:48:11 +0900 Subject: [PATCH 05/31] =?UTF-8?q?ADD=20:=20=EC=9D=B8=EC=8A=A4=ED=83=80?= =?UTF-8?q?=EA=B7=B8=EB=9E=A8=20url=20=EC=88=98=EC=A7=91=20=EB=AA=A8?= =?UTF-8?q?=EB=93=88=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/ins_url.py | 66 +++++++++++++++++ .../src/scrapper/inscrawler.py | 71 +++++++++---------- brickstudy_ingestion/src/scrapper/models.py | 17 +++++ 3 files changed, 117 insertions(+), 37 deletions(-) create mode 100644 brickstudy_ingestion/src/scrapper/ins_url.py diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py new file mode 100644 index 0000000..2181631 --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -0,0 +1,66 @@ +import time +from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By +import urllib +import re + +from src.scrapper.inscrawler import InsCrawler + + +class InsURLCrawler(InsCrawler): + def __init__(self): + super().__init__() + + def get_urls(self, keyword: str = None): + if keyword is not None: # execute with given keyword + self._fetch_url_data(keyword) + else: # execute with entire keywords + for keyword in self.keywords: + self._fetch_url_data(keyword) + + def _fetch_url_data(self, keyword): + word = urllib.parse.quote(keyword) + word_url = f'https://www.instagram.com/explore/tags/{word}/' + self.driver.get(word_url) + + try: + time.sleep(5) + js = 'window.scrollBy(0,1000)' + self.driver.execute_script(js) + html = self.driver.page_source + soup = BeautifulSoup(html, 'lxml') + + divimg = soup.find_all('img', {'class': 'x5yr21d xu96u03 x10l6tqk x13vifvy x87ps6o xh8yej3'}) + if not divimg: + print('이미지를 찾을 수 없습니다.') + raise Exception + + for div in divimg: + content = div.get('alt') + if not content: + print('내용이 없습니다.') + continue + + a = div.find_parent('a') + if a is None: + print('게시물 링크가 잘못되었습니다.') + continue + urlto = a.get('href') + if urlto is None: + print('게시물 링크가 없습니다.') + continue + totalurl = 'https://www.instagram.com' + urlto + self.data[urlto].brand = keyword + self.data[urlto].post_url = totalurl + + modified_content = re.sub(r'\s*\n\s*', ' ', content) + self.data[urlto].full_text = modified_content + + print(f'페이지 {keyword}에서 데이터를 가져오는 중...') + time.sleep(5) + + except Exception as e: + print(e) + print('오류 발생') + + print(f'키워드 {keyword}의 URL 정보 수집 완료.') \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 25de1e0..6ffaee4 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,43 +1,40 @@ import os - -from src.common.exception import RetryException - -from browser import Browser -from utils import retry - - -class InsCrawler(): - URL = "https://www.instagram.com" - RETRY_LIMIT = 10 - - def __init__(self, has_screen=False): - super(InsCrawler, self).__init__() - self.browser = Browser(has_screen) - self.page_height = 0 +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +import json +from collections import defaultdict + +from src.scrapper.models import inst_generator + + +class InsCrawler: + def __init__(self): + #TODO base_dir 받는 부분 수정 + self.base_dir = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper' + self.user_id, self.password, self.keywords, self.iter = self.load_config() + self.data = defaultdict(inst_generator) + self.driver = webdriver.Chrome() self.login() - def login(self): - browser = self.browser - url = "%s/accounts/login/" % (InsCrawler.URL) - browser.get(url) - u_input = browser.find_one('input[name="username"]') - u_input.send_keys(os.getenv('INSTAGRAM_ID')) - p_input = browser.find_one('input[name="password"]') - p_input.send_keys(os.getenv('INSTAGRAM_PWD')) - - login_btn = browser.find_one('button[type="submit"]') - login_btn.click() + def load_config(self): + with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f: + config = json.load(f) - @retry() - def check_login(): - if browser.find_one('input[name="username"]'): - raise RetryException() + username = config['login']['username'] + password = config['login']['password'] + keywords = config['keywords'] + iter = config['iter'] - check_login() + return username, password, keywords, iter - def get_latest_posts_by_tag(self, tag, num): - tag = 'enhypen' - url = f"{InsCrawler.URL}/explore/search/keyword/?q=%23{tag}" - self.browser.get(url) - self.browser.scroll_down() - #TODO 게시물 클릭, 컨텐츠 가져오기 \ No newline at end of file + def login(self): + # Instagram 접속 및 로그인 + url = 'https://www.instagram.com/' + self.driver.get(url) + time.sleep(6) + user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input') + user.send_keys(self.user_id) + self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) + self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() + time.sleep(80) \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/models.py b/brickstudy_ingestion/src/scrapper/models.py index 7c5954e..a8eca32 100644 --- a/brickstudy_ingestion/src/scrapper/models.py +++ b/brickstudy_ingestion/src/scrapper/models.py @@ -19,3 +19,20 @@ def brand_generator(): [], '' ) + + +@dataclass +class InstagramData: + brand: str + post_url: str + full_text: str + username: str + like: int + saved_imgs: str + date: str + + +def inst_generator(): + return InstagramData( + '', '', '', '', 0, '', '' + ) From 94e2d0dc5109905a48d89dbd9ddb780d75a29f55 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 19:26:44 +0900 Subject: [PATCH 06/31] fix : close driver after url method returned --- brickstudy_ingestion/src/scrapper/ins_url.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py index 2181631..830b1a3 100644 --- a/brickstudy_ingestion/src/scrapper/ins_url.py +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -1,6 +1,5 @@ import time from bs4 import BeautifulSoup -from selenium.webdriver.common.by import By import urllib import re @@ -63,4 +62,5 @@ def _fetch_url_data(self, keyword): print(e) print('오류 발생') - print(f'키워드 {keyword}의 URL 정보 수집 완료.') \ No newline at end of file + print(f'키워드 {keyword}의 URL 정보 수집 완료.') + self.driver.close() \ No newline at end of file From f299ae2384fe989b28ec65f85cca1026b8ff68ed Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 19:27:11 +0900 Subject: [PATCH 07/31] add : materialize method in inscrawler --- .../src/scrapper/inscrawler.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 6ffaee4..f563beb 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,4 +1,3 @@ -import os import time from selenium import webdriver from selenium.webdriver.common.by import By @@ -10,8 +9,9 @@ class InsCrawler: def __init__(self): - #TODO base_dir 받는 부분 수정 - self.base_dir = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper' + # TODO proj_path(실행환경의 project 절대경로) 받는 부분 수정 환경변수 설정 필요 + proj_path = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion' + self.base_path = f'{proj_path}/src/scrapper' self.user_id, self.password, self.keywords, self.iter = self.load_config() self.data = defaultdict(inst_generator) self.driver = webdriver.Chrome() @@ -37,4 +37,15 @@ def login(self): user.send_keys(self.user_id) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() - time.sleep(80) \ No newline at end of file + time.sleep(80) + + def materialize(self): + """ + self.data to csv file + """ + from src.scrapper.utils import current_datetime_getter + import csv + + with open(f"{self.base_path}/insdata_{current_datetime_getter()}.csv", 'w') as f: + w = csv.writer(f) + w.writerow(self.data.values()) \ No newline at end of file From 9ea3206260952ac8ef91a17c9bd4a92c894c83d5 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 19:30:24 +0900 Subject: [PATCH 08/31] add : generate uid based on current datetime --- brickstudy_ingestion/src/scrapper/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py index 5d151f4..5afbd2a 100644 --- a/brickstudy_ingestion/src/scrapper/utils.py +++ b/brickstudy_ingestion/src/scrapper/utils.py @@ -110,4 +110,13 @@ def wrapped_f(*args, **kwargs): return wrapped_f - return wrap \ No newline at end of file + return wrap + + +def current_datetime_getter(): + import pytz + from datetime import datetime + kst = pytz.timezone('Asia/Seoul') + current_time = datetime.now(kst) + current_datetime = current_time.strftime("%Y%m%d_%H%M%S") + return current_datetime \ No newline at end of file From 639e49fc408d8f619bb757e6c1e2e276d19c39d5 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 19:31:21 +0900 Subject: [PATCH 09/31] add : crawling post data based on url(tested) --- .gitignore | 4 +- brickstudy_ingestion/src/scrapper/ins_data.py | 138 ++++++++++++++++++ .../tests/scrapper/test_instagram.py | 13 ++ 3 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 brickstudy_ingestion/src/scrapper/ins_data.py create mode 100644 brickstudy_ingestion/tests/scrapper/test_instagram.py diff --git a/.gitignore b/.gitignore index ace90b6..4c011e0 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ log # kafka data kafka-data -aws_credentials \ No newline at end of file +aws_credentials +brickstudy_ingestion/dags/viral/tmp +brickstudy_ingestion/src/scrapper/results \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py new file mode 100644 index 0000000..2a5ecb6 --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -0,0 +1,138 @@ +import time +import pandas as pd +from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By +import requests +import re + +from src.scrapper.inscrawler import InsCrawler + + +class InsDataCrawler(InsCrawler): + def __init__(self, data): + super().__init__() + self.data = data + self.headers = { + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36' + } + + def get_post_data(self): + results = f'{self.base_path}/results/data.txt' + with open(results, 'r') as f: + post_crawled_data = {line.strip() for line in f} + + for idx, (key, val) in enumerate(self.data.items()): + + post_url = val.post_url + + if post_url in post_crawled_data: + continue + + self.driver.get(post_url) + print(idx, '. ' + post_url) + + try: + time.sleep(5) + html = self.driver.page_source + soup = BeautifulSoup(html, 'lxml') + + # 작성자 + username = soup.find('span', {'class': '_ap3a _aaco _aacw _aacx _aad7 _aade'}).text + print(username, end=' ') + self.data[key].username = username + + # 작성일자 + date = soup.find_all('time')[-1]['datetime'][:10] + print(date, end=' ') + self.data[key].date = date + + # like 개수 + try: + like = soup.find('span', {'class': 'html-span xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x1hl2dhg x16tdsg8 x1vvkbs'}).text + except Exception: + like = 'no data' # ~~외 여러 명이 좋아합니다. 같은 경우 + print(like) + self.data[key].like = like + + # 이미지 저장 + images = [] + img_urls = set() + images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3')) + + for i in range(len(images)): + for j in range(len(images[i])): + + if j >= 3: # 4번째부터 타 게시물의 썸네일 이미지 + break + + alt = images[i][j].get_attribute('alt') + check = re.findall(r'by .+? on', alt) # 타 게시물인지 아닌지 검사 + + if check != []: + img_urls.add(images[i][j].get_attribute('src')) + + # 이미지 끝까지 넘기면서 url 추출 + try: + while True: + time.sleep(3) + + self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click() # 다음 이미지 버튼 클릭 + images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3')) + + for i in range(len(images)): + for j in range(len(images[i])): + + if j >= 3: # 4번째부터 타 게시물의 썸네일 이미지 + break + + alt = images[i][j].get_attribute('alt') + check = re.findall(r'by .+? on', alt) # 타 게시물인지 아닌지 검사 + + if check != []: + img_urls.add(images[i][j].get_attribute('src')) + + images.clear() + + except Exception: + print('더 이상 넘길 이미지 없음') + + img_urls = list(img_urls) + print(img_urls) + images.clear() + + saved_imgs = set() + for img_url in img_urls: + # 이미지만 고려. 우선 비디오 타입은 고려하지 않음. + pattern = r'\/v\/[^\/]+\/([^\/\?]+)\.(jpg|png|webp|heic)' + match = re.search(pattern, img_url) + if match: + img_name = match.group(1) + '.' + match.group(2) + else: + print('파일을 찾을 수 없거나 jpg 혹은 png, webp, heic 파일이 아님.') + continue + + if img_name not in saved_imgs: + response = requests.get(img_url, headers=self.headers, timeout=20) + + with open(f'{self.base_path}/results/images/' + img_name, 'wb') as f: + f.write(response.content) + + saved_imgs.add(img_name) + + time.sleep(.5) + + print(f"총 {len(saved_imgs)} 장의 이미지 저장") + self.data[key].saved_imgs = str(list(saved_imgs)) + + time.sleep(5) + + except Exception as e: + print(e) + print('오류 발생') + + # 수집 완료된 데이터 키값(post url unique id) 저장 + with open(results, 'a') as f: + for key in self.data.keys(): + f.write(key + '\n') + + self.driver.close() diff --git a/brickstudy_ingestion/tests/scrapper/test_instagram.py b/brickstudy_ingestion/tests/scrapper/test_instagram.py new file mode 100644 index 0000000..a1fee10 --- /dev/null +++ b/brickstudy_ingestion/tests/scrapper/test_instagram.py @@ -0,0 +1,13 @@ +from src.scrapper.ins_url import InsURLCrawler +from src.scrapper.ins_data import InsDataCrawler + + +def test_get_urls(): + keyword = '올리브영' + url_crawler = InsURLCrawler() + url_crawler.get_urls(keyword) + url_crawler.materialize() + + crawler = InsDataCrawler(url_crawler.data) + crawler.get_post_data() + crawler.materialize() \ No newline at end of file From ddeeb4570f7b804d4e210914516313cc4f6d7e64 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 19:32:49 +0900 Subject: [PATCH 10/31] delete : browser class --- brickstudy_ingestion/src/scrapper/browser.py | 103 ------------------- 1 file changed, 103 deletions(-) delete mode 100644 brickstudy_ingestion/src/scrapper/browser.py diff --git a/brickstudy_ingestion/src/scrapper/browser.py b/brickstudy_ingestion/src/scrapper/browser.py deleted file mode 100644 index d2bc7bf..0000000 --- a/brickstudy_ingestion/src/scrapper/browser.py +++ /dev/null @@ -1,103 +0,0 @@ -import os - -from selenium import webdriver -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait -# from selenium.webdriver.common.keys import Keys -from fake_useragent import UserAgent - -from utils import randmized_sleep - - -class Browser: - def __init__(self, has_screen): - dir_path = os.path.dirname(os.path.realpath(__file__)) - service_args = ["--ignore-ssl-errors=true"] - chrome_options = Options() - if not has_screen: - chrome_options.add_argument("--headless") - chrome_options.add_argument("--start-maximized") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("user-agent=" + UserAgent().random) - self.driver = webdriver.Chrome( - executable_path=f"{dir_path}/bin/chromedriver", - service_args=service_args, - chrome_options=chrome_options, - ) - self.driver.implicitly_wait(5) - - @property - def page_height(self): - return self.driver.execute_script("return document.body.scrollHeight") - - def get(self, url): - self.driver.get(url) - - @property - def current_url(self): - return self.driver.current_url - - def implicitly_wait(self, t): - self.driver.implicitly_wait(t) - - def find_one(self, css_selector, elem=None, waittime=0): - obj = elem or self.driver - - if waittime: - WebDriverWait(obj, waittime).until( - EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)) - ) - - try: - return obj.find_element(By.CSS_SELECTOR, css_selector) - except NoSuchElementException: - return None - - def find(self, css_selector, elem=None, waittime=0): - obj = elem or self.driver - - try: - if waittime: - WebDriverWait(obj, waittime).until( - EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)) - ) - except TimeoutException: - return None - - try: - return obj.find_elements(By.CSS_SELECTOR, css_selector) - except NoSuchElementException: - return None - - def scroll_down(self, wait=0.3): - self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") - randmized_sleep(wait) - - def scroll_up(self, offset=-1, wait=2): - if offset == -1: - self.driver.execute_script("window.scrollTo(0, 0)") - else: - self.driver.execute_script("window.scrollBy(0, -%s)" % offset) - randmized_sleep(wait) - - def js_click(self, elem): - self.driver.execute_script("arguments[0].click();", elem) - - def open_new_tab(self, url): - self.driver.execute_script("window.open('%s');" % url) - self.driver.switch_to.window(self.driver.window_handles[1]) - - def close_current_tab(self): - self.driver.close() - - self.driver.switch_to.window(self.driver.window_handles[0]) - - def __del__(self): - try: - self.driver.quit() - except Exception: - pass From 87248de663b6addb59bf0c656f93e8e75623ad17 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 22:56:17 +0900 Subject: [PATCH 11/31] =?UTF-8?q?fix=20:=20inscralwer=20=EC=83=9D=EC=84=B1?= =?UTF-8?q?=EC=9E=90=20dev,=20prod=20=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/ins_data.py | 1 - .../src/scrapper/inscrawler.py | 33 +++++++++++-------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py index 2a5ecb6..e4ec1b9 100644 --- a/brickstudy_ingestion/src/scrapper/ins_data.py +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -1,5 +1,4 @@ import time -import pandas as pd from bs4 import BeautifulSoup from selenium.webdriver.common.by import By import requests diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index f563beb..6ae11c0 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,3 +1,4 @@ +import os import time from selenium import webdriver from selenium.webdriver.common.by import By @@ -8,25 +9,29 @@ class InsCrawler: - def __init__(self): - # TODO proj_path(실행환경의 project 절대경로) 받는 부분 수정 환경변수 설정 필요 - proj_path = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion' + def __init__(self, dev: bool = False): + if dev: + proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion' + else: + proj_path = '/opt/airflow/brickstudy_ingestion' self.base_path = f'{proj_path}/src/scrapper' - self.user_id, self.password, self.keywords, self.iter = self.load_config() + + self.user_id, self.password = self.load_config(dev=dev) self.data = defaultdict(inst_generator) self.driver = webdriver.Chrome() self.login() - def load_config(self): - with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f: - config = json.load(f) - - username = config['login']['username'] - password = config['login']['password'] - keywords = config['keywords'] - iter = config['iter'] - - return username, password, keywords, iter + def load_config(self, dev: bool = False): + if dev: + with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f: + config = json.load(f) + + username = config['login']['username'] + password = config['login']['password'] + else: + username = os.environ('INSTAGRAM_ID') + password = os.environ('INSTAGRAM_PASSWORD') + return (username, password) def login(self): # Instagram 접속 및 로그인 From 0ab1ba264ecb7f24e2f76614dac66c4ebcb575ef Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 22:56:48 +0900 Subject: [PATCH 12/31] =?UTF-8?q?add=20:=20=EC=A0=95=ED=95=B4=EC=A7=84=20?= =?UTF-8?q?=EC=B9=B4=ED=85=8C=EA=B3=A0=EB=A6=AC=EC=97=90=20=ED=95=B4?= =?UTF-8?q?=EB=8B=B9=EB=90=98=EB=8A=94=20=EB=B8=8C=EB=9E=9C=EB=93=9C?= =?UTF-8?q?=EB=AA=85=EB=A7=8C=20=EC=B6=94=EC=B6=9C=ED=95=98=EB=8A=94=20?= =?UTF-8?q?=EB=AA=A8=EB=93=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/brand_name_getter.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 brickstudy_ingestion/src/scrapper/brand_name_getter.py diff --git a/brickstudy_ingestion/src/scrapper/brand_name_getter.py b/brickstudy_ingestion/src/scrapper/brand_name_getter.py new file mode 100644 index 0000000..8ff8e40 --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/brand_name_getter.py @@ -0,0 +1,62 @@ +import json + +from src.common.aws.s3_uploader import S3Uploader +from src.scrapper.models import OliveyoungBrand + + +def get_latest_dt(): + return '2024-08-20' + + +def category_checker(category: list) -> bool: + """ + standard 기준 카테고리에 하나라도 속해있으면 True 반환, 아니라면 False 반환 + """ + compare = set([c.split('_')[0] for c in category]) + standard = {'메이크업', '스킨케어', '향수', '헤어케어', '바디케어', '마스크팩', + '클렌징', '선케어', '더모코스메틱', '맨즈케어'} + if len(compare & standard) > 0: + return True + return False + + +def filter_brand(file_content: str) -> list: + filtered = [] + for line in file_content.split('\n'): + if line == '': + break + print(f"line: {line}") + for brandname, brandinfo in json.loads(line).items(): + brandinfo_dic = OliveyoungBrand(**brandinfo) + if category_checker(brandinfo_dic.category): + filtered.append(brandname) + return filtered + + +def get_brand_list_fr_s3(): + s3_client = S3Uploader().s3_client + bucket = 'brickstudy' + + def file_keys_getter(): + paginator = s3_client.get_paginator('list_objects_v2') + prefix = f"bronze/viral/oliveyoung/{get_latest_dt()}" + file_key_lst = [] + for page in paginator.paginate( + Bucket=bucket, + Prefix=prefix + ): + if 'Contents' in page: + for obj in page['Contents']: + file_key_lst.append(obj['Key']) + return file_key_lst + + file_key_lst = file_keys_getter() + filtered_brand_lst = [] + for filekey in file_key_lst: + response = s3_client.get_object( + Bucket=bucket, + Key=filekey + ) + file_content = response['Body'].read().decode('utf-8') + filtered_brand_lst += filter_brand(file_content) + return filtered_brand_lst \ No newline at end of file From a66b2c5e4ba6d1533d7f33da9302dbd49041fad5 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 3 Sep 2024 23:49:55 +0900 Subject: [PATCH 13/31] ADD : instagram crawling dag --- .../dags/viral/instagram_crawler.py | 69 +++++++++++++++++++ .../src/scrapper/inscrawler.py | 3 +- 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 brickstudy_ingestion/dags/viral/instagram_crawler.py diff --git a/brickstudy_ingestion/dags/viral/instagram_crawler.py b/brickstudy_ingestion/dags/viral/instagram_crawler.py new file mode 100644 index 0000000..1672b18 --- /dev/null +++ b/brickstudy_ingestion/dags/viral/instagram_crawler.py @@ -0,0 +1,69 @@ +from datetime import timedelta + +from airflow import DAG +from airflow.utils.dates import days_ago +from airflow.decorators import task +from airflow.operators.python import PythonVirtualenvOperator + +from src.scrapper.brand_name_getter import get_brand_list_fr_s3 + +# ========================================= +# Change parameter +DAG_ID = "bronze_viral_instagram" +TARGET_PLATFORM = 'instagram' + +# Set aiflow setting +default_args = { + 'owner': 'brickstudy', + 'start_date': days_ago(0), + 'retries': 1, + 'retry_delay': timedelta(minutes=1), + # 'on_failure_callback': on_failure_callback, +} +# ========================================= + + +def entrypoint(): + import logging + import multiprocess + from src.common.kafka.utils import Kafka + from src.scrapper.inscrawler import InsCrawler + from src.scrapper.ins_url import InsURLCrawler + from src.scrapper.ins_data import InsDataCrawler + + brand_lst = get_brand_list_fr_s3 + CONCURRENCY_LEVEL = multiprocess.cpu_count() + + def crawl_instagram(keywords: tuple): + crawler = InsURLCrawler(InsCrawler(keywords=keywords)).get_urls() + post_crawler = InsDataCrawler(crawler.data) + post_crawler.get_post_data() + producer.send_data_to_kafka( + kafka_topic='instagram', + data=post_crawler.data + ) + + try: + producer = Kafka() + with multiprocess.Pool(CONCURRENCY_LEVEL) as p: + p.map(crawl_instagram, brand_lst) + except Exception as e: + logging.error("***entrypoint error***", e) + raise + +with DAG( + dag_id=DAG_ID, + default_args=default_args, + schedule_interval='@daily', + catchup=False +): + t_crawl_ins = PythonVirtualenvOperator( + task_id='crawl_instagram_based_on_keyword', + python_version='3.10', + system_site_packages=False, + requirements=['selenium==4.24.0', 'webdriver-manager==4.0.2', + 'bs4==0.0.2', 'beautifulsoup4==4.12.3', + 'lxml==5.3.0', 'pytz==2024.1', + "python-dotenv==0.19.0", "multiprocess", "kafka-python"], + python_callable=entrypoint + ) \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 6ae11c0..537e947 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -9,7 +9,7 @@ class InsCrawler: - def __init__(self, dev: bool = False): + def __init__(self, keywords: list = None, dev: bool = False): if dev: proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion' else: @@ -17,6 +17,7 @@ def __init__(self, dev: bool = False): self.base_path = f'{proj_path}/src/scrapper' self.user_id, self.password = self.load_config(dev=dev) + self.keywords = keywords self.data = defaultdict(inst_generator) self.driver = webdriver.Chrome() self.login() From 932b294c987ed6c41f428646c398e2a1b5631e95 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Wed, 4 Sep 2024 12:53:11 +0900 Subject: [PATCH 14/31] =?UTF-8?q?fix=20:=20instagram=20=EB=AA=A8=EB=93=88?= =?UTF-8?q?=20=EC=83=9D=EC=84=B1=EC=9E=90=EC=97=90=EC=84=9C=20=EB=B0=9B?= =?UTF-8?q?=EB=8A=94=20arg=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/ins_data.py | 6 ++- brickstudy_ingestion/src/scrapper/ins_url.py | 7 ++-- .../src/scrapper/inscrawler.py | 39 ++++++++++++++----- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py index e4ec1b9..d00cef0 100644 --- a/brickstudy_ingestion/src/scrapper/ins_data.py +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -8,8 +8,10 @@ class InsDataCrawler(InsCrawler): - def __init__(self, data): - super().__init__() + def __init__(self, + driver, data, + dev: bool = False): + super().__init__(dev=dev, driver=driver) self.data = data self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36' diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py index 830b1a3..e87c1af 100644 --- a/brickstudy_ingestion/src/scrapper/ins_url.py +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -7,8 +7,8 @@ class InsURLCrawler(InsCrawler): - def __init__(self): - super().__init__() + def __init__(self, keywords: list = None, dev: bool = False): + super().__init__(keywords, dev) def get_urls(self, keyword: str = None): if keyword is not None: # execute with given keyword @@ -62,5 +62,4 @@ def _fetch_url_data(self, keyword): print(e) print('오류 발생') - print(f'키워드 {keyword}의 URL 정보 수집 완료.') - self.driver.close() \ No newline at end of file + print(f'키워드 {keyword}의 URL 정보 수집 완료.') \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 537e947..1c6922d 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,6 +1,7 @@ import os import time from selenium import webdriver +from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By import json from collections import defaultdict @@ -9,18 +10,38 @@ class InsCrawler: - def __init__(self, keywords: list = None, dev: bool = False): + def __init__(self, + keywords: list = None, + dev: bool = False, + driver: webdriver = None): if dev: - proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion' + proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion" else: proj_path = '/opt/airflow/brickstudy_ingestion' - self.base_path = f'{proj_path}/src/scrapper' + self.base_path = f"{proj_path}/src/scrapper" self.user_id, self.password = self.load_config(dev=dev) self.keywords = keywords self.data = defaultdict(inst_generator) - self.driver = webdriver.Chrome() - self.login() + + if driver is None: + self.load_driver(dev) + self.login() + else: + self.driver = driver + + def load_driver(self, dev: bool = False): + if dev: + self.driver = webdriver.Chrome() + else: + options = webdriver.ChromeOptions() + options.add_argument("--headless") + options.add_argument("--no-sandbox") + options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome( + executable_path=ChromeDriverManager().install(), + options=options + ) def load_config(self, dev: bool = False): if dev: @@ -30,8 +51,8 @@ def load_config(self, dev: bool = False): username = config['login']['username'] password = config['login']['password'] else: - username = os.environ('INSTAGRAM_ID') - password = os.environ('INSTAGRAM_PASSWORD') + username = os.getenv('INSTAGRAM_CLIENT_ID') + password = os.getenv('INSTAGRAM_CLIENT_PASSWORD') return (username, password) def login(self): @@ -43,7 +64,7 @@ def login(self): user.send_keys(self.user_id) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() - time.sleep(80) + time.sleep(40) def materialize(self): """ @@ -52,6 +73,6 @@ def materialize(self): from src.scrapper.utils import current_datetime_getter import csv - with open(f"{self.base_path}/insdata_{current_datetime_getter()}.csv", 'w') as f: + with open(f"{self.base_path}/results/insdata_{current_datetime_getter()}.csv", 'w') as f: w = csv.writer(f) w.writerow(self.data.values()) \ No newline at end of file From f7a56ea9953276b28e1adf42096053f97e446981 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Wed, 4 Sep 2024 12:59:54 +0900 Subject: [PATCH 15/31] =?UTF-8?q?fix=20:=20instagram=20dag=20=EB=B8=8C?= =?UTF-8?q?=EB=9E=9C=EB=93=9C=EB=A6=AC=EC=8A=A4=ED=8A=B8=20=EB=B0=9B?= =?UTF-8?q?=EB=8A=94=20=EB=B6=80=EB=B6=84=20task=20=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/dags/utils/config.py | 4 +- .../dags/viral/instagram_crawler.py | 42 ++++++++++++++----- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/brickstudy_ingestion/dags/utils/config.py b/brickstudy_ingestion/dags/utils/config.py index bd1c384..a2f96a6 100644 --- a/brickstudy_ingestion/dags/utils/config.py +++ b/brickstudy_ingestion/dags/utils/config.py @@ -24,7 +24,9 @@ def set_env_variables(): "TWITTER_CLIENT_ID", "TWITTER_CLIENT_PASSWORD", "TWITTER_TOKEN" - "TWITTER_CRAWLER_AUTH_TOKEN_PASSWORD" + # Instagram + "INSTAGRAM_CLIENT_ID", + "INSTAGRAM_CLIENT_PASSWORD" ] for ENV_VARIABLE in ALL_ENV_VARIABLES: os.environ[ENV_VARIABLE] = Variable.get(ENV_VARIABLE, "") diff --git a/brickstudy_ingestion/dags/viral/instagram_crawler.py b/brickstudy_ingestion/dags/viral/instagram_crawler.py index 1672b18..e8ca249 100644 --- a/brickstudy_ingestion/dags/viral/instagram_crawler.py +++ b/brickstudy_ingestion/dags/viral/instagram_crawler.py @@ -2,8 +2,8 @@ from airflow import DAG from airflow.utils.dates import days_ago -from airflow.decorators import task -from airflow.operators.python import PythonVirtualenvOperator +from airflow.operators.python import PythonVirtualenvOperator, PythonOperator +from airflow.models import Variable from src.scrapper.brand_name_getter import get_brand_list_fr_s3 @@ -23,16 +23,23 @@ # ========================================= -def entrypoint(): +def get_brand_list(): + import os + for ENV_VARIABLE in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY']: + os.environ[ENV_VARIABLE] = Variable.get(ENV_VARIABLE, "") + return get_brand_list_fr_s3() + + +def instagram_crawling(brand_lst, id, pwd): + import os import logging - import multiprocess from src.common.kafka.utils import Kafka from src.scrapper.inscrawler import InsCrawler from src.scrapper.ins_url import InsURLCrawler from src.scrapper.ins_data import InsDataCrawler - brand_lst = get_brand_list_fr_s3 - CONCURRENCY_LEVEL = multiprocess.cpu_count() + os.environ['INSTAGRAM_CLIENT_ID'] = id + os.environ['INSTAGRAM_CLIENT_PASSWORD'] = pwd def crawl_instagram(keywords: tuple): crawler = InsURLCrawler(InsCrawler(keywords=keywords)).get_urls() @@ -45,25 +52,38 @@ def crawl_instagram(keywords: tuple): try: producer = Kafka() - with multiprocess.Pool(CONCURRENCY_LEVEL) as p: - p.map(crawl_instagram, brand_lst) + crawl_instagram(brand_lst) except Exception as e: logging.error("***entrypoint error***", e) raise + with DAG( dag_id=DAG_ID, default_args=default_args, schedule_interval='@daily', catchup=False ): - t_crawl_ins = PythonVirtualenvOperator( + t1 = PythonOperator( + task_id='get_brand_list_from_s3', + python_callable=get_brand_list + ) + + t2 = PythonVirtualenvOperator( task_id='crawl_instagram_based_on_keyword', + system_site_packages=False, + op_kwargs={ + 'brand_lst': "{{ ti.xcom_pull(task_ids='get_brand_list_from_s3') }}", + 'id': Variable.get('INSTAGRAM_CLIENT_ID'), + 'pwd': Variable.get('INSTAGRAM_CLIENT_PASSWORD') + }, python_version='3.10', system_site_packages=False, requirements=['selenium==4.24.0', 'webdriver-manager==4.0.2', 'bs4==0.0.2', 'beautifulsoup4==4.12.3', 'lxml==5.3.0', 'pytz==2024.1', "python-dotenv==0.19.0", "multiprocess", "kafka-python"], - python_callable=entrypoint - ) \ No newline at end of file + python_callable=instagram_crawling + ) + + t1 >> t2 \ No newline at end of file From 405b7e434d87dab77a0c829d467da6771d6c8b7d Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Thu, 5 Sep 2024 13:34:01 +0900 Subject: [PATCH 16/31] =?UTF-8?q?ADD=20:=20=EC=9E=84=EC=8B=9C=20=ED=81=AC?= =?UTF-8?q?=EB=A1=A4=EB=9F=AC=20=EC=8A=A4=ED=81=AC=EB=A6=BD=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/brand_name_getter.py | 1 - .../src/scrapper/ins_runner.py | 107 ++++++++++++++++++ .../src/scrapper/inscrawler.py | 15 +-- 3 files changed, 109 insertions(+), 14 deletions(-) create mode 100644 brickstudy_ingestion/src/scrapper/ins_runner.py diff --git a/brickstudy_ingestion/src/scrapper/brand_name_getter.py b/brickstudy_ingestion/src/scrapper/brand_name_getter.py index 8ff8e40..8b045ff 100644 --- a/brickstudy_ingestion/src/scrapper/brand_name_getter.py +++ b/brickstudy_ingestion/src/scrapper/brand_name_getter.py @@ -25,7 +25,6 @@ def filter_brand(file_content: str) -> list: for line in file_content.split('\n'): if line == '': break - print(f"line: {line}") for brandname, brandinfo in json.loads(line).items(): brandinfo_dic = OliveyoungBrand(**brandinfo) if category_checker(brandinfo_dic.category): diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py new file mode 100644 index 0000000..01a7a56 --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -0,0 +1,107 @@ +from src.scrapper.brand_name_getter import get_brand_list_fr_s3 +from src.scrapper.ins_url import InsURLCrawler +from src.scrapper.ins_data import InsDataCrawler +from src.common.aws.s3_uploader import S3Uploader +from src.scrapper.utils import write_local_as_json +from src.scrapper.utils import current_datetime_getter +import logging +import os + +logger = logging.getLogger('insrunner') +logger.setLevel(logging.ERROR) + + +def crawl_data(): + brand_lst = get_brand_list_fr_s3() + for brand in brand_lst: + try: + crawler = InsURLCrawler(dev=True) + crawler.get_urls(keyword=brand) + crawler.materialize() + except Exception as e: + logging.error( + "{} url 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e) + ) + finally: + pass + + try: + post_crawler = InsDataCrawler( + driver=crawler.driver, + data=crawler.data, + dev=True + ) + post_crawler.get_post_data() + except Exception as e: + logging.error( + "{} post data 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e) + ) + finally: + pass + + try: + write_local_as_json( + data=post_crawler.data, + file_path=f"{post_crawler.base_path}/results/data", + file_name=f"instagram_{current_datetime_getter}" + ) + except Exception as e: + logging.error( + "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e) + ) + finally: + pass + + return f"{post_crawler.base_path}/results/data" + + +def s3_upload(local_path): + s3 = S3Uploader().s3_client + s3_path = "bronze/viral/instagram", + bucket_name = "brickstudy" + + for root, _, files in os.walk(local_path): + for file in files: + local_file_path = os.path.join(root, file) + # S3 파일 경로 설정 + s3_file_path = os.path.join(s3_path, os.path.relpath(local_file_path, local_path)) + + try: + s3.upload_file(local_file_path, bucket_name, s3_file_path) + print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}") + except FileNotFoundError: + print(f"File not found: {local_file_path}") + except Exception as e: + print(f"Failed to upload {local_file_path}: {str(e)}") + + +if __name__ =='__main__': + local_path = crawl_data() + s3_upload(local_path) + +""" +curl -i -X PUT -H "Accept:application/json" -H "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{ + "connector.class": "io.confluent.connect.s3.S3SinkConnector", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter.schemas.enable": "false", + "tasks.max": 1, + "topics": "instagram", + "aws.signing_region": "ap-northeast-2", + "s3.part.size": 5242880, + "s3.region": "ap-northeast-2", + "s3.bucket.name": "brickstudy", + "s3.credentials.provider.class": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain", + "topics.dir": "bronze/viral", + "partitioner.class": "io.confluent.connect.storage.partitioner.TimeBasedPartitioner", + "partition.duration.ms": "86400000", + "timestamp.extractor": "Record", + "path.format": "yyyy-MM-dd", + "flush.size": 100, + "rotate.interval.ms": 60000, + "storage.class": "io.confluent.connect.s3.storage.S3Storage", + "format.class": "io.confluent.connect.s3.format.json.JsonFormat", + "locale": "ko_KR", + "timezone": "Asia/Seoul" +}' +""" \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 1c6922d..8da325a 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -16,8 +16,10 @@ def __init__(self, driver: webdriver = None): if dev: proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion" + self.driver = webdriver.Chrome() else: proj_path = '/opt/airflow/brickstudy_ingestion' + self.driver = driver self.base_path = f"{proj_path}/src/scrapper" self.user_id, self.password = self.load_config(dev=dev) @@ -29,19 +31,6 @@ def __init__(self, self.login() else: self.driver = driver - - def load_driver(self, dev: bool = False): - if dev: - self.driver = webdriver.Chrome() - else: - options = webdriver.ChromeOptions() - options.add_argument("--headless") - options.add_argument("--no-sandbox") - options.add_argument("--disable-dev-shm-usage") - self.driver = webdriver.Chrome( - executable_path=ChromeDriverManager().install(), - options=options - ) def load_config(self, dev: bool = False): if dev: From 42ef0859166563da497f81819b38c917d0f10e4a Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Thu, 5 Sep 2024 16:35:59 +0900 Subject: [PATCH 17/31] =?UTF-8?q?fix=20:=20=EC=9E=84=EC=8B=9C=20=ED=83=9C?= =?UTF-8?q?=EC=8A=A4=ED=81=AC=20=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/ins_runner.py | 18 ++++++++++-------- .../src/scrapper/inscrawler.py | 9 ++------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index 01a7a56..9949c17 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -40,10 +40,11 @@ def crawl_data(): pass try: + cur_date = current_datetime_getter() write_local_as_json( data=post_crawler.data, file_path=f"{post_crawler.base_path}/results/data", - file_name=f"instagram_{current_datetime_getter}" + file_name=f"instagram_{cur_date}" ) except Exception as e: logging.error( @@ -51,21 +52,21 @@ def crawl_data(): ) finally: pass - return f"{post_crawler.base_path}/results/data" def s3_upload(local_path): + dt = current_datetime_getter() + dt = dt.split('_')[0] s3 = S3Uploader().s3_client - s3_path = "bronze/viral/instagram", + s3_path = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}" bucket_name = "brickstudy" for root, _, files in os.walk(local_path): for file in files: local_file_path = os.path.join(root, file) - # S3 파일 경로 설정 - s3_file_path = os.path.join(s3_path, os.path.relpath(local_file_path, local_path)) - + s3_file_path = os.path.join(s3_path, file) + print(local_file_path) try: s3.upload_file(local_file_path, bucket_name, s3_file_path) print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}") @@ -75,8 +76,9 @@ def s3_upload(local_path): print(f"Failed to upload {local_file_path}: {str(e)}") -if __name__ =='__main__': - local_path = crawl_data() +if __name__ == '__main__': + # local_path = crawl_data() + local_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/results/data" s3_upload(local_path) """ diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 8da325a..e0a3e13 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,7 +1,6 @@ import os import time from selenium import webdriver -from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By import json from collections import defaultdict @@ -26,11 +25,7 @@ def __init__(self, self.keywords = keywords self.data = defaultdict(inst_generator) - if driver is None: - self.load_driver(dev) - self.login() - else: - self.driver = driver + self.login() def load_config(self, dev: bool = False): if dev: @@ -53,7 +48,7 @@ def login(self): user.send_keys(self.user_id) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() - time.sleep(40) + time.sleep(10) def materialize(self): """ From 484400384e6cf874c974e82c5cbf912b67e701e5 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Thu, 5 Sep 2024 17:08:38 +0900 Subject: [PATCH 18/31] fix : refactor temporal task --- .../src/scrapper/ins_runner.py | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index 9949c17..9ae569b 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -1,11 +1,11 @@ from src.scrapper.brand_name_getter import get_brand_list_fr_s3 from src.scrapper.ins_url import InsURLCrawler from src.scrapper.ins_data import InsDataCrawler -from src.common.aws.s3_uploader import S3Uploader from src.scrapper.utils import write_local_as_json from src.scrapper.utils import current_datetime_getter -import logging import os +import logging +import subprocess logger = logging.getLogger('insrunner') logger.setLevel(logging.ERROR) @@ -52,34 +52,29 @@ def crawl_data(): ) finally: pass - return f"{post_crawler.base_path}/results/data" + return f"{post_crawler.base_path}/results" -def s3_upload(local_path): +def s3_upload(local_path: str, target: str = 'data'): + local_folder = os.path.join(local_path, target) dt = current_datetime_getter() dt = dt.split('_')[0] - s3 = S3Uploader().s3_client - s3_path = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}" + s3_folder = f"bronze/viral/instagram/{target}/{dt[:4]}-{dt[4:6]}-{dt[6:]}" bucket_name = "brickstudy" - - for root, _, files in os.walk(local_path): - for file in files: - local_file_path = os.path.join(root, file) - s3_file_path = os.path.join(s3_path, file) - print(local_file_path) - try: - s3.upload_file(local_file_path, bucket_name, s3_file_path) - print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}") - except FileNotFoundError: - print(f"File not found: {local_file_path}") - except Exception as e: - print(f"Failed to upload {local_file_path}: {str(e)}") + try: + subprocess.run( + ['aws', 's3', 'cp', local_folder, f's3://{bucket_name}/{s3_folder}/', '--recursive'], + check=True + ) + print(f"Folder {local_folder} uploaded to s3://{bucket_name}/{s3_folder}/") + except subprocess.CalledProcessError as e: + print(f"Failed to upload folder: {str(e)}") if __name__ == '__main__': - # local_path = crawl_data() - local_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/results/data" - s3_upload(local_path) + local_path = crawl_data() + s3_upload(local_path, 'data') + s3_upload(local_path, 'images') """ curl -i -X PUT -H "Accept:application/json" -H "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{ From 6f6da8bc839400d4343e88ae49f24e6b61905bf9 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Thu, 5 Sep 2024 17:46:08 +0900 Subject: [PATCH 19/31] fix : s3 path --- brickstudy_ingestion/src/scrapper/ins_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index 9ae569b..f4b4cdd 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -59,7 +59,7 @@ def s3_upload(local_path: str, target: str = 'data'): local_folder = os.path.join(local_path, target) dt = current_datetime_getter() dt = dt.split('_')[0] - s3_folder = f"bronze/viral/instagram/{target}/{dt[:4]}-{dt[4:6]}-{dt[6:]}" + s3_folder = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}/{target}" bucket_name = "brickstudy" try: subprocess.run( From 09ffaf4abf368e66761ec35520826ec2ddc3c07c Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Sun, 8 Sep 2024 22:44:27 +0900 Subject: [PATCH 20/31] fix : crawler avoid account blocked --- brickstudy_ingestion/src/scrapper/ins_data.py | 10 +++-- .../src/scrapper/ins_runner.py | 42 +++++++------------ brickstudy_ingestion/src/scrapper/ins_url.py | 1 + .../src/scrapper/inscrawler.py | 37 ++++++++++++---- 4 files changed, 52 insertions(+), 38 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py index d00cef0..56b70f2 100644 --- a/brickstudy_ingestion/src/scrapper/ins_data.py +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -1,8 +1,10 @@ import time from bs4 import BeautifulSoup +from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By import requests import re +import random from src.scrapper.inscrawler import InsCrawler @@ -23,17 +25,18 @@ def get_post_data(self): post_crawled_data = {line.strip() for line in f} for idx, (key, val) in enumerate(self.data.items()): + if self.numof_error > 10: break post_url = val.post_url if post_url in post_crawled_data: continue + time.sleep(random.randrange(2, 5)) self.driver.get(post_url) print(idx, '. ' + post_url) try: - time.sleep(5) html = self.driver.page_source soup = BeautifulSoup(html, 'lxml') @@ -75,8 +78,8 @@ def get_post_data(self): # 이미지 끝까지 넘기면서 url 추출 try: while True: - time.sleep(3) - + time.sleep(random.randrange(1, 3)) + WebDriverWait(self.driver, random.randrange(1, 4)) self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click() # 다음 이미지 버튼 클릭 images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3')) @@ -129,6 +132,7 @@ def get_post_data(self): except Exception as e: print(e) + self.numof_error += 1 print('오류 발생') # 수집 완료된 데이터 키값(post url unique id) 저장 diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index f4b4cdd..5ac77d9 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -13,31 +13,22 @@ def crawl_data(): brand_lst = get_brand_list_fr_s3() - for brand in brand_lst: - try: - crawler = InsURLCrawler(dev=True) - crawler.get_urls(keyword=brand) - crawler.materialize() - except Exception as e: - logging.error( - "{} url 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e) - ) - finally: - pass + err = 0 + for brand in brand_lst[13:]: + if err > 10: + break - try: - post_crawler = InsDataCrawler( - driver=crawler.driver, - data=crawler.data, - dev=True - ) - post_crawler.get_post_data() - except Exception as e: - logging.error( - "{} post data 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e) - ) - finally: - pass + crawler = InsURLCrawler(dev=True) + crawler.get_urls(keyword=brand) + err += crawler.numof_error + + post_crawler = InsDataCrawler( + driver=crawler.driver, + data=crawler.data, + dev=True + ) + post_crawler.get_post_data() + err += post_crawler.numof_error try: cur_date = current_datetime_getter() @@ -50,8 +41,7 @@ def crawl_data(): logging.error( "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e) ) - finally: - pass + return f"{post_crawler.base_path}/results" diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py index e87c1af..62e3a54 100644 --- a/brickstudy_ingestion/src/scrapper/ins_url.py +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -59,6 +59,7 @@ def _fetch_url_data(self, keyword): time.sleep(5) except Exception as e: + self.numof_error += 1 print(e) print('오류 발생') diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index e0a3e13..5f45eb7 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -1,9 +1,10 @@ import os import time -from selenium import webdriver -from selenium.webdriver.common.by import By import json from collections import defaultdict +import random +from selenium import webdriver +from selenium.webdriver.common.by import By from src.scrapper.models import inst_generator @@ -12,28 +13,46 @@ class InsCrawler: def __init__(self, keywords: list = None, dev: bool = False, - driver: webdriver = None): + driver=None): if dev: proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion" - self.driver = webdriver.Chrome() + self.driver = self.make_driver() else: proj_path = '/opt/airflow/brickstudy_ingestion' self.driver = driver - self.base_path = f"{proj_path}/src/scrapper" + self.base_path = f"{proj_path}/src/scrapper" self.user_id, self.password = self.load_config(dev=dev) self.keywords = keywords self.data = defaultdict(inst_generator) + self.numof_error = 0 self.login() + @staticmethod + def make_driver(): + user_agent_lst = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" + ] + options = webdriver.ChromeOptions() + options.add_argument("--disable-blink-features=AutomationControlled") + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option("useAutomationExtension", False) + driver = webdriver.Chrome(options=options) + driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[0]}) + return driver + def load_config(self, dev: bool = False): if dev: with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f: config = json.load(f) - username = config['login']['username'] - password = config['login']['password'] + x = random.randrange(0, 2) + username = config['login']['username'][x] + password = config['login']['password'][x] else: username = os.getenv('INSTAGRAM_CLIENT_ID') password = os.getenv('INSTAGRAM_CLIENT_PASSWORD') @@ -43,12 +62,12 @@ def login(self): # Instagram 접속 및 로그인 url = 'https://www.instagram.com/' self.driver.get(url) - time.sleep(6) + time.sleep(random.randrange(4, 6)) user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input') user.send_keys(self.user_id) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() - time.sleep(10) + time.sleep(random.randrange(5, 11)) def materialize(self): """ From 5ba7f3834622c1030d045703ee9d01aa3236c48b Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Mon, 9 Sep 2024 03:41:19 +0900 Subject: [PATCH 21/31] =?UTF-8?q?fix=20:=20crawler=20-=20sleep=20=EC=A1=B0?= =?UTF-8?q?=EC=A0=95,=20=EC=8A=A4=ED=81=AC=EB=A1=A4=EB=8B=A4=EC=9A=B4=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/ins_data.py | 4 +-- .../src/scrapper/ins_runner.py | 30 ++++++++++++++++--- brickstudy_ingestion/src/scrapper/ins_url.py | 12 +++++--- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py index 56b70f2..3a283d6 100644 --- a/brickstudy_ingestion/src/scrapper/ins_data.py +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -78,7 +78,7 @@ def get_post_data(self): # 이미지 끝까지 넘기면서 url 추출 try: while True: - time.sleep(random.randrange(1, 3)) + time.sleep(1) WebDriverWait(self.driver, random.randrange(1, 4)) self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click() # 다음 이미지 버튼 클릭 images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3')) @@ -128,7 +128,7 @@ def get_post_data(self): print(f"총 {len(saved_imgs)} 장의 이미지 저장") self.data[key].saved_imgs = str(list(saved_imgs)) - time.sleep(5) + time.sleep(random.randrange(3, 5)) except Exception as e: print(e) diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index 5ac77d9..6feeb37 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -6,17 +6,32 @@ import os import logging import subprocess +import shutil logger = logging.getLogger('insrunner') logger.setLevel(logging.ERROR) +scrapped = [ + "포엘리에", "아워글래스", + "휴캄", "아이레놀", "루트리", "일소", + "유니크미", "본트리", "메디필", "OOTD", "앤디얼", + "아크네스", "그레이멜린", "제로앱솔루", "리쥬란", "폴라초이스", "메이크프렘", + "제로이드", "원데이즈유", "숌", "어뮤즈", "프랭클리", "네오젠", "제이엠솔루션", "리터뉴", + "아크웰", "아이레시피", "제이준", "글로오아시스", "어반디케이", + "닥터방기원", "유리피부", "콤마나인", + "라운드어라운드", "미구하라", "주미소", + "에이지투웨니스", "프리메라", "애즈이즈투비", "투쿨포스쿨" +] + def crawl_data(): brand_lst = get_brand_list_fr_s3() err = 0 - for brand in brand_lst[13:]: + for brand in brand_lst[30:]: if err > 10: break + if brand in scrapped: + continue crawler = InsURLCrawler(dev=True) crawler.get_urls(keyword=brand) @@ -41,6 +56,7 @@ def crawl_data(): logging.error( "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e) ) + break return f"{post_crawler.base_path}/results" @@ -62,9 +78,15 @@ def s3_upload(local_path: str, target: str = 'data'): if __name__ == '__main__': - local_path = crawl_data() - s3_upload(local_path, 'data') - s3_upload(local_path, 'images') + base_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/" + + # shutil.copytree(base_path + "template", base_path + "results") + + crawl_data() + s3_upload(base_path + "results", 'data') + s3_upload(base_path + "results", 'images') + + # shutil.rmtree(base_path + "results") """ curl -i -X PUT -H "Accept:application/json" -H "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{ diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py index 62e3a54..8422d07 100644 --- a/brickstudy_ingestion/src/scrapper/ins_url.py +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -2,6 +2,7 @@ from bs4 import BeautifulSoup import urllib import re +import random from src.scrapper.inscrawler import InsCrawler @@ -23,9 +24,11 @@ def _fetch_url_data(self, keyword): self.driver.get(word_url) try: - time.sleep(5) - js = 'window.scrollBy(0,1000)' - self.driver.execute_script(js) + for _ in range(10): # 스크롤 10회 + time.sleep(random.randrange(3, 5)) + js = 'window.scrollBy(0,5000)' + self.driver.execute_script(js) + time.sleep(5) html = self.driver.page_source soup = BeautifulSoup(html, 'lxml') @@ -63,4 +66,5 @@ def _fetch_url_data(self, keyword): print(e) print('오류 발생') - print(f'키워드 {keyword}의 URL 정보 수집 완료.') \ No newline at end of file + print(f'키워드 {keyword}의 URL 정보 수집 완료.') + self.driver.close() \ No newline at end of file From c62a82cef0caa63c846766b45834093e9a373f72 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Sat, 28 Sep 2024 17:42:59 +0900 Subject: [PATCH 22/31] add : humanize feature --- .../src/scrapper/http_429_handler.py | 58 +++++++++++++++++ brickstudy_ingestion/src/scrapper/ins_data.py | 9 ++- .../src/scrapper/ins_runner.py | 60 ++++++++++------- brickstudy_ingestion/src/scrapper/ins_url.py | 8 ++- .../src/scrapper/inscrawler.py | 64 ++++++++++++++----- 5 files changed, 152 insertions(+), 47 deletions(-) create mode 100644 brickstudy_ingestion/src/scrapper/http_429_handler.py diff --git a/brickstudy_ingestion/src/scrapper/http_429_handler.py b/brickstudy_ingestion/src/scrapper/http_429_handler.py new file mode 100644 index 0000000..d1768ff --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/http_429_handler.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 + +import urllib.error +import urllib.request + +from tenacity import retry, retry_if_exception, stop_after_attempt, wait_fixed +from tenacity.wait import wait_base + + +class retry_if_http_429_error(retry_if_exception): + """Retry strategy that retries if the exception is an ``HTTPError`` with + a 429 status code. + + """ + + def __init__(self): + def is_http_429_error(exception): + return ( + isinstance(exception, urllib.error.HTTPError) and + exception.getcode() == 429 + ) + + super().__init__(predicate=is_http_429_error) + + +class wait_for_retry_after_header(wait_base): + """Wait strategy that tries to wait for the length specified by + the Retry-After header, or the underlying wait strategy if not. + See RFC 6585 § 4. + + Otherwise, wait according to the fallback strategy. + """ + def __init__(self, fallback): + self.fallback = fallback + + def __call__(self, retry_state): + # retry_state is an instance of tenacity.RetryCallState. The .outcome + # property is the result/exception that came from the underlying function. + exc = retry_state.outcome.exception() + if isinstance(exc, urllib.error.HTTPError): + retry_after = exc.headers.get("Retry-After") + + try: + return 3600 if retry_after is None else int(retry_after) + except (TypeError, ValueError): + pass + + return self.fallback(retry_state) + + +@retry( + retry=retry_if_http_429_error(), + wait=wait_for_retry_after_header(fallback=wait_fixed(1)), + stop=stop_after_attempt(3) +) +def get_url_with_tenacity_(url): + return urllib.request.urlopen(url) \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py index 3a283d6..d44ed08 100644 --- a/brickstudy_ingestion/src/scrapper/ins_data.py +++ b/brickstudy_ingestion/src/scrapper/ins_data.py @@ -7,6 +7,7 @@ import random from src.scrapper.inscrawler import InsCrawler +from src.scrapper.http_429_handler import get_url_with_tenacity_ class InsDataCrawler(InsCrawler): @@ -25,14 +26,15 @@ def get_post_data(self): post_crawled_data = {line.strip() for line in f} for idx, (key, val) in enumerate(self.data.items()): - if self.numof_error > 10: break + if self.numof_error > 5: break post_url = val.post_url if post_url in post_crawled_data: continue - time.sleep(random.randrange(2, 5)) + time.sleep(random.randrange(2, 10) + random.random()) + get_url_with_tenacity_(post_url) self.driver.get(post_url) print(idx, '. ' + post_url) @@ -78,7 +80,7 @@ def get_post_data(self): # 이미지 끝까지 넘기면서 url 추출 try: while True: - time.sleep(1) + time.sleep(random.randrange(3, 6) + random.random()) WebDriverWait(self.driver, random.randrange(1, 4)) self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click() # 다음 이미지 버튼 클릭 images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3')) @@ -134,6 +136,7 @@ def get_post_data(self): print(e) self.numof_error += 1 print('오류 발생') + time.sleep(20 + random.random()) # 수집 완료된 데이터 키값(post url unique id) 저장 with open(results, 'a') as f: diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py index 6feeb37..073cbb2 100644 --- a/brickstudy_ingestion/src/scrapper/ins_runner.py +++ b/brickstudy_ingestion/src/scrapper/ins_runner.py @@ -3,38 +3,40 @@ from src.scrapper.ins_data import InsDataCrawler from src.scrapper.utils import write_local_as_json from src.scrapper.utils import current_datetime_getter + import os import logging import subprocess -import shutil logger = logging.getLogger('insrunner') logger.setLevel(logging.ERROR) -scrapped = [ - "포엘리에", "아워글래스", - "휴캄", "아이레놀", "루트리", "일소", - "유니크미", "본트리", "메디필", "OOTD", "앤디얼", - "아크네스", "그레이멜린", "제로앱솔루", "리쥬란", "폴라초이스", "메이크프렘", - "제로이드", "원데이즈유", "숌", "어뮤즈", "프랭클리", "네오젠", "제이엠솔루션", "리터뉴", - "아크웰", "아이레시피", "제이준", "글로오아시스", "어반디케이", - "닥터방기원", "유리피부", "콤마나인", - "라운드어라운드", "미구하라", "주미소", - "에이지투웨니스", "프리메라", "애즈이즈투비", "투쿨포스쿨" +twitter_keyword = [ + "닥터지", "아이소이", "에뛰드", "에스트라", "유세린", "토리든" ] -def crawl_data(): +def get_brand_lst_wo_ingested_list(): brand_lst = get_brand_list_fr_s3() - err = 0 - for brand in brand_lst[30:]: - if err > 10: + with open(f"{base_path}/results/finished_brand.txt", "r") as f: + skip = f.read() + return list(set(brand_lst) - set(skip[:-1].split('\n'))) + + +def crawl_data(brand_lst: list, err: int): + """ + brand_lst 에 속한 brand이 언급된 데이터를 인스타그램으로부터 수집하여 + ./results/data, ./results/images에 저장하는 함수 + :brand_lst: 크롤링할 서치 키워드가 담긴 리스트 + :err: 크롤링 진행 과정에서 발생한 오류 횟수 + """ + for brand in brand_lst: + if err > 10: break - if brand in scrapped: - continue crawler = InsURLCrawler(dev=True) crawler.get_urls(keyword=brand) + crawler.materialize() err += crawler.numof_error post_crawler = InsDataCrawler( @@ -52,13 +54,14 @@ def crawl_data(): file_path=f"{post_crawler.base_path}/results/data", file_name=f"instagram_{cur_date}" ) + with open(f"{post_crawler.base_path}/results/finished_brand.txt", "a") as f: + f.write(f"{brand}\n") except Exception as e: logging.error( "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e) ) - break - return f"{post_crawler.base_path}/results" + return err def s3_upload(local_path: str, target: str = 'data'): @@ -79,14 +82,23 @@ def s3_upload(local_path: str, target: str = 'data'): if __name__ == '__main__': base_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/" + # shutil.rmtree(base_path + "results/data") + # shutil.rmtree(base_path + "results/images") + # os.mkdir(base_path + "results/data") + # os.mkdir(base_path + "results/images") + + err = 0 + # brand_lst = get_brand_lst_wo_ingested_list() - # shutil.copytree(base_path + "template", base_path + "results") + brand_lst = twitter_keyword + for block_s in range(0, len(brand_lst), 10): + partitioned = brand_lst[block_s:block_s + 10] + print(f"**** start crawling {partitioned} ****") + err += crawl_data(brand_lst[block_s:block_s + 10], err) - crawl_data() - s3_upload(base_path + "results", 'data') - s3_upload(base_path + "results", 'images') + # s3_upload(base_path + "results", 'data') + # s3_upload(base_path + "results", 'images') - # shutil.rmtree(base_path + "results") """ curl -i -X PUT -H "Accept:application/json" -H "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{ diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py index 8422d07..7cdb1cb 100644 --- a/brickstudy_ingestion/src/scrapper/ins_url.py +++ b/brickstudy_ingestion/src/scrapper/ins_url.py @@ -5,6 +5,7 @@ import random from src.scrapper.inscrawler import InsCrawler +from src.scrapper.http_429_handler import get_url_with_tenacity_ class InsURLCrawler(InsCrawler): @@ -21,14 +22,15 @@ def get_urls(self, keyword: str = None): def _fetch_url_data(self, keyword): word = urllib.parse.quote(keyword) word_url = f'https://www.instagram.com/explore/tags/{word}/' + get_url_with_tenacity_(word_url) self.driver.get(word_url) try: for _ in range(10): # 스크롤 10회 - time.sleep(random.randrange(3, 5)) - js = 'window.scrollBy(0,5000)' + time.sleep(random.randrange(3, 4) + random.random()) + js = 'window.scrollBy(0,7000)' self.driver.execute_script(js) - time.sleep(5) + time.sleep(random.randrange(3, 4) + random.random()) html = self.driver.page_source soup = BeautifulSoup(html, 'lxml') diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 5f45eb7..ae95372 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -4,6 +4,8 @@ from collections import defaultdict import random from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from src.scrapper.models import inst_generator @@ -14,35 +16,60 @@ def __init__(self, keywords: list = None, dev: bool = False, driver=None): + self.account_x = random.randrange(0, 2) if dev: proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion" self.driver = self.make_driver() else: proj_path = '/opt/airflow/brickstudy_ingestion' self.driver = driver - self.base_path = f"{proj_path}/src/scrapper" - self.user_id, self.password = self.load_config(dev=dev) + + user_id, password = self.load_config(dev=dev) self.keywords = keywords self.data = defaultdict(inst_generator) self.numof_error = 0 - self.login() + self.login(user_id, password) - @staticmethod - def make_driver(): + def make_driver(self): + proxies = [ + ["211.223.89.176:51147", + "121.66.105.19:51080", + "121.66.105.19:51080", + "8.213.128.6:8080"], + ["8.213.129.20:8090", + "8.213.129.20:5566", + "8.213.137.155:8090", + "8.220.204.215:808"], + ["8.220.205.172:9098", + "211.223.89.176:51147", + "8.213.128.90:2019", + "8.213.128.90:444"] + ] user_agent_lst = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" ] options = webdriver.ChromeOptions() + # options.add_argument("--headless") + proxy = proxies[self.account_x][random.randrange(0, 4)] + # webdriver.DesiredCapabilities.CHROME['proxy'] = { + # "socksProxy": proxy, + # "socksVersion": 4, + # "proxyType": "MANUAL", + # } + options.add_argument("--disable-blink-features=AutomationControlled") options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) - driver = webdriver.Chrome(options=options) + driver = webdriver.Chrome( + options=options + ) driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[0]}) + driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]}) return driver def load_config(self, dev: bool = False): @@ -50,24 +77,24 @@ def load_config(self, dev: bool = False): with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f: config = json.load(f) - x = random.randrange(0, 2) - username = config['login']['username'][x] - password = config['login']['password'][x] + username = config['login']['username'][self.account_x] + password = config['login']['password'][self.account_x] else: username = os.getenv('INSTAGRAM_CLIENT_ID') password = os.getenv('INSTAGRAM_CLIENT_PASSWORD') return (username, password) - def login(self): + def login(self, user_id: str, password: str): # Instagram 접속 및 로그인 url = 'https://www.instagram.com/' self.driver.get(url) - time.sleep(random.randrange(4, 6)) - user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input') - user.send_keys(self.user_id) - self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password) + time.sleep(random.randrange(4, 6) + random.random()) + self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input').send_keys(user_id) + time.sleep(random.randrange(1, 3) + random.random()) + self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(password) + time.sleep(random.randrange(1, 3) + random.random()) self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click() - time.sleep(random.randrange(5, 11)) + time.sleep(random.randrange(5, 11) + random.random()) def materialize(self): """ @@ -78,4 +105,7 @@ def materialize(self): with open(f"{self.base_path}/results/insdata_{current_datetime_getter()}.csv", 'w') as f: w = csv.writer(f) - w.writerow(self.data.values()) \ No newline at end of file + w.writerow(self.data.values()) + +if __name__ == "__main__": + test = InsCrawler(keywords='엔하이픈', dev=True) \ No newline at end of file From 0147d9a29930e7a666ea6c4272bc3114870aa79a Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Sat, 28 Sep 2024 19:39:58 +0900 Subject: [PATCH 23/31] add : catch suspicious account popup --- .../src/scrapper/inscrawler.py | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index ae95372..c95ea95 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -7,6 +7,8 @@ from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC from src.scrapper.models import inst_generator @@ -32,6 +34,11 @@ def __init__(self, self.login(user_id, password) + if self.suspicous_check(): + #TODO 계정 사용비율 낮추기 + print("return True in suspicious check") + time.sleep() + def make_driver(self): proxies = [ ["211.223.89.176:51147", @@ -56,11 +63,11 @@ def make_driver(self): options = webdriver.ChromeOptions() # options.add_argument("--headless") proxy = proxies[self.account_x][random.randrange(0, 4)] - # webdriver.DesiredCapabilities.CHROME['proxy'] = { - # "socksProxy": proxy, - # "socksVersion": 4, - # "proxyType": "MANUAL", - # } + print(proxy) + webdriver.DesiredCapabilities.CHROME['proxy'] = { + "socksProxy": proxy, + "socksVersion": 4, + } options.add_argument("--disable-blink-features=AutomationControlled") options.add_experimental_option("excludeSwitches", ["enable-automation"]) @@ -107,5 +114,26 @@ def materialize(self): w = csv.writer(f) w.writerow(self.data.values()) + def suspicous_check(self): + """ 현재 자동화 행동 의심받는지 확인 """ + try: + if 'wbloks_1' in self.driver.page_source: + print("자동화된 활동 경고가 나타났습니다.") + + close_button = self.driver.find_element(By.XPATH, '//div[@aria-label="Dismiss"]') + self.driver.execute_script("arguments[0].dispatchEvent(new MouseEvent('click', {bubbles: true}));", close_button) + + # # 닫기 버튼 클릭, 계정 사용 일시 중지 + # close_button = WebDriverWait(self.driver, 5).until( + # EC.element_to_be_clickable((By.XPATH, '//div[@aria-label="Dismiss"]')) + # ) + # close_button.click() + return True + return False + except Exception: + self.numof_error += 1 + return False + + if __name__ == "__main__": test = InsCrawler(keywords='엔하이픈', dev=True) \ No newline at end of file From 6e1666f098e7148feb22addd6e21fb23c747dc50 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Sat, 28 Sep 2024 20:05:20 +0900 Subject: [PATCH 24/31] fix : dummy --- brickstudy_ingestion/src/scrapper/inscrawler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index c95ea95..7a7bdeb 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -37,7 +37,7 @@ def __init__(self, if self.suspicous_check(): #TODO 계정 사용비율 낮추기 print("return True in suspicious check") - time.sleep() + time.sleep(300) def make_driver(self): proxies = [ @@ -134,6 +134,3 @@ def suspicous_check(self): self.numof_error += 1 return False - -if __name__ == "__main__": - test = InsCrawler(keywords='엔하이픈', dev=True) \ No newline at end of file From a31ea73d1771b2a97a93d8534ab3b489e423380e Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Mon, 11 Nov 2024 18:20:33 +0900 Subject: [PATCH 25/31] refactor : move driver_making component to utils --- .../src/scrapper/inscrawler.py | 48 +----------------- brickstudy_ingestion/src/scrapper/utils.py | 49 +++++++++++++++++++ 2 files changed, 51 insertions(+), 46 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py index 7a7bdeb..ec2dc48 100644 --- a/brickstudy_ingestion/src/scrapper/inscrawler.py +++ b/brickstudy_ingestion/src/scrapper/inscrawler.py @@ -3,14 +3,9 @@ import json from collections import defaultdict import random -from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from webdriver_manager.chrome import ChromeDriverManager -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC from src.scrapper.models import inst_generator +from src.scrapper.utils import get_driver class InsCrawler: @@ -21,7 +16,7 @@ def __init__(self, self.account_x = random.randrange(0, 2) if dev: proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion" - self.driver = self.make_driver() + self.driver = get_driver() else: proj_path = '/opt/airflow/brickstudy_ingestion' self.driver = driver @@ -39,45 +34,6 @@ def __init__(self, print("return True in suspicious check") time.sleep(300) - def make_driver(self): - proxies = [ - ["211.223.89.176:51147", - "121.66.105.19:51080", - "121.66.105.19:51080", - "8.213.128.6:8080"], - ["8.213.129.20:8090", - "8.213.129.20:5566", - "8.213.137.155:8090", - "8.220.204.215:808"], - ["8.220.205.172:9098", - "211.223.89.176:51147", - "8.213.128.90:2019", - "8.213.128.90:444"] - ] - user_agent_lst = [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" - ] - options = webdriver.ChromeOptions() - # options.add_argument("--headless") - proxy = proxies[self.account_x][random.randrange(0, 4)] - print(proxy) - webdriver.DesiredCapabilities.CHROME['proxy'] = { - "socksProxy": proxy, - "socksVersion": 4, - } - - options.add_argument("--disable-blink-features=AutomationControlled") - options.add_experimental_option("excludeSwitches", ["enable-automation"]) - options.add_experimental_option("useAutomationExtension", False) - driver = webdriver.Chrome( - options=options - ) - driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]}) - return driver def load_config(self, dev: bool = False): if dev: diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py index 5afbd2a..935290c 100644 --- a/brickstudy_ingestion/src/scrapper/utils.py +++ b/brickstudy_ingestion/src/scrapper/utils.py @@ -1,3 +1,52 @@ +def get_driver(): + """ + return selenium driver + """ + from selenium import webdriver + from selenium.webdriver.chrome.service import Service + from webdriver_manager.chrome import ChromeDriverManager + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + proxies = [ + ["211.223.89.176:51147", + "121.66.105.19:51080", + "121.66.105.19:51080", + "8.213.128.6:8080"], + ["8.213.129.20:8090", + "8.213.129.20:5566", + "8.213.137.155:8090", + "8.220.204.215:808"], + ["8.220.205.172:9098", + "211.223.89.176:51147", + "8.213.128.90:2019", + "8.213.128.90:444"] + ] + user_agent_lst = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" + ] + options = webdriver.ChromeOptions() + # options.add_argument("--headless") + proxy = proxies[self.account_x][random.randrange(0, 4)] + webdriver.DesiredCapabilities.CHROME['proxy'] = { + "socksProxy": proxy, + "socksVersion": 4, + } + + options.add_argument("--disable-blink-features=AutomationControlled") + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option("useAutomationExtension", False) + driver = webdriver.Chrome( + options=options + ) + driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]}) + return driver + + def get_soup(url: str = None): import urllib from urllib.request import urlopen From 6f8aab7948405eb61370cc908bfb264516f184e0 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Fri, 15 Nov 2024 21:47:03 +0900 Subject: [PATCH 26/31] =?UTF-8?q?refactor=20:=20brand,=20items=20class=20?= =?UTF-8?q?=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/oliveyoung.py | 19 ------- .../src/scrapper/oliveyoung_items.py | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 brickstudy_ingestion/src/scrapper/oliveyoung_items.py diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py index 1606637..9508204 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py @@ -18,9 +18,6 @@ def crawl_brand_metadata(self): ) self._get_brand_shop_url() - def crawl_items(self): - self._get_items() - @staticmethod def _get_oliveyoung_category_urls() -> list: """ @@ -94,19 +91,3 @@ def _get_brand_shop_url(self) -> None: self.brand_metadata[kor_brand_name].query_keyword.append(brand_name) except Exception: pass - - def _get_items(self) -> None: - """ - 각 브랜드의 제품 리스트, 해당 제품의 프로모션 여부 추가 - """ - for brand in self.brand_metadata.keys(): - brand_url = self.brand_metadata[brand].brand_shop_detail_url - brand_url_soup = get_soup(brand_url) - if brand_url_soup is None: - continue - item_dic = {} - for div in brand_url_soup.find_all('div', class_='prod-info'): - item_name = div.find('a').get('data-attr') - is_in_promotion = div.find('div', class_="discount") is not None - item_dic[item_name] = is_in_promotion - self.brand_metadata[brand].items = item_dic diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py new file mode 100644 index 0000000..6a00b75 --- /dev/null +++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py @@ -0,0 +1,57 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +import requests +import time +import random + +from src.scrapper.models import brand_generator + +class Items: + def __init__(self): + pass + + def crawl_items(self): + self._get_items() + + def _get_items(self) -> None: + """ + 각 브랜드의 제품 정보 추가 - 제품ID, 제품명, url, 프로모션여부 + """ + for brand in self.brand_metadata.keys(): + brand_url = self.brand_metadata[brand].brand_shop_detail_url + driver = webdriver.Chrome() + driver.get(brand_url) + + # 1페이지 상품 정보 수집 + self._get_products(driver, brand) + + # 다음 페이지 버튼 찾기 + next_pages = driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') + if next_pages: + for next_page in next_pages: + try: + driver.execute_script("arguments[0].click();", next_page) + time.sleep(random.randrange(5, 10) + random.random()) # 페이지 로딩 대기 + response = requests.get(brand_url) + if response.status_code != 200: + time.sleep(10) + except: + time.sleep(10) + + self._get_products(driver, brand) + + def _get_products(self, driver, brand) -> None: + """ + 하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집 + """ + products = driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb') + for product in products: + href = product.get_attribute('href') + data_ref_goodsno = product.get_attribute('data-ref-goodsno') + data_attr = product.get_attribute('data-attr') + is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0 + self.brand_metadata[brand].items[data_ref_goodsno] = { + 'item_name': data_attr, + 'href': href, + 'is_in_promotion': is_in_promotion + } From f5ebd9d7621ba2828b46cb5dba90e26a8d732ec3 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Fri, 15 Nov 2024 23:13:12 +0900 Subject: [PATCH 27/31] =?UTF-8?q?refactor=20:=20brand=20class=20=EB=B3=80?= =?UTF-8?q?=EC=88=98=EB=AA=85=20=EB=8B=A4=EB=93=AC=EA=B8=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/oliveyoung.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py index 9508204..57bb7ba 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py @@ -6,11 +6,8 @@ class Brand: - def __init__(self, brand_metadata=None) -> None: - if brand_metadata: - self.brand_metadata = brand_metadata - else: - self.brand_metadata = defaultdict(brand_generator) + def __init__(self) -> None: + self.brand_metadata = defaultdict(brand_generator) def crawl_brand_metadata(self): self._get_brand_in_each_category( @@ -81,13 +78,13 @@ def _get_brand_shop_url(self) -> None: for a_tag in total_brand_list_soup.find_all('a'): brand_code = a_tag.get('data-ref-onlbrndcd') if brand_code: - brand_name = a_tag.text - if brand_name in self.brand_metadata.keys(): # Kor brand name - self.brand_metadata[brand_name].brand_shop_detail_url = brand_base_url + brand_code - code_name[brand_code] = brand_name + brand = a_tag.text + if brand in self.brand_metadata.keys(): # Kor brand name + self.brand_metadata[brand].brand_shop_detail_url = brand_base_url + brand_code + code_name[brand_code] = brand else: # Eng brand name try: kor_brand_name = code_name[brand_code] - self.brand_metadata[kor_brand_name].query_keyword.append(brand_name) + self.brand_metadata[kor_brand_name].query_keyword.append(brand) except Exception: pass From dae319109a50a32b2cd6874edd632148ea5a4f64 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Fri, 15 Nov 2024 23:13:54 +0900 Subject: [PATCH 28/31] =?UTF-8?q?fix=20:=20item=20=EA=B8=B0=EB=B3=B8?= =?UTF-8?q?=EC=A0=95=EB=B3=B4=20=EC=88=98=EC=A7=91=EA=B8=B0=20=EB=8F=99?= =?UTF-8?q?=EC=9E=91=20=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95,=20dataclas?= =?UTF-8?q?s=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- brickstudy_ingestion/src/scrapper/models.py | 17 ++++- .../src/scrapper/oliveyoung_items.py | 68 +++++++++---------- 2 files changed, 48 insertions(+), 37 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/models.py b/brickstudy_ingestion/src/scrapper/models.py index a8eca32..5cbeda9 100644 --- a/brickstudy_ingestion/src/scrapper/models.py +++ b/brickstudy_ingestion/src/scrapper/models.py @@ -6,7 +6,6 @@ class OliveyoungBrand: query_keyword: List[str] # api 쿼리 키워드 - 브랜드 영문이름 등 brand_shop_detail_url: str # 브랜드관 url - items: Dict[str, bool] # 브랜드 제품 리스트 {제품명:할인여부} category: List[str] # 브랜드가 속한 카테고리 released_date: str = field(default_factory='2024/08/05') # 신제품 출시 일자 @@ -15,11 +14,25 @@ def brand_generator(): return OliveyoungBrand( [], '', - {}, [], '' ) +@dataclass +class OliveyoungItem: + item_name: str + item_detail_url: str + is_in_promotion: bool + reviews: List[str] + + +def oliveyoung_item_generator(): + return OliveyoungItem( + '', + '', + False, + [] + ) @dataclass class InstagramData: diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py index 6a00b75..139cb12 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py @@ -1,57 +1,55 @@ from selenium import webdriver from selenium.webdriver.common.by import By +from collections import defaultdict import requests import time import random -from src.scrapper.models import brand_generator +from src.scrapper.models import oliveyoung_item_generator class Items: - def __init__(self): - pass + def __init__(self, brand_name: str, brand_url: str): + self.brand = brand_name + self.brand_url = brand_url + self.data = defaultdict(oliveyoung_item_generator) + self.driver = webdriver.Chrome() def crawl_items(self): self._get_items() def _get_items(self) -> None: """ - 각 브랜드의 제품 정보 추가 - 제품ID, 제품명, url, 프로모션여부 + 하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집 """ - for brand in self.brand_metadata.keys(): - brand_url = self.brand_metadata[brand].brand_shop_detail_url - driver = webdriver.Chrome() - driver.get(brand_url) - - # 1페이지 상품 정보 수집 - self._get_products(driver, brand) - - # 다음 페이지 버튼 찾기 - next_pages = driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') - if next_pages: - for next_page in next_pages: - try: - driver.execute_script("arguments[0].click();", next_page) - time.sleep(random.randrange(5, 10) + random.random()) # 페이지 로딩 대기 - response = requests.get(brand_url) - if response.status_code != 200: - time.sleep(10) - except: + self.driver.get(self.brand_url) + + # 1페이지 상품 정보 수집 + self._get_products() + + # 다음 페이지 버튼 찾기 + next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') + if next_pages: + for next_page in next_pages: + try: + self.driver.execute_script("arguments[0].click();", next_page) + time.sleep(random.randrange(5, 10) + random.random()) # 페이지 로딩 대기 + response = requests.get(self.brand_url) + if response.status_code != 200: time.sleep(10) + except: + time.sleep(10) - self._get_products(driver, brand) + self._get_products() - def _get_products(self, driver, brand) -> None: - """ - 하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집 - """ - products = driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb') + def _get_products(self) -> None: + products = self.driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb') for product in products: href = product.get_attribute('href') data_ref_goodsno = product.get_attribute('data-ref-goodsno') data_attr = product.get_attribute('data-attr') - is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0 - self.brand_metadata[brand].items[data_ref_goodsno] = { - 'item_name': data_attr, - 'href': href, - 'is_in_promotion': is_in_promotion - } + is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > + item_id = f"{self.brand}_{data_ref_goodsno}" + + self.data[item_id].item_name = data_attr + self.data[item_id].item_detail_url = href + self.data[item_id].is_in_promotion = is_in_promotion From 38a8c2ee8a657ca9eaf333c644361ed30f7a7049 Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 19 Nov 2024 18:21:41 +0900 Subject: [PATCH 29/31] =?UTF-8?q?add=20:=20=EA=B0=81=20=EC=95=84=EC=9D=B4?= =?UTF-8?q?=ED=85=9C=20url=20=EB=93=A4=EC=96=B4=EA=B0=80=EC=84=9C=20?= =?UTF-8?q?=EB=A6=AC=EB=B7=B0=20=EC=88=98=EC=A7=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/oliveyoung_items.py | 87 +++++++++++++++++-- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py index 139cb12..9de3cdb 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py @@ -1,55 +1,126 @@ from selenium import webdriver from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +from datetime import datetime, timedelta from collections import defaultdict import requests import time import random from src.scrapper.models import oliveyoung_item_generator +from src.scrapper.utils import write_local_as_json class Items: def __init__(self, brand_name: str, brand_url: str): self.brand = brand_name self.brand_url = brand_url self.data = defaultdict(oliveyoung_item_generator) + self.item_id = None self.driver = webdriver.Chrome() def crawl_items(self): + # brand 페이지에서 전체 item 정보들 수집 + self.driver.get(self.brand_url) self._get_items() + # 각 item 페이지에서 리뷰 수집 + for item_id in self.data.keys(): + self.item_id = item_id + self.driver.get(self.data[item_id].item_detail_url) + self._get_reviews() + def _get_items(self) -> None: """ 하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집 """ - self.driver.get(self.brand_url) - - # 1페이지 상품 정보 수집 + # 최초 1페이지 상품 정보 수집 self._get_products() - - # 다음 페이지 버튼 찾기 + # 페이지 넘기면서 상품 정보 수집 next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') if next_pages: for next_page in next_pages: try: self.driver.execute_script("arguments[0].click();", next_page) - time.sleep(random.randrange(5, 10) + random.random()) # 페이지 로딩 대기 + time.sleep(random.randrange(5, 10) + random.random()) response = requests.get(self.brand_url) if response.status_code != 200: time.sleep(10) except: time.sleep(10) - self._get_products() def _get_products(self) -> None: + """ + 아이템 element 찾아서 실제 수집 동작 + """ products = self.driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb') for product in products: href = product.get_attribute('href') data_ref_goodsno = product.get_attribute('data-ref-goodsno') data_attr = product.get_attribute('data-attr') - is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > + is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0 item_id = f"{self.brand}_{data_ref_goodsno}" self.data[item_id].item_name = data_attr self.data[item_id].item_detail_url = href self.data[item_id].is_in_promotion = is_in_promotion + + def _get_reviews(self): + self.__click_review_button() + self.__click_latest_button() + + self.__get_reviews_with_page_moving() + + def __click_review_button(self) -> None: + try: + review_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a.goods_reputation[data-attr="상품상세^상품상세_SortingTab^리뷰"]') + self.driver.execute_script("arguments[0].scrollIntoView(true);", review_button_element) + self.driver.execute_script("arguments[0].click();", review_button_element) + time.sleep(random.randint(2, 4)) + except Exception as e: + print(f"리뷰 버튼 클릭 실패: {e}") + + def __click_latest_button(self) -> None: + try: + latest_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a[data-sort-type-code="latest"][data-attr="상품상세^리뷰정렬^최신순"]') + self.driver.execute_script("arguments[0].scrollIntoView(true);", latest_button_element) + self.driver.execute_script("arguments[0].click();", latest_button_element) + time.sleep(random.randint(2, 4)) + except Exception as e: + print(f"최신순 버튼 클릭 실패: {e}") + + def __get_reviews_with_page_moving(self): + self.__get_reviews_in_each_page() + next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') + flag = True + while flag: + flag = len(next_pages) == 10 # next page가 있으면 계속 클릭하면서 수집 + for i in range(len(next_pages)): + try: + self.driver.execute_script("arguments[0].click();", next_pages[i]) + time.sleep(random.randrange(5, 10) + random.random()) + except: + print("exception block in page moving") + time.sleep(3) + self.__get_reviews_in_each_page() + next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]') + + def __get_reviews_in_each_page(self): + """ + 리뷰 element 찾아서 실제 수집 동작 + """ + review_elements = self.driver.find_elements(By.CLASS_NAME, 'txt_inner') + date_elements = self.driver.find_elements(By.CLASS_NAME, 'date') + for rev_elem, date_elem in zip(review_elements, date_elements): + self.data[self.item_id].reviews.append((rev_elem.text, date_elem.text)) + + +if __name__ == "__main__": + brand_name = "토리든" + brand_url = "https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd=A002820&t_page=%EC%83%81%ED%92%88%EC%83%81%EC%84%B8&t_click=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B4%80_%EC%83%81%EB%8B%A8&t_brand_name=%ED%86%A0%EB%A6%AC%EB%93%A0" + item_x = Items(brand_name, brand_url) + item_x.crawl_items() + + write_local_as_json(item_x.data, './', 'toridn') From d1d30773e25a6a19440c05c60250762d029ee29c Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Tue, 19 Nov 2024 18:23:45 +0900 Subject: [PATCH 30/31] =?UTF-8?q?fix=20:=20dummy=20-=20main=20=ED=95=A8?= =?UTF-8?q?=EC=88=98,=20=ED=95=A8=EC=88=98=20=EC=84=A4=EB=AA=85=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- brickstudy_ingestion/src/scrapper/oliveyoung.py | 5 +++++ brickstudy_ingestion/src/scrapper/utils.py | 7 ++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4c011e0..021a48c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ log kafka-data aws_credentials brickstudy_ingestion/dags/viral/tmp -brickstudy_ingestion/src/scrapper/results \ No newline at end of file +brickstudy_ingestion/src/scrapper/results +.DS_Store \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py index 57bb7ba..90ed7b7 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py @@ -88,3 +88,8 @@ def _get_brand_shop_url(self) -> None: self.brand_metadata[kor_brand_name].query_keyword.append(brand) except Exception: pass + +if __name__ == "__main__": + brand = Brand() + brand.crawl_brand_metadata() + print(brand.brand_metadata) \ No newline at end of file diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py index 935290c..22dcec7 100644 --- a/brickstudy_ingestion/src/scrapper/utils.py +++ b/brickstudy_ingestion/src/scrapper/utils.py @@ -100,7 +100,12 @@ def dict_partitioner(data: dict, level: int): start = end -def write_local_as_json(data, file_path, file_name): +def write_local_as_json(data: dict, file_path: str, file_name: str): + """ + data : dictionary with the dataclass value + file_path : directory string where the json file created + file_name : file name without extension + """ from dataclasses import asdict import json import os From 79675029bafd0a265ea70c01e04b02570c9303cf Mon Sep 17 00:00:00 2001 From: seoyeong200 Date: Wed, 20 Nov 2024 18:04:29 +0900 Subject: [PATCH 31/31] =?UTF-8?q?bug=20:=20oliveyoung=20item=20=EC=88=98?= =?UTF-8?q?=EC=A7=91=20=EC=BD=94=EB=93=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/scrapper/oliveyoung_items.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py index 9de3cdb..ea8e29f 100644 --- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py +++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py @@ -20,16 +20,17 @@ def __init__(self, brand_name: str, brand_url: str): self.item_id = None self.driver = webdriver.Chrome() - def crawl_items(self): + def crawl_total_items(self): # brand 페이지에서 전체 item 정보들 수집 self.driver.get(self.brand_url) self._get_items() + def crawl_reviews_in_each_items(self, item_id: str): # 각 item 페이지에서 리뷰 수집 - for item_id in self.data.keys(): - self.item_id = item_id - self.driver.get(self.data[item_id].item_detail_url) - self._get_reviews() + self.item_id = item_id + item_url = self.data[item_id].item_detail_url + self.driver.get(item_url) + self._get_reviews() def _get_items(self) -> None: """ @@ -43,12 +44,9 @@ def _get_items(self) -> None: for next_page in next_pages: try: self.driver.execute_script("arguments[0].click();", next_page) - time.sleep(random.randrange(5, 10) + random.random()) - response = requests.get(self.brand_url) - if response.status_code != 200: - time.sleep(10) + time.sleep(random.randrange(5, 7) + random.random()) except: - time.sleep(10) + time.sleep(2) self._get_products() def _get_products(self) -> None: @@ -78,7 +76,7 @@ def __click_review_button(self) -> None: review_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a.goods_reputation[data-attr="상품상세^상품상세_SortingTab^리뷰"]') self.driver.execute_script("arguments[0].scrollIntoView(true);", review_button_element) self.driver.execute_script("arguments[0].click();", review_button_element) - time.sleep(random.randint(2, 4)) + time.sleep(random.randint(1, 3)) except Exception as e: print(f"리뷰 버튼 클릭 실패: {e}") @@ -87,7 +85,7 @@ def __click_latest_button(self) -> None: latest_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a[data-sort-type-code="latest"][data-attr="상품상세^리뷰정렬^최신순"]') self.driver.execute_script("arguments[0].scrollIntoView(true);", latest_button_element) self.driver.execute_script("arguments[0].click();", latest_button_element) - time.sleep(random.randint(2, 4)) + time.sleep(random.randint(1, 3)) except Exception as e: print(f"최신순 버튼 클릭 실패: {e}") @@ -100,7 +98,7 @@ def __get_reviews_with_page_moving(self): for i in range(len(next_pages)): try: self.driver.execute_script("arguments[0].click();", next_pages[i]) - time.sleep(random.randrange(5, 10) + random.random()) + time.sleep(random.randrange(3, 5) + random.random()) except: print("exception block in page moving") time.sleep(3) @@ -121,6 +119,12 @@ def __get_reviews_in_each_page(self): brand_name = "토리든" brand_url = "https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd=A002820&t_page=%EC%83%81%ED%92%88%EC%83%81%EC%84%B8&t_click=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B4%80_%EC%83%81%EB%8B%A8&t_brand_name=%ED%86%A0%EB%A6%AC%EB%93%A0" item_x = Items(brand_name, brand_url) - item_x.crawl_items() + item_x.crawl_total_items() + print("crawl total item is done") + print(item_x.data) - write_local_as_json(item_x.data, './', 'toridn') + item_list = item_x.data.keys() + for idx, test_item in enumerate(item_list): + item_x.crawl_reviews_in_each_items(item_id=test_item) + if idx % 4 == 0: + write_local_as_json(item_x.data, './logs', f"{brand_name}_{idx}")