From 8aa41b8e5a83aa666ffd9f736883c9f73c14cad1 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Fri, 30 Aug 2024 03:46:20 +0900
Subject: [PATCH 01/31] fix : dockeroperator bug by adding proxy

---
 .../dags/viral/twitter_crawler.py              |  9 ++++++---
 docker-compose.yaml                            | 18 +++++++++++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/brickstudy_ingestion/dags/viral/twitter_crawler.py b/brickstudy_ingestion/dags/viral/twitter_crawler.py
index 90970f9..8efd411 100644
--- a/brickstudy_ingestion/dags/viral/twitter_crawler.py
+++ b/brickstudy_ingestion/dags/viral/twitter_crawler.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+import os
 
 from airflow import DAG
 from airflow.models import Variable
@@ -20,11 +21,11 @@
 }
 # =========================================
 
-
 OUTPUT_FILENAME = "test.csv"
 SEARCH_KEYWORD = "enhypen"
 LIMIT = 10
 TOKEN = Variable.get("TWITTER_CRAWLER_AUTH_TOKEN_PASSWORD")
+HOST_BASE_PATH = '/Users/seoyeongkim/Documents/ETL'
 
 with DAG(
     dag_id=DAG_ID,
@@ -36,15 +37,17 @@
         task_id='t_docker',
         image='brickstudy/twitter_crawler:latest',
         container_name='twitter_crawler',
+        api_version='1.37',
         auto_remove=True,
         mount_tmp_dir=False,
         mounts=[
-            Mount(source="/opt/airflow/logs/tweets-data", target="/app/tweets-data", type="bind"),
+            Mount(source=f"{HOST_BASE_PATH}/logs", target="/app/tweets-data", type="bind"),
         ],
         command=[
+            "bash", "-c",
             f"npx --yes tweet-harvest@latest -o {OUTPUT_FILENAME} -s {SEARCH_KEYWORD} -l {LIMIT} --token {TOKEN}"
         ],
-        docker_url='unix://var/run/docker.sock',
+        docker_url='tcp://docker-socket-proxy:2375',
         network_mode='bridge',
     )
 
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 0912597..8b4abb5 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -69,6 +69,7 @@ x-airflow-common:
     # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server
     # yamllint enable rule:line-length
     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
+    AIRFLOW__CORE__ENABLE_XCOM_PICKLING: 'true'
     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
     # for other purpose (development, test and especially production usage) build/extend Airflow image.
     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-apache-airflow[postgres,virtualenv,apache-airflow-providers-mysql]}
@@ -79,7 +80,8 @@ x-airflow-common:
     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
     - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
-  user: "${AIRFLOW_UID:-50000}:0"
+  # user: "${AIRFLOW_UID:-50000}:0"
+  user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}"
   depends_on:
     &airflow-common-depends-on
     redis:
@@ -343,6 +345,20 @@ services:
       airflow-init:
         condition: service_completed_successfully
 
+  # Proxy container for docker socket
+  # Forward DockerOperator TCP connection to Host docker daemon
+  docker-socket-proxy:
+    image: tecnativa/docker-socket-proxy:0.1.1
+    environment:
+      CONTAINERS: 1
+      IMAGES: 1
+      AUTH: 1
+      POST: 1
+    privileged: true
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro # read only
+    restart: always
+
 # ========================================
 # Kafka infra
   zookeeper:

From 89233f234b09ecb30799b081a1daed252f2e46b3 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 13:44:53 +0900
Subject: [PATCH 02/31] =?UTF-8?q?feature=20:=20get=5Fsoup=20=EC=9C=A0?=
 =?UTF-8?q?=ED=8B=B8=EC=AA=BD=EC=9C=BC=EB=A1=9C=20=EC=9D=B4=EB=8F=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/__init__.py | 34 ----------------
 .../src/scrapper/oliveyoung.py                |  4 +-
 brickstudy_ingestion/src/scrapper/utils.py    | 39 +++++++++++++++++++
 3 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/__init__.py b/brickstudy_ingestion/src/scrapper/__init__.py
index 673266f..fd40910 100644
--- a/brickstudy_ingestion/src/scrapper/__init__.py
+++ b/brickstudy_ingestion/src/scrapper/__init__.py
@@ -1,38 +1,4 @@
-import urllib
-from urllib.request import urlopen
-from urllib.error import HTTPError, URLError
-from bs4 import BeautifulSoup
-import random
-import time
 
-from src.common.exception import ExtractError
 
 
-def get_soup(url: str = None):
-    user_agent_lst = ['Googlebot', 'Yeti', 'Daumoa', 'Twitterbot']
-    user_agent = user_agent_lst[random.randint(0, len(user_agent_lst) - 1)]
-    headers = {'User-Agent': user_agent}
 
-    try:
-        req = urllib.request.Request(url, headers=headers)
-        page = urlopen(req)
-        html = page.read().decode("utf-8")
-        soup = BeautifulSoup(html, "html.parser")
-    except (HTTPError, URLError) as e:
-        err = ExtractError(
-            code=000,
-            message=f"**{url}** HTTPError/URLError. Sleep 5 and continue.",
-            log=e
-        )
-        time.sleep(5)  # TODO 이 경우 해당 url에 대해 재실행 필요
-    except (ValueError) as e:
-        err = ExtractError(
-            code=000,
-            message=f"**{url}** ValueError. Ignore this url parameter.",
-            log=e
-        )
-        print(err)
-        soup = None  # TODO 해당 url 무시
-    else:
-        time.sleep(random.random())
-        return soup
diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py
index 5dd94bd..f912a12 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 from datetime import datetime
 
-from . import get_soup
+from src.scrapper.utils import get_soup
 from src.scrapper.models import brand_generator
 
 
@@ -101,7 +101,7 @@ def _get_items(self) -> None:
         """
         for brand in self.brand_metadata.keys():
             brand_url = self.brand_metadata[brand].brand_shop_detail_url
-            brand_url_soup = get_soup(brand_url)
+            brand_url_soup = get_soup(brand_url)==----------0-0
             if brand_url_soup is None:
                 continue
             item_dic = {}
diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py
index ca9d141..5d151f4 100644
--- a/brickstudy_ingestion/src/scrapper/utils.py
+++ b/brickstudy_ingestion/src/scrapper/utils.py
@@ -1,3 +1,42 @@
+def get_soup(url: str = None):
+    import urllib
+    from urllib.request import urlopen
+    from urllib.error import HTTPError, URLError
+    from bs4 import BeautifulSoup
+    import random
+    import time
+
+    from src.common.exception import ExtractError
+
+    user_agent_lst = ['Googlebot', 'Yeti', 'Daumoa', 'Twitterbot']
+    user_agent = user_agent_lst[random.randint(0, len(user_agent_lst) - 1)]
+    headers = {'User-Agent': user_agent}
+
+    try:
+        req = urllib.request.Request(url, headers=headers)
+        page = urlopen(req)
+        html = page.read().decode("utf-8")
+        soup = BeautifulSoup(html, "html.parser")
+    except (HTTPError, URLError) as e:
+        err = ExtractError(
+            code=000,
+            message=f"**{url}** HTTPError/URLError. Sleep 5 and continue.",
+            log=e
+        )
+        time.sleep(5)  # TODO 이 경우 해당 url에 대해 재실행 필요
+    except (ValueError) as e:
+        err = ExtractError(
+            code=000,
+            message=f"**{url}** ValueError. Ignore this url parameter.",
+            log=e
+        )
+        print(err)
+        soup = None  # TODO 해당 url 무시
+    else:
+        time.sleep(random.random())
+        return soup
+
+
 def dict_partitioner(data: dict, level: int):
     total_n = len(data)
     partition_n = total_n // level

From fb0f20cdb6cfe60f892ee15fab58ae21fd31d2b6 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 13:45:13 +0900
Subject: [PATCH 03/31] fix : readme

---
 brickstudy_ingestion/src/scrapper/readme.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/readme.md b/brickstudy_ingestion/src/scrapper/readme.md
index 4448800..65d9d98 100644
--- a/brickstudy_ingestion/src/scrapper/readme.md
+++ b/brickstudy_ingestion/src/scrapper/readme.md
@@ -2,11 +2,13 @@
 
 ```
 brickstudy_ingestion/src/scrapper
-├── browser.py      # selenium으로 크롤링에 필요한 configs, utils 정의 모듈
-├── inscrawler.py   # instagram crawler main 모듈
-├── models.py
-├── oliveyoung.py   # oliveyoung scrapper main 모듈
-└── utils.py        # 공통 메소드
+├── __init__.py     # scrapper 모듈 entrypoint
+├── Dockerfile      # [Twitter] twitter crawler 동작 환경
+├── browser.py      # [Instagram] selenium으로 크롤링하는데에 사용되는 로직 정의 모듈
+├── inscrawler.py   # [Instagram]instagram crawler main 모듈
+├── models.py       # [Oliveyoung] 올리브영 브랜드 수집 데이터 구조
+├── oliveyoung.py   # [Oliveyoung] scrapper main 모듈
+└── utils.py        # 공통 유틸리티 메소드
 ```
 
 

From b763d17210a688a50209150b326cbbb8e1596d1e Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 16:02:47 +0900
Subject: [PATCH 04/31] fix : bug- typo

---
 brickstudy_ingestion/src/scrapper/oliveyoung.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py
index f912a12..1606637 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py
@@ -101,7 +101,7 @@ def _get_items(self) -> None:
         """
         for brand in self.brand_metadata.keys():
             brand_url = self.brand_metadata[brand].brand_shop_detail_url
-            brand_url_soup = get_soup(brand_url)==----------0-0
+            brand_url_soup = get_soup(brand_url)
             if brand_url_soup is None:
                 continue
             item_dic = {}

From aba10541cf83b2cba5036c211f8ce3c5621fc2ea Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 17:48:11 +0900
Subject: [PATCH 05/31] =?UTF-8?q?ADD=20:=20=EC=9D=B8=EC=8A=A4=ED=83=80?=
 =?UTF-8?q?=EA=B7=B8=EB=9E=A8=20url=20=EC=88=98=EC=A7=91=20=EB=AA=A8?=
 =?UTF-8?q?=EB=93=88=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/ins_url.py  | 66 +++++++++++++++++
 .../src/scrapper/inscrawler.py                | 71 +++++++++----------
 brickstudy_ingestion/src/scrapper/models.py   | 17 +++++
 3 files changed, 117 insertions(+), 37 deletions(-)
 create mode 100644 brickstudy_ingestion/src/scrapper/ins_url.py

diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
new file mode 100644
index 0000000..2181631
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -0,0 +1,66 @@
+import time
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+import urllib
+import re
+
+from src.scrapper.inscrawler import InsCrawler
+
+
+class InsURLCrawler(InsCrawler):
+    def __init__(self):
+        super().__init__()
+
+    def get_urls(self, keyword: str = None):
+        if keyword is not None:              # execute with given keyword
+            self._fetch_url_data(keyword)
+        else:                                # execute with entire keywords
+            for keyword in self.keywords:
+                self._fetch_url_data(keyword)
+
+    def _fetch_url_data(self, keyword):
+        word = urllib.parse.quote(keyword)
+        word_url = f'https://www.instagram.com/explore/tags/{word}/'
+        self.driver.get(word_url)
+
+        try:
+            time.sleep(5)
+            js = 'window.scrollBy(0,1000)'
+            self.driver.execute_script(js)
+            html = self.driver.page_source
+            soup = BeautifulSoup(html, 'lxml')
+
+            divimg = soup.find_all('img', {'class': 'x5yr21d xu96u03 x10l6tqk x13vifvy x87ps6o xh8yej3'})
+            if not divimg:
+                print('이미지를 찾을 수 없습니다.')
+                raise Exception
+
+            for div in divimg:
+                content = div.get('alt')
+                if not content:
+                    print('내용이 없습니다.')
+                    continue
+
+                a = div.find_parent('a')
+                if a is None:
+                    print('게시물 링크가 잘못되었습니다.')
+                    continue
+                urlto = a.get('href')
+                if urlto is None:
+                    print('게시물 링크가 없습니다.')
+                    continue
+                totalurl = 'https://www.instagram.com' + urlto
+                self.data[urlto].brand = keyword
+                self.data[urlto].post_url = totalurl
+
+                modified_content = re.sub(r'\s*\n\s*', ' ', content)
+                self.data[urlto].full_text = modified_content
+
+            print(f'페이지 {keyword}에서 데이터를 가져오는 중...')
+            time.sleep(5)
+
+        except Exception as e:
+            print(e)
+            print('오류 발생')
+
+        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 25de1e0..6ffaee4 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,43 +1,40 @@
 import os
-
-from src.common.exception import RetryException
-
-from browser import Browser
-from utils import retry
-
-
-class InsCrawler():
-    URL = "https://www.instagram.com"
-    RETRY_LIMIT = 10
-
-    def __init__(self, has_screen=False):
-        super(InsCrawler, self).__init__()
-        self.browser = Browser(has_screen)
-        self.page_height = 0
+import time
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import json
+from collections import defaultdict
+
+from src.scrapper.models import inst_generator
+
+
+class InsCrawler:
+    def __init__(self):
+        #TODO base_dir 받는 부분 수정
+        self.base_dir = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper'
+        self.user_id, self.password, self.keywords, self.iter = self.load_config()
+        self.data = defaultdict(inst_generator)
+        self.driver = webdriver.Chrome()
         self.login()
 
-    def login(self):
-        browser = self.browser
-        url = "%s/accounts/login/" % (InsCrawler.URL)
-        browser.get(url)
-        u_input = browser.find_one('input[name="username"]')
-        u_input.send_keys(os.getenv('INSTAGRAM_ID'))
-        p_input = browser.find_one('input[name="password"]')
-        p_input.send_keys(os.getenv('INSTAGRAM_PWD'))
-
-        login_btn = browser.find_one('button[type="submit"]')
-        login_btn.click()
+    def load_config(self):
+        with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f:
+            config = json.load(f)
 
-        @retry()
-        def check_login():
-            if browser.find_one('input[name="username"]'):
-                raise RetryException()
+        username = config['login']['username']
+        password = config['login']['password']
+        keywords = config['keywords']
+        iter = config['iter']
 
-        check_login()
+        return username, password, keywords, iter
 
-    def get_latest_posts_by_tag(self, tag, num):
-        tag = 'enhypen'
-        url = f"{InsCrawler.URL}/explore/search/keyword/?q=%23{tag}"
-        self.browser.get(url)
-        self.browser.scroll_down()
-        #TODO 게시물 클릭, 컨텐츠 가져오기
\ No newline at end of file
+    def login(self):
+        # Instagram 접속 및 로그인
+        url = 'https://www.instagram.com/'
+        self.driver.get(url)
+        time.sleep(6)
+        user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')
+        user.send_keys(self.user_id)
+        self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
+        self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
+        time.sleep(80)
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/models.py b/brickstudy_ingestion/src/scrapper/models.py
index 7c5954e..a8eca32 100644
--- a/brickstudy_ingestion/src/scrapper/models.py
+++ b/brickstudy_ingestion/src/scrapper/models.py
@@ -19,3 +19,20 @@ def brand_generator():
         [],
         ''
     )
+
+
+@dataclass
+class InstagramData:
+    brand: str
+    post_url: str
+    full_text: str
+    username: str
+    like: int
+    saved_imgs: str
+    date: str
+
+
+def inst_generator():
+    return InstagramData(
+        '', '', '', '', 0, '', ''
+    )

From 94e2d0dc5109905a48d89dbd9ddb780d75a29f55 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 19:26:44 +0900
Subject: [PATCH 06/31] fix : close driver after url method returned

---
 brickstudy_ingestion/src/scrapper/ins_url.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
index 2181631..830b1a3 100644
--- a/brickstudy_ingestion/src/scrapper/ins_url.py
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -1,6 +1,5 @@
 import time
 from bs4 import BeautifulSoup
-from selenium.webdriver.common.by import By
 import urllib
 import re
 
@@ -63,4 +62,5 @@ def _fetch_url_data(self, keyword):
             print(e)
             print('오류 발생')
 
-        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
\ No newline at end of file
+        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
+        self.driver.close()
\ No newline at end of file

From f299ae2384fe989b28ec65f85cca1026b8ff68ed Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 19:27:11 +0900
Subject: [PATCH 07/31] add : materialize method in inscrawler

---
 .../src/scrapper/inscrawler.py                | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 6ffaee4..f563beb 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,4 +1,3 @@
-import os
 import time
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -10,8 +9,9 @@
 
 class InsCrawler:
     def __init__(self):
-        #TODO base_dir 받는 부분 수정
-        self.base_dir = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper'
+        # TODO proj_path(실행환경의 project 절대경로) 받는 부분 수정  환경변수 설정 필요
+        proj_path = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion'
+        self.base_path = f'{proj_path}/src/scrapper'
         self.user_id, self.password, self.keywords, self.iter = self.load_config()
         self.data = defaultdict(inst_generator)
         self.driver = webdriver.Chrome()
@@ -37,4 +37,15 @@ def login(self):
         user.send_keys(self.user_id)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
-        time.sleep(80)
\ No newline at end of file
+        time.sleep(80)
+
+    def materialize(self):
+        """
+        self.data to csv file
+        """
+        from src.scrapper.utils import current_datetime_getter
+        import csv
+
+        with open(f"{self.base_path}/insdata_{current_datetime_getter()}.csv", 'w') as f:
+            w = csv.writer(f)
+            w.writerow(self.data.values())
\ No newline at end of file

From 9ea3206260952ac8ef91a17c9bd4a92c894c83d5 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 19:30:24 +0900
Subject: [PATCH 08/31] add : generate uid based on current datetime

---
 brickstudy_ingestion/src/scrapper/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py
index 5d151f4..5afbd2a 100644
--- a/brickstudy_ingestion/src/scrapper/utils.py
+++ b/brickstudy_ingestion/src/scrapper/utils.py
@@ -110,4 +110,13 @@ def wrapped_f(*args, **kwargs):
 
         return wrapped_f
 
-    return wrap
\ No newline at end of file
+    return wrap
+
+
+def current_datetime_getter():
+    import pytz
+    from datetime import datetime
+    kst = pytz.timezone('Asia/Seoul')
+    current_time = datetime.now(kst)
+    current_datetime = current_time.strftime("%Y%m%d_%H%M%S")
+    return current_datetime
\ No newline at end of file

From 639e49fc408d8f619bb757e6c1e2e276d19c39d5 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 19:31:21 +0900
Subject: [PATCH 09/31] add : crawling post data based on url(tested)

---
 .gitignore                                    |   4 +-
 brickstudy_ingestion/src/scrapper/ins_data.py | 138 ++++++++++++++++++
 .../tests/scrapper/test_instagram.py          |  13 ++
 3 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 brickstudy_ingestion/src/scrapper/ins_data.py
 create mode 100644 brickstudy_ingestion/tests/scrapper/test_instagram.py

diff --git a/.gitignore b/.gitignore
index ace90b6..4c011e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,6 @@ log
 
 # kafka data
 kafka-data
-aws_credentials
\ No newline at end of file
+aws_credentials
+brickstudy_ingestion/dags/viral/tmp
+brickstudy_ingestion/src/scrapper/results
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
new file mode 100644
index 0000000..2a5ecb6
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -0,0 +1,138 @@
+import time
+import pandas as pd
+from bs4 import BeautifulSoup
+from selenium.webdriver.common.by import By
+import requests
+import re
+
+from src.scrapper.inscrawler import InsCrawler
+
+
+class InsDataCrawler(InsCrawler):
+    def __init__(self, data):
+        super().__init__()
+        self.data = data
+        self.headers = {
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
+        }
+
+    def get_post_data(self):
+        results = f'{self.base_path}/results/data.txt'
+        with open(results, 'r') as f:
+            post_crawled_data = {line.strip() for line in f}
+
+        for idx, (key, val) in enumerate(self.data.items()):
+
+            post_url = val.post_url
+
+            if post_url in post_crawled_data:
+                continue
+
+            self.driver.get(post_url)
+            print(idx, '. ' + post_url)
+
+            try:
+                time.sleep(5)
+                html = self.driver.page_source
+                soup = BeautifulSoup(html, 'lxml')
+
+                # 작성자
+                username = soup.find('span', {'class': '_ap3a _aaco _aacw _aacx _aad7 _aade'}).text
+                print(username, end=' ')
+                self.data[key].username = username
+
+                # 작성일자
+                date = soup.find_all('time')[-1]['datetime'][:10]
+                print(date, end=' ')
+                self.data[key].date = date
+
+                # like 개수
+                try:
+                    like = soup.find('span', {'class': 'html-span xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x1hl2dhg x16tdsg8 x1vvkbs'}).text
+                except Exception:
+                    like = 'no data'  # ~~외 여러 명이 좋아합니다. 같은 경우
+                print(like)
+                self.data[key].like = like
+
+                # 이미지 저장
+                images = []
+                img_urls = set()
+                images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3'))
+
+                for i in range(len(images)):
+                    for j in range(len(images[i])):
+
+                        if j >= 3:  # 4번째부터 타 게시물의 썸네일 이미지
+                            break
+
+                        alt = images[i][j].get_attribute('alt')
+                        check = re.findall(r'by .+? on', alt)  # 타 게시물인지 아닌지 검사
+
+                        if check != []:
+                            img_urls.add(images[i][j].get_attribute('src'))
+
+                # 이미지 끝까지 넘기면서 url 추출
+                try:
+                    while True:
+                        time.sleep(3)
+
+                        self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click()  # 다음 이미지 버튼 클릭
+                        images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3'))
+
+                        for i in range(len(images)):
+                            for j in range(len(images[i])):
+
+                                if j >= 3:  # 4번째부터 타 게시물의 썸네일 이미지
+                                    break
+
+                                alt = images[i][j].get_attribute('alt')
+                                check = re.findall(r'by .+? on', alt)  # 타 게시물인지 아닌지 검사
+
+                                if check != []:
+                                    img_urls.add(images[i][j].get_attribute('src'))
+
+                        images.clear()
+
+                except Exception:
+                    print('더 이상 넘길 이미지 없음')
+
+                    img_urls = list(img_urls)
+                    print(img_urls)
+                    images.clear()
+
+                saved_imgs = set()
+                for img_url in img_urls:
+                    # 이미지만 고려. 우선 비디오 타입은 고려하지 않음.
+                    pattern = r'\/v\/[^\/]+\/([^\/\?]+)\.(jpg|png|webp|heic)'
+                    match = re.search(pattern, img_url)
+                    if match:
+                        img_name = match.group(1) + '.' + match.group(2)
+                    else:
+                        print('파일을 찾을 수 없거나 jpg 혹은 png, webp, heic 파일이 아님.')
+                        continue
+
+                    if img_name not in saved_imgs:
+                        response = requests.get(img_url, headers=self.headers, timeout=20)
+
+                        with open(f'{self.base_path}/results/images/' + img_name, 'wb') as f:
+                            f.write(response.content)
+
+                        saved_imgs.add(img_name)
+
+                    time.sleep(.5)
+
+                print(f"총 {len(saved_imgs)} 장의 이미지 저장")
+                self.data[key].saved_imgs = str(list(saved_imgs))
+
+                time.sleep(5)
+
+            except Exception as e:
+                print(e)
+                print('오류 발생')
+
+        # 수집 완료된 데이터 키값(post url unique id) 저장
+        with open(results, 'a') as f:
+            for key in self.data.keys():
+                f.write(key + '\n')
+
+        self.driver.close()
diff --git a/brickstudy_ingestion/tests/scrapper/test_instagram.py b/brickstudy_ingestion/tests/scrapper/test_instagram.py
new file mode 100644
index 0000000..a1fee10
--- /dev/null
+++ b/brickstudy_ingestion/tests/scrapper/test_instagram.py
@@ -0,0 +1,13 @@
+from src.scrapper.ins_url import InsURLCrawler
+from src.scrapper.ins_data import InsDataCrawler
+
+
+def test_get_urls():
+    keyword = '올리브영'
+    url_crawler = InsURLCrawler()
+    url_crawler.get_urls(keyword)
+    url_crawler.materialize()
+
+    crawler = InsDataCrawler(url_crawler.data)
+    crawler.get_post_data()
+    crawler.materialize()
\ No newline at end of file

From ddeeb4570f7b804d4e210914516313cc4f6d7e64 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 19:32:49 +0900
Subject: [PATCH 10/31] delete : browser class

---
 brickstudy_ingestion/src/scrapper/browser.py | 103 -------------------
 1 file changed, 103 deletions(-)
 delete mode 100644 brickstudy_ingestion/src/scrapper/browser.py

diff --git a/brickstudy_ingestion/src/scrapper/browser.py b/brickstudy_ingestion/src/scrapper/browser.py
deleted file mode 100644
index d2bc7bf..0000000
--- a/brickstudy_ingestion/src/scrapper/browser.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-
-from selenium import webdriver
-from selenium.common.exceptions import NoSuchElementException
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
-# from selenium.webdriver.common.keys import Keys
-from fake_useragent import UserAgent
-
-from utils import randmized_sleep
-
-
-class Browser:
-    def __init__(self, has_screen):
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        service_args = ["--ignore-ssl-errors=true"]
-        chrome_options = Options()
-        if not has_screen:
-            chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--start-maximized")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("user-agent=" + UserAgent().random)
-        self.driver = webdriver.Chrome(
-            executable_path=f"{dir_path}/bin/chromedriver",
-            service_args=service_args,
-            chrome_options=chrome_options,
-        )
-        self.driver.implicitly_wait(5)
-
-    @property
-    def page_height(self):
-        return self.driver.execute_script("return document.body.scrollHeight")
-
-    def get(self, url):
-        self.driver.get(url)
-
-    @property
-    def current_url(self):
-        return self.driver.current_url
-
-    def implicitly_wait(self, t):
-        self.driver.implicitly_wait(t)
-
-    def find_one(self, css_selector, elem=None, waittime=0):
-        obj = elem or self.driver
-
-        if waittime:
-            WebDriverWait(obj, waittime).until(
-                EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
-            )
-
-        try:
-            return obj.find_element(By.CSS_SELECTOR, css_selector)
-        except NoSuchElementException:
-            return None
-
-    def find(self, css_selector, elem=None, waittime=0):
-        obj = elem or self.driver
-
-        try:
-            if waittime:
-                WebDriverWait(obj, waittime).until(
-                    EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
-                )
-        except TimeoutException:
-            return None
-
-        try:
-            return obj.find_elements(By.CSS_SELECTOR, css_selector)
-        except NoSuchElementException:
-            return None
-
-    def scroll_down(self, wait=0.3):
-        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
-        randmized_sleep(wait)
-
-    def scroll_up(self, offset=-1, wait=2):
-        if offset == -1:
-            self.driver.execute_script("window.scrollTo(0, 0)")
-        else:
-            self.driver.execute_script("window.scrollBy(0, -%s)" % offset)
-        randmized_sleep(wait)
-
-    def js_click(self, elem):
-        self.driver.execute_script("arguments[0].click();", elem)
-
-    def open_new_tab(self, url):
-        self.driver.execute_script("window.open('%s');" % url)
-        self.driver.switch_to.window(self.driver.window_handles[1])
-
-    def close_current_tab(self):
-        self.driver.close()
-
-        self.driver.switch_to.window(self.driver.window_handles[0])
-
-    def __del__(self):
-        try:
-            self.driver.quit()
-        except Exception:
-            pass

From 87248de663b6addb59bf0c656f93e8e75623ad17 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 22:56:17 +0900
Subject: [PATCH 11/31] =?UTF-8?q?fix=20:=20inscralwer=20=EC=83=9D=EC=84=B1?=
 =?UTF-8?q?=EC=9E=90=20dev,=20prod=20=EB=B6=84=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/ins_data.py |  1 -
 .../src/scrapper/inscrawler.py                | 33 +++++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
index 2a5ecb6..e4ec1b9 100644
--- a/brickstudy_ingestion/src/scrapper/ins_data.py
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -1,5 +1,4 @@
 import time
-import pandas as pd
 from bs4 import BeautifulSoup
 from selenium.webdriver.common.by import By
 import requests
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index f563beb..6ae11c0 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,3 +1,4 @@
+import os
 import time
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -8,25 +9,29 @@
 
 
 class InsCrawler:
-    def __init__(self):
-        # TODO proj_path(실행환경의 project 절대경로) 받는 부분 수정  환경변수 설정 필요
-        proj_path = '/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion'
+    def __init__(self, dev: bool = False):
+        if dev:
+            proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion'
+        else:
+            proj_path = '/opt/airflow/brickstudy_ingestion'
         self.base_path = f'{proj_path}/src/scrapper'
-        self.user_id, self.password, self.keywords, self.iter = self.load_config()
+
+        self.user_id, self.password = self.load_config(dev=dev)
         self.data = defaultdict(inst_generator)
         self.driver = webdriver.Chrome()
         self.login()
 
-    def load_config(self):
-        with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f:
-            config = json.load(f)
-
-        username = config['login']['username']
-        password = config['login']['password']
-        keywords = config['keywords']
-        iter = config['iter']
-
-        return username, password, keywords, iter
+    def load_config(self, dev: bool = False):
+        if dev:
+            with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f:
+                config = json.load(f)
+
+            username = config['login']['username']
+            password = config['login']['password']
+        else:
+            username = os.environ('INSTAGRAM_ID')
+            password = os.environ('INSTAGRAM_PASSWORD')
+        return (username, password)
 
     def login(self):
         # Instagram 접속 및 로그인

From 0ab1ba264ecb7f24e2f76614dac66c4ebcb575ef Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 22:56:48 +0900
Subject: [PATCH 12/31] =?UTF-8?q?add=20:=20=EC=A0=95=ED=95=B4=EC=A7=84=20?=
 =?UTF-8?q?=EC=B9=B4=ED=85=8C=EA=B3=A0=EB=A6=AC=EC=97=90=20=ED=95=B4?=
 =?UTF-8?q?=EB=8B=B9=EB=90=98=EB=8A=94=20=EB=B8=8C=EB=9E=9C=EB=93=9C?=
 =?UTF-8?q?=EB=AA=85=EB=A7=8C=20=EC=B6=94=EC=B6=9C=ED=95=98=EB=8A=94=20?=
 =?UTF-8?q?=EB=AA=A8=EB=93=88?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/brand_name_getter.py         | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 brickstudy_ingestion/src/scrapper/brand_name_getter.py

diff --git a/brickstudy_ingestion/src/scrapper/brand_name_getter.py b/brickstudy_ingestion/src/scrapper/brand_name_getter.py
new file mode 100644
index 0000000..8ff8e40
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/brand_name_getter.py
@@ -0,0 +1,62 @@
+import json
+
+from src.common.aws.s3_uploader import S3Uploader
+from src.scrapper.models import OliveyoungBrand
+
+
+def get_latest_dt():
+    return '2024-08-20'
+
+
+def category_checker(category: list) -> bool:
+    """
+    standard 기준 카테고리에 하나라도 속해있으면 True 반환, 아니라면 False 반환
+    """
+    compare = set([c.split('_')[0] for c in category])
+    standard = {'메이크업', '스킨케어', '향수', '헤어케어', '바디케어', '마스크팩', 
+                '클렌징', '선케어', '더모코스메틱', '맨즈케어'}
+    if len(compare & standard) > 0:
+        return True
+    return False
+
+
+def filter_brand(file_content: str) -> list:
+    filtered = []
+    for line in file_content.split('\n'):
+        if line == '':
+            break
+        print(f"line: {line}")
+        for brandname, brandinfo in json.loads(line).items():
+            brandinfo_dic = OliveyoungBrand(**brandinfo)
+            if category_checker(brandinfo_dic.category):
+                filtered.append(brandname)
+    return filtered
+
+
+def get_brand_list_fr_s3():
+    s3_client = S3Uploader().s3_client
+    bucket = 'brickstudy'
+
+    def file_keys_getter():
+        paginator = s3_client.get_paginator('list_objects_v2')
+        prefix = f"bronze/viral/oliveyoung/{get_latest_dt()}"
+        file_key_lst = []
+        for page in paginator.paginate(
+            Bucket=bucket,
+            Prefix=prefix
+        ):
+            if 'Contents' in page:
+                for obj in page['Contents']:
+                    file_key_lst.append(obj['Key'])
+        return file_key_lst
+
+    file_key_lst = file_keys_getter()
+    filtered_brand_lst = []
+    for filekey in file_key_lst:
+        response = s3_client.get_object(
+            Bucket=bucket,
+            Key=filekey
+        )
+        file_content = response['Body'].read().decode('utf-8')
+        filtered_brand_lst += filter_brand(file_content)
+    return filtered_brand_lst
\ No newline at end of file

From a66b2c5e4ba6d1533d7f33da9302dbd49041fad5 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Tue, 3 Sep 2024 23:49:55 +0900
Subject: [PATCH 13/31] ADD : instagram crawling dag

---
 .../dags/viral/instagram_crawler.py           | 69 +++++++++++++++++++
 .../src/scrapper/inscrawler.py                |  3 +-
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 brickstudy_ingestion/dags/viral/instagram_crawler.py

diff --git a/brickstudy_ingestion/dags/viral/instagram_crawler.py b/brickstudy_ingestion/dags/viral/instagram_crawler.py
new file mode 100644
index 0000000..1672b18
--- /dev/null
+++ b/brickstudy_ingestion/dags/viral/instagram_crawler.py
@@ -0,0 +1,69 @@
+from datetime import timedelta
+
+from airflow import DAG
+from airflow.utils.dates import days_ago
+from airflow.decorators import task
+from airflow.operators.python import PythonVirtualenvOperator
+
+from src.scrapper.brand_name_getter import get_brand_list_fr_s3
+
+# =========================================
+# Change parameter
+DAG_ID = "bronze_viral_instagram"
+TARGET_PLATFORM = 'instagram'
+
+# Set aiflow setting
+default_args = {
+    'owner': 'brickstudy',
+    'start_date': days_ago(0),
+    'retries': 1,
+    'retry_delay': timedelta(minutes=1),
+    # 'on_failure_callback': on_failure_callback,
+}
+# =========================================
+
+
+def entrypoint():
+    import logging
+    import multiprocess
+    from src.common.kafka.utils import Kafka
+    from src.scrapper.inscrawler import InsCrawler
+    from src.scrapper.ins_url import InsURLCrawler
+    from src.scrapper.ins_data import InsDataCrawler
+
+    brand_lst = get_brand_list_fr_s3
+    CONCURRENCY_LEVEL = multiprocess.cpu_count()
+
+    def crawl_instagram(keywords: tuple):
+        crawler = InsURLCrawler(InsCrawler(keywords=keywords)).get_urls()
+        post_crawler = InsDataCrawler(crawler.data)
+        post_crawler.get_post_data()
+        producer.send_data_to_kafka(
+            kafka_topic='instagram',
+            data=post_crawler.data
+        )
+
+    try:
+        producer = Kafka()
+        with multiprocess.Pool(CONCURRENCY_LEVEL) as p:
+            p.map(crawl_instagram, brand_lst)
+    except Exception as e:
+        logging.error("***entrypoint error***", e)
+        raise
+
+with DAG(
+    dag_id=DAG_ID,
+    default_args=default_args,
+    schedule_interval='@daily',
+    catchup=False
+):
+    t_crawl_ins = PythonVirtualenvOperator(
+        task_id='crawl_instagram_based_on_keyword',
+        python_version='3.10',
+        system_site_packages=False,
+        requirements=['selenium==4.24.0', 'webdriver-manager==4.0.2',
+                      'bs4==0.0.2', 'beautifulsoup4==4.12.3',
+                      'lxml==5.3.0', 'pytz==2024.1',
+                      "python-dotenv==0.19.0", "multiprocess", "kafka-python"],
+        python_callable=entrypoint
+    )
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 6ae11c0..537e947 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -9,7 +9,7 @@
 
 
 class InsCrawler:
-    def __init__(self, dev: bool = False):
+    def __init__(self, keywords: list = None, dev: bool = False):
         if dev:
             proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion'
         else:
@@ -17,6 +17,7 @@ def __init__(self, dev: bool = False):
         self.base_path = f'{proj_path}/src/scrapper'
 
         self.user_id, self.password = self.load_config(dev=dev)
+        self.keywords = keywords
         self.data = defaultdict(inst_generator)
         self.driver = webdriver.Chrome()
         self.login()

From 932b294c987ed6c41f428646c398e2a1b5631e95 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Wed, 4 Sep 2024 12:53:11 +0900
Subject: [PATCH 14/31] =?UTF-8?q?fix=20:=20instagram=20=EB=AA=A8=EB=93=88?=
 =?UTF-8?q?=20=EC=83=9D=EC=84=B1=EC=9E=90=EC=97=90=EC=84=9C=20=EB=B0=9B?=
 =?UTF-8?q?=EB=8A=94=20arg=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/ins_data.py |  6 ++-
 brickstudy_ingestion/src/scrapper/ins_url.py  |  7 ++--
 .../src/scrapper/inscrawler.py                | 39 ++++++++++++++-----
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
index e4ec1b9..d00cef0 100644
--- a/brickstudy_ingestion/src/scrapper/ins_data.py
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -8,8 +8,10 @@
 
 
 class InsDataCrawler(InsCrawler):
-    def __init__(self, data):
-        super().__init__()
+    def __init__(self,
+                 driver, data,
+                 dev: bool = False):
+        super().__init__(dev=dev, driver=driver)
         self.data = data
         self.headers = {
             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
index 830b1a3..e87c1af 100644
--- a/brickstudy_ingestion/src/scrapper/ins_url.py
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -7,8 +7,8 @@
 
 
 class InsURLCrawler(InsCrawler):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, keywords: list = None, dev: bool = False):
+        super().__init__(keywords, dev)
 
     def get_urls(self, keyword: str = None):
         if keyword is not None:              # execute with given keyword
@@ -62,5 +62,4 @@ def _fetch_url_data(self, keyword):
             print(e)
             print('오류 발생')
 
-        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
-        self.driver.close()
\ No newline at end of file
+        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 537e947..1c6922d 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,6 +1,7 @@
 import os
 import time
 from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
 import json
 from collections import defaultdict
@@ -9,18 +10,38 @@
 
 
 class InsCrawler:
-    def __init__(self, keywords: list = None, dev: bool = False):
+    def __init__(self,
+                 keywords: list = None,
+                 dev: bool = False,
+                 driver: webdriver = None):
         if dev:
-            proj_path = f'{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('brickstudy_ingestion')])}/brickstudy_ingestion'
+            proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion"
         else:
             proj_path = '/opt/airflow/brickstudy_ingestion'
-        self.base_path = f'{proj_path}/src/scrapper'
+        self.base_path = f"{proj_path}/src/scrapper"
 
         self.user_id, self.password = self.load_config(dev=dev)
         self.keywords = keywords
         self.data = defaultdict(inst_generator)
-        self.driver = webdriver.Chrome()
-        self.login()
+
+        if driver is None:
+            self.load_driver(dev)
+            self.login()
+        else:
+            self.driver = driver
+        
+    def load_driver(self, dev: bool = False):
+        if dev:
+            self.driver = webdriver.Chrome()
+        else:
+            options = webdriver.ChromeOptions()
+            options.add_argument("--headless")
+            options.add_argument("--no-sandbox")
+            options.add_argument("--disable-dev-shm-usage")
+            self.driver = webdriver.Chrome(
+                executable_path=ChromeDriverManager().install(),
+                options=options
+            )
 
     def load_config(self, dev: bool = False):
         if dev:
@@ -30,8 +51,8 @@ def load_config(self, dev: bool = False):
             username = config['login']['username']
             password = config['login']['password']
         else:
-            username = os.environ('INSTAGRAM_ID')
-            password = os.environ('INSTAGRAM_PASSWORD')
+            username = os.getenv('INSTAGRAM_CLIENT_ID')
+            password = os.getenv('INSTAGRAM_CLIENT_PASSWORD')
         return (username, password)
 
     def login(self):
@@ -43,7 +64,7 @@ def login(self):
         user.send_keys(self.user_id)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
-        time.sleep(80)
+        time.sleep(40)
 
     def materialize(self):
         """
@@ -52,6 +73,6 @@ def materialize(self):
         from src.scrapper.utils import current_datetime_getter
         import csv
 
-        with open(f"{self.base_path}/insdata_{current_datetime_getter()}.csv", 'w') as f:
+        with open(f"{self.base_path}/results/insdata_{current_datetime_getter()}.csv", 'w') as f:
             w = csv.writer(f)
             w.writerow(self.data.values())
\ No newline at end of file

From f7a56ea9953276b28e1adf42096053f97e446981 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Wed, 4 Sep 2024 12:59:54 +0900
Subject: [PATCH 15/31] =?UTF-8?q?fix=20:=20instagram=20dag=20=EB=B8=8C?=
 =?UTF-8?q?=EB=9E=9C=EB=93=9C=EB=A6=AC=EC=8A=A4=ED=8A=B8=20=EB=B0=9B?=
 =?UTF-8?q?=EB=8A=94=20=EB=B6=80=EB=B6=84=20task=20=EB=B6=84=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/dags/utils/config.py     |  4 +-
 .../dags/viral/instagram_crawler.py           | 42 ++++++++++++++-----
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/brickstudy_ingestion/dags/utils/config.py b/brickstudy_ingestion/dags/utils/config.py
index bd1c384..a2f96a6 100644
--- a/brickstudy_ingestion/dags/utils/config.py
+++ b/brickstudy_ingestion/dags/utils/config.py
@@ -24,7 +24,9 @@ def set_env_variables():
         "TWITTER_CLIENT_ID",
         "TWITTER_CLIENT_PASSWORD",
         "TWITTER_TOKEN"
-        "TWITTER_CRAWLER_AUTH_TOKEN_PASSWORD"
+        # Instagram
+        "INSTAGRAM_CLIENT_ID",
+        "INSTAGRAM_CLIENT_PASSWORD"
     ]
     for ENV_VARIABLE in ALL_ENV_VARIABLES:
         os.environ[ENV_VARIABLE] = Variable.get(ENV_VARIABLE, "")
diff --git a/brickstudy_ingestion/dags/viral/instagram_crawler.py b/brickstudy_ingestion/dags/viral/instagram_crawler.py
index 1672b18..e8ca249 100644
--- a/brickstudy_ingestion/dags/viral/instagram_crawler.py
+++ b/brickstudy_ingestion/dags/viral/instagram_crawler.py
@@ -2,8 +2,8 @@
 
 from airflow import DAG
 from airflow.utils.dates import days_ago
-from airflow.decorators import task
-from airflow.operators.python import PythonVirtualenvOperator
+from airflow.operators.python import PythonVirtualenvOperator, PythonOperator
+from airflow.models import Variable
 
 from src.scrapper.brand_name_getter import get_brand_list_fr_s3
 
@@ -23,16 +23,23 @@
 # =========================================
 
 
-def entrypoint():
+def get_brand_list():
+    import os
+    for ENV_VARIABLE in ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY']:
+        os.environ[ENV_VARIABLE] = Variable.get(ENV_VARIABLE, "")
+    return get_brand_list_fr_s3()
+
+
+def instagram_crawling(brand_lst, id, pwd):
+    import os
     import logging
-    import multiprocess
     from src.common.kafka.utils import Kafka
     from src.scrapper.inscrawler import InsCrawler
     from src.scrapper.ins_url import InsURLCrawler
     from src.scrapper.ins_data import InsDataCrawler
 
-    brand_lst = get_brand_list_fr_s3
-    CONCURRENCY_LEVEL = multiprocess.cpu_count()
+    os.environ['INSTAGRAM_CLIENT_ID'] = id
+    os.environ['INSTAGRAM_CLIENT_PASSWORD'] = pwd
 
     def crawl_instagram(keywords: tuple):
         crawler = InsURLCrawler(InsCrawler(keywords=keywords)).get_urls()
@@ -45,25 +52,38 @@ def crawl_instagram(keywords: tuple):
 
     try:
         producer = Kafka()
-        with multiprocess.Pool(CONCURRENCY_LEVEL) as p:
-            p.map(crawl_instagram, brand_lst)
+        crawl_instagram(brand_lst)
     except Exception as e:
         logging.error("***entrypoint error***", e)
         raise
 
+
 with DAG(
     dag_id=DAG_ID,
     default_args=default_args,
     schedule_interval='@daily',
     catchup=False
 ):
-    t_crawl_ins = PythonVirtualenvOperator(
+    t1 = PythonOperator(
+        task_id='get_brand_list_from_s3',
+        python_callable=get_brand_list
+    )
+
+    t2 = PythonVirtualenvOperator(
         task_id='crawl_instagram_based_on_keyword',
+        system_site_packages=False,
+        op_kwargs={
+            'brand_lst': "{{ ti.xcom_pull(task_ids='get_brand_list_from_s3') }}",
+            'id': Variable.get('INSTAGRAM_CLIENT_ID'),
+            'pwd': Variable.get('INSTAGRAM_CLIENT_PASSWORD')
+        },
         python_version='3.10',
         system_site_packages=False,
         requirements=['selenium==4.24.0', 'webdriver-manager==4.0.2',
                       'bs4==0.0.2', 'beautifulsoup4==4.12.3',
                       'lxml==5.3.0', 'pytz==2024.1',
                       "python-dotenv==0.19.0", "multiprocess", "kafka-python"],
-        python_callable=entrypoint
-    )
\ No newline at end of file
+        python_callable=instagram_crawling
+    )
+
+    t1 >> t2
\ No newline at end of file

From 405b7e434d87dab77a0c829d467da6771d6c8b7d Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Thu, 5 Sep 2024 13:34:01 +0900
Subject: [PATCH 16/31] =?UTF-8?q?ADD=20:=20=EC=9E=84=EC=8B=9C=20=ED=81=AC?=
 =?UTF-8?q?=EB=A1=A4=EB=9F=AC=20=EC=8A=A4=ED=81=AC=EB=A6=BD=ED=8A=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/brand_name_getter.py         |   1 -
 .../src/scrapper/ins_runner.py                | 107 ++++++++++++++++++
 .../src/scrapper/inscrawler.py                |  15 +--
 3 files changed, 109 insertions(+), 14 deletions(-)
 create mode 100644 brickstudy_ingestion/src/scrapper/ins_runner.py

diff --git a/brickstudy_ingestion/src/scrapper/brand_name_getter.py b/brickstudy_ingestion/src/scrapper/brand_name_getter.py
index 8ff8e40..8b045ff 100644
--- a/brickstudy_ingestion/src/scrapper/brand_name_getter.py
+++ b/brickstudy_ingestion/src/scrapper/brand_name_getter.py
@@ -25,7 +25,6 @@ def filter_brand(file_content: str) -> list:
     for line in file_content.split('\n'):
         if line == '':
             break
-        print(f"line: {line}")
         for brandname, brandinfo in json.loads(line).items():
             brandinfo_dic = OliveyoungBrand(**brandinfo)
             if category_checker(brandinfo_dic.category):
diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
new file mode 100644
index 0000000..01a7a56
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -0,0 +1,107 @@
+from src.scrapper.brand_name_getter import get_brand_list_fr_s3
+from src.scrapper.ins_url import InsURLCrawler
+from src.scrapper.ins_data import InsDataCrawler
+from src.common.aws.s3_uploader import S3Uploader
+from src.scrapper.utils import write_local_as_json
+from src.scrapper.utils import current_datetime_getter
+import logging
+import os
+
+logger = logging.getLogger('insrunner')
+logger.setLevel(logging.ERROR)
+
+
+def crawl_data():
+    brand_lst = get_brand_list_fr_s3()
+    for brand in brand_lst:
+        try:
+            crawler = InsURLCrawler(dev=True)
+            crawler.get_urls(keyword=brand)
+            crawler.materialize()
+        except Exception as e:
+            logging.error(
+                "{} url 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e)
+            )
+        finally:
+            pass
+
+        try:
+            post_crawler = InsDataCrawler(
+                driver=crawler.driver,
+                data=crawler.data,
+                dev=True
+            )
+            post_crawler.get_post_data()
+        except Exception as e:
+            logging.error(
+                "{} post data 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e)
+            )
+        finally:
+            pass
+
+        try:
+            write_local_as_json(
+                data=post_crawler.data,
+                file_path=f"{post_crawler.base_path}/results/data",
+                file_name=f"instagram_{current_datetime_getter}"
+            )
+        except Exception as e:
+            logging.error(
+                "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e)
+            )
+        finally:
+            pass
+
+    return f"{post_crawler.base_path}/results/data"
+
+
+def s3_upload(local_path):
+    s3 = S3Uploader().s3_client
+    s3_path = "bronze/viral/instagram",
+    bucket_name = "brickstudy"
+
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file_path = os.path.join(root, file)
+            # S3 파일 경로 설정
+            s3_file_path = os.path.join(s3_path, os.path.relpath(local_file_path, local_path))
+
+            try:
+                s3.upload_file(local_file_path, bucket_name, s3_file_path)
+                print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}")
+            except FileNotFoundError:
+                print(f"File not found: {local_file_path}")
+            except Exception as e:
+                print(f"Failed to upload {local_file_path}: {str(e)}")
+
+
+if __name__ =='__main__':
+    local_path = crawl_data()
+    s3_upload(local_path)
+
+"""
+curl -i -X PUT -H "Accept:application/json" -H  "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{
+    "connector.class": "io.confluent.connect.s3.S3SinkConnector",
+    "key.converter": "org.apache.kafka.connect.storage.StringConverter",
+    "value.converter": "org.apache.kafka.connect.json.JsonConverter",
+    "value.converter.schemas.enable": "false",
+    "tasks.max": 1,
+    "topics": "instagram",
+    "aws.signing_region": "ap-northeast-2", 
+    "s3.part.size": 5242880,
+    "s3.region": "ap-northeast-2",
+    "s3.bucket.name": "brickstudy",
+    "s3.credentials.provider.class": "com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
+    "topics.dir": "bronze/viral",
+    "partitioner.class": "io.confluent.connect.storage.partitioner.TimeBasedPartitioner",
+    "partition.duration.ms": "86400000",
+    "timestamp.extractor": "Record",
+    "path.format": "yyyy-MM-dd",
+    "flush.size": 100,
+    "rotate.interval.ms": 60000,
+    "storage.class": "io.confluent.connect.s3.storage.S3Storage",
+    "format.class": "io.confluent.connect.s3.format.json.JsonFormat",
+    "locale": "ko_KR",
+    "timezone": "Asia/Seoul"
+}'
+"""
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 1c6922d..8da325a 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -16,8 +16,10 @@ def __init__(self,
                  driver: webdriver = None):
         if dev:
             proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion"
+            self.driver = webdriver.Chrome()
         else:
             proj_path = '/opt/airflow/brickstudy_ingestion'
+            self.driver = driver
         self.base_path = f"{proj_path}/src/scrapper"
 
         self.user_id, self.password = self.load_config(dev=dev)
@@ -29,19 +31,6 @@ def __init__(self,
             self.login()
         else:
             self.driver = driver
-        
-    def load_driver(self, dev: bool = False):
-        if dev:
-            self.driver = webdriver.Chrome()
-        else:
-            options = webdriver.ChromeOptions()
-            options.add_argument("--headless")
-            options.add_argument("--no-sandbox")
-            options.add_argument("--disable-dev-shm-usage")
-            self.driver = webdriver.Chrome(
-                executable_path=ChromeDriverManager().install(),
-                options=options
-            )
 
     def load_config(self, dev: bool = False):
         if dev:

From 42ef0859166563da497f81819b38c917d0f10e4a Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Thu, 5 Sep 2024 16:35:59 +0900
Subject: [PATCH 17/31] =?UTF-8?q?fix=20:=20=EC=9E=84=EC=8B=9C=20=ED=83=9C?=
 =?UTF-8?q?=EC=8A=A4=ED=81=AC=20=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/ins_runner.py                 | 18 ++++++++++--------
 .../src/scrapper/inscrawler.py                 |  9 ++-------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index 01a7a56..9949c17 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -40,10 +40,11 @@ def crawl_data():
             pass
 
         try:
+            cur_date = current_datetime_getter()
             write_local_as_json(
                 data=post_crawler.data,
                 file_path=f"{post_crawler.base_path}/results/data",
-                file_name=f"instagram_{current_datetime_getter}"
+                file_name=f"instagram_{cur_date}"
             )
         except Exception as e:
             logging.error(
@@ -51,21 +52,21 @@ def crawl_data():
             )
         finally:
             pass
-
     return f"{post_crawler.base_path}/results/data"
 
 
 def s3_upload(local_path):
+    dt = current_datetime_getter()
+    dt = dt.split('_')[0]
     s3 = S3Uploader().s3_client
-    s3_path = "bronze/viral/instagram",
+    s3_path = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}"
     bucket_name = "brickstudy"
 
     for root, _, files in os.walk(local_path):
         for file in files:
             local_file_path = os.path.join(root, file)
-            # S3 파일 경로 설정
-            s3_file_path = os.path.join(s3_path, os.path.relpath(local_file_path, local_path))
-
+            s3_file_path = os.path.join(s3_path, file)
+            print(local_file_path)
             try:
                 s3.upload_file(local_file_path, bucket_name, s3_file_path)
                 print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}")
@@ -75,8 +76,9 @@ def s3_upload(local_path):
                 print(f"Failed to upload {local_file_path}: {str(e)}")
 
 
-if __name__ =='__main__':
-    local_path = crawl_data()
+if __name__ == '__main__':
+    # local_path = crawl_data()
+    local_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/results/data"
     s3_upload(local_path)
 
 """
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 8da325a..e0a3e13 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,7 +1,6 @@
 import os
 import time
 from selenium import webdriver
-from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
 import json
 from collections import defaultdict
@@ -26,11 +25,7 @@ def __init__(self,
         self.keywords = keywords
         self.data = defaultdict(inst_generator)
 
-        if driver is None:
-            self.load_driver(dev)
-            self.login()
-        else:
-            self.driver = driver
+        self.login()
 
     def load_config(self, dev: bool = False):
         if dev:
@@ -53,7 +48,7 @@ def login(self):
         user.send_keys(self.user_id)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
-        time.sleep(40)
+        time.sleep(10)
 
     def materialize(self):
         """

From 484400384e6cf874c974e82c5cbf912b67e701e5 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Thu, 5 Sep 2024 17:08:38 +0900
Subject: [PATCH 18/31] fix : refactor temporal task

---
 .../src/scrapper/ins_runner.py                | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index 9949c17..9ae569b 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -1,11 +1,11 @@
 from src.scrapper.brand_name_getter import get_brand_list_fr_s3
 from src.scrapper.ins_url import InsURLCrawler
 from src.scrapper.ins_data import InsDataCrawler
-from src.common.aws.s3_uploader import S3Uploader
 from src.scrapper.utils import write_local_as_json
 from src.scrapper.utils import current_datetime_getter
-import logging
 import os
+import logging
+import subprocess
 
 logger = logging.getLogger('insrunner')
 logger.setLevel(logging.ERROR)
@@ -52,34 +52,29 @@ def crawl_data():
             )
         finally:
             pass
-    return f"{post_crawler.base_path}/results/data"
+    return f"{post_crawler.base_path}/results"
 
 
-def s3_upload(local_path):
+def s3_upload(local_path: str, target: str = 'data'):
+    local_folder = os.path.join(local_path, target)
     dt = current_datetime_getter()
     dt = dt.split('_')[0]
-    s3 = S3Uploader().s3_client
-    s3_path = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}"
+    s3_folder = f"bronze/viral/instagram/{target}/{dt[:4]}-{dt[4:6]}-{dt[6:]}"
     bucket_name = "brickstudy"
-
-    for root, _, files in os.walk(local_path):
-        for file in files:
-            local_file_path = os.path.join(root, file)
-            s3_file_path = os.path.join(s3_path, file)
-            print(local_file_path)
-            try:
-                s3.upload_file(local_file_path, bucket_name, s3_file_path)
-                print(f"File {local_file_path} uploaded to {bucket_name}/{s3_file_path}")
-            except FileNotFoundError:
-                print(f"File not found: {local_file_path}")
-            except Exception as e:
-                print(f"Failed to upload {local_file_path}: {str(e)}")
+    try:
+        subprocess.run(
+            ['aws', 's3', 'cp', local_folder, f's3://{bucket_name}/{s3_folder}/', '--recursive'],
+            check=True
+        )
+        print(f"Folder {local_folder} uploaded to s3://{bucket_name}/{s3_folder}/")
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to upload folder: {str(e)}")
 
 
 if __name__ == '__main__':
-    # local_path = crawl_data()
-    local_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/results/data"
-    s3_upload(local_path)
+    local_path = crawl_data()
+    s3_upload(local_path, 'data')
+    s3_upload(local_path, 'images')
 
 """
 curl -i -X PUT -H "Accept:application/json" -H  "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{

From 6f6da8bc839400d4343e88ae49f24e6b61905bf9 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Thu, 5 Sep 2024 17:46:08 +0900
Subject: [PATCH 19/31] fix : s3 path

---
 brickstudy_ingestion/src/scrapper/ins_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index 9ae569b..f4b4cdd 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -59,7 +59,7 @@ def s3_upload(local_path: str, target: str = 'data'):
     local_folder = os.path.join(local_path, target)
     dt = current_datetime_getter()
     dt = dt.split('_')[0]
-    s3_folder = f"bronze/viral/instagram/{target}/{dt[:4]}-{dt[4:6]}-{dt[6:]}"
+    s3_folder = f"bronze/viral/instagram/{dt[:4]}-{dt[4:6]}-{dt[6:]}/{target}"
     bucket_name = "brickstudy"
     try:
         subprocess.run(

From 09ffaf4abf368e66761ec35520826ec2ddc3c07c Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Sun, 8 Sep 2024 22:44:27 +0900
Subject: [PATCH 20/31] fix : crawler avoid account blocked

---
 brickstudy_ingestion/src/scrapper/ins_data.py | 10 +++--
 .../src/scrapper/ins_runner.py                | 42 +++++++------------
 brickstudy_ingestion/src/scrapper/ins_url.py  |  1 +
 .../src/scrapper/inscrawler.py                | 37 ++++++++++++----
 4 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
index d00cef0..56b70f2 100644
--- a/brickstudy_ingestion/src/scrapper/ins_data.py
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -1,8 +1,10 @@
 import time
 from bs4 import BeautifulSoup
+from selenium.webdriver.support.wait import WebDriverWait 
 from selenium.webdriver.common.by import By
 import requests
 import re
+import random
 
 from src.scrapper.inscrawler import InsCrawler
 
@@ -23,17 +25,18 @@ def get_post_data(self):
             post_crawled_data = {line.strip() for line in f}
 
         for idx, (key, val) in enumerate(self.data.items()):
+            if self.numof_error > 10: break
 
             post_url = val.post_url
 
             if post_url in post_crawled_data:
                 continue
 
+            time.sleep(random.randrange(2, 5))
             self.driver.get(post_url)
             print(idx, '. ' + post_url)
 
             try:
-                time.sleep(5)
                 html = self.driver.page_source
                 soup = BeautifulSoup(html, 'lxml')
 
@@ -75,8 +78,8 @@ def get_post_data(self):
                 # 이미지 끝까지 넘기면서 url 추출
                 try:
                     while True:
-                        time.sleep(3)
-
+                        time.sleep(random.randrange(1, 3))
+                        WebDriverWait(self.driver, random.randrange(1, 4))
                         self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click()  # 다음 이미지 버튼 클릭
                         images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3'))
 
@@ -129,6 +132,7 @@ def get_post_data(self):
 
             except Exception as e:
                 print(e)
+                self.numof_error += 1
                 print('오류 발생')
 
         # 수집 완료된 데이터 키값(post url unique id) 저장
diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index f4b4cdd..5ac77d9 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -13,31 +13,22 @@
 
 def crawl_data():
     brand_lst = get_brand_list_fr_s3()
-    for brand in brand_lst:
-        try:
-            crawler = InsURLCrawler(dev=True)
-            crawler.get_urls(keyword=brand)
-            crawler.materialize()
-        except Exception as e:
-            logging.error(
-                "{} url 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e)
-            )
-        finally:
-            pass
+    err = 0
+    for brand in brand_lst[13:]:
+        if err > 10: 
+            break
 
-        try:
-            post_crawler = InsDataCrawler(
-                driver=crawler.driver,
-                data=crawler.data,
-                dev=True
-            )
-            post_crawler.get_post_data()
-        except Exception as e:
-            logging.error(
-                "{} post data 수집 과정에서 오류 발생. \nerror message: {}".format(brand, e)
-            )
-        finally:
-            pass
+        crawler = InsURLCrawler(dev=True)
+        crawler.get_urls(keyword=brand)
+        err += crawler.numof_error
+
+        post_crawler = InsDataCrawler(
+            driver=crawler.driver,
+            data=crawler.data,
+            dev=True
+        )
+        post_crawler.get_post_data()
+        err += post_crawler.numof_error
 
         try:
             cur_date = current_datetime_getter()
@@ -50,8 +41,7 @@ def crawl_data():
             logging.error(
                 "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e)
             )
-        finally:
-            pass
+
     return f"{post_crawler.base_path}/results"
 
 
diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
index e87c1af..62e3a54 100644
--- a/brickstudy_ingestion/src/scrapper/ins_url.py
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -59,6 +59,7 @@ def _fetch_url_data(self, keyword):
             time.sleep(5)
 
         except Exception as e:
+            self.numof_error += 1
             print(e)
             print('오류 발생')
 
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index e0a3e13..5f45eb7 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -1,9 +1,10 @@
 import os
 import time
-from selenium import webdriver
-from selenium.webdriver.common.by import By
 import json
 from collections import defaultdict
+import random
+from selenium import webdriver
+from selenium.webdriver.common.by import By
 
 from src.scrapper.models import inst_generator
 
@@ -12,28 +13,46 @@ class InsCrawler:
     def __init__(self,
                  keywords: list = None,
                  dev: bool = False,
-                 driver: webdriver = None):
+                 driver=None):
         if dev:
             proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion"
-            self.driver = webdriver.Chrome()
+            self.driver = self.make_driver()
         else:
             proj_path = '/opt/airflow/brickstudy_ingestion'
             self.driver = driver
-        self.base_path = f"{proj_path}/src/scrapper"
 
+        self.base_path = f"{proj_path}/src/scrapper"
         self.user_id, self.password = self.load_config(dev=dev)
         self.keywords = keywords
         self.data = defaultdict(inst_generator)
+        self.numof_error = 0
 
         self.login()
 
+    @staticmethod
+    def make_driver():
+        user_agent_lst = [
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
+        ]
+        options = webdriver.ChromeOptions()
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
+        options.add_experimental_option("useAutomationExtension", False)
+        driver = webdriver.Chrome(options=options)
+        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[0]})
+        return driver
+
     def load_config(self, dev: bool = False):
         if dev:
             with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f:
                 config = json.load(f)
 
-            username = config['login']['username']
-            password = config['login']['password']
+            x = random.randrange(0, 2)
+            username = config['login']['username'][x]
+            password = config['login']['password'][x]
         else:
             username = os.getenv('INSTAGRAM_CLIENT_ID')
             password = os.getenv('INSTAGRAM_CLIENT_PASSWORD')
@@ -43,12 +62,12 @@ def login(self):
         # Instagram 접속 및 로그인
         url = 'https://www.instagram.com/'
         self.driver.get(url)
-        time.sleep(6)
+        time.sleep(random.randrange(4, 6))
         user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')
         user.send_keys(self.user_id)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
-        time.sleep(10)
+        time.sleep(random.randrange(5, 11))
 
     def materialize(self):
         """

From 5ba7f3834622c1030d045703ee9d01aa3236c48b Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Mon, 9 Sep 2024 03:41:19 +0900
Subject: [PATCH 21/31] =?UTF-8?q?fix=20:=20crawler=20-=20sleep=20=EC=A1=B0?=
 =?UTF-8?q?=EC=A0=95,=20=EC=8A=A4=ED=81=AC=EB=A1=A4=EB=8B=A4=EC=9A=B4=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/ins_data.py |  4 +--
 .../src/scrapper/ins_runner.py                | 30 ++++++++++++++++---
 brickstudy_ingestion/src/scrapper/ins_url.py  | 12 +++++---
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
index 56b70f2..3a283d6 100644
--- a/brickstudy_ingestion/src/scrapper/ins_data.py
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -78,7 +78,7 @@ def get_post_data(self):
                 # 이미지 끝까지 넘기면서 url 추출
                 try:
                     while True:
-                        time.sleep(random.randrange(1, 3))
+                        time.sleep(1)
                         WebDriverWait(self.driver, random.randrange(1, 4))
                         self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click()  # 다음 이미지 버튼 클릭
                         images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3'))
@@ -128,7 +128,7 @@ def get_post_data(self):
                 print(f"총 {len(saved_imgs)} 장의 이미지 저장")
                 self.data[key].saved_imgs = str(list(saved_imgs))
 
-                time.sleep(5)
+                time.sleep(random.randrange(3, 5))
 
             except Exception as e:
                 print(e)
diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index 5ac77d9..6feeb37 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -6,17 +6,32 @@
 import os
 import logging
 import subprocess
+import shutil
 
 logger = logging.getLogger('insrunner')
 logger.setLevel(logging.ERROR)
 
+scrapped = [
+    "포엘리에", "아워글래스",
+    "휴캄", "아이레놀", "루트리", "일소",
+    "유니크미", "본트리", "메디필", "OOTD", "앤디얼",
+    "아크네스", "그레이멜린", "제로앱솔루", "리쥬란", "폴라초이스", "메이크프렘",
+    "제로이드", "원데이즈유", "숌", "어뮤즈", "프랭클리", "네오젠", "제이엠솔루션", "리터뉴",
+    "아크웰", "아이레시피", "제이준", "글로오아시스", "어반디케이", 
+    "닥터방기원", "유리피부", "콤마나인", 
+    "라운드어라운드", "미구하라", "주미소", 
+    "에이지투웨니스", "프리메라", "애즈이즈투비", "투쿨포스쿨"
+]
+
 
 def crawl_data():
     brand_lst = get_brand_list_fr_s3()
     err = 0
-    for brand in brand_lst[13:]:
+    for brand in brand_lst[30:]:
         if err > 10: 
             break
+        if brand in scrapped: 
+            continue
 
         crawler = InsURLCrawler(dev=True)
         crawler.get_urls(keyword=brand)
@@ -41,6 +56,7 @@ def crawl_data():
             logging.error(
                 "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e)
             )
+        break
 
     return f"{post_crawler.base_path}/results"
 
@@ -62,9 +78,15 @@ def s3_upload(local_path: str, target: str = 'data'):
 
 
 if __name__ == '__main__':
-    local_path = crawl_data()
-    s3_upload(local_path, 'data')
-    s3_upload(local_path, 'images')
+    base_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/"
+
+    # shutil.copytree(base_path + "template", base_path + "results")
+
+    crawl_data()
+    s3_upload(base_path + "results", 'data')
+    s3_upload(base_path + "results", 'images')
+
+    # shutil.rmtree(base_path + "results")
 
 """
 curl -i -X PUT -H "Accept:application/json" -H  "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{
diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
index 62e3a54..8422d07 100644
--- a/brickstudy_ingestion/src/scrapper/ins_url.py
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -2,6 +2,7 @@
 from bs4 import BeautifulSoup
 import urllib
 import re
+import random
 
 from src.scrapper.inscrawler import InsCrawler
 
@@ -23,9 +24,11 @@ def _fetch_url_data(self, keyword):
         self.driver.get(word_url)
 
         try:
-            time.sleep(5)
-            js = 'window.scrollBy(0,1000)'
-            self.driver.execute_script(js)
+            for _ in range(10):  # 스크롤 10회
+                time.sleep(random.randrange(3, 5))
+                js = 'window.scrollBy(0,5000)'
+                self.driver.execute_script(js)
+                time.sleep(5)
             html = self.driver.page_source
             soup = BeautifulSoup(html, 'lxml')
 
@@ -63,4 +66,5 @@ def _fetch_url_data(self, keyword):
             print(e)
             print('오류 발생')
 
-        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
\ No newline at end of file
+        print(f'키워드 {keyword}의 URL 정보 수집 완료.')
+        self.driver.close()
\ No newline at end of file

From c62a82cef0caa63c846766b45834093e9a373f72 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Sat, 28 Sep 2024 17:42:59 +0900
Subject: [PATCH 22/31] add : humanize feature

---
 .../src/scrapper/http_429_handler.py          | 58 +++++++++++++++++
 brickstudy_ingestion/src/scrapper/ins_data.py |  9 ++-
 .../src/scrapper/ins_runner.py                | 60 ++++++++++-------
 brickstudy_ingestion/src/scrapper/ins_url.py  |  8 ++-
 .../src/scrapper/inscrawler.py                | 64 ++++++++++++++-----
 5 files changed, 152 insertions(+), 47 deletions(-)
 create mode 100644 brickstudy_ingestion/src/scrapper/http_429_handler.py

diff --git a/brickstudy_ingestion/src/scrapper/http_429_handler.py b/brickstudy_ingestion/src/scrapper/http_429_handler.py
new file mode 100644
index 0000000..d1768ff
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/http_429_handler.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8
+
+import urllib.error
+import urllib.request
+
+from tenacity import retry, retry_if_exception, stop_after_attempt, wait_fixed
+from tenacity.wait import wait_base
+
+
+class retry_if_http_429_error(retry_if_exception):
+    """Retry strategy that retries if the exception is an ``HTTPError`` with
+    a 429 status code.
+
+    """
+
+    def __init__(self):
+        def is_http_429_error(exception):
+            return (
+                isinstance(exception, urllib.error.HTTPError) and
+                exception.getcode() == 429
+            )
+
+        super().__init__(predicate=is_http_429_error)
+
+
+class wait_for_retry_after_header(wait_base):
+    """Wait strategy that tries to wait for the length specified by
+    the Retry-After header, or the underlying wait strategy if not.
+    See RFC 6585 § 4.
+
+    Otherwise, wait according to the fallback strategy.
+    """
+    def __init__(self, fallback):
+        self.fallback = fallback
+
+    def __call__(self, retry_state):
+        # retry_state is an instance of tenacity.RetryCallState.  The .outcome
+        # property is the result/exception that came from the underlying function.
+        exc = retry_state.outcome.exception()
+        if isinstance(exc, urllib.error.HTTPError):
+            retry_after = exc.headers.get("Retry-After")
+
+            try:
+                return 3600 if retry_after is None else int(retry_after)
+            except (TypeError, ValueError):
+                pass
+
+        return self.fallback(retry_state)
+
+
+@retry(
+    retry=retry_if_http_429_error(),
+    wait=wait_for_retry_after_header(fallback=wait_fixed(1)),
+    stop=stop_after_attempt(3)
+)
+def get_url_with_tenacity_(url):
+    return urllib.request.urlopen(url)
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/ins_data.py b/brickstudy_ingestion/src/scrapper/ins_data.py
index 3a283d6..d44ed08 100644
--- a/brickstudy_ingestion/src/scrapper/ins_data.py
+++ b/brickstudy_ingestion/src/scrapper/ins_data.py
@@ -7,6 +7,7 @@
 import random
 
 from src.scrapper.inscrawler import InsCrawler
+from src.scrapper.http_429_handler import get_url_with_tenacity_
 
 
 class InsDataCrawler(InsCrawler):
@@ -25,14 +26,15 @@ def get_post_data(self):
             post_crawled_data = {line.strip() for line in f}
 
         for idx, (key, val) in enumerate(self.data.items()):
-            if self.numof_error > 10: break
+            if self.numof_error > 5: break
 
             post_url = val.post_url
 
             if post_url in post_crawled_data:
                 continue
 
-            time.sleep(random.randrange(2, 5))
+            time.sleep(random.randrange(2, 10) + random.random())
+            get_url_with_tenacity_(post_url)
             self.driver.get(post_url)
             print(idx, '. ' + post_url)
 
@@ -78,7 +80,7 @@ def get_post_data(self):
                 # 이미지 끝까지 넘기면서 url 추출
                 try:
                     while True:
-                        time.sleep(1)
+                        time.sleep(random.randrange(3, 6) + random.random())
                         WebDriverWait(self.driver, random.randrange(1, 4))
                         self.driver.find_element(By.CLASS_NAME, '_afxw._al46._al47').click()  # 다음 이미지 버튼 클릭
                         images.append(self.driver.find_elements(By.CLASS_NAME, 'x5yr21d.xu96u03.x10l6tqk.x13vifvy.x87ps6o.xh8yej3'))
@@ -134,6 +136,7 @@ def get_post_data(self):
                 print(e)
                 self.numof_error += 1
                 print('오류 발생')
+                time.sleep(20 + random.random())
 
         # 수집 완료된 데이터 키값(post url unique id) 저장
         with open(results, 'a') as f:
diff --git a/brickstudy_ingestion/src/scrapper/ins_runner.py b/brickstudy_ingestion/src/scrapper/ins_runner.py
index 6feeb37..073cbb2 100644
--- a/brickstudy_ingestion/src/scrapper/ins_runner.py
+++ b/brickstudy_ingestion/src/scrapper/ins_runner.py
@@ -3,38 +3,40 @@
 from src.scrapper.ins_data import InsDataCrawler
 from src.scrapper.utils import write_local_as_json
 from src.scrapper.utils import current_datetime_getter
+
 import os
 import logging
 import subprocess
-import shutil
 
 logger = logging.getLogger('insrunner')
 logger.setLevel(logging.ERROR)
 
-scrapped = [
-    "포엘리에", "아워글래스",
-    "휴캄", "아이레놀", "루트리", "일소",
-    "유니크미", "본트리", "메디필", "OOTD", "앤디얼",
-    "아크네스", "그레이멜린", "제로앱솔루", "리쥬란", "폴라초이스", "메이크프렘",
-    "제로이드", "원데이즈유", "숌", "어뮤즈", "프랭클리", "네오젠", "제이엠솔루션", "리터뉴",
-    "아크웰", "아이레시피", "제이준", "글로오아시스", "어반디케이", 
-    "닥터방기원", "유리피부", "콤마나인", 
-    "라운드어라운드", "미구하라", "주미소", 
-    "에이지투웨니스", "프리메라", "애즈이즈투비", "투쿨포스쿨"
+twitter_keyword = [
+    "닥터지", "아이소이", "에뛰드", "에스트라", "유세린", "토리든"
 ]
 
 
-def crawl_data():
+def get_brand_lst_wo_ingested_list():
     brand_lst = get_brand_list_fr_s3()
-    err = 0
-    for brand in brand_lst[30:]:
-        if err > 10: 
+    with open(f"{base_path}/results/finished_brand.txt", "r") as f:
+        skip = f.read()
+    return list(set(brand_lst) - set(skip[:-1].split('\n')))
+
+
+def crawl_data(brand_lst: list, err: int):
+    """
+    brand_lst 에 속한 brand이 언급된 데이터를 인스타그램으로부터 수집하여
+    ./results/data, ./results/images에 저장하는 함수
+    :brand_lst: 크롤링할 서치 키워드가 담긴 리스트
+    :err: 크롤링 진행 과정에서 발생한 오류 횟수
+    """
+    for brand in brand_lst:
+        if err > 10:
             break
-        if brand in scrapped: 
-            continue
 
         crawler = InsURLCrawler(dev=True)
         crawler.get_urls(keyword=brand)
+        crawler.materialize()
         err += crawler.numof_error
 
         post_crawler = InsDataCrawler(
@@ -52,13 +54,14 @@ def crawl_data():
                 file_path=f"{post_crawler.base_path}/results/data",
                 file_name=f"instagram_{cur_date}"
             )
+            with open(f"{post_crawler.base_path}/results/finished_brand.txt", "a") as f:
+                f.write(f"{brand}\n")
         except Exception as e:
             logging.error(
                 "{} data write 과정에서 오류 발생. \nerror message: {}".format(brand, e)
             )
-        break
 
-    return f"{post_crawler.base_path}/results"
+    return err
 
 
 def s3_upload(local_path: str, target: str = 'data'):
@@ -79,14 +82,23 @@ def s3_upload(local_path: str, target: str = 'data'):
 
 if __name__ == '__main__':
     base_path = "/Users/seoyeongkim/Documents/ETL/brickstudy_ingestion/src/scrapper/"
+    # shutil.rmtree(base_path + "results/data")
+    # shutil.rmtree(base_path + "results/images")
+    # os.mkdir(base_path + "results/data")
+    # os.mkdir(base_path + "results/images")
+
+    err = 0
+    # brand_lst = get_brand_lst_wo_ingested_list()
 
-    # shutil.copytree(base_path + "template", base_path + "results")
+    brand_lst = twitter_keyword
+    for block_s in range(0, len(brand_lst), 10):
+        partitioned = brand_lst[block_s:block_s + 10]
+        print(f"**** start crawling {partitioned} ****")
+        err += crawl_data(brand_lst[block_s:block_s + 10], err)
 
-    crawl_data()
-    s3_upload(base_path + "results", 'data')
-    s3_upload(base_path + "results", 'images')
+    # s3_upload(base_path + "results", 'data')
+    # s3_upload(base_path + "results", 'images')
 
-    # shutil.rmtree(base_path + "results")
 
 """
 curl -i -X PUT -H "Accept:application/json" -H  "Content-Type:application/json" http://kafka-connect:8083/connectors/sink-s3-voluble/config -d '{
diff --git a/brickstudy_ingestion/src/scrapper/ins_url.py b/brickstudy_ingestion/src/scrapper/ins_url.py
index 8422d07..7cdb1cb 100644
--- a/brickstudy_ingestion/src/scrapper/ins_url.py
+++ b/brickstudy_ingestion/src/scrapper/ins_url.py
@@ -5,6 +5,7 @@
 import random
 
 from src.scrapper.inscrawler import InsCrawler
+from src.scrapper.http_429_handler import get_url_with_tenacity_
 
 
 class InsURLCrawler(InsCrawler):
@@ -21,14 +22,15 @@ def get_urls(self, keyword: str = None):
     def _fetch_url_data(self, keyword):
         word = urllib.parse.quote(keyword)
         word_url = f'https://www.instagram.com/explore/tags/{word}/'
+        get_url_with_tenacity_(word_url)
         self.driver.get(word_url)
 
         try:
             for _ in range(10):  # 스크롤 10회
-                time.sleep(random.randrange(3, 5))
-                js = 'window.scrollBy(0,5000)'
+                time.sleep(random.randrange(3, 4) + random.random())
+                js = 'window.scrollBy(0,7000)'
                 self.driver.execute_script(js)
-                time.sleep(5)
+                time.sleep(random.randrange(3, 4) + random.random())
             html = self.driver.page_source
             soup = BeautifulSoup(html, 'lxml')
 
diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 5f45eb7..ae95372 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -4,6 +4,8 @@
 from collections import defaultdict
 import random
 from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
 
 from src.scrapper.models import inst_generator
@@ -14,35 +16,60 @@ def __init__(self,
                  keywords: list = None,
                  dev: bool = False,
                  driver=None):
+        self.account_x = random.randrange(0, 2)
         if dev:
             proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion"
             self.driver = self.make_driver()
         else:
             proj_path = '/opt/airflow/brickstudy_ingestion'
             self.driver = driver
-
         self.base_path = f"{proj_path}/src/scrapper"
-        self.user_id, self.password = self.load_config(dev=dev)
+
+        user_id, password = self.load_config(dev=dev)
         self.keywords = keywords
         self.data = defaultdict(inst_generator)
         self.numof_error = 0
 
-        self.login()
+        self.login(user_id, password)
 
-    @staticmethod
-    def make_driver():
+    def make_driver(self):
+        proxies = [
+            ["211.223.89.176:51147",
+            "121.66.105.19:51080",
+            "121.66.105.19:51080",
+            "8.213.128.6:8080"],
+            ["8.213.129.20:8090",
+            "8.213.129.20:5566",
+            "8.213.137.155:8090",
+            "8.220.204.215:808"],
+            ["8.220.205.172:9098",
+            "211.223.89.176:51147",
+            "8.213.128.90:2019",
+            "8.213.128.90:444"]
+        ]
         user_agent_lst = [
             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36",
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
         ]
         options = webdriver.ChromeOptions()
+        # options.add_argument("--headless")
+        proxy = proxies[self.account_x][random.randrange(0, 4)]
+        # webdriver.DesiredCapabilities.CHROME['proxy'] = {
+        #     "socksProxy": proxy,
+        #     "socksVersion": 4,
+        #     "proxyType": "MANUAL",
+        # }
+
         options.add_argument("--disable-blink-features=AutomationControlled")
         options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
         options.add_experimental_option("useAutomationExtension", False)
-        driver = webdriver.Chrome(options=options)
+        driver = webdriver.Chrome(
+            options=options
+        )
         driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
-        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[0]})
+        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]})
         return driver
 
     def load_config(self, dev: bool = False):
@@ -50,24 +77,24 @@ def load_config(self, dev: bool = False):
             with open(f'{self.base_path}/config.json', 'r', encoding='utf-8') as f:
                 config = json.load(f)
 
-            x = random.randrange(0, 2)
-            username = config['login']['username'][x]
-            password = config['login']['password'][x]
+            username = config['login']['username'][self.account_x]
+            password = config['login']['password'][self.account_x]
         else:
             username = os.getenv('INSTAGRAM_CLIENT_ID')
             password = os.getenv('INSTAGRAM_CLIENT_PASSWORD')
         return (username, password)
 
-    def login(self):
+    def login(self, user_id: str, password: str):
         # Instagram 접속 및 로그인
         url = 'https://www.instagram.com/'
         self.driver.get(url)
-        time.sleep(random.randrange(4, 6))
-        user = self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input')
-        user.send_keys(self.user_id)
-        self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(self.password)
+        time.sleep(random.randrange(4, 6) + random.random())
+        self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[1]/div/label/input').send_keys(user_id)
+        time.sleep(random.randrange(1, 3) + random.random())
+        self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[2]/div/label/input').send_keys(password)
+        time.sleep(random.randrange(1, 3) + random.random())
         self.driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button/div').click()
-        time.sleep(random.randrange(5, 11))
+        time.sleep(random.randrange(5, 11) + random.random())
 
     def materialize(self):
         """
@@ -78,4 +105,7 @@ def materialize(self):
 
         with open(f"{self.base_path}/results/insdata_{current_datetime_getter()}.csv", 'w') as f:
             w = csv.writer(f)
-            w.writerow(self.data.values())
\ No newline at end of file
+            w.writerow(self.data.values())
+
+if __name__ == "__main__":
+    test = InsCrawler(keywords='엔하이픈', dev=True)
\ No newline at end of file

From 0147d9a29930e7a666ea6c4272bc3114870aa79a Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Sat, 28 Sep 2024 19:39:58 +0900
Subject: [PATCH 23/31] add : catch suspicious account popup

---
 .../src/scrapper/inscrawler.py                | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index ae95372..c95ea95 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -7,6 +7,8 @@
 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 
 from src.scrapper.models import inst_generator
 
@@ -32,6 +34,11 @@ def __init__(self,
 
         self.login(user_id, password)
 
+        if self.suspicous_check():
+            #TODO 계정 사용비율 낮추기
+            print("return True in suspicious check")
+            time.sleep()
+
     def make_driver(self):
         proxies = [
             ["211.223.89.176:51147",
@@ -56,11 +63,11 @@ def make_driver(self):
         options = webdriver.ChromeOptions()
         # options.add_argument("--headless")
         proxy = proxies[self.account_x][random.randrange(0, 4)]
-        # webdriver.DesiredCapabilities.CHROME['proxy'] = {
-        #     "socksProxy": proxy,
-        #     "socksVersion": 4,
-        #     "proxyType": "MANUAL",
-        # }
+        print(proxy)
+        webdriver.DesiredCapabilities.CHROME['proxy'] = {
+            "socksProxy": proxy,
+            "socksVersion": 4,
+        }
 
         options.add_argument("--disable-blink-features=AutomationControlled")
         options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
@@ -107,5 +114,26 @@ def materialize(self):
             w = csv.writer(f)
             w.writerow(self.data.values())
 
+    def suspicous_check(self):
+        """ 현재 자동화 행동 의심받는지 확인 """
+        try:
+            if 'wbloks_1' in self.driver.page_source:
+                print("자동화된 활동 경고가 나타났습니다.")
+
+                close_button = self.driver.find_element(By.XPATH, '//div[@aria-label="Dismiss"]')
+                self.driver.execute_script("arguments[0].dispatchEvent(new MouseEvent('click', {bubbles: true}));", close_button)
+
+                # # 닫기 버튼 클릭, 계정 사용 일시 중지
+                # close_button = WebDriverWait(self.driver, 5).until(
+                #     EC.element_to_be_clickable((By.XPATH, '//div[@aria-label="Dismiss"]'))
+                # )
+                # close_button.click()
+                return True
+            return False
+        except Exception:
+            self.numof_error += 1
+            return False
+
+
 if __name__ == "__main__":
     test = InsCrawler(keywords='엔하이픈', dev=True)
\ No newline at end of file

From 6e1666f098e7148feb22addd6e21fb23c747dc50 Mon Sep 17 00:00:00 2001
From: seoyeong200 <youngseogimkk@gmail.com>
Date: Sat, 28 Sep 2024 20:05:20 +0900
Subject: [PATCH 24/31] fix : dummy

---
 brickstudy_ingestion/src/scrapper/inscrawler.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index c95ea95..7a7bdeb 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -37,7 +37,7 @@ def __init__(self,
         if self.suspicous_check():
             #TODO 계정 사용비율 낮추기
             print("return True in suspicious check")
-            time.sleep()
+            time.sleep(300)
 
     def make_driver(self):
         proxies = [
@@ -134,6 +134,3 @@ def suspicous_check(self):
             self.numof_error += 1
             return False
 
-
-if __name__ == "__main__":
-    test = InsCrawler(keywords='엔하이픈', dev=True)
\ No newline at end of file

From a31ea73d1771b2a97a93d8534ab3b489e423380e Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Mon, 11 Nov 2024 18:20:33 +0900
Subject: [PATCH 25/31] refactor : move driver_making component to utils

---
 .../src/scrapper/inscrawler.py                | 48 +-----------------
 brickstudy_ingestion/src/scrapper/utils.py    | 49 +++++++++++++++++++
 2 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/inscrawler.py b/brickstudy_ingestion/src/scrapper/inscrawler.py
index 7a7bdeb..ec2dc48 100644
--- a/brickstudy_ingestion/src/scrapper/inscrawler.py
+++ b/brickstudy_ingestion/src/scrapper/inscrawler.py
@@ -3,14 +3,9 @@
 import json
 from collections import defaultdict
 import random
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
 
 from src.scrapper.models import inst_generator
+from src.scrapper.utils import get_driver
 
 
 class InsCrawler:
@@ -21,7 +16,7 @@ def __init__(self,
         self.account_x = random.randrange(0, 2)
         if dev:
             proj_path = f"{'/'.join(os.getcwd().split('/')[:os.getcwd().split('/').index('ETL') + 1])}/brickstudy_ingestion"
-            self.driver = self.make_driver()
+            self.driver = get_driver()
         else:
             proj_path = '/opt/airflow/brickstudy_ingestion'
             self.driver = driver
@@ -39,45 +34,6 @@ def __init__(self,
             print("return True in suspicious check")
             time.sleep(300)
 
-    def make_driver(self):
-        proxies = [
-            ["211.223.89.176:51147",
-            "121.66.105.19:51080",
-            "121.66.105.19:51080",
-            "8.213.128.6:8080"],
-            ["8.213.129.20:8090",
-            "8.213.129.20:5566",
-            "8.213.137.155:8090",
-            "8.220.204.215:808"],
-            ["8.220.205.172:9098",
-            "211.223.89.176:51147",
-            "8.213.128.90:2019",
-            "8.213.128.90:444"]
-        ]
-        user_agent_lst = [
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36",
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
-        ]
-        options = webdriver.ChromeOptions()
-        # options.add_argument("--headless")
-        proxy = proxies[self.account_x][random.randrange(0, 4)]
-        print(proxy)
-        webdriver.DesiredCapabilities.CHROME['proxy'] = {
-            "socksProxy": proxy,
-            "socksVersion": 4,
-        }
-
-        options.add_argument("--disable-blink-features=AutomationControlled")
-        options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
-        options.add_experimental_option("useAutomationExtension", False)
-        driver = webdriver.Chrome(
-            options=options
-        )
-        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
-        driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]})
-        return driver
 
     def load_config(self, dev: bool = False):
         if dev:
diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py
index 5afbd2a..935290c 100644
--- a/brickstudy_ingestion/src/scrapper/utils.py
+++ b/brickstudy_ingestion/src/scrapper/utils.py
@@ -1,3 +1,52 @@
+def get_driver():
+    """
+    return selenium driver
+    """
+    from selenium import webdriver
+    from selenium.webdriver.chrome.service import Service
+    from webdriver_manager.chrome import ChromeDriverManager
+    from selenium.webdriver.common.by import By
+    from selenium.webdriver.support.ui import WebDriverWait
+    from selenium.webdriver.support import expected_conditions as EC
+    proxies = [
+        ["211.223.89.176:51147",
+        "121.66.105.19:51080",
+        "121.66.105.19:51080",
+        "8.213.128.6:8080"],
+        ["8.213.129.20:8090",
+        "8.213.129.20:5566",
+        "8.213.137.155:8090",
+        "8.220.204.215:808"],
+        ["8.220.205.172:9098",
+        "211.223.89.176:51147",
+        "8.213.128.90:2019",
+        "8.213.128.90:444"]
+    ]
+    user_agent_lst = [
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1636.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
+    ]
+    options = webdriver.ChromeOptions()
+    # options.add_argument("--headless")
+    proxy = proxies[self.account_x][random.randrange(0, 4)]
+    webdriver.DesiredCapabilities.CHROME['proxy'] = {
+        "socksProxy": proxy,
+        "socksVersion": 4,
+    }
+
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_experimental_option("excludeSwitches", ["enable-automation"]) 
+    options.add_experimental_option("useAutomationExtension", False)
+    driver = webdriver.Chrome(
+        options=options
+    )
+    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+    driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_lst[self.account_x]})
+    return driver
+
+
 def get_soup(url: str = None):
     import urllib
     from urllib.request import urlopen

From 6f8aab7948405eb61370cc908bfb264516f184e0 Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Fri, 15 Nov 2024 21:47:03 +0900
Subject: [PATCH 26/31] =?UTF-8?q?refactor=20:=20brand,=20items=20class=20?=
 =?UTF-8?q?=EB=B6=84=EB=A6=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/oliveyoung.py                | 19 -------
 .../src/scrapper/oliveyoung_items.py          | 57 +++++++++++++++++++
 2 files changed, 57 insertions(+), 19 deletions(-)
 create mode 100644 brickstudy_ingestion/src/scrapper/oliveyoung_items.py

diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py
index 1606637..9508204 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py
@@ -18,9 +18,6 @@ def crawl_brand_metadata(self):
         )
         self._get_brand_shop_url()
 
-    def crawl_items(self):
-        self._get_items()
-
     @staticmethod
     def _get_oliveyoung_category_urls() -> list:
         """
@@ -94,19 +91,3 @@ def _get_brand_shop_url(self) -> None:
                         self.brand_metadata[kor_brand_name].query_keyword.append(brand_name)
                     except Exception:
                         pass
-
-    def _get_items(self) -> None:
-        """
-        각 브랜드의 제품 리스트, 해당 제품의 프로모션 여부 추가
-        """
-        for brand in self.brand_metadata.keys():
-            brand_url = self.brand_metadata[brand].brand_shop_detail_url
-            brand_url_soup = get_soup(brand_url)
-            if brand_url_soup is None:
-                continue
-            item_dic = {}
-            for div in brand_url_soup.find_all('div', class_='prod-info'):
-                item_name = div.find('a').get('data-attr')
-                is_in_promotion = div.find('div', class_="discount") is not None
-                item_dic[item_name] = is_in_promotion
-            self.brand_metadata[brand].items = item_dic
diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
new file mode 100644
index 0000000..6a00b75
--- /dev/null
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
@@ -0,0 +1,57 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import requests
+import time
+import random
+
+from src.scrapper.models import brand_generator
+
+class Items:
+    def __init__(self):
+        pass
+
+    def crawl_items(self):
+        self._get_items()
+
+    def _get_items(self) -> None:
+        """
+        각 브랜드의 제품 정보 추가 - 제품ID, 제품명, url, 프로모션여부
+        """
+        for brand in self.brand_metadata.keys():
+            brand_url = self.brand_metadata[brand].brand_shop_detail_url
+            driver = webdriver.Chrome()
+            driver.get(brand_url)
+
+            # 1페이지 상품 정보 수집
+            self._get_products(driver, brand)
+
+            # 다음 페이지 버튼 찾기
+            next_pages = driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
+            if next_pages:
+                for next_page in next_pages:
+                    try:
+                        driver.execute_script("arguments[0].click();", next_page)
+                        time.sleep(random.randrange(5, 10) + random.random())  # 페이지 로딩 대기
+                        response = requests.get(brand_url)
+                        if response.status_code != 200:
+                            time.sleep(10)
+                    except:
+                        time.sleep(10)
+
+                    self._get_products(driver, brand)
+
+    def _get_products(self, driver, brand) -> None:
+        """
+        하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집
+        """
+        products = driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb')
+        for product in products:
+            href = product.get_attribute('href')
+            data_ref_goodsno = product.get_attribute('data-ref-goodsno')
+            data_attr = product.get_attribute('data-attr')
+            is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0
+            self.brand_metadata[brand].items[data_ref_goodsno] = {
+                'item_name': data_attr,
+                'href': href,
+                'is_in_promotion': is_in_promotion
+            }

From f5ebd9d7621ba2828b46cb5dba90e26a8d732ec3 Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Fri, 15 Nov 2024 23:13:12 +0900
Subject: [PATCH 27/31] =?UTF-8?q?refactor=20:=20brand=20class=20=EB=B3=80?=
 =?UTF-8?q?=EC=88=98=EB=AA=85=20=EB=8B=A4=EB=93=AC=EA=B8=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/oliveyoung.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py
index 9508204..57bb7ba 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py
@@ -6,11 +6,8 @@
 
 
 class Brand:
-    def __init__(self, brand_metadata=None) -> None:
-        if brand_metadata:
-            self.brand_metadata = brand_metadata
-        else:
-            self.brand_metadata = defaultdict(brand_generator)
+    def __init__(self) -> None:
+        self.brand_metadata = defaultdict(brand_generator)
 
     def crawl_brand_metadata(self):
         self._get_brand_in_each_category(
@@ -81,13 +78,13 @@ def _get_brand_shop_url(self) -> None:
         for a_tag in total_brand_list_soup.find_all('a'):
             brand_code = a_tag.get('data-ref-onlbrndcd')
             if brand_code:
-                brand_name = a_tag.text
-                if brand_name in self.brand_metadata.keys():  # Kor brand name
-                    self.brand_metadata[brand_name].brand_shop_detail_url = brand_base_url + brand_code
-                    code_name[brand_code] = brand_name
+                brand = a_tag.text
+                if brand in self.brand_metadata.keys():  # Kor brand name
+                    self.brand_metadata[brand].brand_shop_detail_url = brand_base_url + brand_code
+                    code_name[brand_code] = brand
                 else:                                # Eng brand name
                     try:
                         kor_brand_name = code_name[brand_code]
-                        self.brand_metadata[kor_brand_name].query_keyword.append(brand_name)
+                        self.brand_metadata[kor_brand_name].query_keyword.append(brand)
                     except Exception:
                         pass

From dae319109a50a32b2cd6874edd632148ea5a4f64 Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Fri, 15 Nov 2024 23:13:54 +0900
Subject: [PATCH 28/31] =?UTF-8?q?fix=20:=20item=20=EA=B8=B0=EB=B3=B8?=
 =?UTF-8?q?=EC=A0=95=EB=B3=B4=20=EC=88=98=EC=A7=91=EA=B8=B0=20=EB=8F=99?=
 =?UTF-8?q?=EC=9E=91=20=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95,=20dataclas?=
 =?UTF-8?q?s=20=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 brickstudy_ingestion/src/scrapper/models.py   | 17 ++++-
 .../src/scrapper/oliveyoung_items.py          | 68 +++++++++----------
 2 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/models.py b/brickstudy_ingestion/src/scrapper/models.py
index a8eca32..5cbeda9 100644
--- a/brickstudy_ingestion/src/scrapper/models.py
+++ b/brickstudy_ingestion/src/scrapper/models.py
@@ -6,7 +6,6 @@
 class OliveyoungBrand:
     query_keyword: List[str]            # api 쿼리 키워드 - 브랜드 영문이름 등
     brand_shop_detail_url: str          # 브랜드관 url
-    items: Dict[str, bool]              # 브랜드 제품 리스트 {제품명:할인여부}
     category: List[str]                 # 브랜드가 속한 카테고리
     released_date: str = field(default_factory='2024/08/05')   # 신제품 출시 일자
 
@@ -15,11 +14,25 @@ def brand_generator():
     return OliveyoungBrand(
         [],
         '',
-        {},
         [],
         ''
     )
 
+@dataclass
+class OliveyoungItem:
+    item_name: str
+    item_detail_url: str
+    is_in_promotion: bool
+    reviews: List[str]
+
+
+def oliveyoung_item_generator():
+    return OliveyoungItem(
+        '',
+        '',
+        False,
+        []
+    )
 
 @dataclass
 class InstagramData:
diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
index 6a00b75..139cb12 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
@@ -1,57 +1,55 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
+from collections import defaultdict
 import requests
 import time
 import random
 
-from src.scrapper.models import brand_generator
+from src.scrapper.models import oliveyoung_item_generator
 
 class Items:
-    def __init__(self):
-        pass
+    def __init__(self, brand_name: str, brand_url: str):
+        self.brand = brand_name
+        self.brand_url = brand_url
+        self.data = defaultdict(oliveyoung_item_generator)
+        self.driver = webdriver.Chrome()
 
     def crawl_items(self):
         self._get_items()
 
     def _get_items(self) -> None:
         """
-        각 브랜드의 제품 정보 추가 - 제품ID, 제품명, url, 프로모션여부
+        하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집
         """
-        for brand in self.brand_metadata.keys():
-            brand_url = self.brand_metadata[brand].brand_shop_detail_url
-            driver = webdriver.Chrome()
-            driver.get(brand_url)
-
-            # 1페이지 상품 정보 수집
-            self._get_products(driver, brand)
-
-            # 다음 페이지 버튼 찾기
-            next_pages = driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
-            if next_pages:
-                for next_page in next_pages:
-                    try:
-                        driver.execute_script("arguments[0].click();", next_page)
-                        time.sleep(random.randrange(5, 10) + random.random())  # 페이지 로딩 대기
-                        response = requests.get(brand_url)
-                        if response.status_code != 200:
-                            time.sleep(10)
-                    except:
+        self.driver.get(self.brand_url)
+
+        # 1페이지 상품 정보 수집
+        self._get_products()
+
+        # 다음 페이지 버튼 찾기
+        next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
+        if next_pages:
+            for next_page in next_pages:
+                try:
+                    self.driver.execute_script("arguments[0].click();", next_page)
+                    time.sleep(random.randrange(5, 10) + random.random())  # 페이지 로딩 대기
+                    response = requests.get(self.brand_url)
+                    if response.status_code != 200:
                         time.sleep(10)
+                except:
+                    time.sleep(10)
 
-                    self._get_products(driver, brand)
+                self._get_products()
 
-    def _get_products(self, driver, brand) -> None:
-        """
-        하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집
-        """
-        products = driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb')
+    def _get_products(self) -> None:
+        products = self.driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb')
         for product in products:
             href = product.get_attribute('href')
             data_ref_goodsno = product.get_attribute('data-ref-goodsno')
             data_attr = product.get_attribute('data-attr')
-            is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0
-            self.brand_metadata[brand].items[data_ref_goodsno] = {
-                'item_name': data_attr,
-                'href': href,
-                'is_in_promotion': is_in_promotion
-            }
+            is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 
+            item_id = f"{self.brand}_{data_ref_goodsno}"
+
+            self.data[item_id].item_name = data_attr
+            self.data[item_id].item_detail_url = href
+            self.data[item_id].is_in_promotion = is_in_promotion

From 38a8c2ee8a657ca9eaf333c644361ed30f7a7049 Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Tue, 19 Nov 2024 18:21:41 +0900
Subject: [PATCH 29/31] =?UTF-8?q?add=20:=20=EA=B0=81=20=EC=95=84=EC=9D=B4?=
 =?UTF-8?q?=ED=85=9C=20url=20=EB=93=A4=EC=96=B4=EA=B0=80=EC=84=9C=20?=
 =?UTF-8?q?=EB=A6=AC=EB=B7=B0=20=EC=88=98=EC=A7=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/oliveyoung_items.py          | 87 +++++++++++++++++--
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
index 139cb12..9de3cdb 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
@@ -1,55 +1,126 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+from datetime import datetime, timedelta
 from collections import defaultdict
 import requests
 import time
 import random
 
 from src.scrapper.models import oliveyoung_item_generator
+from src.scrapper.utils import write_local_as_json
 
 class Items:
     def __init__(self, brand_name: str, brand_url: str):
         self.brand = brand_name
         self.brand_url = brand_url
         self.data = defaultdict(oliveyoung_item_generator)
+        self.item_id = None
         self.driver = webdriver.Chrome()
 
     def crawl_items(self):
+        # brand 페이지에서 전체 item 정보들 수집
+        self.driver.get(self.brand_url)
         self._get_items()
 
+        # 각 item 페이지에서 리뷰 수집
+        for item_id in self.data.keys():
+            self.item_id = item_id
+            self.driver.get(self.data[item_id].item_detail_url)
+            self._get_reviews()
+
     def _get_items(self) -> None:
         """
         하나의 brand page의 item page x에 있는 아이템정보(id, url, 상품명, 할인여부) 수집
         """
-        self.driver.get(self.brand_url)
-
-        # 1페이지 상품 정보 수집
+        # 최초 1페이지 상품 정보 수집
         self._get_products()
-
-        # 다음 페이지 버튼 찾기
+        # 페이지 넘기면서 상품 정보 수집
         next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
         if next_pages:
             for next_page in next_pages:
                 try:
                     self.driver.execute_script("arguments[0].click();", next_page)
-                    time.sleep(random.randrange(5, 10) + random.random())  # 페이지 로딩 대기
+                    time.sleep(random.randrange(5, 10) + random.random())
                     response = requests.get(self.brand_url)
                     if response.status_code != 200:
                         time.sleep(10)
                 except:
                     time.sleep(10)
-
                 self._get_products()
 
     def _get_products(self) -> None:
+        """
+        아이템 element 찾아서 실제 수집 동작
+        """
         products = self.driver.find_elements(By.CSS_SELECTOR, 'ul.prod-list.goodsProd div.prod a.thumb')
         for product in products:
             href = product.get_attribute('href')
             data_ref_goodsno = product.get_attribute('data-ref-goodsno')
             data_attr = product.get_attribute('data-attr')
-            is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 
+            is_in_promotion = len(product.find_elements(By.CLASS_NAME, 'discount')) > 0
             item_id = f"{self.brand}_{data_ref_goodsno}"
 
             self.data[item_id].item_name = data_attr
             self.data[item_id].item_detail_url = href
             self.data[item_id].is_in_promotion = is_in_promotion
+
+    def _get_reviews(self):
+        self.__click_review_button()
+        self.__click_latest_button()
+        
+        self.__get_reviews_with_page_moving()
+
+    def __click_review_button(self) -> None:
+        try:
+            review_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a.goods_reputation[data-attr="상품상세^상품상세_SortingTab^리뷰"]')
+            self.driver.execute_script("arguments[0].scrollIntoView(true);", review_button_element)
+            self.driver.execute_script("arguments[0].click();", review_button_element)
+            time.sleep(random.randint(2, 4))
+        except Exception as e:
+            print(f"리뷰 버튼 클릭 실패: {e}")
+    
+    def __click_latest_button(self) -> None:
+        try:
+            latest_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a[data-sort-type-code="latest"][data-attr="상품상세^리뷰정렬^최신순"]')
+            self.driver.execute_script("arguments[0].scrollIntoView(true);", latest_button_element)
+            self.driver.execute_script("arguments[0].click();", latest_button_element)
+            time.sleep(random.randint(2, 4))
+        except Exception as e:
+            print(f"최신순 버튼 클릭 실패: {e}")
+
+    def __get_reviews_with_page_moving(self):
+        self.__get_reviews_in_each_page()
+        next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
+        flag = True
+        while flag:
+            flag = len(next_pages) == 10  # next page가 있으면 계속 클릭하면서 수집
+            for i in range(len(next_pages)):
+                try:
+                    self.driver.execute_script("arguments[0].click();", next_pages[i])
+                    time.sleep(random.randrange(5, 10) + random.random())
+                except:
+                    print("exception block in page moving")
+                    time.sleep(3)
+                self.__get_reviews_in_each_page()
+                next_pages = self.driver.find_elements(By.CSS_SELECTOR, '.pageing a[data-page-no]')
+
+    def __get_reviews_in_each_page(self):
+        """
+        리뷰 element 찾아서 실제 수집 동작
+        """
+        review_elements = self.driver.find_elements(By.CLASS_NAME, 'txt_inner')
+        date_elements = self.driver.find_elements(By.CLASS_NAME, 'date')
+        for rev_elem, date_elem in zip(review_elements, date_elements):
+            self.data[self.item_id].reviews.append((rev_elem.text, date_elem.text))
+
+
+if __name__ == "__main__":
+    brand_name = "토리든"
+    brand_url = "https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd=A002820&t_page=%EC%83%81%ED%92%88%EC%83%81%EC%84%B8&t_click=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B4%80_%EC%83%81%EB%8B%A8&t_brand_name=%ED%86%A0%EB%A6%AC%EB%93%A0"
+    item_x = Items(brand_name, brand_url)
+    item_x.crawl_items()
+    
+    write_local_as_json(item_x.data, './', 'toridn')

From d1d30773e25a6a19440c05c60250762d029ee29c Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Tue, 19 Nov 2024 18:23:45 +0900
Subject: [PATCH 30/31] =?UTF-8?q?fix=20:=20dummy=20-=20main=20=ED=95=A8?=
 =?UTF-8?q?=EC=88=98,=20=ED=95=A8=EC=88=98=20=EC=84=A4=EB=AA=85=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                      | 3 ++-
 brickstudy_ingestion/src/scrapper/oliveyoung.py | 5 +++++
 brickstudy_ingestion/src/scrapper/utils.py      | 7 ++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4c011e0..021a48c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,5 @@ log
 kafka-data
 aws_credentials
 brickstudy_ingestion/dags/viral/tmp
-brickstudy_ingestion/src/scrapper/results
\ No newline at end of file
+brickstudy_ingestion/src/scrapper/results
+.DS_Store
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung.py b/brickstudy_ingestion/src/scrapper/oliveyoung.py
index 57bb7ba..90ed7b7 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung.py
@@ -88,3 +88,8 @@ def _get_brand_shop_url(self) -> None:
                         self.brand_metadata[kor_brand_name].query_keyword.append(brand)
                     except Exception:
                         pass
+
+if __name__ == "__main__":
+    brand = Brand()
+    brand.crawl_brand_metadata()
+    print(brand.brand_metadata)
\ No newline at end of file
diff --git a/brickstudy_ingestion/src/scrapper/utils.py b/brickstudy_ingestion/src/scrapper/utils.py
index 935290c..22dcec7 100644
--- a/brickstudy_ingestion/src/scrapper/utils.py
+++ b/brickstudy_ingestion/src/scrapper/utils.py
@@ -100,7 +100,12 @@ def dict_partitioner(data: dict, level: int):
         start = end
 
 
-def write_local_as_json(data, file_path, file_name):
+def write_local_as_json(data: dict, file_path: str, file_name: str):
+    """
+    data : dictionary with the dataclass value
+    file_path : directory string where the json file created
+    file_name : file name without extension
+    """
     from dataclasses import asdict
     import json
     import os

From 79675029bafd0a265ea70c01e04b02570c9303cf Mon Sep 17 00:00:00 2001
From: seoyeong200 <kimseozero@naver.com>
Date: Wed, 20 Nov 2024 18:04:29 +0900
Subject: [PATCH 31/31] =?UTF-8?q?bug=20:=20oliveyoung=20item=20=EC=88=98?=
 =?UTF-8?q?=EC=A7=91=20=EC=BD=94=EB=93=9C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/scrapper/oliveyoung_items.py          | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
index 9de3cdb..ea8e29f 100644
--- a/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
+++ b/brickstudy_ingestion/src/scrapper/oliveyoung_items.py
@@ -20,16 +20,17 @@ def __init__(self, brand_name: str, brand_url: str):
         self.item_id = None
         self.driver = webdriver.Chrome()
 
-    def crawl_items(self):
+    def crawl_total_items(self):
         # brand 페이지에서 전체 item 정보들 수집
         self.driver.get(self.brand_url)
         self._get_items()
 
+    def crawl_reviews_in_each_items(self, item_id: str):
         # 각 item 페이지에서 리뷰 수집
-        for item_id in self.data.keys():
-            self.item_id = item_id
-            self.driver.get(self.data[item_id].item_detail_url)
-            self._get_reviews()
+        self.item_id = item_id
+        item_url = self.data[item_id].item_detail_url
+        self.driver.get(item_url)
+        self._get_reviews()
 
     def _get_items(self) -> None:
         """
@@ -43,12 +44,9 @@ def _get_items(self) -> None:
             for next_page in next_pages:
                 try:
                     self.driver.execute_script("arguments[0].click();", next_page)
-                    time.sleep(random.randrange(5, 10) + random.random())
-                    response = requests.get(self.brand_url)
-                    if response.status_code != 200:
-                        time.sleep(10)
+                    time.sleep(random.randrange(5, 7) + random.random())
                 except:
-                    time.sleep(10)
+                    time.sleep(2)
                 self._get_products()
 
     def _get_products(self) -> None:
@@ -78,7 +76,7 @@ def __click_review_button(self) -> None:
             review_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a.goods_reputation[data-attr="상품상세^상품상세_SortingTab^리뷰"]')
             self.driver.execute_script("arguments[0].scrollIntoView(true);", review_button_element)
             self.driver.execute_script("arguments[0].click();", review_button_element)
-            time.sleep(random.randint(2, 4))
+            time.sleep(random.randint(1, 3))
         except Exception as e:
             print(f"리뷰 버튼 클릭 실패: {e}")
     
@@ -87,7 +85,7 @@ def __click_latest_button(self) -> None:
             latest_button_element = self.driver.find_element(By.CSS_SELECTOR, 'a[data-sort-type-code="latest"][data-attr="상품상세^리뷰정렬^최신순"]')
             self.driver.execute_script("arguments[0].scrollIntoView(true);", latest_button_element)
             self.driver.execute_script("arguments[0].click();", latest_button_element)
-            time.sleep(random.randint(2, 4))
+            time.sleep(random.randint(1, 3))
         except Exception as e:
             print(f"최신순 버튼 클릭 실패: {e}")
 
@@ -100,7 +98,7 @@ def __get_reviews_with_page_moving(self):
             for i in range(len(next_pages)):
                 try:
                     self.driver.execute_script("arguments[0].click();", next_pages[i])
-                    time.sleep(random.randrange(5, 10) + random.random())
+                    time.sleep(random.randrange(3, 5) + random.random())
                 except:
                     print("exception block in page moving")
                     time.sleep(3)
@@ -121,6 +119,12 @@ def __get_reviews_in_each_page(self):
     brand_name = "토리든"
     brand_url = "https://www.oliveyoung.co.kr/store/display/getBrandShopDetail.do?onlBrndCd=A002820&t_page=%EC%83%81%ED%92%88%EC%83%81%EC%84%B8&t_click=%EB%B8%8C%EB%9E%9C%EB%93%9C%EA%B4%80_%EC%83%81%EB%8B%A8&t_brand_name=%ED%86%A0%EB%A6%AC%EB%93%A0"
     item_x = Items(brand_name, brand_url)
-    item_x.crawl_items()
+    item_x.crawl_total_items()
+    print("crawl total item is done")
+    print(item_x.data)
     
-    write_local_as_json(item_x.data, './', 'toridn')
+    item_list = item_x.data.keys()
+    for idx, test_item in enumerate(item_list):
+        item_x.crawl_reviews_in_each_items(item_id=test_item)
+        if idx % 4 == 0:
+            write_local_as_json(item_x.data, './logs', f"{brand_name}_{idx}")