appgoblin-dev · ddxv · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/adscrawler/app_stores/apple.py b/adscrawler/app_stores/apple.py
@@ -183,25 +183,24 @@ def scrape_store_html(store_id: str, country: str) -> dict:
     headers = {"User-Agent": "Mozilla/5.0"}
     response = requests.get(url, headers=headers)
 
-    html = response.text
-
     if response.status_code != 200:
         logger.error(f"Failed to retrieve the page: {response.status_code}")
         return {}
 
     soup = BeautifulSoup(response.text, "html.parser")
+
     in_app_purchase_element = soup.find(
-        "li", class_="inline-list__item--bulleted", string="Offers In-App Purchases"
+        "p", class_=["attributes"], string=re.compile(r"Purchases", re.I)
     )
-    has_in_app_purchases = in_app_purchase_element is not None
 
-    try:
-        privacy_details = get_privacy_details(html, country, store_id)
-    except Exception as e:
-        logger.error(f"Failed to get privacy details for {store_id=} {country=} {e}")
-        privacy_details = None
+    has_in_app_purchases = in_app_purchase_element is not None
 
-    has_third_party_advertising = "THIRD_PARTY_ADVERTISING" in str(privacy_details)
+    purpose_section = soup.find(
+        "section", class_=lambda classes: classes and "purpose-section" in classes
+    )
+    has_third_party_advertising = (
+        "third-party advertising" in purpose_section.get_text(strip=True).lower()
+    )
 
     urls = get_urls_from_html(soup)
 
@@ -229,70 +228,6 @@ def get_urls_from_html(soup: BeautifulSoup) -> dict:
     return urls
 
 
-def get_privacy_details(html: str, country: str, store_id: str) -> dict:
-    """
-    Get privacy details for an iOS app from the App Store.
-
-    Args:
-        html: HTML of the App Store page
-        country: Country code (default: 'US')
-        store_id: App Store ID of the app
-
-    Returns:
-        True if the app has third-party advertising, False otherwise
-    """
-
-    # Extract the token using regex
-    reg_exp = r"token%22%3A%22([^%]+)%22%7D"
-    match = re.search(reg_exp, html)
-    if not match:
-        raise ValueError("Could not extract token from App Store page")
-
-    token = match.group(1)
-
-    # Make request to the API for privacy details
-    api_url = f"https://amp-api-edge.apps.apple.com/v1/catalog/{country}/apps/{store_id}?platform=web&fields=privacyDetails"
-    api_headers = {
-        "Origin": "https://apps.apple.com",
-        "Authorization": f"Bearer {token}",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-    }
-
-    response = requests.get(api_url, headers=api_headers)
-    response.raise_for_status()
-
-    data = response.json()
-
-    if not data.get("data") or len(data["data"]) == 0:
-        raise ValueError("App not found (404)")
-    privacy_details: dict = data["data"][0]["attributes"]["privacyDetails"]
-    return privacy_details
-
-
-def find_privacy_policy_id(soup: BeautifulSoup) -> str | None:
-    privacy_learn_more = soup.find("p", class_="app-privacy__learn-more")
-
-    if privacy_learn_more:
-        # Get the anchor tag inside the paragraph
-        link = privacy_learn_more.find("a")
-
-        if link:
-            # Extract the full URL
-            url = link.get("href")
-            print(f"URL: {url}")
-
-            # Extract the ID from the URL
-            # The URL format is https://apps.apple.com/story/id1538632801
-            if "id" in url:
-                # Find the position where "id" starts and extract everything after it
-                id_position = url.find("id")
-                if id_position != -1:
-                    id_value: str = url[id_position + 2 :]  # +2 to skip the "id" prefix
-                    print(f"ID: {id_value}")  # This will print: 1538632801
-                    return id_value
-    return None
-
-
 def get_developer_url(result: dict, urls: dict) -> str:
     """
     Decide if we should crawl the store html for the developer url.
@@ -328,12 +263,14 @@ def get_developer_url(result: dict, urls: dict) -> str:
     return final_url
 
 
-def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
+def scrape_app_ios(
+    store_id: str, country: str, language: str, scrape_html: bool = False
+) -> dict:
     """Scrape iOS app details from the App Store.
     yt_us = scrape_app_ios("544007664", "us", language="en")
     yt_de = scrape_app_ios("544007664", "de", language="en")
 
-    # SAME FOR ALL COUNTRIES
+    ## SAME FOR ALL COUNTRIES
     yt_de['sellerName'] == yt_us['sellerName']
     yt_us['currentVersionReleaseDate'] == yt_de['currentVersionReleaseDate']
 
@@ -344,7 +281,7 @@ def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
     yt_de['user_ratings'] != yt_us['user_ratings']
     yt_de['description'] != yt_us['description']
 
-    # These very by country but are also the same as each other?
+    ### These very by country but are also the same as each other?
     yt_de['userRatingCount'] == yt_de['userRatingCountForCurrentVersion']
 
 
@@ -355,7 +292,9 @@ def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
     result_dict: dict = scraper.get_app_details(
         store_id, country=country, add_ratings=True, timeout=10, lang=language
     )
-    logger.info(f"store=2 {country=} {language=} {store_id=} ios store scraped")
+    if scrape_html:
+        result_dict = scrape_itunes_additional_html(result_dict, store_id, country)
+    logger.debug(f"store=2 {country=} {language=} {store_id=} ios store scraped")
     return result_dict
 
 
@@ -366,9 +305,9 @@ def scrape_itunes_additional_html(result: dict, store_id: str, country: str) ->
         result["in_app_purchases"] = html_res["in_app_purchases"]
         result["ad_supported"] = html_res["ad_supported"]
         result["sellerUrl"] = get_developer_url(result, html_res["urls"])
+        result["additional_html_scraped_at"] = datetime.datetime.now(tz=datetime.UTC)
     except Exception as e:
         logger.warning(f"Failed to get developer url for {store_id=} {country=} {e}")
-    result["additional_html_scraped_at"] = datetime.datetime.now(tz=datetime.UTC)
     return result
 
 
@@ -477,6 +416,16 @@ def clean_ios_app_df(df: pd.DataFrame) -> pd.DataFrame:
         df.loc[
             df["store_language_code"].str.startswith("zh-"), "store_language_code"
         ] = "zh"
+    # Fix .0 int to string issues
+    # Mixing nulls in and ints cause .0 to be added to the end of the string
+    problem_rows = df["developer_id"].str.contains(".0")
+    if problem_rows.any():
+        logger.warning(
+            f'Found {problem_rows.sum()} developer_id with ".0" suffix, fixing'
+        )
+        df.loc[problem_rows, "developer_id"] = (
+            df.loc[problem_rows, "developer_id"].str.split(".").str[0]
+        )
     return df
 
 

diff --git a/adscrawler/app_stores/google.py b/adscrawler/app_stores/google.py
@@ -16,23 +16,21 @@ def scrape_app_gp(store_id: str, country: str, language: str = "en") -> dict:
     yt_us = scrape_app_gp("com.google.android.youtube", "us", language="en")
     yt_de = scrape_app_gp("com.google.android.youtube", "mx", language="en")
 
-    # SAME FOR ALL COUNTRIES
+    ## SAME FOR ALL COUNTRIES
     yt_us["ratings"] == yt_de["ratings"]
     yt_us["realInstalls"] == yt_de["realInstalls"]
     yt_us["updated"] == yt_de["updated"]
 
-    # MOSTLY SAME FOR ALL COUNTRIES
-    # Almost always lower for smaller countries
-    # looks more like delays and incomplete (0s)
+    ## MOSTLY SAME FOR ALL COUNTRIES
+    ## Almost always lower for smaller countries
+    ## looks more like delays and incomplete (0s)
     yt_us["histogram"] == yt_de["histogram"]
 
 
     ## UNIQUE PER COUNTRY
     yt_us["reviews"] != yt_de["reviews"]
     yt_us["score"] != yt_de["score"]
 
-
-
     ## UNIQUE PER LANGUAGE
     yt_us["description"] == yt_de["description"]
     yt_us["description"] == yt_de_en["description"]
@@ -43,7 +41,7 @@ def scrape_app_gp(store_id: str, country: str, language: str = "en") -> dict:
         country=country,
         timeout=10,
     )
-    logger.info(f"store=1 {country=} {language=} {store_id=} play store scraped")
+    logger.debug(f"store=1 {country=} {language=} {store_id=} play store scraped")
     return result_dict
 
 
@@ -161,14 +159,14 @@ def get_js_data(filepath: str, is_json: bool = True) -> list[dict] | list:
 
 
 def scrape_google_ranks(country: str) -> list[dict]:
-    logger.info("Scrape Google ranks start")
+    logger.info(f"Scrape Google ranks {country=} start")
     filepath = f"/tmp/googleplay_json_{country}.txt"
     try:
         call_js_to_update_file(filepath, country)
     except Exception as error:
-        logger.exception(f"JS pull failed with {error=}")
+        logger.exception(f"JS pull failed with {country=} {error=}")
     ranked_dicts = get_js_data(filepath)
-    logger.info(f"Scrape Google ranks finished: {len(ranked_dicts)}")
+    logger.info(f"Scrape Google ranks {country=} finished: {len(ranked_dicts)}")
     return ranked_dicts