Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
687dd3d
Do not use cached store_map second time
ddxv Nov 10, 2025
e5812f6
Merge branch 'main' of ssh://github.com/ddxv/adscrawler
ddxv Nov 10, 2025
dc94704
Fix delete and insert for keywords, needed an extra dimension
ddxv Nov 10, 2025
7ac310b
Dump schema changes for keywords
ddxv Nov 10, 2025
d275d72
Dump schema changes for keywords
ddxv Nov 10, 2025
a310355
store click urls into creative records table
ddxv Nov 11, 2025
29c44c2
store click urls into creative records table
ddxv Nov 11, 2025
8bab6ac
dont fail if url missing path
ddxv Nov 11, 2025
5e6d135
Small additions for stopwords, didnt rerun
ddxv Nov 11, 2025
f6e5092
clean waydroid media dir for leftover files
ddxv Nov 13, 2025
9096ba8
Fix keyword crawling
ddxv Nov 13, 2025
ebdc174
backoff for keyword failures
ddxv Nov 14, 2025
90301bf
Fix null values in apple developer columns causing incorrect develope…
ddxv Nov 14, 2025
62c36aa
Remove unused countries map
ddxv Nov 16, 2025
3ba7615
RotatingLogHandler inconsistently rotates files if multiple processes…
ddxv Nov 16, 2025
10132e7
Clarify search for keywords
ddxv Nov 16, 2025
2ba0f09
Add icon_url_100 to result to avoid needing to pass original df to pr…
ddxv Nov 16, 2025
a2c4064
Undo rename, function used by frontned
ddxv Nov 16, 2025
7d9586b
Clarify variable store should not be none
ddxv Nov 16, 2025
a9f0c6b
Clarify variable store should not be none
ddxv Nov 16, 2025
777a295
Add group to constantly process new apps rather than wait for long ru…
ddxv Nov 16, 2025
67ef812
Better log messages when querying ranks
ddxv Nov 16, 2025
4fd3d66
Refresh schema
Nov 24, 2025
4bd7742
Refresh schema
Nov 24, 2025
11ae739
Refresh schema
Nov 24, 2025
4796028
Drop some long b64 strings from ios insert
Dec 1, 2025
5daf3db
Reduce overhead memory in individual processes
Dec 1, 2025
a93ab86
Remove testing string
Dec 1, 2025
202b7ac
Add back in threadpoolexecutor but with the workers as well
Dec 2, 2025
a921f21
Add back in threadpoolexecutor but with the workers as well
Dec 2, 2025
677e56b
Try fewer default threads
Dec 3, 2025
db9915d
avoid hammering when using threads
Dec 3, 2025
18de49a
avoid hammering when using threads
Dec 3, 2025
a2692f2
Threading is causing issues with Google, which uses urllib rather tha…
Dec 3, 2025
64794ee
Merge branch 'appgoblin-dev:main' into main
ddxv Dec 4, 2025
2d4c3be
Organize store_scrape log messages
Dec 4, 2025
b9da731
Merge branch 'main' of github.com:ddxv/adscrawler
Dec 4, 2025
4eaf728
Organize store_scrape log messages
Dec 4, 2025
58da15e
Add slight delay and jitter
Dec 4, 2025
36d89dc
Skip missing base64s
Jan 21, 2026
9203d36
Stop tracking new_company
Jan 21, 2026
afc00eb
Add example
Jan 21, 2026
f142086
Use new fdw table
Jan 21, 2026
06bd503
Update apps to crawl more often
Jan 27, 2026
c9f62be
WIP: new global keywords logic
Jan 27, 2026
d169edf
WIP: new global keywords logic
Jan 27, 2026
7d5ae42
Update apps to crawl more often
Jan 27, 2026
57fcbb8
Dont retry apps that appear to have not updated recently
Jan 29, 2026
7a1700f
Use new logging table to cut down on query time for apps to keyword e…
Jan 30, 2026
f2782f1
Add optional dependencies related to keyword cleaning
Jan 30, 2026
458c607
Update query apps to download logic
Jan 30, 2026
ae42040
Working on new global keywords logic
Jan 30, 2026
b1b4578
Better processing of app keyword extraction
Jan 30, 2026
b04e019
Allow bad version strings
Jan 31, 2026
ba68121
Allow bad version strings
Jan 31, 2026
0fde485
Allow bad version strings
Jan 31, 2026
5d6a4f3
Clean weekly rank table after importing data
Feb 2, 2026
7afef48
Clean weekly rank table after importing data
Feb 2, 2026
cfa04a2
New schema dump with moving mvs to using sdk ids inplace of company ids
Feb 3, 2026
b63bb1b
Merge pull request #3 from ddxv/db-use-sdk-id
ddxv Feb 3, 2026
9f7be15
interim schema
Feb 4, 2026
93bf717
interim schema
Feb 4, 2026
23f3cb7
interim schema
Feb 4, 2026
6bfb9f7
interim schema
Feb 4, 2026
6e16275
New schema
Feb 5, 2026
ab1d9d1
Prep Schema dump
Feb 6, 2026
d139bed
Scrape apple HTML for ad supported and in apps
Feb 6, 2026
a02e270
default to none
Feb 6, 2026
6cdfffd
Remove unused columns
Feb 6, 2026
db9912f
Fix apple global metrics by querying historical country data and comb…
Feb 11, 2026
f7c23b8
Fixes for pulling apple data
Feb 12, 2026
72c9864
Dump recent MV changes
Feb 12, 2026
72297e1
Dump recent MV changes
Feb 12, 2026
4d1c4a4
Fix incorrect passing of args
Feb 12, 2026
a4c49f1
Use search queries top priority for keyword searches
Feb 12, 2026
35545e1
Use search queries top priority for keyword searches
Feb 12, 2026
9a8641f
Schema checkin
Feb 13, 2026
bc9db17
new tables
Feb 14, 2026
7492a60
Rename table
Feb 14, 2026
1d92ca1
Calculate WAU
Feb 14, 2026
6f485fa
include mau and logging
Feb 14, 2026
b8b5d7f
Add spot for it to run daily
Feb 14, 2026
ae96f1f
speed up query
Feb 15, 2026
78178b6
Add WIP estimate for installs per country based on ratings / reviews …
Feb 17, 2026
42bca06
Estimate tier percentages based on installs
Feb 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 28 additions & 79 deletions adscrawler/app_stores/apple.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,25 +183,24 @@ def scrape_store_html(store_id: str, country: str) -> dict:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

html = response.text

if response.status_code != 200:
logger.error(f"Failed to retrieve the page: {response.status_code}")
return {}

soup = BeautifulSoup(response.text, "html.parser")

in_app_purchase_element = soup.find(
"li", class_="inline-list__item--bulleted", string="Offers In-App Purchases"
"p", class_=["attributes"], string=re.compile(r"Purchases", re.I)
)
has_in_app_purchases = in_app_purchase_element is not None

try:
privacy_details = get_privacy_details(html, country, store_id)
except Exception as e:
logger.error(f"Failed to get privacy details for {store_id=} {country=} {e}")
privacy_details = None
has_in_app_purchases = in_app_purchase_element is not None

has_third_party_advertising = "THIRD_PARTY_ADVERTISING" in str(privacy_details)
purpose_section = soup.find(
"section", class_=lambda classes: classes and "purpose-section" in classes
)
has_third_party_advertising = (
"third-party advertising" in purpose_section.get_text(strip=True).lower()
)

urls = get_urls_from_html(soup)

Expand Down Expand Up @@ -229,70 +228,6 @@ def get_urls_from_html(soup: BeautifulSoup) -> dict:
return urls


def get_privacy_details(html: str, country: str, store_id: str) -> dict:
"""
Get privacy details for an iOS app from the App Store.

Args:
html: HTML of the App Store page
country: Country code (default: 'US')
store_id: App Store ID of the app

Returns:
True if the app has third-party advertising, False otherwise
"""

# Extract the token using regex
reg_exp = r"token%22%3A%22([^%]+)%22%7D"
match = re.search(reg_exp, html)
if not match:
raise ValueError("Could not extract token from App Store page")

token = match.group(1)

# Make request to the API for privacy details
api_url = f"https://amp-api-edge.apps.apple.com/v1/catalog/{country}/apps/{store_id}?platform=web&fields=privacyDetails"
api_headers = {
"Origin": "https://apps.apple.com",
"Authorization": f"Bearer {token}",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}

response = requests.get(api_url, headers=api_headers)
response.raise_for_status()

data = response.json()

if not data.get("data") or len(data["data"]) == 0:
raise ValueError("App not found (404)")
privacy_details: dict = data["data"][0]["attributes"]["privacyDetails"]
return privacy_details


def find_privacy_policy_id(soup: BeautifulSoup) -> str | None:
privacy_learn_more = soup.find("p", class_="app-privacy__learn-more")

if privacy_learn_more:
# Get the anchor tag inside the paragraph
link = privacy_learn_more.find("a")

if link:
# Extract the full URL
url = link.get("href")
print(f"URL: {url}")

# Extract the ID from the URL
# The URL format is https://apps.apple.com/story/id1538632801
if "id" in url:
# Find the position where "id" starts and extract everything after it
id_position = url.find("id")
if id_position != -1:
id_value: str = url[id_position + 2 :] # +2 to skip the "id" prefix
print(f"ID: {id_value}") # This will print: 1538632801
return id_value
return None


def get_developer_url(result: dict, urls: dict) -> str:
"""
Decide if we should crawl the store html for the developer url.
Expand Down Expand Up @@ -328,12 +263,14 @@ def get_developer_url(result: dict, urls: dict) -> str:
return final_url


def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
def scrape_app_ios(
store_id: str, country: str, language: str, scrape_html: bool = False
) -> dict:
"""Scrape iOS app details from the App Store.
yt_us = scrape_app_ios("544007664", "us", language="en")
yt_de = scrape_app_ios("544007664", "de", language="en")

# SAME FOR ALL COUNTRIES
## SAME FOR ALL COUNTRIES
yt_de['sellerName'] == yt_us['sellerName']
yt_us['currentVersionReleaseDate'] == yt_de['currentVersionReleaseDate']

Expand All @@ -344,7 +281,7 @@ def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
yt_de['user_ratings'] != yt_us['user_ratings']
yt_de['description'] != yt_us['description']

# These very by country but are also the same as each other?
### These very by country but are also the same as each other?
yt_de['userRatingCount'] == yt_de['userRatingCountForCurrentVersion']


Expand All @@ -355,7 +292,9 @@ def scrape_app_ios(store_id: str, country: str, language: str) -> dict:
result_dict: dict = scraper.get_app_details(
store_id, country=country, add_ratings=True, timeout=10, lang=language
)
logger.info(f"store=2 {country=} {language=} {store_id=} ios store scraped")
if scrape_html:
result_dict = scrape_itunes_additional_html(result_dict, store_id, country)
logger.debug(f"store=2 {country=} {language=} {store_id=} ios store scraped")
return result_dict


Expand All @@ -366,9 +305,9 @@ def scrape_itunes_additional_html(result: dict, store_id: str, country: str) ->
result["in_app_purchases"] = html_res["in_app_purchases"]
result["ad_supported"] = html_res["ad_supported"]
result["sellerUrl"] = get_developer_url(result, html_res["urls"])
result["additional_html_scraped_at"] = datetime.datetime.now(tz=datetime.UTC)
except Exception as e:
logger.warning(f"Failed to get developer url for {store_id=} {country=} {e}")
result["additional_html_scraped_at"] = datetime.datetime.now(tz=datetime.UTC)
return result


Expand Down Expand Up @@ -477,6 +416,16 @@ def clean_ios_app_df(df: pd.DataFrame) -> pd.DataFrame:
df.loc[
df["store_language_code"].str.startswith("zh-"), "store_language_code"
] = "zh"
# Fix .0 int to string issues
# Mixing nulls in and ints cause .0 to be added to the end of the string
problem_rows = df["developer_id"].str.contains(".0")
if problem_rows.any():
logger.warning(
f'Found {problem_rows.sum()} developer_id with ".0" suffix, fixing'
)
df.loc[problem_rows, "developer_id"] = (
df.loc[problem_rows, "developer_id"].str.split(".").str[0]
)
return df


Expand Down
18 changes: 8 additions & 10 deletions adscrawler/app_stores/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,21 @@ def scrape_app_gp(store_id: str, country: str, language: str = "en") -> dict:
yt_us = scrape_app_gp("com.google.android.youtube", "us", language="en")
yt_de = scrape_app_gp("com.google.android.youtube", "mx", language="en")

# SAME FOR ALL COUNTRIES
## SAME FOR ALL COUNTRIES
yt_us["ratings"] == yt_de["ratings"]
yt_us["realInstalls"] == yt_de["realInstalls"]
yt_us["updated"] == yt_de["updated"]

# MOSTLY SAME FOR ALL COUNTRIES
# Almost always lower for smaller countries
# looks more like delays and incomplete (0s)
## MOSTLY SAME FOR ALL COUNTRIES
## Almost always lower for smaller countries
## looks more like delays and incomplete (0s)
yt_us["histogram"] == yt_de["histogram"]


## UNIQUE PER COUNTRY
yt_us["reviews"] != yt_de["reviews"]
yt_us["score"] != yt_de["score"]



## UNIQUE PER LANGUAGE
yt_us["description"] == yt_de["description"]
yt_us["description"] == yt_de_en["description"]
Expand All @@ -43,7 +41,7 @@ def scrape_app_gp(store_id: str, country: str, language: str = "en") -> dict:
country=country,
timeout=10,
)
logger.info(f"store=1 {country=} {language=} {store_id=} play store scraped")
logger.debug(f"store=1 {country=} {language=} {store_id=} play store scraped")
return result_dict


Expand Down Expand Up @@ -161,14 +159,14 @@ def get_js_data(filepath: str, is_json: bool = True) -> list[dict] | list:


def scrape_google_ranks(country: str) -> list[dict]:
logger.info("Scrape Google ranks start")
logger.info(f"Scrape Google ranks {country=} start")
filepath = f"/tmp/googleplay_json_{country}.txt"
try:
call_js_to_update_file(filepath, country)
except Exception as error:
logger.exception(f"JS pull failed with {error=}")
logger.exception(f"JS pull failed with {country=} {error=}")
ranked_dicts = get_js_data(filepath)
logger.info(f"Scrape Google ranks finished: {len(ranked_dicts)}")
logger.info(f"Scrape Google ranks {country=} finished: {len(ranked_dicts)}")
return ranked_dicts


Expand Down
Loading