From da9a16c85220b27cfa3ab75a97a3089911ad43ad Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 21 Nov 2025 22:00:24 +0000 Subject: [PATCH] Fix scraper errors by implementing cloudscraper to bypass Cloudflare protection Changes: - Replace requests with cloudscraper in all scraper modules (match.py, player.py, series.py, summary.py) - Add cloudscraper dependency to requirements.txt - Fix bug in player.py line 196: changed self.headers() to self.headers - Remove unused dateparser import from player.py - All modules now use cloudscraper.create_scraper() to handle Cloudflare anti-bot protection This update addresses 403 Access Denied errors caused by Cloudflare bot detection. The scraper will now work from environments where ESPN Cricinfo is accessible. --- espncricinfo/match.py | 10 ++++++---- espncricinfo/player.py | 18 +++++++++--------- espncricinfo/series.py | 5 +++-- espncricinfo/summary.py | 5 +++-- requirements.txt | 1 + 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/espncricinfo/match.py b/espncricinfo/match.py index e96bc8a..72aeaf2 100644 --- a/espncricinfo/match.py +++ b/espncricinfo/match.py @@ -1,5 +1,5 @@ import json -import requests +import cloudscraper from bs4 import BeautifulSoup from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError @@ -10,6 +10,7 @@ def __init__(self, match_id): self.match_url = "https://www.espncricinfo.com/matches/engine/match/{0}.html".format(str(match_id)) self.json_url = "https://www.espncricinfo.com/matches/engine/match/{0}.json".format(str(match_id)) self.headers = {'user-agent': 'Mozilla/5.0'} + self.scraper = cloudscraper.create_scraper() self.json = self.get_json() self.html = self.get_html() self.comms_json = self.get_comms_json() @@ -90,7 +91,7 @@ def __repr__(self): return (f'{self.__class__.__name__}('f'{self.match_id!r})') def get_json(self): - r = requests.get(self.json_url,headers=self.headers) + r = self.scraper.get(self.json_url, headers=self.headers) if r.status_code == 404: raise MatchNotFoundError elif 'Scorecard not yet available' in r.text: @@ -99,7 +100,7 @@ def get_json(self): return r.json() def get_html(self): - r = requests.get(self.match_url,headers=self.headers) + r = self.scraper.get(self.match_url, headers=self.headers) if r.status_code == 404: raise MatchNotFoundError else: @@ -432,6 +433,7 @@ def get_recent_matches(date=None): url = "https://www.espncricinfo.com/ci/engine/match/index.html?date=%sview=week" % date else: url = "https://www.espncricinfo.com/ci/engine/match/index.html?view=week" - r = requests.get(url,headers={'user-agent': 'Mozilla/5.0'}) + scraper = cloudscraper.create_scraper() + r = scraper.get(url, headers={'user-agent': 'Mozilla/5.0'}) soup = BeautifulSoup(r.text, 'html.parser') return [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')] diff --git a/espncricinfo/player.py b/espncricinfo/player.py index 6b33b7e..6de11e2 100644 --- a/espncricinfo/player.py +++ b/espncricinfo/player.py @@ -1,6 +1,5 @@ -import requests +import cloudscraper from bs4 import BeautifulSoup -import dateparser from espncricinfo.exceptions import PlayerNotFoundError from espncricinfo.match import Match import csv @@ -13,6 +12,7 @@ def __init__(self, player_id): self.json_url = "http://core.espnuk.org/v2/sports/cricket/athletes/{0}".format(str(player_id)) self.new_json_url = "https://hs-consumer-api.espncricinfo.com/v1/pages/player/home?playerId={0}".format(str(player_id)) self.headers = {'user-agent': 'Mozilla/5.0'} + self.scraper = cloudscraper.create_scraper() self.parsed_html = self.get_html() self.json = self.get_json() self.new_json = self.get_new_json() @@ -29,21 +29,21 @@ def __init__(self, player_id): self.major_teams = self._major_teams() def get_html(self): - r = requests.get(self.url, headers=self.headers) + r = self.scraper.get(self.url, headers=self.headers) if r.status_code == 404: raise PlayerNotFoundError else: return BeautifulSoup(r.text, 'html.parser') def get_json(self): - r = requests.get(self.json_url, headers=self.headers) + r = self.scraper.get(self.json_url, headers=self.headers) if r.status_code == 404: raise PlayerNotFoundError else: return r.json() - + def get_new_json(self): - r = requests.get(self.new_json_url, headers=self.headers) + r = self.scraper.get(self.new_json_url, headers=self.headers) if r.status_code == 404: raise PlayerNotFoundError else: @@ -127,7 +127,7 @@ def get_career_averages(self, file_name=None, match_format=11, data_type='allrou self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_averages.csv" self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}" - html_doc = requests.get(self.url, headers=self.headers) + html_doc = self.scraper.get(self.url, headers=self.headers) soup = BeautifulSoup(html_doc.text, 'html.parser') tables = soup.find_all("table")[2] table_rows = tables.find_all("tr") @@ -159,7 +159,7 @@ def get_career_summary(self, file_name=None, match_format=11, data_type='allroun self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_career_summary.csv" self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type}" - html_doc = requests.get(self.url, headers=self.headers) + html_doc = self.scraper.get(self.url, headers=self.headers) soup = BeautifulSoup(html_doc.text, 'html.parser') tables = soup.find_all("table")[3] table_rows = tables.find_all("tr") @@ -193,7 +193,7 @@ def get_data(self, file_name=None, match_format=11, data_type='allround', view=' self.file_name = f"{self.player_id}_{self.match_format}_{self.data_type}_{self.view}.csv" self.url=f"https://stats.espncricinfo.com/ci/engine/player/{self.player_id}.html?class={self.match_format};template=results;type={self.data_type};view={self.view}" - html_doc = requests.get(self.url, headers=self.headers()) + html_doc = self.scraper.get(self.url, headers=self.headers) soup = BeautifulSoup(html_doc.text, 'html.parser') tables = soup.find_all("table")[3] table_rows = tables.find_all("tr") diff --git a/espncricinfo/series.py b/espncricinfo/series.py index 7999d5e..444b920 100644 --- a/espncricinfo/series.py +++ b/espncricinfo/series.py @@ -1,4 +1,4 @@ -import requests +import cloudscraper from bs4 import BeautifulSoup from espncricinfo.exceptions import MatchNotFoundError, NoSeriesError @@ -10,6 +10,7 @@ def __init__(self, series_id): self.events_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/events".format(str(series_id)) self.seasons_url = "http://core.espnuk.org/v2/sports/cricket/leagues/{0}/seasons".format(str(series_id)) self.headers = {'user-agent': 'Mozilla/5.0'} + self.scraper = cloudscraper.create_scraper() self.json = self.get_json(self.json_url) self.seasons = self._get_seasons() self.years = self._get_years_from_seasons() @@ -26,7 +27,7 @@ def __init__(self, series_id): self.events = self._build_events() def get_json(self, url): - r = requests.get(url,headers=self.headers) + r = self.scraper.get(url, headers=self.headers) if r.status_code == 404: raise "Not Found" else: diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py index eca242f..f90006d 100644 --- a/espncricinfo/summary.py +++ b/espncricinfo/summary.py @@ -1,4 +1,4 @@ -import requests +import cloudscraper from bs4 import BeautifulSoup from espncricinfo.match import Match @@ -7,12 +7,13 @@ class Summary(object): def __init__(self): self.url = "http://static.cricinfo.com/rss/livescores.xml" self.headers = {'user-agent': 'Mozilla/5.0'} + self.scraper = cloudscraper.create_scraper() self.xml = self.get_xml() self.match_ids = self._match_ids() self.matches = self._build_matches() def get_xml(self): - r = requests.get(self.url, headers=self.headers) + r = self.scraper.get(self.url, headers=self.headers) if r.status_code == 404: raise MatchNotFoundError else: diff --git a/requirements.txt b/requirements.txt index cf8a326..e060d23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ beautifulsoup4==4.9.1 +cloudscraper>=1.2.71 dateparser==1.1.6 jdatetime==3.6.2 python-dateutil==2.8.1