From 4dfecbb1e0154d273567ec4782a25cce9f3b8dd5 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Sun, 7 Jun 2020 22:21:28 -0700 Subject: [PATCH 01/10] Create scraper.py --- scraper.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 scraper.py diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..7a23185 --- /dev/null +++ b/scraper.py @@ -0,0 +1,42 @@ +import numpy as np +import requests +import pandas as pd +from urllib.request import urlopen +from bs4 import BeautifulSoup +import re + +#columns we want to extract +features = {'player','season','team_id','pts_per_g'} + +#dataframe to store temp and final data +final_df = pd.DataFrame(columns = features) +stats = pd.DataFrame(columns = features) + +#numpy tool to iterate over for loop by 100 (8 pages in this example) +pages = np.arange(0,900,100) +for i in pages: + #get url, use .get() to pass into Beautiful soup parser + #allows us to look at metadata + url = "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=per_game&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=1985&year_max=2020&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=0&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=pts_per_g&c1comp=gt&c1val=20&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&c5stat=&c5comp=&c6mult=&c6stat=&order_by=pts_per_g&order_by_asc=&offset="+str(i) + page = requests.get(url) + soup = BeautifulSoup(page.text,'html.parser') + + #based on tags, relevant info is in tbody, all the rows of data are in tr + table = soup.find('tbody') + rows = table.find_all('tr') + + #stats loops over the features we picked, matches them to headers + #If matched with headers, returns that column of data + stats = [[td.getText() for td in soup.findAll('td', {'data-stat': f})] for f in features] + + #convert to dataframe, rename columns and flip information to append easily + stats_df = pd.DataFrame(stats) + stats_df = stats_df.transpose() + stats_df.columns = features + final_df = final_df.append(stats_df) + +#extra code to showcase dropna function for dity data +new_df = final_df[final_df.team_id.isin(['CHI','LAL','BOS','NYK','PHO'])] +new_df = new_df.append(pd.Series(),ignore_index=True) +new_df = new_df.dropna() +final_df = final_df.dropna() From a0be5d6440490b334d3e1768b077eb8c155b52f9 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Thu, 11 Jun 2020 20:28:12 -0700 Subject: [PATCH 02/10] Update scraper.py --- scraper.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/scraper.py b/scraper.py index 7a23185..6fe98bd 100644 --- a/scraper.py +++ b/scraper.py @@ -1,18 +1,15 @@ import numpy as np import requests import pandas as pd -from urllib.request import urlopen from bs4 import BeautifulSoup -import re #columns we want to extract features = {'player','season','team_id','pts_per_g'} #dataframe to store temp and final data final_df = pd.DataFrame(columns = features) -stats = pd.DataFrame(columns = features) -#numpy tool to iterate over for loop by 100 (8 pages in this example) +#numpy tool to iterate over for loop by 100 (9 pages in this example) pages = np.arange(0,900,100) for i in pages: #get url, use .get() to pass into Beautiful soup parser @@ -21,10 +18,6 @@ page = requests.get(url) soup = BeautifulSoup(page.text,'html.parser') - #based on tags, relevant info is in tbody, all the rows of data are in tr - table = soup.find('tbody') - rows = table.find_all('tr') - #stats loops over the features we picked, matches them to headers #If matched with headers, returns that column of data stats = [[td.getText() for td in soup.findAll('td', {'data-stat': f})] for f in features] @@ -35,8 +28,8 @@ stats_df.columns = features final_df = final_df.append(stats_df) -#extra code to showcase dropna function for dity data +#isin, dropna function for dirty data new_df = final_df[final_df.team_id.isin(['CHI','LAL','BOS','NYK','PHO'])] -new_df = new_df.append(pd.Series(),ignore_index=True) +new_df = new_df.append(pd.Series(dtype='object'),ignore_index=True) new_df = new_df.dropna() final_df = final_df.dropna() From 8528f62ce31051c0b6c6f4796062355d4a568239 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Thu, 11 Jun 2020 20:42:51 -0700 Subject: [PATCH 03/10] Blog Rough Draft --- blog.md | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 blog.md diff --git a/blog.md b/blog.md new file mode 100644 index 0000000..a65d5ec --- /dev/null +++ b/blog.md @@ -0,0 +1,97 @@ +# Scraping and Sanitizing Data in Python by Beautiful Soup + +Sometimes, you come across some rankings that intrigues you, like the Billboard Hot 100. However, there's no simple download button that lets you look at the informaton as a database. That's where web scraping comes in, allowing you to extract information from the internet. Sometimes, this can yield unorganized data, but it is a key tool that vastly increases our access to information. We will use a few Python libraries, most notably BeautifulSoup and Pandas, to help us gather some information off of the internet. + +# Pre-Req: HTML Basics + +![Alt Text](https://i.imgur.com/kVOdBf7.png) + +If you ever used inspect element to change the around the title of a webpage to mess with your friends, you were programming HTML! HTML is a language used to affect the presentation of webpages, and relies on tags to store different types of information. [W3Schools](https://www.w3schools.com/tags/) is a great resource to learn more, but some relevant keywords for us are as follows: + +*head:* element is a container for metadata (data about data) and is placed between the *html* tag and the *body* tag. + +*body:* contains all the contents of an HTML document, such as headings, paragraphs, images, hyperlinks, tables, lists, etc. + +*div:* Whenever you include certain content, you enclose it together inside this single entity. +It can act as the parent for a lot of different elements. + +*href:* Specifies the URL the link goes to. + +*a:* The links are described in this tag, where the webpage that will get loaded on +click of this link is mentioned in its property href. + +*p:* Whenever some information is to be displayed on the webpage as a block of text, +this tag is used. Each such tag appears as its own paragraph. + +*tbody:* Defines a grouping for the content of an HTML table. + +*tr:* Defines a row in an HTML table. + +*th:* Defines a header in an HTML table. + +*td:* Defines a data cell in an HTML table. + +*table:* Tables are displayed in HTML with the help of this tag, +where data is displayed in cells formed by intersection of rows and columns. + +Here's a basic example: +```html + + + Title of the Page + + +
+ Link +

Paragraph

+
+ + + + + + + + + +
Col 1Col 2
Row 1 Data 1Row 1 Data 2
+ + +``` + +# Analyzing Our Website + +![Alt Text](https://d2p3bygnnzw9w3.cloudfront.net/req/202005291/logos/bbr-logo.svg) + +We will scrape statistics from a set of [set of](https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=per_game&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=1985&year_max=2020&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=0&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=pts_per_g&c1comp=gt&c1val=20&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&c5stat=&c5comp=&c6mult=&c6stat=&order_by=pts_per_g&order_by_asc=&offset=0) basketball players who have scored more than twenty points per game in a season after 1984. We are able to get a specific data set due to the query commands offered by www.basketball-reference.com (a fantastic website), but there are limits to their tools. To capture the data table, we first will right-click and select "Inspect Element" on Michael Jordan's name in the first row. As you hover over various parts of the code on the right, you will see that there is a link (https://www.basketball-reference.com/players/j/jordami01.html) wrapped by a *td* set, and that there are pairs of *td* and *data-stat=* for each stat down the line like season, age, team_id, etc. These are wrapped by a *tr* pair for *data-row-=0*, which is preceded by a *tbody* pair. + +![Alt Text](image1) + + +# Scraping the Data + +Understanding where our data lies is crucial for implementing the scraper. BeautifulSoup uses an html parser to locate the data we want, so being able to give it the tags that relate to that stats table is the engine behind our machine. Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages there are to scraper. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages of our dataset. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. + +This, combined with having nine pages of information, means we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through 'offset='), and then add our for loop variable i to the end of the url with str(i). So, base url + str(i). + +![Alt Text](image2) + +For this dataset, I want to only extract the player, the season, their team, and their points per game. +Going back to our *td* pairs, I can click inspect element on each element and see what the corresponding *data-row=* is. For this, it would be the tags 'player','season','team_id','pts_per_g'. We can create two Pandas dataframes to store our information: a final set to hold all the information, and a temp one we use to append a new page of information to our final set. + +### Access Server Information + +At the start of each loop iteration, we are going to have to make an HTTP response and store it. This stored response has to be converted to text, then will be used to pass into a BeautifulSoup object, with the second parameter being a built in parser, cleverly named 'html.parser'. Our BeautifulSoup parser has two related functions to call: find() and find_all(). We can use find() to find the instance of our table. This is as opposed to using find_all(), when we need to search the entire document for instances of something, find returns the singular result we want directly rather than a list. For us, we are looking for up to 100 instances of relevant rows per page, so we want to use find_all(). We can create two for loops: one to loop over each relevant feature we want (player, season, etc.), and another one to loop over each *td* pair in our find_all(*td*, *data-stat=feature*). Since we want the information as plain text, we have to convert it concurrently. All of this is passed as a variable. + +![Alt Text](image3) + +# Append Data + +We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call .transpose() on our new dataframe to properly format it, and set its columns to be equal to our features variable. +We then call .append() with the final set DataFrame. + +# Bonus: Cleaning The Data + +We were able to work with a cleaned data set that didn't require too much to change. However, I may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the .isin() command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract it with isin(). For null values, say I was looking at an older data set that started from 1964 instead of 1984. The three point shot didn't exist before 1979, so I can use dropna() to remove any instances of seasons before the three point era. This snippet below showcases a basic use of isin() and dropna(). If you comment out the dropna and add a few null rows to the dataset, then call dropna to see them go away. + +![Alt Text](image4) From b076790f506dae7c2ea72015b0274b09afaeae17 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Sat, 13 Jun 2020 21:55:26 -0700 Subject: [PATCH 04/10] Update blog.md --- blog.md | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/blog.md b/blog.md index a65d5ec..4854c55 100644 --- a/blog.md +++ b/blog.md @@ -1,38 +1,34 @@ # Scraping and Sanitizing Data in Python by Beautiful Soup -Sometimes, you come across some rankings that intrigues you, like the Billboard Hot 100. However, there's no simple download button that lets you look at the informaton as a database. That's where web scraping comes in, allowing you to extract information from the internet. Sometimes, this can yield unorganized data, but it is a key tool that vastly increases our access to information. We will use a few Python libraries, most notably BeautifulSoup and Pandas, to help us gather some information off of the internet. +There are a lot of tables embedded in websites, like the Billboard Hot 100, TV channel ratings, address books, etc. However, this data is not readily availlable to be downloaded. That's where web scraping comes to allow you to extract information from the internet. Sometimes, this can yield unorganized data, but it is a key tool that vastly increases our access to information. We will use a few Python libraries, most notably BeautifulSoup and Pandas, to help us gather some information off of the internet. # Pre-Req: HTML Basics ![Alt Text](https://i.imgur.com/kVOdBf7.png) -If you ever used inspect element to change the around the title of a webpage to mess with your friends, you were programming HTML! HTML is a language used to affect the presentation of webpages, and relies on tags to store different types of information. [W3Schools](https://www.w3schools.com/tags/) is a great resource to learn more, but some relevant keywords for us are as follows: +If you ever used inspect element to change the around the title of a webpage while messing around with your friends, you were programming HTML! HTML is a language used to affect the presentation of a website, and relies on tags to store different types of information. [W3Schools](https://www.w3schools.com/tags/) is a great resource to learn more, but here are some immediately relevant keywords: -*head:* element is a container for metadata (data about data) and is placed between the *html* tag and the *body* tag. +*head:* A container for metadata (data about data) and is placed between the *html* and *body* tags. -*body:* contains all the contents of an HTML document, such as headings, paragraphs, images, hyperlinks, tables, lists, etc. +*body:* Holds the content of an HTML document, including headings, paragraphs, images, hyperlinks, tables, lists, etc. -*div:* Whenever you include certain content, you enclose it together inside this single entity. -It can act as the parent for a lot of different elements. +*div:* Whenever you a wrapper for content under one set. It can act as the parent for a lot of different elements. -*href:* Specifies the URL the link goes to. +*href:* Holds the link's URL. -*a:* The links are described in this tag, where the webpage that will get loaded on -click of this link is mentioned in its property href. +*a:* Describes the links held in the href. -*p:* Whenever some information is to be displayed on the webpage as a block of text, -this tag is used. Each such tag appears as its own paragraph. +*p:* Used to display text on a page. *tbody:* Defines a grouping for the content of an HTML table. -*tr:* Defines a row in an HTML table. +*tr:* Signifies a HTML table row. -*th:* Defines a header in an HTML table. +*th:* Signifies a HTML table header. -*td:* Defines a data cell in an HTML table. +*td:* Signifies data in a HTML table. -*table:* Tables are displayed in HTML with the help of this tag, -where data is displayed in cells formed by intersection of rows and columns. +*table:* Helps format data in cells of rows and columns. Here's a basic example: ```html @@ -63,35 +59,36 @@ Here's a basic example: ![Alt Text](https://d2p3bygnnzw9w3.cloudfront.net/req/202005291/logos/bbr-logo.svg) -We will scrape statistics from a set of [set of](https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=per_game&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=1985&year_max=2020&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=0&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=pts_per_g&c1comp=gt&c1val=20&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&c5stat=&c5comp=&c6mult=&c6stat=&order_by=pts_per_g&order_by_asc=&offset=0) basketball players who have scored more than twenty points per game in a season after 1984. We are able to get a specific data set due to the query commands offered by www.basketball-reference.com (a fantastic website), but there are limits to their tools. To capture the data table, we first will right-click and select "Inspect Element" on Michael Jordan's name in the first row. As you hover over various parts of the code on the right, you will see that there is a link (https://www.basketball-reference.com/players/j/jordami01.html) wrapped by a *td* set, and that there are pairs of *td* and *data-stat=* for each stat down the line like season, age, team_id, etc. These are wrapped by a *tr* pair for *data-row-=0*, which is preceded by a *tbody* pair. +We will scrape statistics from a [set of](https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=per_game&per_minute_base=36&per_poss_base=100&lg_id=NBA&is_playoffs=N&year_min=1985&year_max=2020&franch_id=&season_start=1&season_end=-1&age_min=0&age_max=99&shoot_hand=&height_min=0&height_max=99&birth_country_is=Y&birth_country=&birth_state=&college_id=&draft_year=&is_active=&debut_yr_nba_start=&debut_yr_nba_end=&is_hof=&is_as=&as_comp=gt&as_val=0&award=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&qual=&c1stat=pts_per_g&c1comp=gt&c1val=20&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&c5stat=&c5comp=&c6mult=&c6stat=&order_by=pts_per_g&order_by_asc=&offset=0) basketball statistics to find players who scored 20 points per game or more since 1984. www.basketball-reference.com (a fantastic website) has a lot of query tools built into their website, but there filtering is limited, so we need to scrape the data to get what we want. To capture the data table, we first will right-click and select "Inspect Element" on Michael Jordan's name in the first row. As you hover over various parts of the code that pops up, you will see that there is a link (https://www.basketball-reference.com/players/j/jordami01.html) wrapped by a *td* set, and that there are pairs of *td* and *data-stat=* for each stat down the line like season, age, team_id, etc. These are wrapped by a *tr* pair for *data-row-=0*, which is preceded by a *tbody* pair. These are the relevant tags we will need to reference when scraping. -![Alt Text](image1) +![Alt Text](https://i.imgur.com/g8mi9vx.png) # Scraping the Data -Understanding where our data lies is crucial for implementing the scraper. BeautifulSoup uses an html parser to locate the data we want, so being able to give it the tags that relate to that stats table is the engine behind our machine. Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages there are to scraper. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages of our dataset. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. +Understanding where our data lies is crucial for implementing the scraper. BeautifulSoup uses an html parser to locate the data we want, but we have to give it some baseline information to do so. -This, combined with having nine pages of information, means we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through 'offset='), and then add our for loop variable i to the end of the url with str(i). So, base url + str(i). +Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages of information there are. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. -![Alt Text](image2) +Knowing what changes in our url, we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through 'offset='), and then add our for loop variable i to the end of the url with str(i). So, url = (base url) + str(i). + +![Alt Text](https://i.imgur.com/XQu2tug.png) For this dataset, I want to only extract the player, the season, their team, and their points per game. -Going back to our *td* pairs, I can click inspect element on each element and see what the corresponding *data-row=* is. For this, it would be the tags 'player','season','team_id','pts_per_g'. We can create two Pandas dataframes to store our information: a final set to hold all the information, and a temp one we use to append a new page of information to our final set. +Going back to our *td* pairs, I can click inspect element on each element and see what the corresponding *data-row=* is for each statistic. In HTML, they are tagged as 'player','season','team_id','pts_per_g'. We can create two Pandas dataframes to store our information: a temporary DataFrame we use to append a new page of information to our final set, and a final set to hold all the information. ### Access Server Information -At the start of each loop iteration, we are going to have to make an HTTP response and store it. This stored response has to be converted to text, then will be used to pass into a BeautifulSoup object, with the second parameter being a built in parser, cleverly named 'html.parser'. Our BeautifulSoup parser has two related functions to call: find() and find_all(). We can use find() to find the instance of our table. This is as opposed to using find_all(), when we need to search the entire document for instances of something, find returns the singular result we want directly rather than a list. For us, we are looking for up to 100 instances of relevant rows per page, so we want to use find_all(). We can create two for loops: one to loop over each relevant feature we want (player, season, etc.), and another one to loop over each *td* pair in our find_all(*td*, *data-stat=feature*). Since we want the information as plain text, we have to convert it concurrently. All of this is passed as a variable. +At the start of each loop iteration, we are going to have to request an HTTP response for ourl url and store it as plain text. We then will pass it into a BeautifulSoup object, with the second parameter being a built in parser, cleverly named 'html.parser'. Our BeautifulSoup parser has two related functions to call: find() and find_all(). We can use find() to find the instance of our table. This is as opposed to using find_all(), when we need to search the entire document for instances of something, find returns the singular result we want directly rather than a list. For us, we are looking for up to 100 instances of relevant rows per page, so we want to use find_all(). We can create two for loops: one to loop over each relevant feature we want (player, season, etc.), and another one to loop over each *td* pair in our find_all(*td*, *data-stat=feature*). Since we want the information as plain text, we have to convert it concurrently. -![Alt Text](image3) +![Alt Text](https://i.imgur.com/l5MaQso.png) # Append Data -We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call .transpose() on our new dataframe to properly format it, and set its columns to be equal to our features variable. -We then call .append() with the final set DataFrame. +We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call .transpose() on our new dataframe to flip the rows to columns and vice versa, and set the column names to our list of features. We then call .append() with the final set DataFrame. # Bonus: Cleaning The Data -We were able to work with a cleaned data set that didn't require too much to change. However, I may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the .isin() command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract it with isin(). For null values, say I was looking at an older data set that started from 1964 instead of 1984. The three point shot didn't exist before 1979, so I can use dropna() to remove any instances of seasons before the three point era. This snippet below showcases a basic use of isin() and dropna(). If you comment out the dropna and add a few null rows to the dataset, then call dropna to see them go away. +We were able to work with a cleaned data set that didn't require too much to change. However, one may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the .isin() command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract them with isin(). For null values, imagine an older data set that started from 1964 instead of 1984. The three point line didn't exist before 1979, so I can use dropna() to remove any instances of seasons before the three point era. This snippet below showcases a basic use of isin() and dropna(). -![Alt Text](image4) +![Alt Text](https://i.imgur.com/Mn052zQ.png) From c660c101812f518efe00c563b5a1c1ed2335b418 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Sun, 14 Jun 2020 12:26:04 -0700 Subject: [PATCH 05/10] Additional print statements --- scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scraper.py b/scraper.py index 6fe98bd..8752a0a 100644 --- a/scraper.py +++ b/scraper.py @@ -33,3 +33,6 @@ new_df = new_df.append(pd.Series(dtype='object'),ignore_index=True) new_df = new_df.dropna() final_df = final_df.dropna() + +final_df.head(); +new_df.head(); From 07c0e3c1cb16ab71440ee2fb805196c33e18874e Mon Sep 17 00:00:00 2001 From: Kyle Begovich Date: Sun, 14 Jun 2020 22:41:30 -0500 Subject: [PATCH 06/10] semantic updates updating inline python code with backticks and fixing find_all() into findAll() --- blog.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/blog.md b/blog.md index 4854c55..8e3b768 100644 --- a/blog.md +++ b/blog.md @@ -70,25 +70,25 @@ Understanding where our data lies is crucial for implementing the scraper. Beaut Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages of information there are. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. -Knowing what changes in our url, we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through 'offset='), and then add our for loop variable i to the end of the url with str(i). So, url = (base url) + str(i). +Knowing what changes in our url, we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through `offset=`), and then add our for loop variable `i` to the end of the url with `str(i)`. So, `url = (base url) + str(i)`. ![Alt Text](https://i.imgur.com/XQu2tug.png) For this dataset, I want to only extract the player, the season, their team, and their points per game. -Going back to our *td* pairs, I can click inspect element on each element and see what the corresponding *data-row=* is for each statistic. In HTML, they are tagged as 'player','season','team_id','pts_per_g'. We can create two Pandas dataframes to store our information: a temporary DataFrame we use to append a new page of information to our final set, and a final set to hold all the information. +Going back to our *td* pairs, I can click inspect element on each element and see what the corresponding *data-row=* is for each statistic. In HTML, they are tagged as 'player', 'season', 'team_id', 'pts_per_g'. We can create two Pandas dataframes to store our information: a temporary DataFrame we use to append a new page of information to our final set, and a final set to hold all the information. ### Access Server Information -At the start of each loop iteration, we are going to have to request an HTTP response for ourl url and store it as plain text. We then will pass it into a BeautifulSoup object, with the second parameter being a built in parser, cleverly named 'html.parser'. Our BeautifulSoup parser has two related functions to call: find() and find_all(). We can use find() to find the instance of our table. This is as opposed to using find_all(), when we need to search the entire document for instances of something, find returns the singular result we want directly rather than a list. For us, we are looking for up to 100 instances of relevant rows per page, so we want to use find_all(). We can create two for loops: one to loop over each relevant feature we want (player, season, etc.), and another one to loop over each *td* pair in our find_all(*td*, *data-stat=feature*). Since we want the information as plain text, we have to convert it concurrently. +At the start of each loop iteration, we are going to have to request an HTTP response for ourl url and store it as plain text. We then will pass it into a BeautifulSoup object, with the second parameter being a built in parser, cleverly named 'html.parser'. Our BeautifulSoup parser has two related functions to call: `find()` and `findAll()`. We can use `find()` to find the instance of our table. This is as opposed to using `findAll()`, when we need to search the entire document for instances of something, find returns the singular result we want directly rather than a list. For us, we are looking for up to 100 instances of relevant rows per page, so we want to use `findAll()`. We can create two for loops: one to loop over each relevant feature we want (player, season, etc.), and another one to loop over each *td* pair in our `findAll(*td*, *data-stat=feature*)`. Since we want the information as plain text, we have to convert it concurrently. ![Alt Text](https://i.imgur.com/l5MaQso.png) # Append Data -We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call .transpose() on our new dataframe to flip the rows to columns and vice versa, and set the column names to our list of features. We then call .append() with the final set DataFrame. +We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call `.transpose()` on our new dataframe to flip the rows to columns and vice versa, and set the column names to our list of features. We then call `.append()` with the final set DataFrame. # Bonus: Cleaning The Data -We were able to work with a cleaned data set that didn't require too much to change. However, one may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the .isin() command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract them with isin(). For null values, imagine an older data set that started from 1964 instead of 1984. The three point line didn't exist before 1979, so I can use dropna() to remove any instances of seasons before the three point era. This snippet below showcases a basic use of isin() and dropna(). +We were able to work with a cleaned data set that didn't require too much to change. However, one may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the `.isin()` command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract them with `isin()`. For null values, imagine an older data set that started from 1964 instead of 1984. The three point line didn't exist before 1979, so I can use `dropna()` to remove any instances of seasons before the three point era. This snippet below showcases a basic use of `isin()` and `dropna()`. ![Alt Text](https://i.imgur.com/Mn052zQ.png) From d83573c5d17a1d078b8d9d85c1cd166c815441ce Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Wed, 17 Jun 2020 18:00:27 -0700 Subject: [PATCH 07/10] Update blog.md --- blog.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/blog.md b/blog.md index 8e3b768..0fcfab8 100644 --- a/blog.md +++ b/blog.md @@ -68,6 +68,8 @@ We will scrape statistics from a [set of](https://www.basketball-reference.com/p Understanding where our data lies is crucial for implementing the scraper. BeautifulSoup uses an html parser to locate the data we want, but we have to give it some baseline information to do so. +## Installing Dependencies + Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages of information there are. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. Knowing what changes in our url, we will create a for loop that goes from 0 to 900 in increments of 100, and we will copy the base url (everything through `offset=`), and then add our for loop variable `i` to the end of the url with `str(i)`. So, `url = (base url) + str(i)`. @@ -89,6 +91,14 @@ We now have our data, we want to re-format our data and add it to a final set. W # Bonus: Cleaning The Data -We were able to work with a cleaned data set that didn't require too much to change. However, one may want to make more specific queries on the existing data set or eliminate any potential null values. For example, I can use the `.isin()` command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract them with `isin()`. For null values, imagine an older data set that started from 1964 instead of 1984. The three point line didn't exist before 1979, so I can use `dropna()` to remove any instances of seasons before the three point era. This snippet below showcases a basic use of `isin()` and `dropna()`. +We were able to work with a cleaned data set that didn't require too much to change. However, one may want to make more specific queries on the existing data set or eliminate any potentially null or missing values. For example, I can use the `.isin()` command to extract a value or list of values. So, if I only wanted the list of twenty point per game scorers from the Suns, Bulls, Lakers, Celtics and Knicks, I can extract them with `isin()`. For null values, imagine an older data set that started from 1964 instead of 1984. The three point line didn't exist before 1979, so I can use `dropna()` to remove any instances of seasons before the three point era. This snippet below showcases a basic use of `isin()` and `dropna()`. ![Alt Text](https://i.imgur.com/Mn052zQ.png) + +Before Query | After Query +:-------------------------:|:-------------------------: +![](https://i.imgur.com/4KNhGZJ.png) | ![](https://i.imgur.com/d0pnUmY.png) + +Before Drop | After Drop +:-------------------------:|:-------------------------: +![](https://i.imgur.com/1q4i4qd.png) | ![](https://i.imgur.com/88GuIQ4.png) From 2f49f2368fb74933a9e978ffc66e8c081760595f Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Wed, 17 Jun 2020 18:03:43 -0700 Subject: [PATCH 08/10] Update blog.md --- blog.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/blog.md b/blog.md index 0fcfab8..23889f7 100644 --- a/blog.md +++ b/blog.md @@ -63,12 +63,11 @@ We will scrape statistics from a [set of](https://www.basketball-reference.com/p ![Alt Text](https://i.imgur.com/g8mi9vx.png) - # Scraping the Data Understanding where our data lies is crucial for implementing the scraper. BeautifulSoup uses an html parser to locate the data we want, but we have to give it some baseline information to do so. -## Installing Dependencies +## Installing Dependencies and Scanning URL Before we start, we want to ensure we have the following libraries installed using pip install *package_name*: pandas, numpy, requests and bs4. The first thing we want to do for our scraper is determine how many pages of information there are. For this particular dataset, it is laid out across 9 different URLs. We have to examine what changes occur between two pages. It is a really long link, but we can see at the end there is a part that says offset=0 for the first page, but is equal to 100 for the second page, and increments by 100 each time. From 37b9a34a9c92e93241eb713a4017b22c7328035d Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Wed, 17 Jun 2020 18:11:19 -0700 Subject: [PATCH 09/10] Added to_csv --- scraper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scraper.py b/scraper.py index 8752a0a..a9d458e 100644 --- a/scraper.py +++ b/scraper.py @@ -34,5 +34,8 @@ new_df = new_df.dropna() final_df = final_df.dropna() +#display the information and pass files into csv's final_df.head(); new_df.head(); + +final_df.to_csv('20ppg.csv') From 779780f309f907e15bf8b3f0278c5325bb950fc8 Mon Sep 17 00:00:00 2001 From: Alex Doytchinov Date: Wed, 17 Jun 2020 18:13:21 -0700 Subject: [PATCH 10/10] Added way to send to a csv file --- blog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blog.md b/blog.md index 23889f7..5572e66 100644 --- a/blog.md +++ b/blog.md @@ -86,7 +86,7 @@ At the start of each loop iteration, we are going to have to request an HTTP res # Append Data -We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call `.transpose()` on our new dataframe to flip the rows to columns and vice versa, and set the column names to our list of features. We then call `.append()` with the final set DataFrame. +We now have our data, we want to re-format our data and add it to a final set. We take the stats variable and pass it into a Pandas DataFrame. Due to how the information was stored, our stats variable is a 4 element list with each element being size 100. We want the opposite of that, so we call `.transpose()` on our new dataframe to flip the rows to columns and vice versa, and set the column names to our list of features. We then call `.append()` with the final set DataFrame. We can send this to a .csv file using 'final_df.to_csv('league_20ppg.csv')', which will be stored to the local directory of your IDE. # Bonus: Cleaning The Data