diff --git a/Week-08-Regression/Exercise-DONT-EDIT-MAKE-COPY.ipynb b/Week-08-Regression/Exercise-DONT-EDIT-MAKE-COPY.ipynb new file mode 100644 index 00000000..1ab4e3ce --- /dev/null +++ b/Week-08-Regression/Exercise-DONT-EDIT-MAKE-COPY.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Your First Regression Model: Predicting Wine Quality\n", + "\n", + "Dataset: Wine Quality Dataset (`data/winequality-red.csv`)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import our libraries.\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Seaborn / matplotlib for visualization \n", + "import seaborn as sns\n", + "sns.set()\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# Helper function to split our data\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Linear Regression model\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "# Metrics to evaluate our model\n", + "from sklearn import metrics\n", + "\n", + "# Statsmodels for detailed regression statistics\n", + "import statsmodels.api as sm\n", + "from statsmodels.tools.eval_measures import rmse\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Constants\n", + "\n", + "It is a good programming practice to use constants to avoid repetition errors and to save yourself the effort of retyping the expression by _centralizing_ semantically identical values.\n", + "\n", + "`DATASET_PATH` identifies the path to the dataset being loaded and operated on. `RANDOM_STATE` makes otherwise random operations reproducible run after run. Keep whatever value you set it to unless you want slightly different results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = 'data/winequality-red.csv'\n", + "RANDOM_STATE = 42\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary Inspection\n", + "\n", + "See what the raw file looks like!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Look at the first 5 lines of the raw contents of the file first.\n", + "\n", + "with open(DATASET_PATH, 'r') as file:\n", + " for line_number in range(5):\n", + " if line := file.readline():\n", + " print(line)\n", + " else:\n", + " break # Stop; there are less than 5 lines.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preliminary Load\n", + "\n", + "Load the data. This is not the final form of the data which will be used, but it's a `DataFrame` for further inspection so we can decide what to do with it next.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the dataset into a pandas dataframe.\n", + "\n", + "df = ?\n", + "df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect and Explore (EDA)\n", + "1. Shape and size\n", + "2. Describe\n", + "3. Info\n", + "4. Check for nulls\n", + "5. Check for dupes\n", + "6. Plot\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Shape and size\n", + "print(df.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. Describe\n", + "df.describe()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Get info on cols\n", + "df.info()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect null values\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect / check for nulls.\n", + "df.isnull().sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Percentage of null values per column\n", + "((df.isnull().sum() / len(df)) * 100).round(2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect duplicate rows\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_dupes = df.duplicated().sum()\n", + "print(\"Number of duplicate rows are %i.\" % n_dupes)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize our data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Distribution of our target variable (quality)\n", + "plt.figure(figsize=(10, 5))\n", + "df['quality'].hist(bins=6, alpha=0.7, color='darkred')\n", + "plt.title('Distribution of Wine Quality Scores')\n", + "plt.xlabel('Quality Rating')\n", + "plt.ylabel('Frequency')\n", + "plt.show()\n", + "\n", + "print(\"Quality Statistics:\")\n", + "print(df['quality'].describe())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Selection\n", + "\n", + "Choose the column corresponding to `alcohol` to be your `X`. Target `quality` as your `y`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set X to the desired feature.\n", + "\n", + "\n", + "# Set y to be our target variable.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split to Testing and Training Datasets\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split our data into testing and training pairs.\n", + "\n", + "\n", + "# Print the length and width of our testing data.\n", + "print('X_train: %d rows, %d columns' % X_train.shape)\n", + "print('X_test: %d rows, %d columns' % X_test.shape)\n", + "print('y_train: %d rows' % y_train.shape[0])\n", + "print('y_test: %d rows' % y_test.shape[0])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build and train your model\n", + "\n", + "Initialize an empty Linear Regression model, and then fit your model to your training data.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize our linear regression model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation\n", + "\n", + "Make predictions with your test data and save the predictions as `y_pred`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Make predictions of your test data and save them as `y_pred`.\n", + "\n", + "\n", + "y_pred\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calculate and print the R², RMSE, and MAE scores of your model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. Calculate and print the R², RMSE, and MAE scores of your model.\n", + "\n", + "print(\"R² Score: %.4f\" % )\n", + "print(\"RMSE: %.4f\" % )\n", + "print(\"MAE: %.4f\" % )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Plot predicted vs actual values.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Plot predicted vs actual values.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra Credit 1\n", + "\n", + "Build a better model using multiple features (alcohol, volatile acidity, sulphates).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the new X variable with multiple features, and reuse the same y variable from before.\n", + "\n", + "\n", + "# Split our data into testing and training. Remember to use the same random state as you used before.\n", + "\n", + "\n", + "# Initialize our model.\n", + "\n", + "\n", + "# Fit-train our model using our training data.\n", + "\n", + "\n", + "# Make new predictions using our testing data.\n", + "\n", + "\n", + "# Print each of our scores to inspect performance.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra Credit 2\n", + "\n", + "Use `statsmodels` to create a summary report. Interpret the results.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add a constant term to the independent variables.\n", + "\n", + "\n", + "# Fit the OLS model.\n", + "\n", + "\n", + "# Print the summary and interpret the results.\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}