diff --git a/.cursor/rules/python-jupyter.mdc b/.cursor/rules/python-jupyter.mdc
index 6fa418e..3b8d0b4 100644
--- a/.cursor/rules/python-jupyter.mdc
+++ b/.cursor/rules/python-jupyter.mdc
@@ -15,6 +15,7 @@ alwaysApply: false
- Prefer vectorized operations over explicit loops for better performance.
- Use descriptive variable names that reflect the data they contain.
- Follow PEP 8 style guidelines for Python code.
+ - Only import things that are used
Data Analysis and Manipulation:
- Use pandas for data manipulation and analysis.
diff --git a/README.md b/README.md
deleted file mode 100644
index 5c212eb..0000000
--- a/README.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Data Exploration Project
-
-This project provides a structured environment for data exploration and analysis using Jupyter notebooks.
-
-## Getting Started
-
-### Prerequisites
-
-- Python 3.8 or higher
-- pip (Python package installer)
-
-### Setup Instructions
-
-1. **Clone or download this repository**
-
-2. **Create a virtual environment (recommended)**
-
- ```bash
- # On macOS/Linux
- python3 -m venv venv
- source venv/bin/activate
-
- # On Windows
- python -m venv venv
- venv\Scripts\activate
- ```
-
-3. **Install the required packages**
-
- ```bash
- pip install -r requirements.txt
- ```
-
-4. **Launch Jupyter Notebook**
-
- ```bash
- jupyter notebook
- ```
-
-5. **Open the data_exploration.ipynb notebook**
- - A browser window should open automatically. If not, copy the URL displayed in the terminal and paste it into your web browser.
- - Navigate to and click on `data_exploration.ipynb` to open the notebook.
-
-## Project Structure
-
-- `data_exploration.ipynb`: Starter Jupyter notebook for data analysis
-- `requirements.txt`: Contains all the Python dependencies
-- `data/`: Directory where you can store your datasets (create this as needed)
-
-## Adding Your Data
-
-You can add your data files to the project:
-
-1. Create a `data` directory (if not already present):
-
- ```bash
- mkdir data
- ```
-
-2. Place your data files (CSV, Excel, etc.) in the `data` directory.
-
-3. In the notebook, load your data:
- ```python
- df = pd.read_csv('data/your_file.csv')
- ```
-
-## Common Tasks
-
-- **Data Loading**: Use pandas to read different file formats (CSV, Excel, JSON, etc.)
-- **Data Cleaning**: Handle missing values, duplicates, outliers
-- **Exploratory Analysis**: Descriptive statistics, correlation analysis
-- **Data Visualization**: Create charts and plots using matplotlib and seaborn
-- **Feature Engineering**: Create new features or transform existing ones
-- **Model Building**: Build and evaluate machine learning models using scikit-learn
-
-## Useful Extensions
-
-Consider installing these additional Jupyter extensions for enhanced productivity:
-
-```bash
-pip install jupyterlab # JupyterLab interface
-pip install nbextensions # Notebook extensions
-```
diff --git a/analysis.ipynb b/analysis.ipynb
deleted file mode 100644
index 9638289..0000000
--- a/analysis.ipynb
+++ /dev/null
@@ -1,43 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "cad336dc-cdc4-4dc0-8a6a-e4df4cacea45",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Hello World\n"
- ]
- }
- ],
- "source": [
- "print(\"Hello World\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/analysis.py b/analysis.py
new file mode 100644
index 0000000..6959587
--- /dev/null
+++ b/analysis.py
@@ -0,0 +1,692 @@
+import pandas as pd
+import os
+import glob
+from dme_dictionary import DATA_DICTIONARY
+import numpy as np
+import locale
+
+# Set locale for currency formatting
+locale.setlocale(locale.LC_ALL, '')
+
+# Set pandas display options to show all columns
+pd.set_option('display.max_columns', None) # Show all columns
+pd.set_option('display.width', None) # Don't wrap the output
+# Don't add new lines in wide DataFrames
+pd.set_option('display.expand_frame_repr', False)
+
+# Path to data directories
+data_dir = 'data'
+years = range(2018, 2023) # 2018 to 2022
+
+# Initialize an empty list to store DataFrames
+dfs = []
+
+# Loop through each year
+for year in years:
+ # Get the CSV file path
+ csv_files = glob.glob(f"{data_dir}/{year}/*.csv")
+
+ if not csv_files:
+ print(f"No CSV files found for year {year}")
+ continue
+
+ # Get the first CSV file (there should be only one per year based on our observation)
+ csv_file = csv_files[0]
+ print(f"Loading data from {csv_file}")
+
+ # Read the CSV into a DataFrame with mixed type handling
+ df = pd.read_csv(csv_file, low_memory=False)
+
+ # Add a 'year' column
+ df['year'] = year
+
+ # Append to our list of DataFrames
+ dfs.append(df)
+
+# Function to format dollar amounts (K or M based on size)
+
+
+def format_dollar_amount(amount):
+ if amount >= 1000000:
+ return f"${amount/1000000:.1f}M"
+ else:
+ return f"${amount/1000:.1f}K"
+
+
+# Combine all DataFrames into one
+if dfs:
+ combined_df = pd.concat(dfs, ignore_index=True)
+ print(f"Combined DataFrame shape: {combined_df.shape}")
+
+ # Display the first few rows with all columns
+ # print("\nFirst few rows of the combined DataFrame:")
+ # print(combined_df.head())
+
+ # Create a dictionary to store column information
+ column_info = {}
+
+ # Check which columns from our data are in the data dictionary
+ for column in combined_df.columns:
+ if column in DATA_DICTIONARY:
+ column_info[column] = DATA_DICTIONARY[column]
+ else:
+ column_info[column] = "Description not available"
+
+ # Display column information
+ print("\nColumn Information:")
+ # for column, count in zip(combined_df.columns, combined_df.count()):
+ # description = column_info.get(column, "Description not available")
+ # print(f"Column: {column}")
+ # print(f" Description: {description}")
+ # print(f" Non-null count: {count}/{len(combined_df)} entries")
+ # print(f" Data type: {combined_df[column].dtype}")
+ # print()
+
+ # Add data dictionary descriptions as attributes
+ combined_df.attrs['column_descriptions'] = column_info
+
+ # Summary statistics for numerical columns
+ # print("\nSummary statistics for numerical columns:")
+ # print(combined_df.describe())
+
+ print("\n" + "="*100)
+ print("Year-over-Year Growth Rate Analysis")
+ print("="*100)
+
+ # Create a DataFrame to analyze suppliers by year-over-year growth rate
+ # First, group by supplier and year to get annual totals
+ supplier_yearly = combined_df.groupby(['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org', 'year']).agg({
+ 'Suplr_Sbmtd_Chrgs': 'sum',
+ 'Suplr_Mdcr_Pymt_Amt': 'sum',
+ 'Tot_Suplr_Benes': 'mean', # Average number of beneficiaries
+ 'Tot_Suplr_Clms': 'sum' # Total claims
+ }).reset_index()
+
+ # Create a pivot table to have years as columns
+ pivot_charges = supplier_yearly.pivot_table(
+ index=['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org'],
+ columns='year',
+ values='Suplr_Mdcr_Pymt_Amt',
+ fill_value=0
+ )
+
+ # Calculate year-over-year growth rates
+ growth_rates = pd.DataFrame(index=pivot_charges.index)
+
+ # Calculate growth rate for each year pair (2019/2018, 2020/2019, etc.)
+ for year_pair in [(2019, 2018), (2020, 2019), (2021, 2020), (2022, 2021)]:
+ current, previous = year_pair
+ growth_rates[f'growth_{current}'] = (
+ (pivot_charges[current] - pivot_charges[previous]) /
+ pivot_charges[previous].replace(0, float('nan'))
+ ) * 100 # Convert to percentage
+
+ # Calculate average growth rate across all years
+ growth_cols = [
+ col for col in growth_rates.columns if col.startswith('growth_')]
+ growth_rates['avg_growth'] = growth_rates[growth_cols].mean(axis=1)
+
+ # Filter out suppliers that weren't present in all years
+ valid_suppliers = pivot_charges[(pivot_charges[2018] > 0) &
+ (pivot_charges[2019] > 0) &
+ (pivot_charges[2020] > 0) &
+ (pivot_charges[2021] > 0) &
+ (pivot_charges[2022] > 0)]
+
+ # Filter suppliers with significant payment amounts (at least $100K in the last year)
+ significant_suppliers = valid_suppliers[valid_suppliers[2022] >= 100000]
+ print(
+ f"Filtering to {len(significant_suppliers)} suppliers with at least $100,000 in payments in 2022")
+
+ # Merge growth rates with valid and significant suppliers
+ valid_growth = growth_rates.loc[significant_suppliers.index].reset_index()
+
+ # Sort by average growth rate in descending order
+ top_growth = valid_growth.sort_values('avg_growth', ascending=False)
+
+ # Merge with additional data for reporting
+ supplier_totals = supplier_yearly.groupby(['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org']).agg({
+ 'Suplr_Sbmtd_Chrgs': 'sum',
+ 'Suplr_Mdcr_Pymt_Amt': 'sum',
+ 'Tot_Suplr_Benes': 'mean',
+ 'Tot_Suplr_Clms': 'sum'
+ }).reset_index()
+
+ top_growth_with_data = pd.merge(
+ top_growth,
+ supplier_totals,
+ on=['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org']
+ )
+
+ # Format the output for the top 10 suppliers
+ print("The analysis identified suppliers with the highest growth rates based on Medicare payment amounts from 2018 to 2022.")
+ print("Here are the top 10 suppliers with extraordinary growth (minimum $100K in 2022 payments):\n")
+
+ # Get top 10 suppliers
+ top_10_suppliers = top_growth_with_data.head(10)
+ top_10_npi = top_10_suppliers['Suplr_NPI'].tolist()
+
+ # Filter the original data for just these suppliers
+ top_supplier_data = supplier_yearly[supplier_yearly['Suplr_NPI'].isin(
+ top_10_npi)]
+
+ # Format and display each supplier's information
+ for i, (_, supplier) in enumerate(top_10_suppliers.iterrows(), 1):
+ npi = supplier['Suplr_NPI']
+ name = supplier['Suplr_Prvdr_Last_Name_Org']
+ avg_growth = supplier['avg_growth']
+ total_payments = supplier['Suplr_Mdcr_Pymt_Amt']
+
+ # Get yearly data for this supplier
+ yearly_data = top_supplier_data[top_supplier_data['Suplr_NPI'] == npi].sort_values(
+ 'year')
+
+ print(f"{i}. **{name}** (NPI: {npi})")
+ print(f" - Average growth rate: {avg_growth:.2f}%")
+ print(
+ f" - Total Medicare payments: ${total_payments/1000000:.2f} million")
+
+ # Show yearly payment amounts
+ yearly_payments = []
+ for year in range(2018, 2023):
+ year_data = yearly_data[yearly_data['year'] == year]
+ if not year_data.empty:
+ payment = year_data['Suplr_Mdcr_Pymt_Amt'].values[0]
+ yearly_payments.append(format_dollar_amount(payment))
+ else:
+ yearly_payments.append("$0")
+
+ print(
+ f" - Yearly payments: 2018: {yearly_payments[0]}, 2019: {yearly_payments[1]}, 2020: {yearly_payments[2]}, 2021: {yearly_payments[3]}, 2022: {yearly_payments[4]}")
+
+ # Analyze growth pattern
+ payment_pattern = yearly_data['Suplr_Mdcr_Pymt_Amt'].tolist()
+ years_list = yearly_data['year'].tolist()
+ benes_pattern = yearly_data['Tot_Suplr_Benes'].tolist()
+
+ # Identify the largest year-over-year jump
+ max_jump = 0
+ max_jump_year_idx = 0
+ for j in range(1, len(payment_pattern)):
+ if payment_pattern[j-1] > 0:
+ jump_pct = (
+ payment_pattern[j] - payment_pattern[j-1]) / payment_pattern[j-1] * 100
+ if jump_pct > max_jump:
+ max_jump = jump_pct
+ max_jump_year_idx = j
+
+ if max_jump_year_idx > 0:
+ from_year = years_list[max_jump_year_idx-1]
+ to_year = years_list[max_jump_year_idx]
+ from_amount = payment_pattern[max_jump_year_idx-1]
+ to_amount = payment_pattern[max_jump_year_idx]
+
+ # Format amounts with K or M suffix based on size
+ from_amount_str = format_dollar_amount(from_amount)
+ to_amount_str = format_dollar_amount(to_amount)
+
+ print(
+ f" - Growth pattern: Major increase from {from_year} to {to_year} ({from_amount_str} to {to_amount_str})")
+
+ # Check for consistent growth
+ growth_consistent = True
+ for j in range(1, len(payment_pattern)):
+ if payment_pattern[j] <= payment_pattern[j-1]:
+ growth_consistent = False
+ break
+
+ if growth_consistent and len(payment_pattern) > 2:
+ print(" - Pattern shows consistent year-over-year growth")
+
+ # Check for beneficiary growth
+ if not pd.isna(benes_pattern).all() and len(benes_pattern) >= 2:
+ first_valid_idx = next((i for i, x in enumerate(
+ benes_pattern) if not pd.isna(x)), None)
+ last_valid_idx = next((i for i, x in enumerate(
+ reversed(benes_pattern)) if not pd.isna(x)), None)
+ if first_valid_idx is not None and last_valid_idx is not None:
+ last_valid_idx = len(benes_pattern) - 1 - last_valid_idx
+ first_benes = benes_pattern[first_valid_idx]
+ last_benes = benes_pattern[last_valid_idx]
+ if not pd.isna(first_benes) and not pd.isna(last_benes) and first_benes > 0:
+ bene_growth = (last_benes - first_benes) / \
+ first_benes * 100
+ print(
+ f" - Beneficiary growth: {bene_growth:.1f}% increase (from {first_benes:.0f} to {last_benes:.0f})")
+
+ print("") # Add a blank line between suppliers
+
+ # =====================================
+ # Analysis of High Submitted Charges vs Low Allowed/Paid Amounts
+ # =====================================
+ print("\n" + "="*100)
+ print("Analysis of High Submitted Charges with Low Allowed/Paid Amounts")
+ print("="*100)
+
+ # Aggregate data by supplier across all years
+ supplier_totals_with_allowed = combined_df.groupby(['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org']).agg({
+ 'Suplr_Sbmtd_Chrgs': 'sum',
+ 'Suplr_Mdcr_Alowd_Amt': 'sum',
+ 'Suplr_Mdcr_Pymt_Amt': 'sum',
+ 'Tot_Suplr_Benes': 'mean',
+ 'Tot_Suplr_Clms': 'sum'
+ }).reset_index()
+
+ # Calculate ratios
+ supplier_totals_with_allowed['submitted_allowed_ratio'] = supplier_totals_with_allowed['Suplr_Sbmtd_Chrgs'] / \
+ supplier_totals_with_allowed['Suplr_Mdcr_Alowd_Amt']
+ supplier_totals_with_allowed['submitted_paid_ratio'] = supplier_totals_with_allowed['Suplr_Sbmtd_Chrgs'] / \
+ supplier_totals_with_allowed['Suplr_Mdcr_Pymt_Amt']
+
+ # Filter for suppliers with substantial submitted charges (at least $100,000) to focus on meaningful outliers
+ significant_suppliers = supplier_totals_with_allowed[
+ supplier_totals_with_allowed['Suplr_Sbmtd_Chrgs'] >= 100000]
+
+ # Find outliers with highest submitted-to-allowed ratio
+ top_submitted_allowed_outliers = significant_suppliers.sort_values(
+ 'submitted_allowed_ratio', ascending=False).head(10)
+
+ print("Top 10 Suppliers with Highest Submitted Charges to Allowed Amount Ratio:\n")
+
+ for i, (_, supplier) in enumerate(top_submitted_allowed_outliers.iterrows(), 1):
+ npi = supplier['Suplr_NPI']
+ name = supplier['Suplr_Prvdr_Last_Name_Org']
+ submitted = supplier['Suplr_Sbmtd_Chrgs']
+ allowed = supplier['Suplr_Mdcr_Alowd_Amt']
+ paid = supplier['Suplr_Mdcr_Pymt_Amt']
+ ratio = supplier['submitted_allowed_ratio']
+
+ # Format amounts with K or M suffix based on size
+ submitted_str = format_dollar_amount(submitted)
+ allowed_str = format_dollar_amount(allowed)
+ paid_str = format_dollar_amount(paid)
+
+ print(f"{i}. **{name}** (NPI: {npi})")
+ print(f" - Submitted charges: {submitted_str}")
+ print(f" - Allowed amount: {allowed_str}")
+ print(f" - Paid amount: {paid_str}")
+ print(f" - Submitted to allowed ratio: {ratio:.2f}x")
+ print(
+ f" - Allowed amount is {(allowed/submitted)*100:.1f}% of submitted charges")
+ print(
+ f" - Paid amount is {(paid/submitted)*100:.1f}% of submitted charges")
+ print("") # Add a blank line between suppliers
+
+ # Find outliers with highest submitted-to-paid ratio
+ top_submitted_paid_outliers = significant_suppliers.sort_values(
+ 'submitted_paid_ratio', ascending=False).head(10)
+
+ print("\nTop 10 Suppliers with Highest Submitted Charges to Paid Amount Ratio:\n")
+
+ for i, (_, supplier) in enumerate(top_submitted_paid_outliers.iterrows(), 1):
+ npi = supplier['Suplr_NPI']
+ name = supplier['Suplr_Prvdr_Last_Name_Org']
+ submitted = supplier['Suplr_Sbmtd_Chrgs']
+ allowed = supplier['Suplr_Mdcr_Alowd_Amt']
+ paid = supplier['Suplr_Mdcr_Pymt_Amt']
+ ratio = supplier['submitted_paid_ratio']
+
+ # Format amounts with K or M suffix based on size
+ submitted_str = format_dollar_amount(submitted)
+ allowed_str = format_dollar_amount(allowed)
+ paid_str = format_dollar_amount(paid)
+
+ print(f"{i}. **{name}** (NPI: {npi})")
+ print(f" - Submitted charges: {submitted_str}")
+ print(f" - Allowed amount: {allowed_str}")
+ print(f" - Paid amount: {paid_str}")
+ print(f" - Submitted to paid ratio: {ratio:.2f}x")
+ print(
+ f" - Paid amount is {(paid/submitted)*100:.1f}% of submitted charges")
+ print("") # Add a blank line between suppliers
+
+ # =====================================
+ # Peer Group Analysis for Fraud Detection
+ # =====================================
+ print("\n" + "="*100)
+ print("Peer Group Analysis for Fraud Detection")
+ print("="*100)
+
+ if dfs:
+ # Ensure we have the required columns for analysis
+ required_columns = ['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org', 'Suplr_Prvdr_Spclty_Desc',
+ 'Suplr_Prvdr_State_Abrvtn', 'Suplr_Sbmtd_Chrgs', 'Suplr_Mdcr_Pymt_Amt',
+ 'Tot_Suplr_Clms', 'Tot_Suplr_Srvcs']
+
+ # Check if all required columns exist in the combined dataframe
+ missing_columns = [
+ col for col in required_columns if col not in combined_df.columns]
+ if missing_columns:
+ print(
+ f"Warning: Missing columns needed for peer group analysis: {missing_columns}")
+ print("Skipping peer group analysis.")
+ else:
+ # Calculate aggregated metrics by supplier for analysis
+ supplier_metrics = combined_df.groupby(['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org',
+ 'Suplr_Prvdr_Spclty_Desc', 'Suplr_Prvdr_State_Abrvtn']).agg({
+ 'Suplr_Sbmtd_Chrgs': 'sum',
+ 'Suplr_Mdcr_Pymt_Amt': 'sum',
+ 'Tot_Suplr_Clms': 'sum',
+ 'Tot_Suplr_Srvcs': 'sum'
+ }).reset_index()
+
+ # Add derived metrics
+ supplier_metrics['Avg_Chrg_Per_Clm'] = supplier_metrics['Suplr_Sbmtd_Chrgs'] / \
+ supplier_metrics['Tot_Suplr_Clms']
+ supplier_metrics['Avg_Pymt_Per_Clm'] = supplier_metrics['Suplr_Mdcr_Pymt_Amt'] / \
+ supplier_metrics['Tot_Suplr_Clms']
+ supplier_metrics['Avg_Srvcs_Per_Clm'] = supplier_metrics['Tot_Suplr_Srvcs'] / \
+ supplier_metrics['Tot_Suplr_Clms']
+
+ # 1. Analysis by Specialty
+ print("\nAnalysis by Specialty:")
+ print("-" * 50)
+
+ # Get the specialties with at least 5 suppliers for meaningful comparison
+ specialty_counts = supplier_metrics['Suplr_Prvdr_Spclty_Desc'].value_counts(
+ )
+ valid_specialties = specialty_counts[specialty_counts >= 5].index.tolist(
+ )
+
+ if valid_specialties:
+ print(
+ f"Found {len(valid_specialties)} specialties with at least 5 suppliers for peer comparison.")
+
+ # Calculate peer group metrics for each specialty
+ peer_specialty_metrics = supplier_metrics[supplier_metrics['Suplr_Prvdr_Spclty_Desc'].isin(valid_specialties)].groupby(
+ 'Suplr_Prvdr_Spclty_Desc').agg({
+ 'Suplr_Sbmtd_Chrgs': ['median', 'mean', 'std'],
+ 'Suplr_Mdcr_Pymt_Amt': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Clms': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Srvcs': ['median', 'mean', 'std'],
+ 'Avg_Chrg_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Pymt_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Srvcs_Per_Clm': ['median', 'mean', 'std']
+ })
+
+ # Find outliers within each specialty (suppliers with metrics > 3x the median)
+ outliers_by_specialty = []
+
+ for specialty in valid_specialties:
+ specialty_group = supplier_metrics[supplier_metrics['Suplr_Prvdr_Spclty_Desc'] == specialty]
+ specialty_medians = peer_specialty_metrics.loc[specialty]
+
+ # Check for outliers in claims, charges, and payments
+ claim_outliers = specialty_group[specialty_group['Tot_Suplr_Clms']
+ > 3 * specialty_medians[('Tot_Suplr_Clms', 'median')]]
+ charge_outliers = specialty_group[specialty_group['Suplr_Sbmtd_Chrgs']
+ > 3 * specialty_medians[('Suplr_Sbmtd_Chrgs', 'median')]]
+ payment_outliers = specialty_group[specialty_group['Suplr_Mdcr_Pymt_Amt']
+ > 3 * specialty_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]]
+
+ # Find suppliers that are outliers in at least two categories
+ all_outliers = pd.concat([
+ claim_outliers[['Suplr_NPI']].assign(metric='claims'),
+ charge_outliers[['Suplr_NPI']].assign(
+ metric='charges'),
+ payment_outliers[['Suplr_NPI']].assign(
+ metric='payments')
+ ])
+
+ outlier_counts = all_outliers.groupby('Suplr_NPI').size()
+ multiple_outliers = outlier_counts[outlier_counts >= 2].index.tolist(
+ )
+
+ if multiple_outliers:
+ for npi in multiple_outliers:
+ supplier = specialty_group[specialty_group['Suplr_NPI']
+ == npi].iloc[0]
+ outliers_by_specialty.append({
+ 'NPI': npi,
+ 'Name': supplier['Suplr_Prvdr_Last_Name_Org'],
+ 'Specialty': specialty,
+ 'State': supplier['Suplr_Prvdr_State_Abrvtn'],
+ 'Total_Claims': supplier['Tot_Suplr_Clms'],
+ 'Claim_Ratio': supplier['Tot_Suplr_Clms'] / specialty_medians[('Tot_Suplr_Clms', 'median')],
+ 'Total_Charges': supplier['Suplr_Sbmtd_Chrgs'],
+ 'Charge_Ratio': supplier['Suplr_Sbmtd_Chrgs'] / specialty_medians[('Suplr_Sbmtd_Chrgs', 'median')],
+ 'Total_Payments': supplier['Suplr_Mdcr_Pymt_Amt'],
+ 'Payment_Ratio': supplier['Suplr_Mdcr_Pymt_Amt'] / specialty_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]
+ })
+
+ # Display the top outliers by specialty
+ if outliers_by_specialty:
+ # Sort by highest combined ratio (sum of all ratios)
+ for outlier in sorted(outliers_by_specialty,
+ key=lambda x: (
+ x['Claim_Ratio'] + x['Charge_Ratio'] + x['Payment_Ratio']),
+ reverse=True)[:10]:
+ print(
+ f"\n**{outlier['Name']}** (NPI: {outlier['NPI']})")
+ print(
+ f" Specialty: {outlier['Specialty']} | State: {outlier['State']}")
+ print(
+ f" Total Claims: {outlier['Total_Claims']:.0f} ({outlier['Claim_Ratio']:.1f}x specialty median)")
+
+ # Format monetary values
+ charges_str = format_dollar_amount(
+ outlier['Total_Charges'])
+ payments_str = format_dollar_amount(
+ outlier['Total_Payments'])
+
+ print(
+ f" Total Charges: {charges_str} ({outlier['Charge_Ratio']:.1f}x specialty median)")
+ print(
+ f" Total Payments: {payments_str} ({outlier['Payment_Ratio']:.1f}x specialty median)")
+ else:
+ print("No significant specialty outliers found.")
+ else:
+ print(
+ "No specialties with enough suppliers for meaningful peer comparison.")
+
+ # 2. Analysis by State
+ print("\nAnalysis by State:")
+ print("-" * 50)
+
+ # Get the states with at least 5 suppliers for meaningful comparison
+ state_counts = supplier_metrics['Suplr_Prvdr_State_Abrvtn'].value_counts(
+ )
+ valid_states = state_counts[state_counts >= 5].index.tolist()
+
+ if valid_states:
+ print(
+ f"Found {len(valid_states)} states with at least 5 suppliers for peer comparison.")
+
+ # Calculate peer group metrics for each state
+ peer_state_metrics = supplier_metrics[supplier_metrics['Suplr_Prvdr_State_Abrvtn'].isin(valid_states)].groupby(
+ 'Suplr_Prvdr_State_Abrvtn').agg({
+ 'Suplr_Sbmtd_Chrgs': ['median', 'mean', 'std'],
+ 'Suplr_Mdcr_Pymt_Amt': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Clms': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Srvcs': ['median', 'mean', 'std'],
+ 'Avg_Chrg_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Pymt_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Srvcs_Per_Clm': ['median', 'mean', 'std']
+ })
+
+ # Find outliers within each state (suppliers with metrics > 3x the median)
+ outliers_by_state = []
+
+ for state in valid_states:
+ state_group = supplier_metrics[supplier_metrics['Suplr_Prvdr_State_Abrvtn'] == state]
+ state_medians = peer_state_metrics.loc[state]
+
+ # Check for outliers in claims, charges, and payments
+ claim_outliers = state_group[state_group['Tot_Suplr_Clms']
+ > 3 * state_medians[('Tot_Suplr_Clms', 'median')]]
+ charge_outliers = state_group[state_group['Suplr_Sbmtd_Chrgs']
+ > 3 * state_medians[('Suplr_Sbmtd_Chrgs', 'median')]]
+ payment_outliers = state_group[state_group['Suplr_Mdcr_Pymt_Amt']
+ > 3 * state_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]]
+
+ # Find suppliers that are outliers in at least two categories
+ all_outliers = pd.concat([
+ claim_outliers[['Suplr_NPI']].assign(metric='claims'),
+ charge_outliers[['Suplr_NPI']].assign(
+ metric='charges'),
+ payment_outliers[['Suplr_NPI']].assign(
+ metric='payments')
+ ])
+
+ outlier_counts = all_outliers.groupby('Suplr_NPI').size()
+ multiple_outliers = outlier_counts[outlier_counts >= 2].index.tolist(
+ )
+
+ if multiple_outliers:
+ for npi in multiple_outliers:
+ supplier = state_group[state_group['Suplr_NPI']
+ == npi].iloc[0]
+ outliers_by_state.append({
+ 'NPI': npi,
+ 'Name': supplier['Suplr_Prvdr_Last_Name_Org'],
+ 'Specialty': supplier['Suplr_Prvdr_Spclty_Desc'],
+ 'State': state,
+ 'Total_Claims': supplier['Tot_Suplr_Clms'],
+ 'Claim_Ratio': supplier['Tot_Suplr_Clms'] / state_medians[('Tot_Suplr_Clms', 'median')],
+ 'Total_Charges': supplier['Suplr_Sbmtd_Chrgs'],
+ 'Charge_Ratio': supplier['Suplr_Sbmtd_Chrgs'] / state_medians[('Suplr_Sbmtd_Chrgs', 'median')],
+ 'Total_Payments': supplier['Suplr_Mdcr_Pymt_Amt'],
+ 'Payment_Ratio': supplier['Suplr_Mdcr_Pymt_Amt'] / state_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]
+ })
+
+ # Display the top outliers by state
+ if outliers_by_state:
+ # Sort by highest combined ratio (sum of all ratios)
+ for outlier in sorted(outliers_by_state,
+ key=lambda x: (
+ x['Claim_Ratio'] + x['Charge_Ratio'] + x['Payment_Ratio']),
+ reverse=True)[:10]:
+ print(
+ f"\n**{outlier['Name']}** (NPI: {outlier['NPI']})")
+ print(
+ f" State: {outlier['State']} | Specialty: {outlier['Specialty']}")
+ print(
+ f" Total Claims: {outlier['Total_Claims']:.0f} ({outlier['Claim_Ratio']:.1f}x state median)")
+
+ # Format monetary values
+ charges_str = format_dollar_amount(
+ outlier['Total_Charges'])
+ payments_str = format_dollar_amount(
+ outlier['Total_Payments'])
+
+ print(
+ f" Total Charges: {charges_str} ({outlier['Charge_Ratio']:.1f}x state median)")
+ print(
+ f" Total Payments: {payments_str} ({outlier['Payment_Ratio']:.1f}x state median)")
+ else:
+ print("No significant state outliers found.")
+ else:
+ print("No states with enough suppliers for meaningful peer comparison.")
+
+ # 3. Combined specialty-state analysis for the most precise peer grouping
+ print("\nAnalysis by Combined Specialty-State Groups:")
+ print("-" * 50)
+
+ # Create specialty-state combination for more precise peer groups
+ supplier_metrics['Specialty_State'] = supplier_metrics['Suplr_Prvdr_Spclty_Desc'] + \
+ ' - ' + supplier_metrics['Suplr_Prvdr_State_Abrvtn']
+
+ # Get specialty-state combinations with at least 5 suppliers
+ specialty_state_counts = supplier_metrics['Specialty_State'].value_counts(
+ )
+ valid_specialty_states = specialty_state_counts[specialty_state_counts >= 5].index.tolist(
+ )
+
+ if valid_specialty_states:
+ print(
+ f"Found {len(valid_specialty_states)} specialty-state combinations with at least 5 suppliers.")
+
+ # Calculate metrics for each specialty-state combination
+ peer_combined_metrics = supplier_metrics[supplier_metrics['Specialty_State'].isin(valid_specialty_states)].groupby(
+ 'Specialty_State').agg({
+ 'Suplr_Sbmtd_Chrgs': ['median', 'mean', 'std'],
+ 'Suplr_Mdcr_Pymt_Amt': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Clms': ['median', 'mean', 'std'],
+ 'Tot_Suplr_Srvcs': ['median', 'mean', 'std'],
+ 'Avg_Chrg_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Pymt_Per_Clm': ['median', 'mean', 'std'],
+ 'Avg_Srvcs_Per_Clm': ['median', 'mean', 'std']
+ })
+
+ # Find outliers within each specialty-state group
+ outliers_combined = []
+
+ for group in valid_specialty_states:
+ combined_group = supplier_metrics[supplier_metrics['Specialty_State'] == group]
+ combined_medians = peer_combined_metrics.loc[group]
+
+ # Check for outliers in claims, charges, and payments
+ claim_outliers = combined_group[combined_group['Tot_Suplr_Clms']
+ > 3 * combined_medians[('Tot_Suplr_Clms', 'median')]]
+ charge_outliers = combined_group[combined_group['Suplr_Sbmtd_Chrgs']
+ > 3 * combined_medians[('Suplr_Sbmtd_Chrgs', 'median')]]
+ payment_outliers = combined_group[combined_group['Suplr_Mdcr_Pymt_Amt']
+ > 3 * combined_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]]
+
+ # Find suppliers that are outliers in at least two categories
+ all_outliers = pd.concat([
+ claim_outliers[['Suplr_NPI']].assign(metric='claims'),
+ charge_outliers[['Suplr_NPI']].assign(
+ metric='charges'),
+ payment_outliers[['Suplr_NPI']].assign(
+ metric='payments')
+ ])
+
+ outlier_counts = all_outliers.groupby('Suplr_NPI').size()
+ multiple_outliers = outlier_counts[outlier_counts >= 2].index.tolist(
+ )
+
+ if multiple_outliers:
+ for npi in multiple_outliers:
+ supplier = combined_group[combined_group['Suplr_NPI']
+ == npi].iloc[0]
+ outliers_combined.append({
+ 'NPI': npi,
+ 'Name': supplier['Suplr_Prvdr_Last_Name_Org'],
+ 'Specialty': supplier['Suplr_Prvdr_Spclty_Desc'],
+ 'State': supplier['Suplr_Prvdr_State_Abrvtn'],
+ 'Group': group,
+ 'Total_Claims': supplier['Tot_Suplr_Clms'],
+ 'Claim_Ratio': supplier['Tot_Suplr_Clms'] / combined_medians[('Tot_Suplr_Clms', 'median')],
+ 'Total_Charges': supplier['Suplr_Sbmtd_Chrgs'],
+ 'Charge_Ratio': supplier['Suplr_Sbmtd_Chrgs'] / combined_medians[('Suplr_Sbmtd_Chrgs', 'median')],
+ 'Total_Payments': supplier['Suplr_Mdcr_Pymt_Amt'],
+ 'Payment_Ratio': supplier['Suplr_Mdcr_Pymt_Amt'] / combined_medians[('Suplr_Mdcr_Pymt_Amt', 'median')]
+ })
+
+ # Display the top outliers by combined group
+ if outliers_combined:
+ print(
+ "\nMost Significant Outliers by Combined Specialty-State Group:")
+ # Sort by highest combined ratio (sum of all ratios)
+ for outlier in sorted(outliers_combined,
+ key=lambda x: (
+ x['Claim_Ratio'] + x['Charge_Ratio'] + x['Payment_Ratio']),
+ reverse=True)[:10]:
+ print(
+ f"\n**{outlier['Name']}** (NPI: {outlier['NPI']})")
+ print(
+ f" Specialty: {outlier['Specialty']} | State: {outlier['State']}")
+ print(
+ f" Total Claims: {outlier['Total_Claims']:.0f} ({outlier['Claim_Ratio']:.1f}x peer group median)")
+
+ # Format monetary values
+ charges_str = format_dollar_amount(
+ outlier['Total_Charges'])
+ payments_str = format_dollar_amount(
+ outlier['Total_Payments'])
+
+ print(
+ f" Total Charges: {charges_str} ({outlier['Charge_Ratio']:.1f}x peer group median)")
+ print(
+ f" Total Payments: {payments_str} ({outlier['Payment_Ratio']:.1f}x peer group median)")
+ else:
+ print("No significant combined specialty-state outliers found.")
+ else:
+ print(
+ "No specialty-state combinations with enough suppliers for meaningful peer comparison.")
+else:
+ print("No data was loaded. Please check if the CSV files exist.")
+
+
+print("Stopping here")
diff --git a/dme_analysis.ipynb b/dme_analysis.ipynb
index 9638289..0432eb2 100644
--- a/dme_analysis.ipynb
+++ b/dme_analysis.ipynb
@@ -3,25 +3,2796 @@
{
"cell_type": "code",
"execution_count": 1,
- "id": "cad336dc-cdc4-4dc0-8a6a-e4df4cacea45",
+ "id": "373321ec",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# 1. Imports & Settings\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import glob\n",
+ "import locale\n",
+ "from dme_dictionary import DATA_DICTIONARY # Assuming you have a Python file that defines DATA_DICTIONARY\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "plt.rcParams[\"figure.figsize\"] = (8, 5)\n",
+ "\n",
+ "# Set locale for currency formatting if desired\n",
+ "locale.setlocale(locale.LC_ALL, '')\n",
+ "\n",
+ "# Pandas display options\n",
+ "pd.set_option('display.max_columns', None) # Show all columns\n",
+ "pd.set_option('display.width', None) # Avoid wrapping output\n",
+ "pd.set_option('display.expand_frame_repr', False) # Single-line output for wide DataFrames"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "68dda7d8",
+ "metadata": {},
+ "source": [
+ "# Medicare DME Supplier Analysis\n",
+ "\n",
+ "This notebook demonstrates how to:\n",
+ "1. Load Medicare Durable Medical Equipment (DME) supplier data spanning multiple years (2018–2022).\n",
+ "2. Analyze key metrics (submitted charges, Medicare payments, beneficiary counts) over time.\n",
+ "3. Compute year-over-year growth rates and identify significant spikes.\n",
+ "4. Examine high submitted vs. low allowed or paid amounts.\n",
+ "5. Perform peer-group analyses by specialty, state, and combined specialty–state.\n",
+ "\n",
+ "We'll highlight outliers that may be worth investigating for potential fraud or anomalies."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9bc5b0ae",
+ "metadata": {},
+ "source": [
+ "## 2. Data Loading\n",
+ "We'll load each year's CSV file from 2018 to 2022, then combine them into a single DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "c40d3ac0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading data from data/2018/mup_dme_ry24_p05_v10_dy18_supr.csv\n",
+ "Loading data from data/2019/mup_dme_ry24_p05_v10_dy19_supr.csv\n",
+ "Loading data from data/2020/mup_dme_ry24_p05_v10_dy20_supr.csv\n",
+ "Loading data from data/2021/mup_dme_ry24_p05_v10_dy21_supr.csv\n",
+ "Loading data from data/2022/mup_dme_ry24_p05_v10_dy22_supr.csv\n",
+ "\n",
+ "Combined DataFrame shape: (352611, 95)\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_dir = 'data' # Adjust if your data folder is elsewhere\n",
+ "years = range(2018, 2023) # 2018 to 2022\n",
+ "\n",
+ "dfs = []\n",
+ "\n",
+ "for year in years:\n",
+ " csv_files = glob.glob(f\"{data_dir}/{year}/*.csv\")\n",
+ " if not csv_files:\n",
+ " print(f\"No CSV files found for year {year}\")\n",
+ " continue\n",
+ " \n",
+ " # Take the first CSV found\n",
+ " csv_file = csv_files[0]\n",
+ " print(f\"Loading data from {csv_file}\")\n",
+ " \n",
+ " # Read the CSV, then add a 'year' column\n",
+ " df = pd.read_csv(csv_file, low_memory=False)\n",
+ " df['year'] = year\n",
+ " \n",
+ " dfs.append(df)\n",
+ "\n",
+ "if dfs:\n",
+ " combined_df = pd.concat(dfs, ignore_index=True)\n",
+ " print(f\"\\nCombined DataFrame shape: {combined_df.shape}\")\n",
+ "else:\n",
+ " combined_df = pd.DataFrame()\n",
+ " print(\"No data was loaded.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "37a28f37",
+ "metadata": {},
+ "source": [
+ "### Basic Exploration\n",
+ "Let's do a quick look at the combined DataFrame's structure, and ensure we have the columns we expect."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c7036199",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "First few rows:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Suplr_NPI | \n",
+ " Suplr_Prvdr_Last_Name_Org | \n",
+ " Suplr_Prvdr_First_Name | \n",
+ " Suplr_Prvdr_MI | \n",
+ " Suplr_Prvdr_Crdntls | \n",
+ " Suplr_Prvdr_Gndr | \n",
+ " Suplr_Prvdr_Ent_Cd | \n",
+ " Suplr_Prvdr_St1 | \n",
+ " Suplr_Prvdr_St2 | \n",
+ " Suplr_Prvdr_City | \n",
+ " Suplr_Prvdr_State_Abrvtn | \n",
+ " Suplr_Prvdr_State_FIPS | \n",
+ " Suplr_Prvdr_Zip5 | \n",
+ " Suplr_Prvdr_RUCA | \n",
+ " Suplr_Prvdr_RUCA_Desc | \n",
+ " Suplr_Prvdr_Cntry | \n",
+ " Suplr_Prvdr_Spclty_Desc | \n",
+ " Suplr_Prvdr_Spclty_Srce | \n",
+ " Tot_Suplr_HCPCS_Cds | \n",
+ " Tot_Suplr_Benes | \n",
+ " Tot_Suplr_Clms | \n",
+ " Tot_Suplr_Srvcs | \n",
+ " Suplr_Sbmtd_Chrgs | \n",
+ " Suplr_Mdcr_Alowd_Amt | \n",
+ " Suplr_Mdcr_Pymt_Amt | \n",
+ " Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " DME_Sprsn_Ind | \n",
+ " DME_Tot_Suplr_HCPCS_Cds | \n",
+ " DME_Tot_Suplr_Benes | \n",
+ " DME_Tot_Suplr_Clms | \n",
+ " DME_Tot_Suplr_Srvcs | \n",
+ " DME_Suplr_Sbmtd_Chrgs | \n",
+ " DME_Suplr_Mdcr_Alowd_Amt | \n",
+ " DME_Suplr_Mdcr_Pymt_Amt | \n",
+ " DME_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " POS_Sprsn_Ind | \n",
+ " POS_Tot_Suplr_HCPCS_Cds | \n",
+ " POS_Tot_Suplr_Benes | \n",
+ " POS_Tot_Suplr_Clms | \n",
+ " POS_Tot_Suplr_Srvcs | \n",
+ " POS_Suplr_Sbmtd_Chrgs | \n",
+ " POS_Suplr_Mdcr_Alowd_Amt | \n",
+ " POS_Suplr_Mdcr_Pymt_Amt | \n",
+ " POS_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " Drug_Sprsn_Ind | \n",
+ " Drug_Tot_Suplr_HCPCS_Cds | \n",
+ " Drug_Tot_Suplr_Benes | \n",
+ " Drug_Tot_Suplr_Clms | \n",
+ " Drug_Tot_Suplr_Srvcs | \n",
+ " Drug_Suplr_Sbmtd_Chrgs | \n",
+ " Drug_Suplr_Mdcr_Alowd_Amt | \n",
+ " Drug_Suplr_Mdcr_Pymt_Amt | \n",
+ " Drug_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " Bene_Avg_Age | \n",
+ " Bene_Age_LT_65_Cnt | \n",
+ " Bene_Age_65_74_Cnt | \n",
+ " Bene_Age_75_84_Cnt | \n",
+ " Bene_Age_GT_84_Cnt | \n",
+ " Bene_Feml_Cnt | \n",
+ " Bene_Male_Cnt | \n",
+ " Bene_Race_Wht_Cnt | \n",
+ " Bene_Race_Black_Cnt | \n",
+ " Bene_Race_Api_Cnt | \n",
+ " Bene_Race_Hspnc_Cnt | \n",
+ " Bene_Race_Natind_Cnt | \n",
+ " Bene_Race_Othr_Cnt | \n",
+ " Bene_Ndual_Cnt | \n",
+ " Bene_Dual_Cnt | \n",
+ " Bene_CC_BH_ADHD_OthCD_V1_Pct | \n",
+ " Bene_CC_BH_Alcohol_Drug_V1_Pct | \n",
+ " Bene_CC_BH_Tobacco_V1_Pct | \n",
+ " Bene_CC_BH_Alz_NonAlzdem_V2_Pct | \n",
+ " Bene_CC_BH_Anxiety_V1_Pct | \n",
+ " Bene_CC_BH_Bipolar_V1_Pct | \n",
+ " Bene_CC_BH_Mood_V2_Pct | \n",
+ " Bene_CC_BH_Depress_V1_Pct | \n",
+ " Bene_CC_BH_PD_V1_Pct | \n",
+ " Bene_CC_BH_PTSD_V1_Pct | \n",
+ " Bene_CC_BH_Schizo_OthPsy_V1_Pct | \n",
+ " Bene_CC_PH_Asthma_V2_Pct | \n",
+ " Bene_CC_PH_Afib_V2_Pct | \n",
+ " Bene_CC_PH_Cancer6_V2_Pct | \n",
+ " Bene_CC_PH_CKD_V2_Pct | \n",
+ " Bene_CC_PH_COPD_V2_Pct | \n",
+ " Bene_CC_PH_Diabetes_V2_Pct | \n",
+ " Bene_CC_PH_HF_NonIHD_V2_Pct | \n",
+ " Bene_CC_PH_Hyperlipidemia_V2_Pct | \n",
+ " Bene_CC_PH_Hypertension_V2_Pct | \n",
+ " Bene_CC_PH_IschemicHeart_V2_Pct | \n",
+ " Bene_CC_PH_Osteoporosis_V2_Pct | \n",
+ " Bene_CC_PH_Parkinson_V2_Pct | \n",
+ " Bene_CC_PH_Arthritis_V2_Pct | \n",
+ " Bene_CC_PH_Stroke_TIA_V2_Pct | \n",
+ " Bene_Avg_Risk_Scre | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1003000399 | \n",
+ " Reconstructive Hand To Shoulder Of Indiana, Llc | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " 13431 Old Meridian Street | \n",
+ " Suite 225 | \n",
+ " Carmel | \n",
+ " IN | \n",
+ " 18 | \n",
+ " 46032 | \n",
+ " 1.0 | \n",
+ " Metropolitan area core: primary flow within an... | \n",
+ " US | \n",
+ " General Surgery | \n",
+ " Claim-Specialty | \n",
+ " 15 | \n",
+ " 235.0 | \n",
+ " 301 | \n",
+ " 340 | \n",
+ " 83033.00 | \n",
+ " 70600.40 | \n",
+ " 54545.85 | \n",
+ " 56320.86 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " NaN | \n",
+ " 15.0 | \n",
+ " 235.0 | \n",
+ " 301.0 | \n",
+ " 340.0 | \n",
+ " 83033.00 | \n",
+ " 70600.40 | \n",
+ " 54545.85 | \n",
+ " 56320.86 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 72.862661 | \n",
+ " 20.0 | \n",
+ " 120.0 | \n",
+ " 74.0 | \n",
+ " 21.0 | \n",
+ " 148.0 | \n",
+ " 87.0 | \n",
+ " 222.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 220.0 | \n",
+ " 15.0 | \n",
+ " NaN | \n",
+ " 0.051064 | \n",
+ " 0.102128 | \n",
+ " NaN | \n",
+ " 0.200000 | \n",
+ " NaN | \n",
+ " 0.255319 | \n",
+ " 0.234043 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.085106 | \n",
+ " 0.085106 | \n",
+ " 0.131915 | \n",
+ " 0.102128 | \n",
+ " 0.165957 | \n",
+ " 0.217021 | \n",
+ " 0.093617 | \n",
+ " 0.676596 | \n",
+ " 0.668085 | \n",
+ " 0.229787 | \n",
+ " 0.144681 | \n",
+ " 0.0 | \n",
+ " 0.646809 | \n",
+ " 0.046809 | \n",
+ " 0.975801 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1003000845 | \n",
+ " James D.Schlenker Mdsc | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " 6311 W 95th St | \n",
+ " NaN | \n",
+ " Oak Lawn | \n",
+ " IL | \n",
+ " 17 | \n",
+ " 60453 | \n",
+ " 1.0 | \n",
+ " Metropolitan area core: primary flow within an... | \n",
+ " US | \n",
+ " Plastic and Reconstructive Surgery | \n",
+ " Claim-Specialty | \n",
+ " 8 | \n",
+ " 19.0 | \n",
+ " 22 | \n",
+ " 22 | \n",
+ " 4168.00 | \n",
+ " 4034.22 | \n",
+ " 3138.12 | \n",
+ " 4635.72 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " NaN | \n",
+ " 8.0 | \n",
+ " 19.0 | \n",
+ " 22.0 | \n",
+ " 22.0 | \n",
+ " 4168.00 | \n",
+ " 4034.22 | \n",
+ " 3138.12 | \n",
+ " 4635.72 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 74.631579 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 16.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.842105 | \n",
+ " 0.736842 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.736842 | \n",
+ " NaN | \n",
+ " 1.065053 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1003001934 | \n",
+ " Yi Rui International Corp | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " 4307 8th Ave | \n",
+ " NaN | \n",
+ " Brooklyn | \n",
+ " NY | \n",
+ " 36 | \n",
+ " 11232 | \n",
+ " 1.0 | \n",
+ " Metropolitan area core: primary flow within an... | \n",
+ " US | \n",
+ " Pharmacy | \n",
+ " Claim-Specialty | \n",
+ " 5 | \n",
+ " NaN | \n",
+ " 37 | \n",
+ " 796 | \n",
+ " 2739.60 | \n",
+ " 549.08 | \n",
+ " 339.39 | \n",
+ " 407.47 | \n",
+ " NaN | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " 35.0 | \n",
+ " 46.0 | \n",
+ " 2448.28 | \n",
+ " 512.46 | \n",
+ " 321.75 | \n",
+ " 389.83 | \n",
+ " # | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " * | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 75.285714 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3.668578 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1003002254 | \n",
+ " Walgreen Co. | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " 5104 Bobby Hicks Hwy | \n",
+ " NaN | \n",
+ " Gray | \n",
+ " TN | \n",
+ " 47 | \n",
+ " 37615 | \n",
+ " 1.0 | \n",
+ " Metropolitan area core: primary flow within an... | \n",
+ " US | \n",
+ " Centralized Flu | \n",
+ " Claim-Specialty | \n",
+ " 10 | \n",
+ " 56.0 | \n",
+ " 150 | \n",
+ " 3681 | \n",
+ " 31078.36 | \n",
+ " 5276.87 | \n",
+ " 3699.85 | \n",
+ " 3835.41 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " 56.0 | \n",
+ " 148.0 | \n",
+ " 390.0 | \n",
+ " 26475.28 | \n",
+ " 3226.01 | \n",
+ " 2111.05 | \n",
+ " 2246.61 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " NaN | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " 12.0 | \n",
+ " 3291.0 | \n",
+ " 4603.08 | \n",
+ " 2050.86 | \n",
+ " 1588.8 | \n",
+ " 1588.8 | \n",
+ " 71.240000 | \n",
+ " NaN | \n",
+ " 28.0 | \n",
+ " 16.0 | \n",
+ " NaN | \n",
+ " 37.0 | \n",
+ " 19.0 | \n",
+ " 56.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 45.0 | \n",
+ " 11.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.214286 | \n",
+ " NaN | \n",
+ " 0.285714 | \n",
+ " 0.267857 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.196429 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.196429 | \n",
+ " NaN | \n",
+ " 0.821429 | \n",
+ " NaN | \n",
+ " 0.714286 | \n",
+ " 0.839286 | \n",
+ " 0.232143 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.446429 | \n",
+ " NaN | \n",
+ " 1.171945 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 1003002767 | \n",
+ " Thomas J Mcelligott Md Pc | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " 2415 Wall St Se | \n",
+ " Suite B | \n",
+ " Conyers | \n",
+ " GA | \n",
+ " 13 | \n",
+ " 30013 | \n",
+ " 1.0 | \n",
+ " Metropolitan area core: primary flow within an... | \n",
+ " US | \n",
+ " Orthopedic Surgery | \n",
+ " Claim-Specialty | \n",
+ " 10 | \n",
+ " 38.0 | \n",
+ " 44 | \n",
+ " 45 | \n",
+ " 4920.71 | \n",
+ " 4808.81 | \n",
+ " 3344.82 | \n",
+ " 3420.32 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " NaN | \n",
+ " 10.0 | \n",
+ " 38.0 | \n",
+ " 44.0 | \n",
+ " 45.0 | \n",
+ " 4920.71 | \n",
+ " 4808.81 | \n",
+ " 3344.82 | \n",
+ " 3420.32 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 73.727273 | \n",
+ " NaN | \n",
+ " 15.0 | \n",
+ " 13.0 | \n",
+ " NaN | \n",
+ " 26.0 | \n",
+ " 12.0 | \n",
+ " 26.0 | \n",
+ " 11.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.631579 | \n",
+ " 0.631579 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.447368 | \n",
+ " NaN | \n",
+ " 1.302857 | \n",
+ " 2018 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Suplr_NPI Suplr_Prvdr_Last_Name_Org Suplr_Prvdr_First_Name Suplr_Prvdr_MI Suplr_Prvdr_Crdntls Suplr_Prvdr_Gndr Suplr_Prvdr_Ent_Cd Suplr_Prvdr_St1 Suplr_Prvdr_St2 Suplr_Prvdr_City Suplr_Prvdr_State_Abrvtn Suplr_Prvdr_State_FIPS Suplr_Prvdr_Zip5 Suplr_Prvdr_RUCA Suplr_Prvdr_RUCA_Desc Suplr_Prvdr_Cntry Suplr_Prvdr_Spclty_Desc Suplr_Prvdr_Spclty_Srce Tot_Suplr_HCPCS_Cds Tot_Suplr_Benes Tot_Suplr_Clms Tot_Suplr_Srvcs Suplr_Sbmtd_Chrgs Suplr_Mdcr_Alowd_Amt Suplr_Mdcr_Pymt_Amt Suplr_Mdcr_Stdzd_Pymt_Amt DME_Sprsn_Ind DME_Tot_Suplr_HCPCS_Cds DME_Tot_Suplr_Benes DME_Tot_Suplr_Clms DME_Tot_Suplr_Srvcs DME_Suplr_Sbmtd_Chrgs DME_Suplr_Mdcr_Alowd_Amt DME_Suplr_Mdcr_Pymt_Amt DME_Suplr_Mdcr_Stdzd_Pymt_Amt POS_Sprsn_Ind POS_Tot_Suplr_HCPCS_Cds POS_Tot_Suplr_Benes POS_Tot_Suplr_Clms POS_Tot_Suplr_Srvcs POS_Suplr_Sbmtd_Chrgs POS_Suplr_Mdcr_Alowd_Amt POS_Suplr_Mdcr_Pymt_Amt POS_Suplr_Mdcr_Stdzd_Pymt_Amt Drug_Sprsn_Ind Drug_Tot_Suplr_HCPCS_Cds Drug_Tot_Suplr_Benes Drug_Tot_Suplr_Clms Drug_Tot_Suplr_Srvcs Drug_Suplr_Sbmtd_Chrgs Drug_Suplr_Mdcr_Alowd_Amt Drug_Suplr_Mdcr_Pymt_Amt Drug_Suplr_Mdcr_Stdzd_Pymt_Amt Bene_Avg_Age Bene_Age_LT_65_Cnt Bene_Age_65_74_Cnt Bene_Age_75_84_Cnt Bene_Age_GT_84_Cnt Bene_Feml_Cnt Bene_Male_Cnt Bene_Race_Wht_Cnt Bene_Race_Black_Cnt Bene_Race_Api_Cnt Bene_Race_Hspnc_Cnt Bene_Race_Natind_Cnt Bene_Race_Othr_Cnt Bene_Ndual_Cnt Bene_Dual_Cnt Bene_CC_BH_ADHD_OthCD_V1_Pct Bene_CC_BH_Alcohol_Drug_V1_Pct Bene_CC_BH_Tobacco_V1_Pct Bene_CC_BH_Alz_NonAlzdem_V2_Pct Bene_CC_BH_Anxiety_V1_Pct Bene_CC_BH_Bipolar_V1_Pct Bene_CC_BH_Mood_V2_Pct Bene_CC_BH_Depress_V1_Pct Bene_CC_BH_PD_V1_Pct Bene_CC_BH_PTSD_V1_Pct Bene_CC_BH_Schizo_OthPsy_V1_Pct Bene_CC_PH_Asthma_V2_Pct Bene_CC_PH_Afib_V2_Pct Bene_CC_PH_Cancer6_V2_Pct Bene_CC_PH_CKD_V2_Pct Bene_CC_PH_COPD_V2_Pct Bene_CC_PH_Diabetes_V2_Pct Bene_CC_PH_HF_NonIHD_V2_Pct Bene_CC_PH_Hyperlipidemia_V2_Pct Bene_CC_PH_Hypertension_V2_Pct Bene_CC_PH_IschemicHeart_V2_Pct Bene_CC_PH_Osteoporosis_V2_Pct Bene_CC_PH_Parkinson_V2_Pct Bene_CC_PH_Arthritis_V2_Pct Bene_CC_PH_Stroke_TIA_V2_Pct Bene_Avg_Risk_Scre year\n",
+ "0 1003000399 Reconstructive Hand To Shoulder Of Indiana, Llc NaN NaN NaN NaN O 13431 Old Meridian Street Suite 225 Carmel IN 18 46032 1.0 Metropolitan area core: primary flow within an... US General Surgery Claim-Specialty 15 235.0 301 340 83033.00 70600.40 54545.85 56320.86 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.00 0.00 NaN 15.0 235.0 301.0 340.0 83033.00 70600.40 54545.85 56320.86 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.0 72.862661 20.0 120.0 74.0 21.0 148.0 87.0 222.0 NaN NaN NaN 0.0 NaN 220.0 15.0 NaN 0.051064 0.102128 NaN 0.200000 NaN 0.255319 0.234043 NaN NaN NaN 0.085106 0.085106 0.131915 0.102128 0.165957 0.217021 0.093617 0.676596 0.668085 0.229787 0.144681 0.0 0.646809 0.046809 0.975801 2018\n",
+ "1 1003000845 James D.Schlenker Mdsc NaN NaN NaN NaN O 6311 W 95th St NaN Oak Lawn IL 17 60453 1.0 Metropolitan area core: primary flow within an... US Plastic and Reconstructive Surgery Claim-Specialty 8 19.0 22 22 4168.00 4034.22 3138.12 4635.72 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.00 0.00 NaN 8.0 19.0 22.0 22.0 4168.00 4034.22 3138.12 4635.72 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.0 74.631579 0.0 11.0 NaN NaN NaN NaN 16.0 NaN 0.0 NaN 0.0 NaN NaN NaN 0.0 0.000000 0.000000 NaN NaN 0.0 NaN NaN 0.0 0.0 0.0 0.000000 NaN NaN NaN NaN NaN 0.000000 0.842105 0.736842 NaN NaN NaN 0.736842 NaN 1.065053 2018\n",
+ "2 1003001934 Yi Rui International Corp NaN NaN NaN NaN O 4307 8th Ave NaN Brooklyn NY 36 11232 1.0 Metropolitan area core: primary flow within an... US Pharmacy Claim-Specialty 5 NaN 37 796 2739.60 549.08 339.39 407.47 NaN 4.0 NaN 35.0 46.0 2448.28 512.46 321.75 389.83 # NaN NaN NaN NaN NaN NaN NaN NaN * NaN NaN NaN NaN NaN NaN NaN NaN 75.285714 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.668578 2018\n",
+ "3 1003002254 Walgreen Co. NaN NaN NaN NaN O 5104 Bobby Hicks Hwy NaN Gray TN 47 37615 1.0 Metropolitan area core: primary flow within an... US Centralized Flu Claim-Specialty 10 56.0 150 3681 31078.36 5276.87 3699.85 3835.41 NaN 6.0 56.0 148.0 390.0 26475.28 3226.01 2111.05 2246.61 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.00 0.00 NaN 4.0 NaN 12.0 3291.0 4603.08 2050.86 1588.8 1588.8 71.240000 NaN 28.0 16.0 NaN 37.0 19.0 56.0 0.0 0.0 0.0 0.0 0.0 45.0 11.0 0.0 NaN NaN NaN 0.214286 NaN 0.285714 0.267857 0.0 0.0 0.0 0.196429 NaN NaN 0.196429 NaN 0.821429 NaN 0.714286 0.839286 0.232143 NaN 0.0 0.446429 NaN 1.171945 2018\n",
+ "4 1003002767 Thomas J Mcelligott Md Pc NaN NaN NaN NaN O 2415 Wall St Se Suite B Conyers GA 13 30013 1.0 Metropolitan area core: primary flow within an... US Orthopedic Surgery Claim-Specialty 10 38.0 44 45 4920.71 4808.81 3344.82 3420.32 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.00 0.00 NaN 10.0 38.0 44.0 45.0 4920.71 4808.81 3344.82 3420.32 NaN 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.0 73.727273 NaN 15.0 13.0 NaN 26.0 12.0 26.0 11.0 NaN 0.0 0.0 NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.631579 0.631579 NaN NaN 0.0 0.447368 NaN 1.302857 2018"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Column Names:\n",
+ "['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org', 'Suplr_Prvdr_First_Name', 'Suplr_Prvdr_MI', 'Suplr_Prvdr_Crdntls', 'Suplr_Prvdr_Gndr', 'Suplr_Prvdr_Ent_Cd', 'Suplr_Prvdr_St1', 'Suplr_Prvdr_St2', 'Suplr_Prvdr_City', 'Suplr_Prvdr_State_Abrvtn', 'Suplr_Prvdr_State_FIPS', 'Suplr_Prvdr_Zip5', 'Suplr_Prvdr_RUCA', 'Suplr_Prvdr_RUCA_Desc', 'Suplr_Prvdr_Cntry', 'Suplr_Prvdr_Spclty_Desc', 'Suplr_Prvdr_Spclty_Srce', 'Tot_Suplr_HCPCS_Cds', 'Tot_Suplr_Benes', 'Tot_Suplr_Clms', 'Tot_Suplr_Srvcs', 'Suplr_Sbmtd_Chrgs', 'Suplr_Mdcr_Alowd_Amt', 'Suplr_Mdcr_Pymt_Amt', 'Suplr_Mdcr_Stdzd_Pymt_Amt', 'DME_Sprsn_Ind', 'DME_Tot_Suplr_HCPCS_Cds', 'DME_Tot_Suplr_Benes', 'DME_Tot_Suplr_Clms', 'DME_Tot_Suplr_Srvcs', 'DME_Suplr_Sbmtd_Chrgs', 'DME_Suplr_Mdcr_Alowd_Amt', 'DME_Suplr_Mdcr_Pymt_Amt', 'DME_Suplr_Mdcr_Stdzd_Pymt_Amt', 'POS_Sprsn_Ind', 'POS_Tot_Suplr_HCPCS_Cds', 'POS_Tot_Suplr_Benes', 'POS_Tot_Suplr_Clms', 'POS_Tot_Suplr_Srvcs', 'POS_Suplr_Sbmtd_Chrgs', 'POS_Suplr_Mdcr_Alowd_Amt', 'POS_Suplr_Mdcr_Pymt_Amt', 'POS_Suplr_Mdcr_Stdzd_Pymt_Amt', 'Drug_Sprsn_Ind', 'Drug_Tot_Suplr_HCPCS_Cds', 'Drug_Tot_Suplr_Benes', 'Drug_Tot_Suplr_Clms', 'Drug_Tot_Suplr_Srvcs', 'Drug_Suplr_Sbmtd_Chrgs', 'Drug_Suplr_Mdcr_Alowd_Amt', 'Drug_Suplr_Mdcr_Pymt_Amt', 'Drug_Suplr_Mdcr_Stdzd_Pymt_Amt', 'Bene_Avg_Age', 'Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt', 'Bene_Age_75_84_Cnt', 'Bene_Age_GT_84_Cnt', 'Bene_Feml_Cnt', 'Bene_Male_Cnt', 'Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_Api_Cnt', 'Bene_Race_Hspnc_Cnt', 'Bene_Race_Natind_Cnt', 'Bene_Race_Othr_Cnt', 'Bene_Ndual_Cnt', 'Bene_Dual_Cnt', 'Bene_CC_BH_ADHD_OthCD_V1_Pct', 'Bene_CC_BH_Alcohol_Drug_V1_Pct', 'Bene_CC_BH_Tobacco_V1_Pct', 'Bene_CC_BH_Alz_NonAlzdem_V2_Pct', 'Bene_CC_BH_Anxiety_V1_Pct', 'Bene_CC_BH_Bipolar_V1_Pct', 'Bene_CC_BH_Mood_V2_Pct', 'Bene_CC_BH_Depress_V1_Pct', 'Bene_CC_BH_PD_V1_Pct', 'Bene_CC_BH_PTSD_V1_Pct', 'Bene_CC_BH_Schizo_OthPsy_V1_Pct', 'Bene_CC_PH_Asthma_V2_Pct', 'Bene_CC_PH_Afib_V2_Pct', 'Bene_CC_PH_Cancer6_V2_Pct', 'Bene_CC_PH_CKD_V2_Pct', 'Bene_CC_PH_COPD_V2_Pct', 'Bene_CC_PH_Diabetes_V2_Pct', 'Bene_CC_PH_HF_NonIHD_V2_Pct', 'Bene_CC_PH_Hyperlipidemia_V2_Pct', 'Bene_CC_PH_Hypertension_V2_Pct', 'Bene_CC_PH_IschemicHeart_V2_Pct', 'Bene_CC_PH_Osteoporosis_V2_Pct', 'Bene_CC_PH_Parkinson_V2_Pct', 'Bene_CC_PH_Arthritis_V2_Pct', 'Bene_CC_PH_Stroke_TIA_V2_Pct', 'Bene_Avg_Risk_Scre', 'year']\n",
+ "\n",
+ "Number of unique suppliers: 86467\n",
+ "\n",
+ "Summary of numeric columns:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Suplr_NPI | \n",
+ " Suplr_Prvdr_Zip5 | \n",
+ " Suplr_Prvdr_RUCA | \n",
+ " Tot_Suplr_HCPCS_Cds | \n",
+ " Tot_Suplr_Benes | \n",
+ " Tot_Suplr_Clms | \n",
+ " Tot_Suplr_Srvcs | \n",
+ " Suplr_Sbmtd_Chrgs | \n",
+ " Suplr_Mdcr_Alowd_Amt | \n",
+ " Suplr_Mdcr_Pymt_Amt | \n",
+ " Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " DME_Tot_Suplr_HCPCS_Cds | \n",
+ " DME_Tot_Suplr_Benes | \n",
+ " DME_Tot_Suplr_Clms | \n",
+ " DME_Tot_Suplr_Srvcs | \n",
+ " DME_Suplr_Sbmtd_Chrgs | \n",
+ " DME_Suplr_Mdcr_Alowd_Amt | \n",
+ " DME_Suplr_Mdcr_Pymt_Amt | \n",
+ " DME_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " POS_Tot_Suplr_HCPCS_Cds | \n",
+ " POS_Tot_Suplr_Benes | \n",
+ " POS_Tot_Suplr_Clms | \n",
+ " POS_Tot_Suplr_Srvcs | \n",
+ " POS_Suplr_Sbmtd_Chrgs | \n",
+ " POS_Suplr_Mdcr_Alowd_Amt | \n",
+ " POS_Suplr_Mdcr_Pymt_Amt | \n",
+ " POS_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " Drug_Tot_Suplr_HCPCS_Cds | \n",
+ " Drug_Tot_Suplr_Benes | \n",
+ " Drug_Tot_Suplr_Clms | \n",
+ " Drug_Tot_Suplr_Srvcs | \n",
+ " Drug_Suplr_Sbmtd_Chrgs | \n",
+ " Drug_Suplr_Mdcr_Alowd_Amt | \n",
+ " Drug_Suplr_Mdcr_Pymt_Amt | \n",
+ " Drug_Suplr_Mdcr_Stdzd_Pymt_Amt | \n",
+ " Bene_Avg_Age | \n",
+ " Bene_Age_LT_65_Cnt | \n",
+ " Bene_Age_65_74_Cnt | \n",
+ " Bene_Age_75_84_Cnt | \n",
+ " Bene_Age_GT_84_Cnt | \n",
+ " Bene_Feml_Cnt | \n",
+ " Bene_Male_Cnt | \n",
+ " Bene_Race_Wht_Cnt | \n",
+ " Bene_Race_Black_Cnt | \n",
+ " Bene_Race_Api_Cnt | \n",
+ " Bene_Race_Hspnc_Cnt | \n",
+ " Bene_Race_Natind_Cnt | \n",
+ " Bene_Race_Othr_Cnt | \n",
+ " Bene_Ndual_Cnt | \n",
+ " Bene_Dual_Cnt | \n",
+ " Bene_CC_BH_ADHD_OthCD_V1_Pct | \n",
+ " Bene_CC_BH_Alcohol_Drug_V1_Pct | \n",
+ " Bene_CC_BH_Tobacco_V1_Pct | \n",
+ " Bene_CC_BH_Alz_NonAlzdem_V2_Pct | \n",
+ " Bene_CC_BH_Anxiety_V1_Pct | \n",
+ " Bene_CC_BH_Bipolar_V1_Pct | \n",
+ " Bene_CC_BH_Mood_V2_Pct | \n",
+ " Bene_CC_BH_Depress_V1_Pct | \n",
+ " Bene_CC_BH_PD_V1_Pct | \n",
+ " Bene_CC_BH_PTSD_V1_Pct | \n",
+ " Bene_CC_BH_Schizo_OthPsy_V1_Pct | \n",
+ " Bene_CC_PH_Asthma_V2_Pct | \n",
+ " Bene_CC_PH_Afib_V2_Pct | \n",
+ " Bene_CC_PH_Cancer6_V2_Pct | \n",
+ " Bene_CC_PH_CKD_V2_Pct | \n",
+ " Bene_CC_PH_COPD_V2_Pct | \n",
+ " Bene_CC_PH_Diabetes_V2_Pct | \n",
+ " Bene_CC_PH_HF_NonIHD_V2_Pct | \n",
+ " Bene_CC_PH_Hyperlipidemia_V2_Pct | \n",
+ " Bene_CC_PH_Hypertension_V2_Pct | \n",
+ " Bene_CC_PH_IschemicHeart_V2_Pct | \n",
+ " Bene_CC_PH_Osteoporosis_V2_Pct | \n",
+ " Bene_CC_PH_Parkinson_V2_Pct | \n",
+ " Bene_CC_PH_Arthritis_V2_Pct | \n",
+ " Bene_CC_PH_Stroke_TIA_V2_Pct | \n",
+ " Bene_Avg_Risk_Scre | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 3.526110e+05 | \n",
+ " 352611.000000 | \n",
+ " 352575.000000 | \n",
+ " 352611.000000 | \n",
+ " 331904.000000 | \n",
+ " 352611.000000 | \n",
+ " 3.526110e+05 | \n",
+ " 3.526110e+05 | \n",
+ " 3.526110e+05 | \n",
+ " 3.526110e+05 | \n",
+ " 3.526110e+05 | \n",
+ " 334378.000000 | \n",
+ " 312252.000000 | \n",
+ " 334378.000000 | \n",
+ " 3.343780e+05 | \n",
+ " 3.343780e+05 | \n",
+ " 3.343780e+05 | \n",
+ " 3.343780e+05 | \n",
+ " 3.343780e+05 | \n",
+ " 292989.000000 | \n",
+ " 271386.000000 | \n",
+ " 292989.000000 | \n",
+ " 2.929890e+05 | \n",
+ " 2.929890e+05 | \n",
+ " 2.929890e+05 | \n",
+ " 2.929890e+05 | \n",
+ " 2.929890e+05 | \n",
+ " 279663.000000 | \n",
+ " 195592.000000 | \n",
+ " 279663.000000 | \n",
+ " 2.796630e+05 | \n",
+ " 2.796630e+05 | \n",
+ " 2.796630e+05 | \n",
+ " 2.796630e+05 | \n",
+ " 2.796630e+05 | \n",
+ " 352549.000000 | \n",
+ " 112648.000000 | \n",
+ " 257838.000000 | \n",
+ " 210441.000000 | \n",
+ " 90651.000000 | \n",
+ " 253106.000000 | \n",
+ " 253106.000000 | \n",
+ " 297628.000000 | \n",
+ " 135898.000000 | \n",
+ " 152090.000000 | \n",
+ " 153492.000000 | \n",
+ " 281658.000000 | \n",
+ " 130667.000000 | \n",
+ " 179575.000000 | \n",
+ " 179575.000000 | \n",
+ " 210454.000000 | \n",
+ " 110545.000000 | \n",
+ " 128558.000000 | \n",
+ " 103844.000000 | \n",
+ " 184461.000000 | \n",
+ " 121693.000000 | \n",
+ " 194930.000000 | \n",
+ " 182561.000000 | \n",
+ " 170846.000000 | \n",
+ " 180278.000000 | \n",
+ " 151709.000000 | \n",
+ " 133002.000000 | \n",
+ " 157471.000000 | \n",
+ " 131889.000000 | \n",
+ " 231256.000000 | \n",
+ " 189497.000000 | \n",
+ " 289577.000000 | \n",
+ " 176141.000000 | \n",
+ " 311898.000000 | \n",
+ " 315415.000000 | \n",
+ " 227186.000000 | \n",
+ " 115383.000000 | \n",
+ " 163625.000000 | \n",
+ " 264873.000000 | \n",
+ " 102463.000000 | \n",
+ " 352548.000000 | \n",
+ " 352611.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 1.499823e+09 | \n",
+ " 47761.974436 | \n",
+ " 1.938816 | \n",
+ " 19.162681 | \n",
+ " 180.003712 | \n",
+ " 723.630647 | \n",
+ " 2.894472e+04 | \n",
+ " 4.327424e+05 | \n",
+ " 1.558504e+05 | \n",
+ " 1.198852e+05 | \n",
+ " 1.189166e+05 | \n",
+ " 8.810921 | \n",
+ " 145.596758 | \n",
+ " 642.879322 | \n",
+ " 3.905078e+03 | \n",
+ " 2.565282e+05 | \n",
+ " 8.591894e+04 | \n",
+ " 6.550221e+04 | \n",
+ " 6.456093e+04 | \n",
+ " 6.513569 | \n",
+ " 48.193319 | \n",
+ " 80.856425 | \n",
+ " 3.355593e+03 | \n",
+ " 6.501689e+04 | \n",
+ " 4.470813e+04 | \n",
+ " 3.454446e+04 | \n",
+ " 3.460400e+04 | \n",
+ " 2.651927 | \n",
+ " 15.184762 | \n",
+ " 53.643879 | \n",
+ " 1.554591e+04 | \n",
+ " 3.928560e+04 | \n",
+ " 1.697234e+04 | \n",
+ " 1.326474e+04 | \n",
+ " 1.314945e+04 | \n",
+ " 72.135987 | \n",
+ " 70.165835 | \n",
+ " 93.109103 | \n",
+ " 83.160492 | \n",
+ " 68.880751 | \n",
+ " 126.068106 | \n",
+ " 103.751768 | \n",
+ " 157.496126 | \n",
+ " 38.625859 | \n",
+ " 6.402137 | \n",
+ " 20.363218 | \n",
+ " 0.683773 | \n",
+ " 6.168627 | \n",
+ " 216.677667 | \n",
+ " 81.802339 | \n",
+ " 0.001365 | \n",
+ " 0.056437 | \n",
+ " 0.137417 | \n",
+ " 0.075832 | \n",
+ " 0.270268 | \n",
+ " 0.023278 | \n",
+ " 0.295474 | \n",
+ " 0.268735 | \n",
+ " 0.003625 | \n",
+ " 0.003438 | \n",
+ " 0.011297 | \n",
+ " 0.154784 | \n",
+ " 0.205620 | \n",
+ " 0.170630 | \n",
+ " 0.364954 | \n",
+ " 0.296001 | \n",
+ " 0.738303 | \n",
+ " 0.245400 | \n",
+ " 0.803431 | \n",
+ " 0.854808 | \n",
+ " 0.356955 | \n",
+ " 0.132541 | \n",
+ " 0.004815 | \n",
+ " 0.491985 | \n",
+ " 0.085324 | \n",
+ " 1.759540 | \n",
+ " 2019.933791 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 2.877778e+08 | \n",
+ " 28443.077792 | \n",
+ " 2.593615 | \n",
+ " 25.023950 | \n",
+ " 1318.464715 | \n",
+ " 5759.184130 | \n",
+ " 1.137952e+06 | \n",
+ " 6.107114e+06 | \n",
+ " 2.104732e+06 | \n",
+ " 1.642990e+06 | \n",
+ " 1.640688e+06 | \n",
+ " 17.959107 | \n",
+ " 1210.207291 | \n",
+ " 5472.581913 | \n",
+ " 1.183744e+05 | \n",
+ " 4.536824e+06 | \n",
+ " 1.222667e+06 | \n",
+ " 9.536508e+05 | \n",
+ " 9.520616e+05 | \n",
+ " 18.036465 | \n",
+ " 582.654243 | \n",
+ " 1658.046524 | \n",
+ " 1.774209e+05 | \n",
+ " 1.399145e+06 | \n",
+ " 7.558673e+05 | \n",
+ " 5.801982e+05 | \n",
+ " 5.969880e+05 | \n",
+ " 2.654579 | \n",
+ " 508.361346 | \n",
+ " 2599.424059 | \n",
+ " 1.233497e+06 | \n",
+ " 2.072189e+06 | \n",
+ " 9.877956e+05 | \n",
+ " 7.734391e+05 | \n",
+ " 7.665664e+05 | \n",
+ " 4.203287 | \n",
+ " 362.180891 | \n",
+ " 576.926665 | \n",
+ " 551.356612 | \n",
+ " 375.786927 | \n",
+ " 859.590068 | \n",
+ " 663.652059 | \n",
+ " 1086.588625 | \n",
+ " 283.259321 | \n",
+ " 53.643838 | \n",
+ " 181.647129 | \n",
+ " 9.812065 | \n",
+ " 45.984446 | \n",
+ " 1322.832122 | \n",
+ " 533.498517 | \n",
+ " 0.008951 | \n",
+ " 0.076374 | \n",
+ " 0.100486 | \n",
+ " 0.102199 | \n",
+ " 0.102902 | \n",
+ " 0.044075 | \n",
+ " 0.108544 | \n",
+ " 0.102275 | \n",
+ " 0.012957 | \n",
+ " 0.013120 | \n",
+ " 0.044755 | \n",
+ " 0.087742 | \n",
+ " 0.084632 | \n",
+ " 0.135977 | \n",
+ " 0.153952 | \n",
+ " 0.140641 | \n",
+ " 0.251118 | \n",
+ " 0.108972 | \n",
+ " 0.145535 | \n",
+ " 0.146975 | \n",
+ " 0.112958 | \n",
+ " 0.083479 | \n",
+ " 0.017198 | \n",
+ " 0.137264 | \n",
+ " 0.070467 | \n",
+ " 0.655410 | \n",
+ " 1.417299 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.003000e+09 | \n",
+ " 601.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 11.000000 | \n",
+ " 11.000000 | \n",
+ " 1.100000e+01 | \n",
+ " 1.960000e+01 | \n",
+ " 1.662000e+01 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.176000 | \n",
+ " 2018.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 1.255346e+09 | \n",
+ " 25504.000000 | \n",
+ " 1.000000 | \n",
+ " 8.000000 | \n",
+ " 28.000000 | \n",
+ " 60.000000 | \n",
+ " 4.770000e+02 | \n",
+ " 1.507708e+04 | \n",
+ " 4.234210e+03 | \n",
+ " 3.093515e+03 | \n",
+ " 3.221385e+03 | \n",
+ " 3.000000 | \n",
+ " 15.000000 | \n",
+ " 26.000000 | \n",
+ " 4.900000e+01 | \n",
+ " 2.726097e+03 | \n",
+ " 6.501400e+02 | \n",
+ " 4.462525e+02 | \n",
+ " 4.783675e+02 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 70.280000 | \n",
+ " 13.000000 | \n",
+ " 18.000000 | \n",
+ " 17.000000 | \n",
+ " 11.000000 | \n",
+ " 24.000000 | \n",
+ " 19.000000 | \n",
+ " 23.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 29.000000 | \n",
+ " 15.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.074721 | \n",
+ " 0.000000 | \n",
+ " 0.205950 | \n",
+ " 0.000000 | \n",
+ " 0.225806 | \n",
+ " 0.205128 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.110497 | \n",
+ " 0.156250 | \n",
+ " 0.118056 | \n",
+ " 0.277108 | \n",
+ " 0.203704 | \n",
+ " 0.537500 | \n",
+ " 0.180812 | \n",
+ " 0.714286 | \n",
+ " 0.769231 | \n",
+ " 0.282609 | \n",
+ " 0.091703 | \n",
+ " 0.000000 | \n",
+ " 0.400000 | \n",
+ " 0.000000 | \n",
+ " 1.363045 | \n",
+ " 2019.000000 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 1.497926e+09 | \n",
+ " 44125.000000 | \n",
+ " 1.000000 | \n",
+ " 12.000000 | \n",
+ " 53.000000 | \n",
+ " 146.000000 | \n",
+ " 3.592000e+03 | \n",
+ " 3.964752e+04 | \n",
+ " 1.171618e+04 | \n",
+ " 8.781310e+03 | \n",
+ " 8.958790e+03 | \n",
+ " 5.000000 | \n",
+ " 39.000000 | \n",
+ " 112.000000 | \n",
+ " 2.430000e+02 | \n",
+ " 1.437400e+04 | \n",
+ " 2.624245e+03 | \n",
+ " 1.843840e+03 | \n",
+ " 1.960370e+03 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 15.000000 | \n",
+ " 1.954000e+03 | \n",
+ " 3.273830e+03 | \n",
+ " 3.129500e+02 | \n",
+ " 2.231000e+02 | \n",
+ " 2.212200e+02 | \n",
+ " 72.557692 | \n",
+ " 24.000000 | \n",
+ " 30.000000 | \n",
+ " 26.000000 | \n",
+ " 19.000000 | \n",
+ " 39.000000 | \n",
+ " 31.000000 | \n",
+ " 45.000000 | \n",
+ " 11.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 59.000000 | \n",
+ " 25.000000 | \n",
+ " 0.000000 | \n",
+ " 0.040710 | \n",
+ " 0.135802 | \n",
+ " 0.057075 | \n",
+ " 0.259615 | \n",
+ " 0.000000 | \n",
+ " 0.282353 | \n",
+ " 0.258065 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.147059 | \n",
+ " 0.201835 | \n",
+ " 0.151899 | \n",
+ " 0.351351 | \n",
+ " 0.277108 | \n",
+ " 0.811594 | \n",
+ " 0.236842 | \n",
+ " 0.806452 | \n",
+ " 0.857143 | \n",
+ " 0.349057 | \n",
+ " 0.130435 | \n",
+ " 0.000000 | \n",
+ " 0.474729 | \n",
+ " 0.086397 | \n",
+ " 1.665859 | \n",
+ " 2020.000000 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 1.740658e+09 | \n",
+ " 74104.000000 | \n",
+ " 1.000000 | \n",
+ " 18.000000 | \n",
+ " 105.000000 | \n",
+ " 315.000000 | \n",
+ " 1.050350e+04 | \n",
+ " 9.733840e+04 | \n",
+ " 3.444066e+04 | \n",
+ " 2.628954e+04 | \n",
+ " 2.645689e+04 | \n",
+ " 6.000000 | \n",
+ " 81.000000 | \n",
+ " 256.000000 | \n",
+ " 5.780000e+02 | \n",
+ " 3.535016e+04 | \n",
+ " 6.448807e+03 | \n",
+ " 4.567375e+03 | \n",
+ " 4.816860e+03 | \n",
+ " 4.000000 | \n",
+ " 19.000000 | \n",
+ " 23.000000 | \n",
+ " 5.800000e+01 | \n",
+ " 8.737850e+03 | \n",
+ " 6.100790e+03 | \n",
+ " 4.672270e+03 | \n",
+ " 4.877140e+03 | \n",
+ " 5.000000 | \n",
+ " 16.000000 | \n",
+ " 42.000000 | \n",
+ " 6.352000e+03 | \n",
+ " 1.723049e+04 | \n",
+ " 5.056030e+03 | \n",
+ " 3.889745e+03 | \n",
+ " 3.861795e+03 | \n",
+ " 74.583333 | \n",
+ " 54.000000 | \n",
+ " 55.000000 | \n",
+ " 49.000000 | \n",
+ " 48.000000 | \n",
+ " 74.000000 | \n",
+ " 59.000000 | \n",
+ " 93.000000 | \n",
+ " 25.000000 | \n",
+ " 0.000000 | \n",
+ " 15.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 125.000000 | \n",
+ " 54.000000 | \n",
+ " 0.000000 | \n",
+ " 0.094395 | \n",
+ " 0.193682 | \n",
+ " 0.109195 | \n",
+ " 0.322835 | \n",
+ " 0.042476 | \n",
+ " 0.350000 | \n",
+ " 0.320755 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.193548 | \n",
+ " 0.254902 | \n",
+ " 0.190440 | \n",
+ " 0.432432 | \n",
+ " 0.366667 | \n",
+ " 0.920000 | \n",
+ " 0.302857 | \n",
+ " 0.894118 | \n",
+ " 0.937500 | \n",
+ " 0.421053 | \n",
+ " 0.173554 | \n",
+ " 0.000000 | \n",
+ " 0.564815 | \n",
+ " 0.123711 | \n",
+ " 2.018267 | \n",
+ " 2021.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 1.993000e+09 | \n",
+ " 99901.000000 | \n",
+ " 99.000000 | \n",
+ " 538.000000 | \n",
+ " 237906.000000 | \n",
+ " 685662.000000 | \n",
+ " 3.290728e+08 | \n",
+ " 1.836805e+09 | \n",
+ " 3.991887e+08 | \n",
+ " 3.116929e+08 | \n",
+ " 3.117852e+08 | \n",
+ " 313.000000 | \n",
+ " 237906.000000 | \n",
+ " 685662.000000 | \n",
+ " 2.316198e+07 | \n",
+ " 1.830416e+09 | \n",
+ " 2.082855e+08 | \n",
+ " 1.622225e+08 | \n",
+ " 1.622216e+08 | \n",
+ " 277.000000 | \n",
+ " 94760.000000 | \n",
+ " 255217.000000 | \n",
+ " 3.156666e+07 | \n",
+ " 2.291182e+08 | \n",
+ " 1.343600e+08 | \n",
+ " 1.055647e+08 | \n",
+ " 1.126160e+08 | \n",
+ " 14.000000 | \n",
+ " 98084.000000 | \n",
+ " 619016.000000 | \n",
+ " 3.286327e+08 | \n",
+ " 4.578468e+08 | \n",
+ " 1.979528e+08 | \n",
+ " 1.550935e+08 | \n",
+ " 1.530220e+08 | \n",
+ " 108.000000 | \n",
+ " 46481.000000 | \n",
+ " 82335.000000 | \n",
+ " 81159.000000 | \n",
+ " 27931.000000 | \n",
+ " 142147.000000 | \n",
+ " 95759.000000 | \n",
+ " 170138.000000 | \n",
+ " 44508.000000 | \n",
+ " 4326.000000 | \n",
+ " 28959.000000 | \n",
+ " 740.000000 | \n",
+ " 2859.000000 | \n",
+ " 158443.000000 | \n",
+ " 79463.000000 | \n",
+ " 0.565217 | \n",
+ " 1.636364 | \n",
+ " 1.636364 | \n",
+ " 1.068421 | \n",
+ " 1.727273 | \n",
+ " 1.545455 | \n",
+ " 1.727273 | \n",
+ " 1.636364 | \n",
+ " 0.833333 | \n",
+ " 0.444444 | \n",
+ " 1.636364 | \n",
+ " 1.454545 | \n",
+ " 2.461538 | \n",
+ " 2.380952 | \n",
+ " 3.601562 | \n",
+ " 1.636364 | \n",
+ " 2.589844 | \n",
+ " 3.153846 | \n",
+ " 3.261719 | \n",
+ " 3.753906 | \n",
+ " 2.367188 | \n",
+ " 1.272727 | \n",
+ " 1.144068 | \n",
+ " 1.750000 | \n",
+ " 0.846154 | \n",
+ " 16.340466 | \n",
+ " 2022.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Suplr_NPI Suplr_Prvdr_Zip5 Suplr_Prvdr_RUCA Tot_Suplr_HCPCS_Cds Tot_Suplr_Benes Tot_Suplr_Clms Tot_Suplr_Srvcs Suplr_Sbmtd_Chrgs Suplr_Mdcr_Alowd_Amt Suplr_Mdcr_Pymt_Amt Suplr_Mdcr_Stdzd_Pymt_Amt DME_Tot_Suplr_HCPCS_Cds DME_Tot_Suplr_Benes DME_Tot_Suplr_Clms DME_Tot_Suplr_Srvcs DME_Suplr_Sbmtd_Chrgs DME_Suplr_Mdcr_Alowd_Amt DME_Suplr_Mdcr_Pymt_Amt DME_Suplr_Mdcr_Stdzd_Pymt_Amt POS_Tot_Suplr_HCPCS_Cds POS_Tot_Suplr_Benes POS_Tot_Suplr_Clms POS_Tot_Suplr_Srvcs POS_Suplr_Sbmtd_Chrgs POS_Suplr_Mdcr_Alowd_Amt POS_Suplr_Mdcr_Pymt_Amt POS_Suplr_Mdcr_Stdzd_Pymt_Amt Drug_Tot_Suplr_HCPCS_Cds Drug_Tot_Suplr_Benes Drug_Tot_Suplr_Clms Drug_Tot_Suplr_Srvcs Drug_Suplr_Sbmtd_Chrgs Drug_Suplr_Mdcr_Alowd_Amt Drug_Suplr_Mdcr_Pymt_Amt Drug_Suplr_Mdcr_Stdzd_Pymt_Amt Bene_Avg_Age Bene_Age_LT_65_Cnt Bene_Age_65_74_Cnt Bene_Age_75_84_Cnt Bene_Age_GT_84_Cnt Bene_Feml_Cnt Bene_Male_Cnt Bene_Race_Wht_Cnt Bene_Race_Black_Cnt Bene_Race_Api_Cnt Bene_Race_Hspnc_Cnt Bene_Race_Natind_Cnt Bene_Race_Othr_Cnt Bene_Ndual_Cnt Bene_Dual_Cnt Bene_CC_BH_ADHD_OthCD_V1_Pct Bene_CC_BH_Alcohol_Drug_V1_Pct Bene_CC_BH_Tobacco_V1_Pct Bene_CC_BH_Alz_NonAlzdem_V2_Pct Bene_CC_BH_Anxiety_V1_Pct Bene_CC_BH_Bipolar_V1_Pct Bene_CC_BH_Mood_V2_Pct Bene_CC_BH_Depress_V1_Pct Bene_CC_BH_PD_V1_Pct Bene_CC_BH_PTSD_V1_Pct Bene_CC_BH_Schizo_OthPsy_V1_Pct Bene_CC_PH_Asthma_V2_Pct Bene_CC_PH_Afib_V2_Pct Bene_CC_PH_Cancer6_V2_Pct Bene_CC_PH_CKD_V2_Pct Bene_CC_PH_COPD_V2_Pct Bene_CC_PH_Diabetes_V2_Pct Bene_CC_PH_HF_NonIHD_V2_Pct Bene_CC_PH_Hyperlipidemia_V2_Pct Bene_CC_PH_Hypertension_V2_Pct Bene_CC_PH_IschemicHeart_V2_Pct Bene_CC_PH_Osteoporosis_V2_Pct Bene_CC_PH_Parkinson_V2_Pct Bene_CC_PH_Arthritis_V2_Pct Bene_CC_PH_Stroke_TIA_V2_Pct Bene_Avg_Risk_Scre year\n",
+ "count 3.526110e+05 352611.000000 352575.000000 352611.000000 331904.000000 352611.000000 3.526110e+05 3.526110e+05 3.526110e+05 3.526110e+05 3.526110e+05 334378.000000 312252.000000 334378.000000 3.343780e+05 3.343780e+05 3.343780e+05 3.343780e+05 3.343780e+05 292989.000000 271386.000000 292989.000000 2.929890e+05 2.929890e+05 2.929890e+05 2.929890e+05 2.929890e+05 279663.000000 195592.000000 279663.000000 2.796630e+05 2.796630e+05 2.796630e+05 2.796630e+05 2.796630e+05 352549.000000 112648.000000 257838.000000 210441.000000 90651.000000 253106.000000 253106.000000 297628.000000 135898.000000 152090.000000 153492.000000 281658.000000 130667.000000 179575.000000 179575.000000 210454.000000 110545.000000 128558.000000 103844.000000 184461.000000 121693.000000 194930.000000 182561.000000 170846.000000 180278.000000 151709.000000 133002.000000 157471.000000 131889.000000 231256.000000 189497.000000 289577.000000 176141.000000 311898.000000 315415.000000 227186.000000 115383.000000 163625.000000 264873.000000 102463.000000 352548.000000 352611.000000\n",
+ "mean 1.499823e+09 47761.974436 1.938816 19.162681 180.003712 723.630647 2.894472e+04 4.327424e+05 1.558504e+05 1.198852e+05 1.189166e+05 8.810921 145.596758 642.879322 3.905078e+03 2.565282e+05 8.591894e+04 6.550221e+04 6.456093e+04 6.513569 48.193319 80.856425 3.355593e+03 6.501689e+04 4.470813e+04 3.454446e+04 3.460400e+04 2.651927 15.184762 53.643879 1.554591e+04 3.928560e+04 1.697234e+04 1.326474e+04 1.314945e+04 72.135987 70.165835 93.109103 83.160492 68.880751 126.068106 103.751768 157.496126 38.625859 6.402137 20.363218 0.683773 6.168627 216.677667 81.802339 0.001365 0.056437 0.137417 0.075832 0.270268 0.023278 0.295474 0.268735 0.003625 0.003438 0.011297 0.154784 0.205620 0.170630 0.364954 0.296001 0.738303 0.245400 0.803431 0.854808 0.356955 0.132541 0.004815 0.491985 0.085324 1.759540 2019.933791\n",
+ "std 2.877778e+08 28443.077792 2.593615 25.023950 1318.464715 5759.184130 1.137952e+06 6.107114e+06 2.104732e+06 1.642990e+06 1.640688e+06 17.959107 1210.207291 5472.581913 1.183744e+05 4.536824e+06 1.222667e+06 9.536508e+05 9.520616e+05 18.036465 582.654243 1658.046524 1.774209e+05 1.399145e+06 7.558673e+05 5.801982e+05 5.969880e+05 2.654579 508.361346 2599.424059 1.233497e+06 2.072189e+06 9.877956e+05 7.734391e+05 7.665664e+05 4.203287 362.180891 576.926665 551.356612 375.786927 859.590068 663.652059 1086.588625 283.259321 53.643838 181.647129 9.812065 45.984446 1322.832122 533.498517 0.008951 0.076374 0.100486 0.102199 0.102902 0.044075 0.108544 0.102275 0.012957 0.013120 0.044755 0.087742 0.084632 0.135977 0.153952 0.140641 0.251118 0.108972 0.145535 0.146975 0.112958 0.083479 0.017198 0.137264 0.070467 0.655410 1.417299\n",
+ "min 1.003000e+09 601.000000 1.000000 1.000000 11.000000 11.000000 1.100000e+01 1.960000e+01 1.662000e+01 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.176000 2018.000000\n",
+ "25% 1.255346e+09 25504.000000 1.000000 8.000000 28.000000 60.000000 4.770000e+02 1.507708e+04 4.234210e+03 3.093515e+03 3.221385e+03 3.000000 15.000000 26.000000 4.900000e+01 2.726097e+03 6.501400e+02 4.462525e+02 4.783675e+02 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 70.280000 13.000000 18.000000 17.000000 11.000000 24.000000 19.000000 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 29.000000 15.000000 0.000000 0.000000 0.074721 0.000000 0.205950 0.000000 0.225806 0.205128 0.000000 0.000000 0.000000 0.110497 0.156250 0.118056 0.277108 0.203704 0.537500 0.180812 0.714286 0.769231 0.282609 0.091703 0.000000 0.400000 0.000000 1.363045 2019.000000\n",
+ "50% 1.497926e+09 44125.000000 1.000000 12.000000 53.000000 146.000000 3.592000e+03 3.964752e+04 1.171618e+04 8.781310e+03 8.958790e+03 5.000000 39.000000 112.000000 2.430000e+02 1.437400e+04 2.624245e+03 1.843840e+03 1.960370e+03 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 3.000000 0.000000 15.000000 1.954000e+03 3.273830e+03 3.129500e+02 2.231000e+02 2.212200e+02 72.557692 24.000000 30.000000 26.000000 19.000000 39.000000 31.000000 45.000000 11.000000 0.000000 0.000000 0.000000 0.000000 59.000000 25.000000 0.000000 0.040710 0.135802 0.057075 0.259615 0.000000 0.282353 0.258065 0.000000 0.000000 0.000000 0.147059 0.201835 0.151899 0.351351 0.277108 0.811594 0.236842 0.806452 0.857143 0.349057 0.130435 0.000000 0.474729 0.086397 1.665859 2020.000000\n",
+ "75% 1.740658e+09 74104.000000 1.000000 18.000000 105.000000 315.000000 1.050350e+04 9.733840e+04 3.444066e+04 2.628954e+04 2.645689e+04 6.000000 81.000000 256.000000 5.780000e+02 3.535016e+04 6.448807e+03 4.567375e+03 4.816860e+03 4.000000 19.000000 23.000000 5.800000e+01 8.737850e+03 6.100790e+03 4.672270e+03 4.877140e+03 5.000000 16.000000 42.000000 6.352000e+03 1.723049e+04 5.056030e+03 3.889745e+03 3.861795e+03 74.583333 54.000000 55.000000 49.000000 48.000000 74.000000 59.000000 93.000000 25.000000 0.000000 15.000000 0.000000 0.000000 125.000000 54.000000 0.000000 0.094395 0.193682 0.109195 0.322835 0.042476 0.350000 0.320755 0.000000 0.000000 0.000000 0.193548 0.254902 0.190440 0.432432 0.366667 0.920000 0.302857 0.894118 0.937500 0.421053 0.173554 0.000000 0.564815 0.123711 2.018267 2021.000000\n",
+ "max 1.993000e+09 99901.000000 99.000000 538.000000 237906.000000 685662.000000 3.290728e+08 1.836805e+09 3.991887e+08 3.116929e+08 3.117852e+08 313.000000 237906.000000 685662.000000 2.316198e+07 1.830416e+09 2.082855e+08 1.622225e+08 1.622216e+08 277.000000 94760.000000 255217.000000 3.156666e+07 2.291182e+08 1.343600e+08 1.055647e+08 1.126160e+08 14.000000 98084.000000 619016.000000 3.286327e+08 4.578468e+08 1.979528e+08 1.550935e+08 1.530220e+08 108.000000 46481.000000 82335.000000 81159.000000 27931.000000 142147.000000 95759.000000 170138.000000 44508.000000 4326.000000 28959.000000 740.000000 2859.000000 158443.000000 79463.000000 0.565217 1.636364 1.636364 1.068421 1.727273 1.545455 1.727273 1.636364 0.833333 0.444444 1.636364 1.454545 2.461538 2.380952 3.601562 1.636364 2.589844 3.153846 3.261719 3.753906 2.367188 1.272727 1.144068 1.750000 0.846154 16.340466 2022.000000"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "if not combined_df.empty:\n",
+ " print(\"\\nFirst few rows:\")\n",
+ " display(combined_df.head())\n",
+ "\n",
+ " print(\"\\nColumn Names:\")\n",
+ " print(combined_df.columns.tolist())\n",
+ "\n",
+ " print(f\"\\nNumber of unique suppliers: {combined_df['Suplr_NPI'].nunique()}\")\n",
+ " print(\"\\nSummary of numeric columns:\")\n",
+ " display(combined_df.describe(include=[np.number]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0106077",
+ "metadata": {},
+ "source": [
+ "## 3. Mapping Columns to Data Dictionary\n",
+ "We've got a `DATA_DICTIONARY` that provides definitions for each column. Let's map them to the DataFrame's columns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "744715a8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Data Dictionary Mapping:\n",
+ "\n",
+ "- Suplr_NPI: Supplier NPI - NPI for the Supplier on the DMEPOS claim\n",
+ "- Suplr_Prvdr_Last_Name_Org: Supplier Last Name/Organization Name - When registered as individual, the Supplier's last name. When registered as organization, this is the organization name\n",
+ "- Suplr_Prvdr_First_Name: Supplier First Name - When registered as individual, the Supplier's first name\n",
+ "- Suplr_Prvdr_MI: Supplier Middle Initial - When registered as individual, the Supplier's middle initial\n",
+ "- Suplr_Prvdr_Crdntls: Supplier Credentials - When registered as individual, these are the Supplier's credentials\n",
+ "- Suplr_Prvdr_Gndr: Supplier Gender - When registered as individual, this is the Supplier's gender\n",
+ "- Suplr_Prvdr_Ent_Cd: Supplier Entity Code - 'I' identifies Suppliers registered as individuals, 'O' identifies Suppliers registered as organizations\n",
+ "- Suplr_Prvdr_St1: Supplier Street 1 - First line of the Supplier's street address\n",
+ "- Suplr_Prvdr_St2: Supplier Street 2 - Second line of the Supplier's street address\n",
+ "- Suplr_Prvdr_City: Supplier City - The city where the Supplier is located\n",
+ "- Suplr_Prvdr_State_Abrvtn: Supplier State - State postal abbreviation where the Supplier is located\n",
+ "- Suplr_Prvdr_State_FIPS: Supplier State FIPS Code - FIPS code for Supplier's state\n",
+ "- Suplr_Prvdr_Zip5: Supplier ZIP - The Supplier's ZIP code\n",
+ "- Suplr_Prvdr_RUCA: Supplier RUCA - Rural-Urban Commuting Area Code for the Supplier ZIP code\n",
+ "- Suplr_Prvdr_RUCA_Desc: Supplier RUCA Description - Description of Rural-Urban Commuting Area (RUCA) Code\n",
+ "- Suplr_Prvdr_Cntry: Supplier Country - Country where the Supplier is located\n",
+ "- Suplr_Prvdr_Spclty_Desc: Supplier Provider Specialty Description - Derived from Medicare provider/supplier specialty code\n",
+ "- Suplr_Prvdr_Spclty_Srce: Supplier Provider Specialty Source - Source of the Supplier Specialty (claims-specialty or NPPES-specialty)\n",
+ "- Tot_Suplr_HCPCS_Cds: Number of Supplier HCPCS - Total unique DMEPOS product/service HCPCS codes\n",
+ "- Tot_Suplr_Benes: Number of Supplier Beneficiaries - Total unique beneficiaries (<11 are suppressed)\n",
+ "- Tot_Suplr_Clms: Number of Supplier Claims - Total DMEPOS claims submitted\n",
+ "- Tot_Suplr_Srvcs: Number of Supplier Services - Total DMEPOS products/services rendered\n",
+ "- Suplr_Sbmtd_Chrgs: Supplier Submitted Charges - Total charges submitted for DMEPOS products/services\n",
+ "- Suplr_Mdcr_Alowd_Amt: Supplier Medicare Allowed Amount - Total Medicare allowed amount\n",
+ "- Suplr_Mdcr_Pymt_Amt: Supplier Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance\n",
+ "- Suplr_Mdcr_Stdzd_Pymt_Amt: Supplier Medicare Standard Payment Amount - Standardized Medicare payments\n",
+ "- DME_Sprsn_Ind: Durable Medical Equipment Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed\n",
+ "- DME_Tot_Suplr_HCPCS_Cds: Number of DME HCPCS - Total unique DME HCPCS codes\n",
+ "- DME_Tot_Suplr_Benes: Number of DME Beneficiaries - Total unique beneficiaries with DME claims (<11 are suppressed)\n",
+ "- DME_Tot_Suplr_Clms: Number of DME Claims - Total DME claims submitted\n",
+ "- DME_Tot_Suplr_Srvcs: Number of DME Services - Total DME products/services rendered\n",
+ "- DME_Suplr_Sbmtd_Chrgs: DME Submitted Charges - Total charges submitted for DME products/services\n",
+ "- DME_Suplr_Mdcr_Alowd_Amt: DME Medicare Allowed Amount - Total Medicare allowed amount for DME\n",
+ "- DME_Suplr_Mdcr_Pymt_Amt: DME Medicare Payment Amount - Amount Medicare paid for DME after deductible/coinsurance\n",
+ "- DME_Suplr_Mdcr_Stdzd_Pymt_Amt: DME Medicare Standard Payment Amount - Standardized Medicare payments for DME\n",
+ "- POS_Sprsn_Ind: Prosthetic and Orthotic Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed\n",
+ "- POS_Tot_Suplr_HCPCS_Cds: Number of Prosthetic/Orthotic HCPCS - Total unique prosthetic/orthotic HCPCS codes\n",
+ "- POS_Tot_Suplr_Benes: Number of Prosthetic/Orthotic Beneficiaries - Total unique beneficiaries\n",
+ "- POS_Tot_Suplr_Clms: Number of Prosthetic/Orthotic Claims - Total prosthetic/orthotic claims submitted\n",
+ "- POS_Tot_Suplr_Srvcs: Number of Prosthetic/Orthotic Services - Total prosthetic/orthotic products/services\n",
+ "- POS_Suplr_Sbmtd_Chrgs: Prosthetic/Orthotic Submitted Charges - Total charges submitted for prosthetic/orthotic\n",
+ "- POS_Suplr_Mdcr_Alowd_Amt: Prosthetic/Orthotic Medicare Allowed Amount - Total Medicare allowed amount\n",
+ "- POS_Suplr_Mdcr_Pymt_Amt: Prosthetic/Orthotic Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance\n",
+ "- POS_Suplr_Mdcr_Stdzd_Pymt_Amt: Prosthetic/Orthotic Medicare Standard Payment Amount - Standardized Medicare payments\n",
+ "- Drug_Sprsn_Ind: Drug and Nutritional Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed\n",
+ "- Drug_Tot_Suplr_HCPCS_Cds: Number of Drug/Nutritional HCPCS - Total unique drug/nutritional HCPCS codes\n",
+ "- Drug_Tot_Suplr_Benes: Number of Drug/Nutritional Beneficiaries - Total unique beneficiaries\n",
+ "- Drug_Tot_Suplr_Clms: Number of Drug/Nutritional Claims - Total drug/nutritional claims submitted\n",
+ "- Drug_Tot_Suplr_Srvcs: Number of Drug/Nutritional Services - Total drug/nutritional products/services\n",
+ "- Drug_Suplr_Sbmtd_Chrgs: Drug/Nutritional Submitted Charges - Total charges submitted for drug/nutritional\n",
+ "- Drug_Suplr_Mdcr_Alowd_Amt: Drug/Nutritional Medicare Allowed Amount - Total Medicare allowed amount\n",
+ "- Drug_Suplr_Mdcr_Pymt_Amt: Drug/Nutritional Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance\n",
+ "- Drug_Suplr_Mdcr_Stdzd_Pymt_Amt: Drug/Nutritional Medicare Standard Payment Amount - Standardized Medicare payments\n",
+ "- Bene_Avg_Age: Average Age of Beneficiaries - Average age at end of calendar year or time of death\n",
+ "- Bene_Age_LT_65_Cnt: Number of Beneficiaries <65 - Count of beneficiaries under 65 years old\n",
+ "- Bene_Age_65_74_Cnt: Number of Beneficiaries 65-74 - Count of beneficiaries between 65-74 years old\n",
+ "- Bene_Age_75_84_Cnt: Number of Beneficiaries 75-84 - Count of beneficiaries between 75-84 years old\n",
+ "- Bene_Age_GT_84_Cnt: Number of Beneficiaries >84 - Count of beneficiaries over 84 years old\n",
+ "- Bene_Feml_Cnt: Number of Female Beneficiaries - Count of female beneficiaries\n",
+ "- Bene_Male_Cnt: Number of Male Beneficiaries - Count of male beneficiaries\n",
+ "- Bene_Race_Wht_Cnt: Number of White Beneficiaries - Count of non-Hispanic white beneficiaries\n",
+ "- Bene_Race_Black_Cnt: Number of Black Beneficiaries - Count of non-Hispanic Black/African American beneficiaries\n",
+ "- Bene_Race_Api_Cnt: Number of Asian/PI Beneficiaries - Count of Asian Pacific Islander beneficiaries\n",
+ "- Bene_Race_Hspnc_Cnt: Number of Hispanic Beneficiaries - Count of Hispanic beneficiaries\n",
+ "- Bene_Race_Natind_Cnt: Number of Native American/Alaska Native Beneficiaries - Count of American Indian/Alaska Native beneficiaries\n",
+ "- Bene_Race_Othr_Cnt: Number of Other Race Beneficiaries - Count of beneficiaries with race not elsewhere classified\n",
+ "- Bene_Ndual_Cnt: Number of Medicare & Medicaid Beneficiaries - Count of dual-eligible beneficiaries\n",
+ "- Bene_Dual_Cnt: Number of Medicare-Only Beneficiaries - Count of Medicare-only beneficiaries\n",
+ "- Bene_CC_BH_ADHD_OthCD_V1_Pct: Percent with ADHD and Other Conduct Disorders\n",
+ "- Bene_CC_BH_Alcohol_Drug_V1_Pct: Percent with Alcohol and Drug Use Disorders\n",
+ "- Bene_CC_BH_Tobacco_V1_Pct: Percent with Tobacco Use Disorders\n",
+ "- Bene_CC_BH_Alz_NonAlzdem_V2_Pct: Percent with Alzheimer's and Non-Alzheimer's Dementia\n",
+ "- Bene_CC_BH_Anxiety_V1_Pct: Percent with Anxiety Disorders\n",
+ "- Bene_CC_BH_Bipolar_V1_Pct: Percent with Bipolar Disorder\n",
+ "- Bene_CC_BH_Mood_V2_Pct: Percent with Depression, Bipolar or Other Mood Disorders\n",
+ "- Bene_CC_BH_Depress_V1_Pct: Percent with Major Depressive Affective Disorder\n",
+ "- Bene_CC_BH_PD_V1_Pct: Percent with Personality Disorders\n",
+ "- Bene_CC_BH_PTSD_V1_Pct: Percent with Post-Traumatic Stress Disorder\n",
+ "- Bene_CC_BH_Schizo_OthPsy_V1_Pct: Percent with Schizophrenia and Other Psychotic Disorders\n",
+ "- Bene_CC_PH_Asthma_V2_Pct: Percent with Asthma\n",
+ "- Bene_CC_PH_Afib_V2_Pct: Percent with Atrial Fibrillation and Flutter\n",
+ "- Bene_CC_PH_Cancer6_V2_Pct: Percent with Cancer (combined 6 cancer indicators)\n",
+ "- Bene_CC_PH_CKD_V2_Pct: Percent with Chronic Kidney Disease\n",
+ "- Bene_CC_PH_COPD_V2_Pct: Percent with Chronic Obstructive Pulmonary Disease\n",
+ "- Bene_CC_PH_Diabetes_V2_Pct: Percent with Diabetes\n",
+ "- Bene_CC_PH_HF_NonIHD_V2_Pct: Percent with Heart Failure and Non-Ischemic Heart Disease\n",
+ "- Bene_CC_PH_Hyperlipidemia_V2_Pct: Percent with Hyperlipidemia\n",
+ "- Bene_CC_PH_Hypertension_V2_Pct: Percent with Hypertension\n",
+ "- Bene_CC_PH_IschemicHeart_V2_Pct: Percent with Ischemic Heart Disease\n",
+ "- Bene_CC_PH_Osteoporosis_V2_Pct: Percent with Osteoporosis\n",
+ "- Bene_CC_PH_Parkinson_V2_Pct: Percent with Parkinson's Disease\n",
+ "- Bene_CC_PH_Arthritis_V2_Pct: Percent with Rheumatoid Arthritis/Osteoarthritis\n",
+ "- Bene_CC_PH_Stroke_TIA_V2_Pct: Percent with Stroke/Transient Ischemic Attack\n",
+ "- Bene_Avg_Risk_Scre: Average HCC Risk Score of Beneficiaries\n",
+ "- year: Year of the data\n"
+ ]
+ }
+ ],
+ "source": [
+ "if not combined_df.empty:\n",
+ " column_info = {}\n",
+ " for column in combined_df.columns:\n",
+ " if column in DATA_DICTIONARY:\n",
+ " column_info[column] = DATA_DICTIONARY[column]\n",
+ " else:\n",
+ " column_info[column] = \"Description not available\"\n",
+ " \n",
+ " # Optionally store in DataFrame attributes (just for reference, not required)\n",
+ " combined_df.attrs['column_descriptions'] = column_info\n",
+ "\n",
+ " # Display an overview\n",
+ " print(\"Data Dictionary Mapping:\\n\")\n",
+ " for col in combined_df.columns:\n",
+ " desc = column_info[col]\n",
+ " print(f\"- {col}: {desc}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a009a7cc",
+ "metadata": {},
+ "source": [
+ "## 4. Helper: Format Dollar Amounts\n",
+ "A small function to display large numbers with K/M suffixes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "0462c05c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format_dollar_amount(amount):\n",
+ " \"\"\"Return a string formatted with $ and K/M if needed.\"\"\"\n",
+ " if amount >= 1_000_000:\n",
+ " return f\"${amount/1_000_000:.1f}M\"\n",
+ " elif amount >= 1_000:\n",
+ " return f\"${amount/1_000:.1f}K\"\n",
+ " else:\n",
+ " return f\"${amount:,.0f}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29348be9",
+ "metadata": {},
+ "source": [
+ "# 5. Year-over-Year Growth Analysis\n",
+ "We'll look at *Medicare Payment Amount* by Supplier (NPI) across years, and compute YOY growth.\n",
+ "- Filter for suppliers that appear in all relevant years (2018–2022).\n",
+ "- Only consider suppliers with a meaningful (>= 100k) total in 2022 to focus on large-volume providers.\n",
+ "- Identify top 10 by average growth rate."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "0cdba951",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Top 10 Suppliers by Average Year-over-Year Growth (2018–2022), \n",
+ "\n",
+ " Filtered to those with >= $100K in 2022 payments:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Suplr_NPI | \n",
+ " Suplr_Prvdr_Last_Name_Org | \n",
+ " growth_2019 | \n",
+ " growth_2020 | \n",
+ " growth_2021 | \n",
+ " growth_2022 | \n",
+ " avg_growth | \n",
+ " Suplr_Sbmtd_Chrgs | \n",
+ " Suplr_Mdcr_Pymt_Amt | \n",
+ " Tot_Suplr_Benes | \n",
+ " Tot_Suplr_Clms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 500 | \n",
+ " 1063967768 | \n",
+ " P-Cares Medical Supplies, Llc | \n",
+ " 216.762990 | \n",
+ " -18.279120 | \n",
+ " -1.438596 | \n",
+ " 59704.308031 | \n",
+ " 14975.338326 | \n",
+ " 2.996986e+07 | \n",
+ " 15894839.27 | \n",
+ " 3029.60 | \n",
+ " 21002 | \n",
+ "
\n",
+ " \n",
+ " | 5958 | \n",
+ " 1891275590 | \n",
+ " Lincare Inc | \n",
+ " 43427.677155 | \n",
+ " 40.042523 | \n",
+ " 9.862999 | \n",
+ " 1.397273 | \n",
+ " 10869.744987 | \n",
+ " 1.351100e+08 | \n",
+ " 19498239.26 | \n",
+ " 8006.60 | \n",
+ " 250598 | \n",
+ "
\n",
+ " \n",
+ " | 3078 | \n",
+ " 1457837080 | \n",
+ " Respiratory Services Of Western New York, Inc. | \n",
+ " 24521.584413 | \n",
+ " 489.458451 | \n",
+ " 59.869097 | \n",
+ " 46.621208 | \n",
+ " 6279.383292 | \n",
+ " 1.742520e+06 | \n",
+ " 636034.24 | \n",
+ " 442.00 | \n",
+ " 8743 | \n",
+ "
\n",
+ " \n",
+ " | 5538 | \n",
+ " 1821424789 | \n",
+ " Vohra Post Acute Care Physicians Of Texas, Pllc | \n",
+ " 24557.093269 | \n",
+ " 105.801915 | \n",
+ " 24.589830 | \n",
+ " 1.304009 | \n",
+ " 6172.197256 | \n",
+ " 1.954372e+07 | \n",
+ " 7851147.07 | \n",
+ " 1268.25 | \n",
+ " 21386 | \n",
+ "
\n",
+ " \n",
+ " | 3389 | \n",
+ " 1508938127 | \n",
+ " Aahi St Joseph Mercy Hospital Inc | \n",
+ " -73.766346 | \n",
+ " -96.932529 | \n",
+ " 23208.805119 | \n",
+ " 139.646393 | \n",
+ " 5794.438159 | \n",
+ " 1.052284e+06 | \n",
+ " 477838.88 | \n",
+ " NaN | \n",
+ " 421 | \n",
+ "
\n",
+ " \n",
+ " | 1197 | \n",
+ " 1174553804 | \n",
+ " Care One Medical Equipment And Supplies, Inc. | \n",
+ " 19205.127235 | \n",
+ " 50.732207 | \n",
+ " -6.816210 | \n",
+ " 34.941358 | \n",
+ " 4820.996148 | \n",
+ " 3.241600e+06 | \n",
+ " 1038009.02 | \n",
+ " 529.25 | \n",
+ " 11495 | \n",
+ "
\n",
+ " \n",
+ " | 3365 | \n",
+ " 1508826199 | \n",
+ " The Home Health Store Of Tomball, Inc. | \n",
+ " 70.638944 | \n",
+ " 23.053685 | \n",
+ " 53.075010 | \n",
+ " 17908.424045 | \n",
+ " 4513.797921 | \n",
+ " 2.914811e+07 | \n",
+ " 15253804.77 | \n",
+ " 5578.40 | \n",
+ " 56914 | \n",
+ "
\n",
+ " \n",
+ " | 5022 | \n",
+ " 1750391751 | \n",
+ " Amerihealth Medical Group, Inc. | \n",
+ " 18039.357850 | \n",
+ " -46.804741 | \n",
+ " 6.478163 | \n",
+ " -16.942326 | \n",
+ " 4495.522236 | \n",
+ " 2.947875e+06 | \n",
+ " 1106496.20 | \n",
+ " 873.60 | \n",
+ " 19592 | \n",
+ "
\n",
+ " \n",
+ " | 3923 | \n",
+ " 1598044208 | \n",
+ " Scooter Chair Repair Georgia, Llc | \n",
+ " 16495.684706 | \n",
+ " 46.102762 | \n",
+ " -17.790577 | \n",
+ " -18.348840 | \n",
+ " 4126.412013 | \n",
+ " 1.078492e+07 | \n",
+ " 4825597.72 | \n",
+ " 219.60 | \n",
+ " 3544 | \n",
+ "
\n",
+ " \n",
+ " | 2865 | \n",
+ " 1437108214 | \n",
+ " Christian Home Health Services, Inc | \n",
+ " 16471.811385 | \n",
+ " -19.951771 | \n",
+ " 10.856242 | \n",
+ " 4.742256 | \n",
+ " 4116.864528 | \n",
+ " 1.948006e+06 | \n",
+ " 573413.07 | \n",
+ " 391.75 | \n",
+ " 7344 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Suplr_NPI Suplr_Prvdr_Last_Name_Org growth_2019 growth_2020 growth_2021 growth_2022 avg_growth Suplr_Sbmtd_Chrgs Suplr_Mdcr_Pymt_Amt Tot_Suplr_Benes Tot_Suplr_Clms\n",
+ "500 1063967768 P-Cares Medical Supplies, Llc 216.762990 -18.279120 -1.438596 59704.308031 14975.338326 2.996986e+07 15894839.27 3029.60 21002\n",
+ "5958 1891275590 Lincare Inc 43427.677155 40.042523 9.862999 1.397273 10869.744987 1.351100e+08 19498239.26 8006.60 250598\n",
+ "3078 1457837080 Respiratory Services Of Western New York, Inc. 24521.584413 489.458451 59.869097 46.621208 6279.383292 1.742520e+06 636034.24 442.00 8743\n",
+ "5538 1821424789 Vohra Post Acute Care Physicians Of Texas, Pllc 24557.093269 105.801915 24.589830 1.304009 6172.197256 1.954372e+07 7851147.07 1268.25 21386\n",
+ "3389 1508938127 Aahi St Joseph Mercy Hospital Inc -73.766346 -96.932529 23208.805119 139.646393 5794.438159 1.052284e+06 477838.88 NaN 421\n",
+ "1197 1174553804 Care One Medical Equipment And Supplies, Inc. 19205.127235 50.732207 -6.816210 34.941358 4820.996148 3.241600e+06 1038009.02 529.25 11495\n",
+ "3365 1508826199 The Home Health Store Of Tomball, Inc. 70.638944 23.053685 53.075010 17908.424045 4513.797921 2.914811e+07 15253804.77 5578.40 56914\n",
+ "5022 1750391751 Amerihealth Medical Group, Inc. 18039.357850 -46.804741 6.478163 -16.942326 4495.522236 2.947875e+06 1106496.20 873.60 19592\n",
+ "3923 1598044208 Scooter Chair Repair Georgia, Llc 16495.684706 46.102762 -17.790577 -18.348840 4126.412013 1.078492e+07 4825597.72 219.60 3544\n",
+ "2865 1437108214 Christian Home Health Services, Inc 16471.811385 -19.951771 10.856242 4.742256 4116.864528 1.948006e+06 573413.07 391.75 7344"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "if not combined_df.empty:\n",
+ " # 5.1 Group by (Supplier, year), then sum relevant metrics\n",
+ " supplier_yearly = combined_df.groupby([\n",
+ " 'Suplr_NPI',\n",
+ " 'Suplr_Prvdr_Last_Name_Org',\n",
+ " 'year'\n",
+ " ], as_index=False).agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': 'sum',\n",
+ " 'Suplr_Mdcr_Pymt_Amt': 'sum',\n",
+ " 'Tot_Suplr_Benes': 'mean', # average across rows\n",
+ " 'Tot_Suplr_Clms': 'sum'\n",
+ " })\n",
+ "\n",
+ " # Create a pivot where columns are years, values are 'Suplr_Mdcr_Pymt_Amt'\n",
+ " pivot_charges = supplier_yearly.pivot_table(\n",
+ " index=['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org'],\n",
+ " columns='year',\n",
+ " values='Suplr_Mdcr_Pymt_Amt',\n",
+ " fill_value=0\n",
+ " )\n",
+ "\n",
+ " # We'll calculate YOY growth for (2019 vs 2018), (2020 vs 2019), etc.\n",
+ " growth_rates = pd.DataFrame(index=pivot_charges.index)\n",
+ " for year_pair in [(2019, 2018), (2020, 2019), (2021, 2020), (2022, 2021)]:\n",
+ " current, previous = year_pair\n",
+ " growth_column = f'growth_{current}'\n",
+ " growth_rates[growth_column] = (\n",
+ " (pivot_charges[current] - pivot_charges[previous]) /\n",
+ " pivot_charges[previous].replace(0, np.nan)\n",
+ " ) * 100\n",
+ "\n",
+ " growth_cols = [col for col in growth_rates.columns if col.startswith('growth_')]\n",
+ " growth_rates['avg_growth'] = growth_rates[growth_cols].mean(axis=1)\n",
+ "\n",
+ " # Filter: Supplier must have >0 in all years, and >=100k in 2022\n",
+ " filter_mask = (\n",
+ " (pivot_charges[2018] > 0) &\n",
+ " (pivot_charges[2019] > 0) &\n",
+ " (pivot_charges[2020] > 0) &\n",
+ " (pivot_charges[2021] > 0) &\n",
+ " (pivot_charges[2022] >= 100000)\n",
+ " )\n",
+ "\n",
+ " valid_suppliers = pivot_charges[filter_mask]\n",
+ " valid_growth = growth_rates.loc[valid_suppliers.index].reset_index()\n",
+ "\n",
+ " # Merge with aggregated totals (all years combined) just for more reporting info\n",
+ " supplier_totals = supplier_yearly.groupby([\n",
+ " 'Suplr_NPI',\n",
+ " 'Suplr_Prvdr_Last_Name_Org'\n",
+ " ], as_index=False).agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': 'sum',\n",
+ " 'Suplr_Mdcr_Pymt_Amt': 'sum',\n",
+ " 'Tot_Suplr_Benes': 'mean',\n",
+ " 'Tot_Suplr_Clms': 'sum'\n",
+ " })\n",
+ "\n",
+ " growth_merged = pd.merge(\n",
+ " valid_growth,\n",
+ " supplier_totals,\n",
+ " on=['Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org'],\n",
+ " how='left'\n",
+ " )\n",
+ "\n",
+ " # Sort by average growth descending\n",
+ " top_growth = growth_merged.sort_values('avg_growth', ascending=False).head(10)\n",
+ " \n",
+ " print(\"\\nTop 10 Suppliers by Average Year-over-Year Growth (2018–2022), \\n\\n\",\n",
+ " \"Filtered to those with >= $100K in 2022 payments:\")\n",
+ " display(top_growth)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1af5e0cf",
"metadata": {},
+ "source": [
+ "### Display Year-by-Year Payment Patterns for Top 10\n",
+ "We'll show each supplier's biggest jump and beneficiary growth, if available."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "25fc60d9",
+ "metadata": {
+ "tags": []
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Hello World\n"
+ "\n",
+ "Detailed Patterns for Top 10 Growth Suppliers:\n",
+ "\n",
+ "1. P-Cares Medical Supplies, Llc (NPI: 1063967768)\n",
+ " - Average Growth: 14975.34%\n",
+ " - Total Medicare Payments (2018–2022): $15.9M\n",
+ " - Year-by-year Payments: 2018: $10.4K, 2019: $32.8K, 2020: $26.8K, 2021: $26.4K, 2022: $15.8M\n",
+ " - Largest Jump: 2021.0 to 2022.0 (+59704.31%)\n",
+ " - Beneficiary Growth: 52782.1% \n",
+ "\n",
+ "2. Lincare Inc (NPI: 1891275590)\n",
+ " - Average Growth: 10869.74%\n",
+ " - Total Medicare Payments (2018–2022): $19.5M\n",
+ " - Year-by-year Payments: 2018: $8.1K, 2019: $3.5M, 2020: $5.0M, 2021: $5.5M, 2022: $5.5M\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+43427.68%)\n",
+ " - Beneficiary Growth: 10389.0% \n",
+ "\n",
+ "3. Respiratory Services Of Western New York, Inc. (NPI: 1457837080)\n",
+ " - Average Growth: 6279.38%\n",
+ " - Total Medicare Payments (2018–2022): $636.0K\n",
+ " - Year-by-year Payments: 2018: $86, 2019: $21.1K, 2020: $124.4K, 2021: $198.9K, 2022: $291.6K\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+24521.58%)\n",
+ " - Beneficiary Growth: 480.9% \n",
+ "\n",
+ "4. Vohra Post Acute Care Physicians Of Texas, Pllc (NPI: 1821424789)\n",
+ " - Average Growth: 6172.20%\n",
+ " - Total Medicare Payments (2018–2022): $7.9M\n",
+ " - Year-by-year Payments: 2018: $3.9K, 2019: $954.7K, 2020: $2.0M, 2021: $2.4M, 2022: $2.5M\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+24557.09%)\n",
+ " - Beneficiary Growth: 134.7% \n",
+ "\n",
+ "5. Aahi St Joseph Mercy Hospital Inc (NPI: 1508938127)\n",
+ " - Average Growth: 5794.44%\n",
+ " - Total Medicare Payments (2018–2022): $477.8K\n",
+ " - Year-by-year Payments: 2018: $62.5K, 2019: $16.4K, 2020: $503, 2021: $117.3K, 2022: $281.1K\n",
+ " - Largest Jump: 2020.0 to 2021.0 (+23208.81%)\n",
+ "\n",
+ "6. Care One Medical Equipment And Supplies, Inc. (NPI: 1174553804)\n",
+ " - Average Growth: 4821.00%\n",
+ " - Total Medicare Payments (2018–2022): $1.0M\n",
+ " - Year-by-year Payments: 2018: $925, 2019: $178.6K, 2020: $269.2K, 2021: $250.8K, 2022: $338.5K\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+19205.13%)\n",
+ " - Beneficiary Growth: 70.1% \n",
+ "\n",
+ "7. The Home Health Store Of Tomball, Inc. (NPI: 1508826199)\n",
+ " - Average Growth: 4513.80%\n",
+ " - Total Medicare Payments (2018–2022): $15.3M\n",
+ " - Year-by-year Payments: 2018: $26.0K, 2019: $44.4K, 2020: $54.6K, 2021: $83.5K, 2022: $15.0M\n",
+ " - Largest Jump: 2021.0 to 2022.0 (+17908.42%)\n",
+ " - Beneficiary Growth: 25079.6% \n",
+ "\n",
+ "8. Amerihealth Medical Group, Inc. (NPI: 1750391751)\n",
+ " - Average Growth: 4495.52%\n",
+ " - Total Medicare Payments (2018–2022): $1.1M\n",
+ " - Year-by-year Payments: 2018: $2.4K, 2019: $429.8K, 2020: $228.6K, 2021: $243.5K, 2022: $202.2K\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+18039.36%)\n",
+ " - Beneficiary Growth: 2878.8% \n",
+ "\n",
+ "9. Scooter Chair Repair Georgia, Llc (NPI: 1598044208)\n",
+ " - Average Growth: 4126.41%\n",
+ " - Total Medicare Payments (2018–2022): $4.8M\n",
+ " - Year-by-year Payments: 2018: $6.3K, 2019: $1.0M, 2020: $1.5M, 2021: $1.2M, 2022: $1.0M\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+16495.68%)\n",
+ " - Beneficiary Growth: 131.0% \n",
+ "\n",
+ "10. Christian Home Health Services, Inc (NPI: 1437108214)\n",
+ " - Average Growth: 4116.86%\n",
+ " - Total Medicare Payments (2018–2022): $573.4K\n",
+ " - Year-by-year Payments: 2018: $955, 2019: $158.3K, 2020: $126.7K, 2021: $140.4K, 2022: $147.1K\n",
+ " - Largest Jump: 2018.0 to 2019.0 (+16471.81%)\n",
+ " - Beneficiary Growth: -27.0% \n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n",
+ "/var/folders/g6/2s70_fq11hn4czzmpgd40ky80000gn/T/ipykernel_34747/4120139511.py:19: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " data.sort_values('year', inplace=True)\n"
]
}
],
"source": [
- "print(\"Hello World\")"
+ "if not combined_df.empty:\n",
+ " # Create a function to display details for the top-10\n",
+ " def show_top_10_growth_details(top_df, supplier_yearly_df):\n",
+ " print(\"\\nDetailed Patterns for Top 10 Growth Suppliers:\\n\")\n",
+ " top_npi = top_df['Suplr_NPI'].tolist()\n",
+ "\n",
+ " # Filter original groupby results for just these suppliers\n",
+ " subset = supplier_yearly_df[supplier_yearly_df['Suplr_NPI'].isin(top_npi)].copy()\n",
+ " subset.sort_values(['Suplr_NPI', 'year'], inplace=True)\n",
+ "\n",
+ " for i, row in enumerate(top_df.itertuples(), start=1):\n",
+ " npi = row.Suplr_NPI\n",
+ " name = row.Suplr_Prvdr_Last_Name_Org\n",
+ " avg_growth = row.avg_growth\n",
+ " total_pay = row.Suplr_Mdcr_Pymt_Amt\n",
+ "\n",
+ " # Grab the subset for this supplier\n",
+ " data = subset[subset['Suplr_NPI'] == npi]\n",
+ " data.sort_values('year', inplace=True)\n",
+ "\n",
+ " print(f\"{i}. {name} (NPI: {npi})\")\n",
+ " print(f\" - Average Growth: {avg_growth:.2f}%\")\n",
+ " print(f\" - Total Medicare Payments (2018–2022): {format_dollar_amount(total_pay)}\")\n",
+ "\n",
+ " # Show year-by-year\n",
+ " year_strs = []\n",
+ " for y in range(2018, 2023):\n",
+ " row_y = data[data['year'] == y]\n",
+ " if not row_y.empty:\n",
+ " pay = row_y.iloc[0]['Suplr_Mdcr_Pymt_Amt']\n",
+ " year_strs.append(f\"{y}: {format_dollar_amount(pay)}\")\n",
+ " else:\n",
+ " year_strs.append(f\"{y}: $0\")\n",
+ " print(\" - Year-by-year Payments: \" + \", \".join(year_strs))\n",
+ "\n",
+ " # Identify the largest yoy jump\n",
+ " data_list = data[['year', 'Suplr_Mdcr_Pymt_Amt']].sort_values('year').values.tolist()\n",
+ " max_jump = 0\n",
+ " jump_year = None\n",
+ " for idx in range(1, len(data_list)):\n",
+ " prev_amt = data_list[idx-1][1]\n",
+ " curr_amt = data_list[idx][1]\n",
+ " if prev_amt > 0:\n",
+ " yoy_pct = (curr_amt - prev_amt) / prev_amt * 100\n",
+ " if yoy_pct > max_jump:\n",
+ " max_jump = yoy_pct\n",
+ " jump_year = (data_list[idx-1][0], data_list[idx][0])\n",
+ "\n",
+ " if jump_year:\n",
+ " print(f\" - Largest Jump: {jump_year[0]} to {jump_year[1]} (+{max_jump:.2f}%)\")\n",
+ "\n",
+ " # Check beneficiary growth\n",
+ " benes = data[['year', 'Tot_Suplr_Benes']].dropna()\n",
+ " if len(benes) > 1:\n",
+ " benes.sort_values('year', inplace=True)\n",
+ " first_benes = benes.iloc[0]['Tot_Suplr_Benes']\n",
+ " last_benes = benes.iloc[-1]['Tot_Suplr_Benes']\n",
+ " if first_benes > 0:\n",
+ " bene_growth = (last_benes - first_benes) / first_benes * 100\n",
+ " print(f\" - Beneficiary Growth: {bene_growth:.1f}% \")\n",
+ "\n",
+ " print(\"\")\n",
+ "\n",
+ " show_top_10_growth_details(top_growth, supplier_yearly)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bee5376d",
+ "metadata": {},
+ "source": [
+ "# 6. Analysis of High Submitted vs. Low Allowed/Paid Amounts\n",
+ "We check each supplier's total submitted charges vs. the allowed and paid amounts across **all** years."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "2201637e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 10 Suppliers: Highest Submitted Charges vs. Allowed Amount Ratio\n",
+ "\n",
+ "- Flatbush Rx Corp (NPI: 1669839536)\n",
+ " Submitted: $252.8K, Allowed: $1.1K, Paid: $616\n",
+ " Submitted : Allowed = 221.97x\n",
+ "\n",
+ "- Arooba Corp (NPI: 1649225152)\n",
+ " Submitted: $312.0K, Allowed: $1.7K, Paid: $1.1K\n",
+ " Submitted : Allowed = 182.41x\n",
+ "\n",
+ "- Mingocare Inc (NPI: 1003228156)\n",
+ " Submitted: $702.7K, Allowed: $4.0K, Paid: $2.4K\n",
+ " Submitted : Allowed = 177.83x\n",
+ "\n",
+ "- Nile City Pharmacy Inc (NPI: 1578076212)\n",
+ " Submitted: $106.4K, Allowed: $702, Paid: $524\n",
+ " Submitted : Allowed = 151.50x\n",
+ "\n",
+ "- Farmacia Julia Discount #2 Llc (NPI: 1457430274)\n",
+ " Submitted: $410.4K, Allowed: $3.4K, Paid: $2.1K\n",
+ " Submitted : Allowed = 122.31x\n",
+ "\n",
+ "- Gamer Pharmacy Inc (NPI: 1588697692)\n",
+ " Submitted: $9.3M, Allowed: $76.9K, Paid: $56.8K\n",
+ " Submitted : Allowed = 120.95x\n",
+ "\n",
+ "- Madina Pharmacy Inc (NPI: 1538525316)\n",
+ " Submitted: $427.3K, Allowed: $4.3K, Paid: $2.7K\n",
+ " Submitted : Allowed = 99.99x\n",
+ "\n",
+ "- Colonial Pharmacy Inc (NPI: 1255438198)\n",
+ " Submitted: $407.8K, Allowed: $4.1K, Paid: $2.5K\n",
+ " Submitted : Allowed = 98.28x\n",
+ "\n",
+ "- Blue Ridge Pharmacy Inc (NPI: 1538564596)\n",
+ " Submitted: $2.0M, Allowed: $21.0K, Paid: $15.6K\n",
+ " Submitted : Allowed = 92.84x\n",
+ "\n",
+ "- Welch Pharmacy Inc (NPI: 1336326792)\n",
+ " Submitted: $1.2M, Allowed: $13.4K, Paid: $8.3K\n",
+ " Submitted : Allowed = 92.37x\n",
+ "\n",
+ "\n",
+ "Top 10 Suppliers: Highest Submitted Charges vs. Paid Amount Ratio\n",
+ "\n",
+ "- Flatbush Rx Corp (NPI: 1669839536)\n",
+ " Submitted: $252.8K, Allowed: $1.1K, Paid: $616\n",
+ " Submitted : Paid = 410.09x\n",
+ "\n",
+ "- Mingocare Inc (NPI: 1003228156)\n",
+ " Submitted: $702.7K, Allowed: $4.0K, Paid: $2.4K\n",
+ " Submitted : Paid = 292.70x\n",
+ "\n",
+ "- Arooba Corp (NPI: 1649225152)\n",
+ " Submitted: $312.0K, Allowed: $1.7K, Paid: $1.1K\n",
+ " Submitted : Paid = 285.22x\n",
+ "\n",
+ "- Nile City Pharmacy Inc (NPI: 1578076212)\n",
+ " Submitted: $106.4K, Allowed: $702, Paid: $524\n",
+ " Submitted : Paid = 202.93x\n",
+ "\n",
+ "- Farmacia Julia Discount #2 Llc (NPI: 1457430274)\n",
+ " Submitted: $410.4K, Allowed: $3.4K, Paid: $2.1K\n",
+ " Submitted : Paid = 196.85x\n",
+ "\n",
+ "- Colonial Pharmacy Inc (NPI: 1255438198)\n",
+ " Submitted: $407.8K, Allowed: $4.1K, Paid: $2.5K\n",
+ " Submitted : Paid = 165.16x\n",
+ "\n",
+ "- Gamer Pharmacy Inc (NPI: 1588697692)\n",
+ " Submitted: $9.3M, Allowed: $76.9K, Paid: $56.8K\n",
+ " Submitted : Paid = 163.71x\n",
+ "\n",
+ "- Madina Pharmacy Inc (NPI: 1538525316)\n",
+ " Submitted: $427.3K, Allowed: $4.3K, Paid: $2.7K\n",
+ " Submitted : Paid = 155.57x\n",
+ "\n",
+ "- Welch Pharmacy Inc (NPI: 1336326792)\n",
+ " Submitted: $1.2M, Allowed: $13.4K, Paid: $8.3K\n",
+ " Submitted : Paid = 148.70x\n",
+ "\n",
+ "- Crystal Drugs Inc (NPI: 1124049184)\n",
+ " Submitted: $720.3K, Allowed: $8.2K, Paid: $5.2K\n",
+ " Submitted : Paid = 137.98x\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "if not combined_df.empty:\n",
+ " supplier_totals_ap = combined_df.groupby([\n",
+ " 'Suplr_NPI',\n",
+ " 'Suplr_Prvdr_Last_Name_Org'\n",
+ " ], as_index=False).agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': 'sum',\n",
+ " 'Suplr_Mdcr_Alowd_Amt': 'sum',\n",
+ " 'Suplr_Mdcr_Pymt_Amt': 'sum',\n",
+ " 'Tot_Suplr_Benes': 'mean',\n",
+ " 'Tot_Suplr_Clms': 'sum'\n",
+ " })\n",
+ "\n",
+ " supplier_totals_ap['submitted_allowed_ratio'] = (\n",
+ " supplier_totals_ap['Suplr_Sbmtd_Chrgs'] / (supplier_totals_ap['Suplr_Mdcr_Alowd_Amt'] + 1e-9)\n",
+ " )\n",
+ " supplier_totals_ap['submitted_paid_ratio'] = (\n",
+ " supplier_totals_ap['Suplr_Sbmtd_Chrgs'] / (supplier_totals_ap['Suplr_Mdcr_Pymt_Amt'] + 1e-9)\n",
+ " )\n",
+ "\n",
+ " # Focus on those with at least $100K submitted charges to reduce noise\n",
+ " significant_ap = supplier_totals_ap[supplier_totals_ap['Suplr_Sbmtd_Chrgs'] >= 100000]\n",
+ "\n",
+ " # Highest submitted-to-allowed ratio\n",
+ " top_allowed = significant_ap.sort_values(\n",
+ " 'submitted_allowed_ratio', ascending=False\n",
+ " ).head(10)\n",
+ "\n",
+ " print(\"Top 10 Suppliers: Highest Submitted Charges vs. Allowed Amount Ratio\\n\")\n",
+ " for i, row in top_allowed.iterrows():\n",
+ " npi = row['Suplr_NPI']\n",
+ " name = row['Suplr_Prvdr_Last_Name_Org']\n",
+ " submitted = row['Suplr_Sbmtd_Chrgs']\n",
+ " allowed = row['Suplr_Mdcr_Alowd_Amt']\n",
+ " paid = row['Suplr_Mdcr_Pymt_Amt']\n",
+ " ratio = row['submitted_allowed_ratio']\n",
+ "\n",
+ " print(f\"- {name} (NPI: {npi})\")\n",
+ " print(f\" Submitted: {format_dollar_amount(submitted)}, Allowed: {format_dollar_amount(allowed)}, Paid: {format_dollar_amount(paid)}\")\n",
+ " print(f\" Submitted : Allowed = {ratio:.2f}x\\n\")\n",
+ "\n",
+ " # Highest submitted-to-paid ratio\n",
+ " top_paid = significant_ap.sort_values(\n",
+ " 'submitted_paid_ratio', ascending=False\n",
+ " ).head(10)\n",
+ "\n",
+ " print(\"\\nTop 10 Suppliers: Highest Submitted Charges vs. Paid Amount Ratio\\n\")\n",
+ " for i, row in top_paid.iterrows():\n",
+ " npi = row['Suplr_NPI']\n",
+ " name = row['Suplr_Prvdr_Last_Name_Org']\n",
+ " submitted = row['Suplr_Sbmtd_Chrgs']\n",
+ " allowed = row['Suplr_Mdcr_Alowd_Amt']\n",
+ " paid = row['Suplr_Mdcr_Pymt_Amt']\n",
+ " ratio = row['submitted_paid_ratio']\n",
+ "\n",
+ " print(f\"- {name} (NPI: {npi})\")\n",
+ " print(f\" Submitted: {format_dollar_amount(submitted)}, Allowed: {format_dollar_amount(allowed)}, Paid: {format_dollar_amount(paid)}\")\n",
+ " print(f\" Submitted : Paid = {ratio:.2f}x\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ebfae2d2",
+ "metadata": {},
+ "source": [
+ "# 7. Peer Group Analysis\n",
+ "Analyze suppliers in the context of their **specialty**, **state**, or combined specialty–state. \n",
+ "Outliers are flagged if they exceed 3× the peer group's median in more than one of these metrics:\n",
+ "- Total Claims\n",
+ "- Total Submitted Charges\n",
+ "- Total Payments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "8f0e0b17",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "## Peer Group Analysis by Specialty\n",
+ "\n",
+ "Significant Specialty Outliers (exceeding 3× median in >=2 metrics):\n",
+ "- Accredo Health Group Inc (NPI: 1417915653)\n",
+ " Specialty: Pharmacy | State: PA\n",
+ " Claims: 209,938, Charges: $3464.6M, Payments: $1267.1M\n",
+ "\n",
+ "- North Coast Medical Supply, Llc (NPI: 1245259282)\n",
+ " Specialty: Pharmacy | State: CA\n",
+ " Claims: 1,236,598, Charges: $3458.1M, Payments: $245.3M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1780748939)\n",
+ " Specialty: Pharmacy | State: FL\n",
+ " Claims: 2,533,531, Charges: $2178.0M, Payments: $644.5M\n",
+ "\n",
+ "- Zoll Services Llc (NPI: 1164535274)\n",
+ " Specialty: Other Medical Supply Company | State: PA\n",
+ " Claims: 345,064, Charges: $1365.1M, Payments: $738.0M\n",
+ "\n",
+ "- Degc Enterprises (U.S.), Inc. (NPI: 1295827780)\n",
+ " Specialty: Pharmacy | State: FL\n",
+ " Claims: 1,329,923, Charges: $1291.7M, Payments: $325.9M\n",
+ "\n",
+ "- United States Medical Supply, Llc (NPI: 1700889227)\n",
+ " Specialty: Other Medical Supply Company | State: FL\n",
+ " Claims: 3,296,437, Charges: $1103.4M, Payments: $297.8M\n",
+ "\n",
+ "- 180 Medical Inc (NPI: 1639160708)\n",
+ " Specialty: Other Medical Supply Company | State: OK\n",
+ " Claims: 1,224,531, Charges: $1036.1M, Payments: $420.4M\n",
+ "\n",
+ "- Rgh Enterprises, Llc (NPI: 1609858729)\n",
+ " Specialty: All Other Suppliers | State: OH\n",
+ " Claims: 1,242,822, Charges: $965.4M, Payments: $271.6M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1003970260)\n",
+ " Specialty: Pharmacy | State: CA\n",
+ " Claims: 953,473, Charges: $817.3M, Payments: $240.2M\n",
+ "\n",
+ "- Coram Alternate Site Services Inc (NPI: 1386674067)\n",
+ " Specialty: All Other Suppliers | State: MN\n",
+ " Claims: 26,085, Charges: $786.8M, Payments: $53.1M\n",
+ "\n",
+ "\n",
+ "## Peer Group Analysis by State\n",
+ "\n",
+ "Significant State Outliers (>= 3× median in >=2 metrics):\n",
+ "- Accredo Health Group Inc (NPI: 1417915653)\n",
+ " State: PA | Specialty: Pharmacy\n",
+ " Claims: 209,938, Charges: $3464.6M, Payments: $1267.1M\n",
+ "\n",
+ "- North Coast Medical Supply, Llc (NPI: 1245259282)\n",
+ " State: CA | Specialty: Pharmacy\n",
+ " Claims: 1,236,598, Charges: $3458.1M, Payments: $245.3M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1780748939)\n",
+ " State: FL | Specialty: Pharmacy\n",
+ " Claims: 2,533,531, Charges: $2178.0M, Payments: $644.5M\n",
+ "\n",
+ "- Zoll Services Llc (NPI: 1164535274)\n",
+ " State: PA | Specialty: Other Medical Supply Company\n",
+ " Claims: 345,064, Charges: $1365.1M, Payments: $738.0M\n",
+ "\n",
+ "- Degc Enterprises (U.S.), Inc. (NPI: 1295827780)\n",
+ " State: FL | Specialty: Pharmacy\n",
+ " Claims: 1,329,923, Charges: $1291.7M, Payments: $325.9M\n",
+ "\n",
+ "- United States Medical Supply, Llc (NPI: 1700889227)\n",
+ " State: FL | Specialty: Other Medical Supply Company\n",
+ " Claims: 3,296,437, Charges: $1103.4M, Payments: $297.8M\n",
+ "\n",
+ "- 180 Medical Inc (NPI: 1639160708)\n",
+ " State: OK | Specialty: Other Medical Supply Company\n",
+ " Claims: 1,224,531, Charges: $1036.1M, Payments: $420.4M\n",
+ "\n",
+ "- Rgh Enterprises, Llc (NPI: 1609858729)\n",
+ " State: OH | Specialty: All Other Suppliers\n",
+ " Claims: 1,242,822, Charges: $965.4M, Payments: $271.6M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1003970260)\n",
+ " State: CA | Specialty: Pharmacy\n",
+ " Claims: 953,473, Charges: $817.3M, Payments: $240.2M\n",
+ "\n",
+ "- Coram Alternate Site Services Inc (NPI: 1386674067)\n",
+ " State: MN | Specialty: All Other Suppliers\n",
+ " Claims: 26,085, Charges: $786.8M, Payments: $53.1M\n",
+ "\n",
+ "\n",
+ "## Peer Group Analysis by Combined Specialty–State\n",
+ "\n",
+ "Significant Combined Specialty–State Outliers (>= 3× median in >=2 metrics):\n",
+ "- Accredo Health Group Inc (NPI: 1417915653)\n",
+ " Specialty: Pharmacy | State: PA\n",
+ " Claims: 209,938, Charges: $3464.6M, Payments: $1267.1M\n",
+ "\n",
+ "- North Coast Medical Supply, Llc (NPI: 1245259282)\n",
+ " Specialty: Pharmacy | State: CA\n",
+ " Claims: 1,236,598, Charges: $3458.1M, Payments: $245.3M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1780748939)\n",
+ " Specialty: Pharmacy | State: FL\n",
+ " Claims: 2,533,531, Charges: $2178.0M, Payments: $644.5M\n",
+ "\n",
+ "- Zoll Services Llc (NPI: 1164535274)\n",
+ " Specialty: Other Medical Supply Company | State: PA\n",
+ " Claims: 345,064, Charges: $1365.1M, Payments: $738.0M\n",
+ "\n",
+ "- Degc Enterprises (U.S.), Inc. (NPI: 1295827780)\n",
+ " Specialty: Pharmacy | State: FL\n",
+ " Claims: 1,329,923, Charges: $1291.7M, Payments: $325.9M\n",
+ "\n",
+ "- United States Medical Supply, Llc (NPI: 1700889227)\n",
+ " Specialty: Other Medical Supply Company | State: FL\n",
+ " Claims: 3,296,437, Charges: $1103.4M, Payments: $297.8M\n",
+ "\n",
+ "- 180 Medical Inc (NPI: 1639160708)\n",
+ " Specialty: Other Medical Supply Company | State: OK\n",
+ " Claims: 1,224,531, Charges: $1036.1M, Payments: $420.4M\n",
+ "\n",
+ "- Lincare Pharmacy Services Inc. (NPI: 1003970260)\n",
+ " Specialty: Pharmacy | State: CA\n",
+ " Claims: 953,473, Charges: $817.3M, Payments: $240.2M\n",
+ "\n",
+ "- Coram Alternate Site Services Inc (NPI: 1386674067)\n",
+ " Specialty: All Other Suppliers | State: MN\n",
+ " Claims: 26,085, Charges: $786.8M, Payments: $53.1M\n",
+ "\n",
+ "- Caremark, L.L.C. (NPI: 1134100134)\n",
+ " Specialty: All Other Suppliers | State: IL\n",
+ " Claims: 61,520, Charges: $737.9M, Payments: $244.4M\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "if not combined_df.empty:\n",
+ " # Ensure we have columns needed for specialty/state analysis\n",
+ " required_cols = [\n",
+ " 'Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org',\n",
+ " 'Suplr_Prvdr_Spclty_Desc', 'Suplr_Prvdr_State_Abrvtn',\n",
+ " 'Suplr_Sbmtd_Chrgs', 'Suplr_Mdcr_Pymt_Amt',\n",
+ " 'Tot_Suplr_Clms', 'Tot_Suplr_Srvcs'\n",
+ " ]\n",
+ " missing_cols = [c for c in required_cols if c not in combined_df.columns]\n",
+ " if missing_cols:\n",
+ " print(f\"Missing columns for Peer Group Analysis: {missing_cols}\")\n",
+ " else:\n",
+ " supplier_metrics = combined_df.groupby([\n",
+ " 'Suplr_NPI', 'Suplr_Prvdr_Last_Name_Org',\n",
+ " 'Suplr_Prvdr_Spclty_Desc', 'Suplr_Prvdr_State_Abrvtn'\n",
+ " ], as_index=False).agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': 'sum',\n",
+ " 'Suplr_Mdcr_Pymt_Amt': 'sum',\n",
+ " 'Tot_Suplr_Clms': 'sum',\n",
+ " 'Tot_Suplr_Srvcs': 'sum'\n",
+ " })\n",
+ "\n",
+ " # Add derived metrics\n",
+ " supplier_metrics['Avg_Chrg_Per_Clm'] = supplier_metrics['Suplr_Sbmtd_Chrgs'] / supplier_metrics['Tot_Suplr_Clms'].replace(0, np.nan)\n",
+ " supplier_metrics['Avg_Pymt_Per_Clm'] = supplier_metrics['Suplr_Mdcr_Pymt_Amt'] / supplier_metrics['Tot_Suplr_Clms'].replace(0, np.nan)\n",
+ " supplier_metrics['Avg_Srvcs_Per_Clm'] = supplier_metrics['Tot_Suplr_Srvcs'] / supplier_metrics['Tot_Suplr_Clms'].replace(0, np.nan)\n",
+ "\n",
+ " print(\"\\n## Peer Group Analysis by Specialty\\n\")\n",
+ " specialty_counts = supplier_metrics['Suplr_Prvdr_Spclty_Desc'].value_counts()\n",
+ " valid_specialties = specialty_counts[specialty_counts >= 5].index # at least 5 suppliers\n",
+ "\n",
+ " if len(valid_specialties) > 0:\n",
+ " peer_specialty_metrics = supplier_metrics[supplier_metrics['Suplr_Prvdr_Spclty_Desc'].isin(valid_specialties)].groupby('Suplr_Prvdr_Spclty_Desc').agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': ['median'],\n",
+ " 'Suplr_Mdcr_Pymt_Amt': ['median'],\n",
+ " 'Tot_Suplr_Clms': ['median'],\n",
+ " 'Tot_Suplr_Srvcs': ['median']\n",
+ " })\n",
+ " peer_specialty_metrics.columns = [\"_\".join(col) for col in peer_specialty_metrics.columns]\n",
+ "\n",
+ " outliers_by_specialty = []\n",
+ "\n",
+ " for specialty in valid_specialties:\n",
+ " group = supplier_metrics[supplier_metrics['Suplr_Prvdr_Spclty_Desc'] == specialty]\n",
+ " med_clms = peer_specialty_metrics.loc[specialty, 'Tot_Suplr_Clms_median']\n",
+ " med_chrg = peer_specialty_metrics.loc[specialty, 'Suplr_Sbmtd_Chrgs_median']\n",
+ " med_pay = peer_specialty_metrics.loc[specialty, 'Suplr_Mdcr_Pymt_Amt_median']\n",
+ "\n",
+ " # Compare each supplier to 3x median\n",
+ " claim_outliers = group[group['Tot_Suplr_Clms'] > 3 * med_clms]\n",
+ " charge_outliers = group[group['Suplr_Sbmtd_Chrgs'] > 3 * med_chrg]\n",
+ " payment_outliers = group[group['Suplr_Mdcr_Pymt_Amt'] > 3 * med_pay]\n",
+ "\n",
+ " # Combine\n",
+ " all_out = pd.concat([\n",
+ " claim_outliers[['Suplr_NPI']].assign(flag='claims'),\n",
+ " charge_outliers[['Suplr_NPI']].assign(flag='charges'),\n",
+ " payment_outliers[['Suplr_NPI']].assign(flag='payments')\n",
+ " ], ignore_index=True)\n",
+ " # We want suppliers that appear at least in 2 out of 3 categories\n",
+ " outlier_counts = all_out.groupby('Suplr_NPI').size()\n",
+ " multi_flags = outlier_counts[outlier_counts >= 2].index\n",
+ " multi_outliers = group[group['Suplr_NPI'].isin(multi_flags)]\n",
+ "\n",
+ " for idx, row in multi_outliers.iterrows():\n",
+ " outliers_by_specialty.append({\n",
+ " 'NPI': row['Suplr_NPI'],\n",
+ " 'Name': row['Suplr_Prvdr_Last_Name_Org'],\n",
+ " 'Specialty': row['Suplr_Prvdr_Spclty_Desc'],\n",
+ " 'State': row['Suplr_Prvdr_State_Abrvtn'],\n",
+ " 'Total_Claims': row['Tot_Suplr_Clms'],\n",
+ " 'Total_Charges': row['Suplr_Sbmtd_Chrgs'],\n",
+ " 'Total_Payments': row['Suplr_Mdcr_Pymt_Amt']\n",
+ " })\n",
+ "\n",
+ " if len(outliers_by_specialty) > 0:\n",
+ " # Just show top 10 by total charges\n",
+ " outliers_by_specialty = sorted(\n",
+ " outliers_by_specialty,\n",
+ " key=lambda x: x['Total_Charges'],\n",
+ " reverse=True\n",
+ " )\n",
+ "\n",
+ " print(\"Significant Specialty Outliers (exceeding 3× median in >=2 metrics):\")\n",
+ " for outlier in outliers_by_specialty[:10]:\n",
+ " print(f\"- {outlier['Name']} (NPI: {outlier['NPI']})\")\n",
+ " print(f\" Specialty: {outlier['Specialty']} | State: {outlier['State']}\")\n",
+ " print(f\" Claims: {outlier['Total_Claims']:,}, Charges: {format_dollar_amount(outlier['Total_Charges'])}, Payments: {format_dollar_amount(outlier['Total_Payments'])}\\n\")\n",
+ " else:\n",
+ " print(\"No multi-metric outliers by specialty.\")\n",
+ " else:\n",
+ " print(\"No specialty with >=5 suppliers.\")\n",
+ "\n",
+ " print(\"\\n## Peer Group Analysis by State\\n\")\n",
+ " state_counts = supplier_metrics['Suplr_Prvdr_State_Abrvtn'].value_counts()\n",
+ " valid_states = state_counts[state_counts >= 5].index\n",
+ "\n",
+ " if len(valid_states) > 0:\n",
+ " peer_state_metrics = supplier_metrics[supplier_metrics['Suplr_Prvdr_State_Abrvtn'].isin(valid_states)].groupby('Suplr_Prvdr_State_Abrvtn').agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': ['median'],\n",
+ " 'Suplr_Mdcr_Pymt_Amt': ['median'],\n",
+ " 'Tot_Suplr_Clms': ['median'],\n",
+ " 'Tot_Suplr_Srvcs': ['median']\n",
+ " })\n",
+ " peer_state_metrics.columns = [\"_\".join(col) for col in peer_state_metrics.columns]\n",
+ "\n",
+ " outliers_by_state = []\n",
+ "\n",
+ " for st in valid_states:\n",
+ " group = supplier_metrics[supplier_metrics['Suplr_Prvdr_State_Abrvtn'] == st]\n",
+ " med_clms = peer_state_metrics.loc[st, 'Tot_Suplr_Clms_median']\n",
+ " med_chrg = peer_state_metrics.loc[st, 'Suplr_Sbmtd_Chrgs_median']\n",
+ " med_pay = peer_state_metrics.loc[st, 'Suplr_Mdcr_Pymt_Amt_median']\n",
+ "\n",
+ " # Compare to 3x\n",
+ " claim_outliers = group[group['Tot_Suplr_Clms'] > 3 * med_clms]\n",
+ " charge_outliers = group[group['Suplr_Sbmtd_Chrgs'] > 3 * med_chrg]\n",
+ " payment_outliers = group[group['Suplr_Mdcr_Pymt_Amt'] > 3 * med_pay]\n",
+ "\n",
+ " all_out = pd.concat([\n",
+ " claim_outliers[['Suplr_NPI']].assign(flag='claims'),\n",
+ " charge_outliers[['Suplr_NPI']].assign(flag='charges'),\n",
+ " payment_outliers[['Suplr_NPI']].assign(flag='payments')\n",
+ " ], ignore_index=True)\n",
+ " outlier_counts = all_out.groupby('Suplr_NPI').size()\n",
+ " multi_flags = outlier_counts[outlier_counts >= 2].index\n",
+ " multi_outliers = group[group['Suplr_NPI'].isin(multi_flags)]\n",
+ "\n",
+ " for idx, row in multi_outliers.iterrows():\n",
+ " outliers_by_state.append({\n",
+ " 'NPI': row['Suplr_NPI'],\n",
+ " 'Name': row['Suplr_Prvdr_Last_Name_Org'],\n",
+ " 'Specialty': row['Suplr_Prvdr_Spclty_Desc'],\n",
+ " 'State': st,\n",
+ " 'Total_Claims': row['Tot_Suplr_Clms'],\n",
+ " 'Total_Charges': row['Suplr_Sbmtd_Chrgs'],\n",
+ " 'Total_Payments': row['Suplr_Mdcr_Pymt_Amt']\n",
+ " })\n",
+ "\n",
+ " if len(outliers_by_state) > 0:\n",
+ " outliers_by_state = sorted(\n",
+ " outliers_by_state,\n",
+ " key=lambda x: x['Total_Charges'],\n",
+ " reverse=True\n",
+ " )\n",
+ " print(\"Significant State Outliers (>= 3× median in >=2 metrics):\")\n",
+ " for outlier in outliers_by_state[:10]:\n",
+ " print(f\"- {outlier['Name']} (NPI: {outlier['NPI']})\")\n",
+ " print(f\" State: {outlier['State']} | Specialty: {outlier['Specialty']}\")\n",
+ " print(f\" Claims: {outlier['Total_Claims']:,}, Charges: {format_dollar_amount(outlier['Total_Charges'])}, Payments: {format_dollar_amount(outlier['Total_Payments'])}\\n\")\n",
+ " else:\n",
+ " print(\"No multi-metric outliers by state.\")\n",
+ " else:\n",
+ " print(\"No states with >=5 suppliers.\")\n",
+ "\n",
+ " print(\"\\n## Peer Group Analysis by Combined Specialty–State\\n\")\n",
+ " supplier_metrics['SpecState'] = supplier_metrics['Suplr_Prvdr_Spclty_Desc'].astype(str) + ' - ' + supplier_metrics['Suplr_Prvdr_State_Abrvtn'].astype(str)\n",
+ " combo_counts = supplier_metrics['SpecState'].value_counts()\n",
+ " valid_specstates = combo_counts[combo_counts >= 5].index\n",
+ "\n",
+ " if len(valid_specstates) > 0:\n",
+ " # Calculate medians for each group\n",
+ " combo_medians = supplier_metrics[supplier_metrics['SpecState'].isin(valid_specstates)].groupby('SpecState').agg({\n",
+ " 'Suplr_Sbmtd_Chrgs': 'median',\n",
+ " 'Suplr_Mdcr_Pymt_Amt': 'median',\n",
+ " 'Tot_Suplr_Clms': 'median',\n",
+ " 'Tot_Suplr_Srvcs': 'median'\n",
+ " })\n",
+ " outliers_combined = []\n",
+ " \n",
+ " for cs in valid_specstates:\n",
+ " group = supplier_metrics[supplier_metrics['SpecState'] == cs]\n",
+ " med_clms = combo_medians.loc[cs, 'Tot_Suplr_Clms']\n",
+ " med_chrg = combo_medians.loc[cs, 'Suplr_Sbmtd_Chrgs']\n",
+ " med_pay = combo_medians.loc[cs, 'Suplr_Mdcr_Pymt_Amt']\n",
+ "\n",
+ " claim_outliers = group[group['Tot_Suplr_Clms'] > 3 * med_clms]\n",
+ " charge_outliers = group[group['Suplr_Sbmtd_Chrgs'] > 3 * med_chrg]\n",
+ " payment_outliers = group[group['Suplr_Mdcr_Pymt_Amt'] > 3 * med_pay]\n",
+ "\n",
+ " all_out = pd.concat([\n",
+ " claim_outliers[['Suplr_NPI']].assign(flag='claims'),\n",
+ " charge_outliers[['Suplr_NPI']].assign(flag='charges'),\n",
+ " payment_outliers[['Suplr_NPI']].assign(flag='payments')\n",
+ " ], ignore_index=True)\n",
+ " outlier_counts = all_out.groupby('Suplr_NPI').size()\n",
+ " multi_flags = outlier_counts[outlier_counts >= 2].index\n",
+ "\n",
+ " multi_outliers = group[group['Suplr_NPI'].isin(multi_flags)]\n",
+ " for idx, row in multi_outliers.iterrows():\n",
+ " outliers_combined.append({\n",
+ " 'NPI': row['Suplr_NPI'],\n",
+ " 'Name': row['Suplr_Prvdr_Last_Name_Org'],\n",
+ " 'SpecState': cs,\n",
+ " 'Specialty': row['Suplr_Prvdr_Spclty_Desc'],\n",
+ " 'State': row['Suplr_Prvdr_State_Abrvtn'],\n",
+ " 'Total_Claims': row['Tot_Suplr_Clms'],\n",
+ " 'Total_Charges': row['Suplr_Sbmtd_Chrgs'],\n",
+ " 'Total_Payments': row['Suplr_Mdcr_Pymt_Amt']\n",
+ " })\n",
+ " \n",
+ " if outliers_combined:\n",
+ " # Sort by total charges just as a quick way to highlight big outliers\n",
+ " outliers_combined = sorted(\n",
+ " outliers_combined,\n",
+ " key=lambda x: x['Total_Charges'],\n",
+ " reverse=True\n",
+ " )\n",
+ " print(\"Significant Combined Specialty–State Outliers (>= 3× median in >=2 metrics):\")\n",
+ " for outlier in outliers_combined[:10]:\n",
+ " print(f\"- {outlier['Name']} (NPI: {outlier['NPI']})\")\n",
+ " print(f\" Specialty: {outlier['Specialty']} | State: {outlier['State']}\")\n",
+ " print(f\" Claims: {outlier['Total_Claims']:,}, Charges: {format_dollar_amount(outlier['Total_Charges'])}, Payments: {format_dollar_amount(outlier['Total_Payments'])}\\n\")\n",
+ " else:\n",
+ " print(\"No multi-metric outliers at the combined specialty–state level.\")\n",
+ " else:\n",
+ " print(\"No combined specialty–state groups with >=5 suppliers.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9b851377",
+ "metadata": {},
+ "source": [
+ "# 8. Conclusions & Next Steps\n",
+ "We've combined multi-year DME data, identified year-over-year outliers, analyzed high submitted vs. allowed/paid ratios, and performed peer-group checks.\n",
+ "\n",
+ "### Potential Enhancements\n",
+ "1. **Additional Metrics**: Incorporate DME-specific categories (e.g., prosthetics vs. drug/nutrition) and investigate outliers in each.\n",
+ "2. **Machine Learning**: Replace threshold-based outlier detection with algorithms (Isolation Forest, DBSCAN, etc.).\n",
+ "3. **Visualization**: Plot distributions, boxplots, or time-series charts for top suspicious suppliers.\n",
+ "4. **Interactive Dashboards**: Provide an interface for users to adjust thresholds and instantly see flagged suppliers.\n"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "PY311LLM",
"language": "python",
"name": "python3"
},
diff --git a/dme_data_analysis.py b/dme_data_analysis.py
deleted file mode 100644
index c2b29aa..0000000
--- a/dme_data_analysis.py
+++ /dev/null
@@ -1,952 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-DME Data Analysis Script
-This script imports and analyzes the DME data files by year.
-"""
-
-import pandas as pd
-import numpy as np
-import os
-from pprint import pprint
-from collections import defaultdict, Counter
-import matplotlib.pyplot as plt
-import seaborn as sns
-import sys
-
-
-def import_dme_data(file_path):
- """
- Import and preprocess DME data from a CSV file.
-
- Parameters:
- -----------
- file_path : str
- Path to the CSV file containing DME data
-
- Returns:
- --------
- df : DataFrame
- Processed DataFrame containing DME data
- """
- print(f"Importing data from {file_path}...")
-
- try:
- # Import data with appropriate dtypes to handle monetary values correctly
- df = pd.read_csv(file_path, low_memory=False)
-
- # Convert monetary columns to numeric
- money_columns = [
- col for col in df.columns if 'Pymt' in col or 'Amt' in col]
- for col in money_columns:
- if col in df.columns:
- df[col] = pd.to_numeric(df[col], errors='coerce')
-
- print(f"Successfully imported data with shape: {df.shape}")
- return df
-
- except Exception as e:
- print(f"Error importing data: {str(e)}")
- return None
-
-
-# Data dictionary mapping variable names to their descriptions
-DATA_DICTIONARY = {
- # Supplier Information
- "Suplr_NPI": "Supplier NPI - NPI for the Supplier on the DMEPOS claim",
- "Suplr_Prvdr_Last_Name_Org": "Supplier Last Name/Organization Name - When registered as individual, the Supplier's last name. When registered as organization, this is the organization name",
- "Suplr_Prvdr_First_Name": "Supplier First Name - When registered as individual, the Supplier's first name",
- "Suplr_Prvdr_MI": "Supplier Middle Initial - When registered as individual, the Supplier's middle initial",
- "Suplr_Prvdr_Crdntls": "Supplier Credentials - When registered as individual, these are the Supplier's credentials",
- "Suplr_Prvdr_Gndr": "Supplier Gender - When registered as individual, this is the Supplier's gender",
- "Suplr_Prvdr_Ent_Cd": "Supplier Entity Code - 'I' identifies Suppliers registered as individuals, 'O' identifies Suppliers registered as organizations",
- "Suplr_Prvdr_St1": "Supplier Street 1 - First line of the Supplier's street address",
- "Suplr_Prvdr_St2": "Supplier Street 2 - Second line of the Supplier's street address",
- "Suplr_Prvdr_City": "Supplier City - The city where the Supplier is located",
- "Suplr_Prvdr_State_Abrvtn": "Supplier State - State postal abbreviation where the Supplier is located",
- "Suplr_Prvdr_State_FIPS": "Supplier State FIPS Code - FIPS code for Supplier's state",
- "Suplr_Prvdr_Zip5": "Supplier ZIP - The Supplier's ZIP code",
- "Suplr_Prvdr_RUCA": "Supplier RUCA - Rural-Urban Commuting Area Code for the Supplier ZIP code",
- "Suplr_Prvdr_RUCA_Desc": "Supplier RUCA Description - Description of Rural-Urban Commuting Area (RUCA) Code",
- "Suplr_Prvdr_Cntry": "Supplier Country - Country where the Supplier is located",
- "Suplr_Prvdr_Spclty_Desc": "Supplier Provider Specialty Description - Derived from Medicare provider/supplier specialty code",
- "Suplr_Prvdr_Spclty_Srce": "Supplier Provider Specialty Source - Source of the Supplier Specialty (claims-specialty or NPPES-specialty)",
-
- # Total Supplier Claims/Services
- "Tot_Suplr_HCPCS_Cds": "Number of Supplier HCPCS - Total unique DMEPOS product/service HCPCS codes",
- "Tot_Suplr_Benes": "Number of Supplier Beneficiaries - Total unique beneficiaries (<11 are suppressed)",
- "Tot_Suplr_Clms": "Number of Supplier Claims - Total DMEPOS claims submitted",
- "Tot_Suplr_Srvcs": "Number of Supplier Services - Total DMEPOS products/services rendered",
- "Suplr_Sbmtd_Chrgs": "Supplier Submitted Charges - Total charges submitted for DMEPOS products/services",
- "Suplr_Mdcr_Alowd_Amt": "Supplier Medicare Allowed Amount - Total Medicare allowed amount",
- "Suplr_Mdcr_Pymt_Amt": "Supplier Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
- "Suplr_Mdcr_Stdzd_Pymt_Amt": "Supplier Medicare Standard Payment Amount - Standardized Medicare payments",
-
- # DME-specific Fields
- "DME_Sprsn_Ind": "Durable Medical Equipment Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
- "DME_Tot_Suplr_HCPCS_Cds": "Number of DME HCPCS - Total unique DME HCPCS codes",
- "DME_Tot_Suplr_Benes": "Number of DME Beneficiaries - Total unique beneficiaries with DME claims (<11 are suppressed)",
- "DME_Tot_Suplr_Clms": "Number of DME Claims - Total DME claims submitted",
- "DME_Tot_Suplr_Srvcs": "Number of DME Services - Total DME products/services rendered",
- "DME_Suplr_Sbmtd_Chrgs": "DME Submitted Charges - Total charges submitted for DME products/services",
- "DME_Suplr_Mdcr_Alowd_Amt": "DME Medicare Allowed Amount - Total Medicare allowed amount for DME",
- "DME_Suplr_Mdcr_Pymt_Amt": "DME Medicare Payment Amount - Amount Medicare paid for DME after deductible/coinsurance",
- "DME_Suplr_Mdcr_Stdzd_Pymt_Amt": "DME Medicare Standard Payment Amount - Standardized Medicare payments for DME",
-
- # Prosthetic and Orthotic Fields
- "POS_Sprsn_Ind": "Prosthetic and Orthotic Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
- "POS_Tot_Suplr_HCPCS_Cds": "Number of Prosthetic/Orthotic HCPCS - Total unique prosthetic/orthotic HCPCS codes",
- "POS_Tot_Suplr_Benes": "Number of Prosthetic/Orthotic Beneficiaries - Total unique beneficiaries",
- "POS_Tot_Suplr_Clms": "Number of Prosthetic/Orthotic Claims - Total prosthetic/orthotic claims submitted",
- "POS_Tot_Suplr_Srvcs": "Number of Prosthetic/Orthotic Services - Total prosthetic/orthotic products/services",
- "POS_Suplr_Sbmtd_Chrgs": "Prosthetic/Orthotic Submitted Charges - Total charges submitted for prosthetic/orthotic",
- "POS_Suplr_Mdcr_Alowd_Amt": "Prosthetic/Orthotic Medicare Allowed Amount - Total Medicare allowed amount",
- "POS_Suplr_Mdcr_Pymt_Amt": "Prosthetic/Orthotic Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
- "POS_Suplr_Mdcr_Stdzd_Pymt_Amt": "Prosthetic/Orthotic Medicare Standard Payment Amount - Standardized Medicare payments",
-
- # Drug and Nutritional Fields
- "Drug_Sprsn_Ind": "Drug and Nutritional Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
- "Drug_Tot_Suplr_HCPCS_Cds": "Number of Drug/Nutritional HCPCS - Total unique drug/nutritional HCPCS codes",
- "Drug_Tot_Suplr_Benes": "Number of Drug/Nutritional Beneficiaries - Total unique beneficiaries",
- "Drug_Tot_Suplr_Clms": "Number of Drug/Nutritional Claims - Total drug/nutritional claims submitted",
- "Drug_Tot_Suplr_Srvcs": "Number of Drug/Nutritional Services - Total drug/nutritional products/services",
- "Drug_Suplr_Sbmtd_Chrgs": "Drug/Nutritional Submitted Charges - Total charges submitted for drug/nutritional",
- "Drug_Suplr_Mdcr_Alowd_Amt": "Drug/Nutritional Medicare Allowed Amount - Total Medicare allowed amount",
- "Drug_Suplr_Mdcr_Pymt_Amt": "Drug/Nutritional Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
- "Drug_Suplr_Mdcr_Stdzd_Pymt_Amt": "Drug/Nutritional Medicare Standard Payment Amount - Standardized Medicare payments",
-
- # Beneficiary Demographics
- "Bene_Avg_Age": "Average Age of Beneficiaries - Average age at end of calendar year or time of death",
- "Bene_Age_LT_65_Cnt": "Number of Beneficiaries <65 - Count of beneficiaries under 65 years old",
- "Bene_Age_65_74_Cnt": "Number of Beneficiaries 65-74 - Count of beneficiaries between 65-74 years old",
- "Bene_Age_75_84_Cnt": "Number of Beneficiaries 75-84 - Count of beneficiaries between 75-84 years old",
- "Bene_Age_GT_84_Cnt": "Number of Beneficiaries >84 - Count of beneficiaries over 84 years old",
- "Bene_Feml_Cnt": "Number of Female Beneficiaries - Count of female beneficiaries",
- "Bene_Male_Cnt": "Number of Male Beneficiaries - Count of male beneficiaries",
- "Bene_Race_Wht_Cnt": "Number of White Beneficiaries - Count of non-Hispanic white beneficiaries",
- "Bene_Race_Black_Cnt": "Number of Black Beneficiaries - Count of non-Hispanic Black/African American beneficiaries",
- "Bene_Race_Api_Cnt": "Number of Asian/PI Beneficiaries - Count of Asian Pacific Islander beneficiaries",
- "Bene_Race_Hspnc_Cnt": "Number of Hispanic Beneficiaries - Count of Hispanic beneficiaries",
- "Bene_Race_Natind_Cnt": "Number of Native American/Alaska Native Beneficiaries - Count of American Indian/Alaska Native beneficiaries",
- "Bene_Race_Othr_Cnt": "Number of Other Race Beneficiaries - Count of beneficiaries with race not elsewhere classified",
- "Bene_Ndual_Cnt": "Number of Medicare & Medicaid Beneficiaries - Count of dual-eligible beneficiaries",
- "Bene_Dual_Cnt": "Number of Medicare-Only Beneficiaries - Count of Medicare-only beneficiaries",
-
- # Beneficiary Health Conditions (Mental/Behavioral Health)
- "Bene_CC_BH_ADHD_OthCD_V1_Pct": "Percent with ADHD and Other Conduct Disorders",
- "Bene_CC_BH_Alcohol_Drug_V1_Pct": "Percent with Alcohol and Drug Use Disorders",
- "Bene_CC_BH_Tobacco_V1_Pct": "Percent with Tobacco Use Disorders",
- "Bene_CC_BH_Alz_NonAlzdem_V2_Pct": "Percent with Alzheimer's and Non-Alzheimer's Dementia",
- "Bene_CC_BH_Anxiety_V1_Pct": "Percent with Anxiety Disorders",
- "Bene_CC_BH_Bipolar_V1_Pct": "Percent with Bipolar Disorder",
- "Bene_CC_BH_Mood_V2_Pct": "Percent with Depression, Bipolar or Other Mood Disorders",
- "Bene_CC_BH_Depress_V1_Pct": "Percent with Major Depressive Affective Disorder",
- "Bene_CC_BH_PD_V1_Pct": "Percent with Personality Disorders",
- "Bene_CC_BH_PTSD_V1_Pct": "Percent with Post-Traumatic Stress Disorder",
- "Bene_CC_BH_Schizo_OthPsy_V1_Pct": "Percent with Schizophrenia and Other Psychotic Disorders",
-
- # Beneficiary Health Conditions (Physical Health)
- "Bene_CC_PH_Asthma_V2_Pct": "Percent with Asthma",
- "Bene_CC_PH_Afib_V2_Pct": "Percent with Atrial Fibrillation and Flutter",
- "Bene_CC_PH_Cancer6_V2_Pct": "Percent with Cancer (combined 6 cancer indicators)",
- "Bene_CC_PH_CKD_V2_Pct": "Percent with Chronic Kidney Disease",
- "Bene_CC_PH_COPD_V2_Pct": "Percent with Chronic Obstructive Pulmonary Disease",
- "Bene_CC_PH_Diabetes_V2_Pct": "Percent with Diabetes",
- "Bene_CC_PH_HF_NonIHD_V2_Pct": "Percent with Heart Failure and Non-Ischemic Heart Disease",
- "Bene_CC_PH_Hyperlipidemia_V2_Pct": "Percent with Hyperlipidemia",
- "Bene_CC_PH_Hypertension_V2_Pct": "Percent with Hypertension",
- "Bene_CC_PH_IschemicHeart_V2_Pct": "Percent with Ischemic Heart Disease",
- "Bene_CC_PH_Osteoporosis_V2_Pct": "Percent with Osteoporosis",
- "Bene_CC_PH_Parkinson_V2_Pct": "Percent with Parkinson's Disease",
- "Bene_CC_PH_Arthritis_V2_Pct": "Percent with Rheumatoid Arthritis/Osteoarthritis",
- "Bene_CC_PH_Stroke_TIA_V2_Pct": "Percent with Stroke/Transient Ischemic Attack",
-
- # Risk Score
- "Bene_Avg_Risk_Scre": "Average HCC Risk Score of Beneficiaries"
-}
-
-
-def get_column_category(column_name):
- """Return the category for a given column name based on prefix."""
- if column_name.startswith('Suplr_'):
- return "Supplier Information"
- elif column_name.startswith('DME_'):
- return "Durable Medical Equipment"
- elif column_name.startswith('POS_'):
- return "Prosthetics and Orthotics"
- elif column_name.startswith('Drug_'):
- return "Drug and Nutritional Products"
- elif column_name.startswith('Bene_CC_BH_'):
- return "Beneficiary Behavioral Health Conditions"
- elif column_name.startswith('Bene_CC_PH_'):
- return "Beneficiary Physical Health Conditions"
- elif column_name.startswith('Bene_'):
- return "Beneficiary Demographics"
- else:
- return "Other"
-
-
-def get_top_suppliers(df, top_n=10):
- """Return the top suppliers by number of beneficiaries."""
- top_suppliers = df.sort_values(
- 'DME_Tot_Suplr_Benes', ascending=False).head(top_n)
-
- # Format results for better readability
- results = []
- for _, row in top_suppliers.iterrows():
- supplier_name = row['Suplr_Prvdr_Last_Name_Org']
- beneficiaries = row['DME_Tot_Suplr_Benes']
- claims = row['DME_Tot_Suplr_Clms']
- payments = row['DME_Suplr_Mdcr_Pymt_Amt']
-
- results.append({
- 'Supplier': supplier_name,
- 'Beneficiaries': beneficiaries,
- 'Claims': claims,
- 'Medicare Payments': f"${payments:,.2f}"
- })
-
- return pd.DataFrame(results)
-
-
-def get_beneficiary_demographics(df):
- """Analyze beneficiary demographics from the data."""
- # Extract age distribution
- age_cols = ['Bene_Age_LT_65_Cnt', 'Bene_Age_65_74_Cnt',
- 'Bene_Age_75_84_Cnt', 'Bene_Age_GT_84_Cnt']
- age_totals = df[age_cols].sum()
- total_benes = age_totals.sum()
- age_pcts = (age_totals / total_benes * 100).round(2)
-
- # Extract gender distribution
- gender_cols = ['Bene_Feml_Cnt', 'Bene_Male_Cnt']
- gender_totals = df[gender_cols].sum()
- gender_pcts = (gender_totals / gender_totals.sum() * 100).round(2)
-
- # Extract race distribution
- race_cols = ['Bene_Race_Wht_Cnt', 'Bene_Race_Black_Cnt', 'Bene_Race_Api_Cnt',
- 'Bene_Race_Hspnc_Cnt', 'Bene_Race_Natind_Cnt', 'Bene_Race_Othr_Cnt']
- race_totals = df[race_cols].sum()
- race_pcts = (race_totals / race_totals.sum() * 100).round(2)
-
- # Format results with readable labels from data dictionary
- age_results = {DATA_DICTIONARY[col].split(
- ' - ')[0]: pct for col, pct in zip(age_cols, age_pcts)}
- gender_results = {DATA_DICTIONARY[col].split(
- ' - ')[0]: pct for col, pct in zip(gender_cols, gender_pcts)}
- race_results = {DATA_DICTIONARY[col].split(
- ' - ')[0]: pct for col, pct in zip(race_cols, race_pcts)}
-
- return {
- 'Age Distribution': age_results,
- 'Gender Distribution': gender_results,
- 'Race Distribution': race_results
- }
-
-
-def get_common_health_conditions(df):
- """Extract the most common health conditions among beneficiaries."""
- # Physical health conditions
- ph_cols = [col for col in df.columns if col.startswith(
- 'Bene_CC_PH_') and col.endswith('_Pct')]
- ph_values = []
-
- for col in ph_cols:
- # Calculate weighted average (weighted by number of beneficiaries)
- weighted_avg = (df[col] * df['DME_Tot_Suplr_Benes']
- ).sum() / df['DME_Tot_Suplr_Benes'].sum()
- ph_values.append((DATA_DICTIONARY[col], weighted_avg))
-
- # Behavioral health conditions
- bh_cols = [col for col in df.columns if col.startswith(
- 'Bene_CC_BH_') and col.endswith('_Pct')]
- bh_values = []
-
- for col in bh_cols:
- # Calculate weighted average (weighted by number of beneficiaries)
- weighted_avg = (df[col] * df['DME_Tot_Suplr_Benes']
- ).sum() / df['DME_Tot_Suplr_Benes'].sum()
- bh_values.append((DATA_DICTIONARY[col], weighted_avg))
-
- # Sort by prevalence
- ph_values.sort(key=lambda x: x[1], reverse=True)
- bh_values.sort(key=lambda x: x[1], reverse=True)
-
- return {
- 'Physical Health Conditions': ph_values,
- 'Behavioral Health Conditions': bh_values
- }
-
-
-def analyze_spending_patterns(df_by_year):
- """Analyze spending patterns across years."""
- year_data = []
-
- for year, df in df_by_year.items():
- # Calculate total beneficiaries and spending
- total_benes = df['DME_Tot_Suplr_Benes'].sum()
- total_spend = df['DME_Suplr_Mdcr_Pymt_Amt'].sum()
-
- # Calculate spending per beneficiary
- spend_per_bene = total_spend / total_benes if total_benes > 0 else 0
-
- # Calculate distribution of spending by DME, POS, and Drug categories
- dme_spend = df['DME_Suplr_Mdcr_Pymt_Amt'].sum()
- pos_spend = df['POS_Suplr_Mdcr_Pymt_Amt'].sum()
- drug_spend = df['Drug_Suplr_Mdcr_Pymt_Amt'].sum()
-
- # Add to results
- year_data.append({
- 'Year': year,
- 'Total Beneficiaries': total_benes,
- 'Total Spending': total_spend,
- 'Spending Per Beneficiary': spend_per_bene,
- 'DME Spending': dme_spend,
- 'Prosthetic/Orthotic Spending': pos_spend,
- 'Drug Spending': drug_spend
- })
-
- return pd.DataFrame(year_data)
-
-
-# -------------------- Visualization Functions --------------------
-
-def plot_spending_trends(spend_df):
- """
- Create visualizations for spending trends over time.
-
- Parameters:
- -----------
- spend_df : DataFrame
- DataFrame with yearly spending data, as returned by analyze_spending_patterns
-
- Returns:
- --------
- fig : matplotlib Figure
- The figure containing the visualizations
- """
- # Set the style
- sns.set_style('whitegrid')
-
- # Create a figure with 2x2 subplots
- fig, axes = plt.subplots(2, 2, figsize=(16, 14))
-
- # Total beneficiaries by year
- sns.lineplot(x='Year', y='Total Beneficiaries', data=spend_df,
- marker='o', linewidth=3, markersize=10, ax=axes[0, 0], color='#1f77b4')
- axes[0, 0].set_title('Total Beneficiaries by Year', fontsize=16)
- axes[0, 0].ticklabel_format(style='plain', axis='y')
- axes[0, 0].grid(True)
-
- # Total spending by year
- sns.lineplot(x='Year', y='Total Spending', data=spend_df,
- marker='o', linewidth=3, markersize=10, ax=axes[0, 1], color='#ff7f0e')
- axes[0, 1].set_title('Total Medicare DME Spending by Year', fontsize=16)
- axes[0, 1].ticklabel_format(style='plain', axis='y')
- axes[0, 1].yaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'${x/1e9:.1f}B'))
- axes[0, 1].grid(True)
-
- # Spending per beneficiary by year
- sns.lineplot(x='Year', y='Spending Per Beneficiary', data=spend_df,
- marker='o', linewidth=3, markersize=10, ax=axes[1, 0], color='#2ca02c')
- axes[1, 0].set_title('Average Spending Per Beneficiary', fontsize=16)
- axes[1, 0].yaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'${x:.0f}'))
- axes[1, 0].grid(True)
-
- # Spending by category stacked area chart
- category_data = spend_df[['Year', 'DME Spending',
- 'Prosthetic/Orthotic Spending', 'Drug Spending']]
- category_data_stacked = category_data.set_index('Year')
-
- # Convert to billions for better readability
- category_data_stacked = category_data_stacked / 1e9
-
- # Plot stacked area chart
- category_data_stacked.plot.area(stacked=True, ax=axes[1, 1],
- color=['#1f77b4', '#ff7f0e', '#2ca02c'],
- alpha=0.7)
- axes[1, 1].set_title('Spending by Category', fontsize=16)
- axes[1, 1].set_ylabel('Spending (Billions $)')
- axes[1, 1].yaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'${x:.1f}B'))
- axes[1, 1].legend(loc='upper left')
- axes[1, 1].grid(True)
-
- plt.tight_layout()
- return fig
-
-
-def plot_demographics(df, year=None):
- """
- Create visualizations for beneficiary demographics.
-
- Parameters:
- -----------
- df : DataFrame or dict
- Either a DataFrame for a specific year or the df_by_year dictionary
- year : int, optional
- If df is a dictionary, specify which year to visualize
-
- Returns:
- --------
- fig : matplotlib Figure
- The figure containing the visualizations
- """
- # If we have multiple years, extract the specified year
- if isinstance(df, dict) and year is not None:
- if year in df:
- df = df[year]
- else:
- raise ValueError(f"Year {year} not found in data")
-
- # Get demographics data
- demo_results = get_beneficiary_demographics(df)
-
- # Create a figure with 3 subplots for age, gender, and race
- fig, axes = plt.subplots(1, 3, figsize=(18, 6))
-
- # Age distribution
- age_data = demo_results['Age Distribution']
- age_labels = list(age_data.keys())
- age_values = list(age_data.values())
-
- axes[0].pie(age_values, labels=age_labels, autopct='%1.1f%%',
- startangle=90, colors=sns.color_palette("Blues", len(age_labels)))
- axes[0].set_title('Age Distribution', fontsize=16)
-
- # Gender distribution
- gender_data = demo_results['Gender Distribution']
- gender_labels = list(gender_data.keys())
- gender_values = list(gender_data.values())
-
- axes[1].pie(gender_values, labels=gender_labels, autopct='%1.1f%%',
- startangle=90, colors=sns.color_palette("Set2", len(gender_labels)))
- axes[1].set_title('Gender Distribution', fontsize=16)
-
- # Race distribution
- race_data = demo_results['Race Distribution']
- race_labels = list(race_data.keys())
- race_values = list(race_data.values())
-
- # Sort by percentage (descending)
- sorted_race = sorted(zip(race_labels, race_values),
- key=lambda x: x[1], reverse=True)
- race_labels, race_values = zip(*sorted_race)
-
- axes[2].pie(race_values, labels=race_labels, autopct='%1.1f%%',
- startangle=90, colors=sns.color_palette("Set3", len(race_labels)))
- axes[2].set_title('Race Distribution', fontsize=16)
-
- plt.tight_layout()
- return fig
-
-
-def plot_health_conditions(df, year=None, top_n=10):
- """
- Create visualizations for health conditions prevalence.
-
- Parameters:
- -----------
- df : DataFrame or dict
- Either a DataFrame for a specific year or the df_by_year dictionary
- year : int, optional
- If df is a dictionary, specify which year to visualize
- top_n : int, optional
- Number of top conditions to display (default: 10)
-
- Returns:
- --------
- fig : matplotlib Figure
- The figure containing the visualizations
- """
- # If we have multiple years, extract the specified year
- if isinstance(df, dict) and year is not None:
- if year in df:
- df = df[year]
- else:
- raise ValueError(f"Year {year} not found in data")
-
- # Get health conditions data
- conditions = get_common_health_conditions(df)
-
- # Create a figure with 2 subplots for physical and behavioral health
- fig, axes = plt.subplots(1, 2, figsize=(20, 10))
-
- # Physical health conditions
- ph_data = conditions['Physical Health Conditions'][:top_n]
- ph_labels = [cond for cond, _ in ph_data]
- ph_values = [val for _, val in ph_data]
-
- # Horizontal bar chart for physical health
- sns.barplot(x=ph_values, y=ph_labels, palette="Blues_d", ax=axes[0])
- axes[0].set_title('Top Physical Health Conditions', fontsize=16)
- axes[0].set_xlabel('Percentage of Beneficiaries', fontsize=12)
- axes[0].xaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'{x:.2f}%'))
- axes[0].grid(axis='x')
-
- # Behavioral health conditions
- bh_data = conditions['Behavioral Health Conditions'][:top_n]
- bh_labels = [cond for cond, _ in bh_data]
- bh_values = [val for _, val in bh_data]
-
- # Horizontal bar chart for behavioral health
- sns.barplot(x=bh_values, y=bh_labels, palette="Oranges_d", ax=axes[1])
- axes[1].set_title('Top Behavioral Health Conditions', fontsize=16)
- axes[1].set_xlabel('Percentage of Beneficiaries', fontsize=12)
- axes[1].xaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'{x:.2f}%'))
- axes[1].grid(axis='x')
-
- plt.tight_layout()
- return fig
-
-
-def plot_top_suppliers(df, year=None, top_n=10):
- """
- Create visualizations for top suppliers.
-
- Parameters:
- -----------
- df : DataFrame or dict
- Either a DataFrame for a specific year or the df_by_year dictionary
- year : int, optional
- If df is a dictionary, specify which year to visualize
- top_n : int, optional
- Number of top suppliers to display (default: 10)
-
- Returns:
- --------
- fig : matplotlib Figure
- The figure containing the visualizations
- """
- # If we have multiple years, extract the specified year
- if isinstance(df, dict) and year is not None:
- if year in df:
- df = df[year]
- else:
- raise ValueError(f"Year {year} not found in data")
-
- # Get top suppliers data
- top_suppliers_df = get_top_suppliers(df, top_n=top_n)
-
- # Convert payments string to numeric values
- top_suppliers_df['Medicare Payments (Numeric)'] = top_suppliers_df['Medicare Payments'].str.replace(
- '$', '').str.replace(',', '').astype(float)
-
- # Sort by payment amount
- top_suppliers_df = top_suppliers_df.sort_values(
- 'Medicare Payments (Numeric)', ascending=True)
-
- # Create a figure with 2 subplots
- fig, axes = plt.subplots(1, 2, figsize=(20, 10))
-
- # Payments bar chart
- sns.barplot(x='Medicare Payments (Numeric)', y='Supplier', data=top_suppliers_df,
- palette="viridis", ax=axes[0])
- axes[0].set_title(
- f'Top {top_n} Suppliers by Medicare Payments', fontsize=16)
- axes[0].set_xlabel('Medicare Payments ($)', fontsize=12)
- axes[0].xaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'${x/1e6:.1f}M'))
- axes[0].grid(axis='x')
-
- # Beneficiaries bar chart
- top_suppliers_df = top_suppliers_df.sort_values(
- 'Beneficiaries', ascending=True)
- sns.barplot(x='Beneficiaries', y='Supplier', data=top_suppliers_df,
- palette="plasma", ax=axes[1])
- axes[1].set_title(
- f'Top {top_n} Suppliers by Number of Beneficiaries', fontsize=16)
- axes[1].set_xlabel('Number of Beneficiaries', fontsize=12)
- axes[1].xaxis.set_major_formatter(
- plt.FuncFormatter(lambda x, pos: f'{x:.0f}'))
- axes[1].grid(axis='x')
-
- plt.tight_layout()
- return fig
-
-
-def plot_geographical_distribution(df, year=None):
- """
- Create visualizations for the geographical distribution of suppliers.
-
- Parameters:
- -----------
- df : DataFrame or dict
- Either a DataFrame for a specific year or the df_by_year dictionary
- year : int, optional
- If df is a dictionary, specify which year to visualize
-
- Returns:
- --------
- fig : matplotlib Figure
- The figure containing the visualizations
- """
- # If we have multiple years, extract the specified year
- if isinstance(df, dict) and year is not None:
- if year in df:
- df = df[year]
- else:
- raise ValueError(f"Year {year} not found in data")
-
- # Create a figure with 2 subplots
- fig, axes = plt.subplots(1, 2, figsize=(20, 8))
-
- # State distribution
- state_counts = df['Suplr_Prvdr_State_Abrvtn'].value_counts().reset_index()
- state_counts.columns = ['State', 'Suppliers']
-
- # Sort by count (descending) and get top 15
- state_counts = state_counts.sort_values(
- 'Suppliers', ascending=False).head(15)
-
- sns.barplot(x='Suppliers', y='State', data=state_counts,
- palette="viridis", ax=axes[0])
- axes[0].set_title('Top 15 States by Number of Suppliers', fontsize=16)
- axes[0].set_xlabel('Number of Suppliers', fontsize=12)
- axes[0].grid(axis='x')
-
- # Rural vs Urban distribution
- if 'Suplr_Prvdr_RUCA_Desc' in df.columns:
- ruca_counts = df['Suplr_Prvdr_RUCA_Desc'].value_counts().reset_index()
- ruca_counts.columns = ['RUCA Description', 'Suppliers']
-
- explode = [0.1] * len(ruca_counts) # Explode all slices
-
- # Plot pie chart for RUCA distribution
- axes[1].pie(ruca_counts['Suppliers'], labels=ruca_counts['RUCA Description'],
- autopct='%1.1f%%', startangle=90,
- colors=sns.color_palette("Set2", len(ruca_counts)),
- explode=explode)
- axes[1].set_title(
- 'Supplier Distribution by Rural-Urban Classification', fontsize=16)
- else:
- axes[1].text(0.5, 0.5, 'RUCA Description not available',
- ha='center', va='center', fontsize=14)
- axes[1].set_title(
- 'Rural-Urban Distribution (Not Available)', fontsize=16)
-
- plt.tight_layout()
- return fig
-
-
-def create_notebook_visualizations(df_by_year):
- """
- Create all visualizations for a Jupyter notebook.
-
- This is a convenience function that calls all visualization functions
- and returns them for display in a Jupyter notebook.
-
- Parameters:
- -----------
- df_by_year : dict
- Dictionary with yearly dataframes, as created in main()
-
- Returns:
- --------
- visualizations : dict
- Dictionary with all visualizations
- """
- import matplotlib.pyplot as plt
-
- # Most recent year
- recent_year = max(df_by_year.keys())
-
- # Create spending trend visualizations
- spend_df = analyze_spending_patterns(df_by_year)
- spending_fig = plot_spending_trends(spend_df)
-
- # Create demographics visualizations for most recent year
- demographics_fig = plot_demographics(df_by_year[recent_year])
-
- # Create health conditions visualizations for most recent year
- health_conditions_fig = plot_health_conditions(df_by_year[recent_year])
-
- # Create top suppliers visualizations for most recent year
- suppliers_fig = plot_top_suppliers(df_by_year[recent_year])
-
- # Create geographical distribution visualizations for most recent year
- geo_fig = plot_geographical_distribution(df_by_year[recent_year])
-
- # Return all visualizations
- return {
- 'spending_trends': spending_fig,
- 'demographics': demographics_fig,
- 'health_conditions': health_conditions_fig,
- 'top_suppliers': suppliers_fig,
- 'geographical_distribution': geo_fig
- }
-
-
-def main():
- """Main function to import and analyze DME data files."""
- print("DME Data Analysis")
- print("================\n")
-
- # Dictionary to store dataframes by year
- df_by_year = {}
-
- # Import data for years 2017-2022
- for year in range(2017, 2023):
- csv_path = f"data/{year}/mup_dme_ry24_p05_v10_dy{str(year)[-2:]}_supr.csv"
- if os.path.exists(csv_path):
- print(f"Importing data for {year}...")
- df_by_year[year] = pd.read_csv(csv_path, low_memory=False)
- print(
- f"✓ Data for {year} imported successfully. Shape: {df_by_year[year].shape}")
- else:
- print(f"Warning: No data file found for {year}")
-
- print("\nAll available data files have been imported.")
-
- # Data Overview
- print("\n1. Data Overview")
- print("---------------\n")
-
- # Create a summary table
- summary_data = {
- 'Year': [],
- 'Suppliers': [],
- 'Total Beneficiaries': [],
- 'Total Claims': [],
- 'Total Payments ($)': []
- }
-
- for year, df in df_by_year.items():
- summary_data['Year'].append(year)
- summary_data['Suppliers'].append(df.shape[0])
- summary_data['Total Beneficiaries'].append(
- df['DME_Tot_Suplr_Benes'].sum())
- summary_data['Total Claims'].append(df['DME_Tot_Suplr_Clms'].sum())
- summary_data['Total Payments ($)'].append(
- df['DME_Suplr_Mdcr_Pymt_Amt'].sum())
-
- summary_df = pd.DataFrame(summary_data)
- print("Summary statistics across years:")
- print(summary_df.to_string(index=False,
- float_format=lambda x: f"{x:,.0f}" if isinstance(x, (int, float)) else x))
-
- # Calculate year-over-year changes
- if len(summary_df) > 1:
- yoy_data = {
- 'Metric': ['Suppliers', 'Beneficiaries', 'Claims', 'Payments'],
- 'Change 2021-2022 (%)': [0, 0, 0, 0]
- }
-
- # Calculate year-over-year changes for the most recent years
- if 2021 in df_by_year and 2022 in df_by_year:
- suppliers_2021 = summary_df[summary_df['Year']
- == 2021]['Suppliers'].values[0]
- suppliers_2022 = summary_df[summary_df['Year']
- == 2022]['Suppliers'].values[0]
- bene_2021 = summary_df[summary_df['Year'] ==
- 2021]['Total Beneficiaries'].values[0]
- bene_2022 = summary_df[summary_df['Year'] ==
- 2022]['Total Beneficiaries'].values[0]
- claims_2021 = summary_df[summary_df['Year']
- == 2021]['Total Claims'].values[0]
- claims_2022 = summary_df[summary_df['Year']
- == 2022]['Total Claims'].values[0]
- payments_2021 = summary_df[summary_df['Year']
- == 2021]['Total Payments ($)'].values[0]
- payments_2022 = summary_df[summary_df['Year']
- == 2022]['Total Payments ($)'].values[0]
-
- # Calculate percentage changes
- yoy_data['Change 2021-2022 (%)'][0] = (
- (suppliers_2022 - suppliers_2021) / suppliers_2021) * 100
- yoy_data['Change 2021-2022 (%)'][1] = (
- (bene_2022 - bene_2021) / bene_2021) * 100
- yoy_data['Change 2021-2022 (%)'][2] = (
- (claims_2022 - claims_2021) / claims_2021) * 100
- yoy_data['Change 2021-2022 (%)'][3] = (
- (payments_2022 - payments_2021) / payments_2021) * 100
-
- yoy_df = pd.DataFrame(yoy_data)
- print("\nYear-over-year changes (2021-2022):")
- print(yoy_df.to_string(
- index=False, float_format=lambda x: f"{x:.2f}%"))
-
- # Column categories
- print("\nColumn Categories:")
- recent_year = max(df_by_year.keys())
- df = df_by_year[recent_year]
-
- categories = set()
- for col in df.columns:
- categories.add(get_column_category(col))
-
- for category in sorted(categories):
- # Print a few example columns for each category
- example_cols = [
- col for col in df.columns if get_column_category(col) == category][:3]
- print(
- f" - {category}: {len([col for col in df.columns if get_column_category(col) == category])} columns")
- print(f" Examples: {', '.join(example_cols)}")
- for col in example_cols:
- if col in DATA_DICTIONARY:
- print(f" {col}: {DATA_DICTIONARY[col]}")
-
- # Top Suppliers
- print("\n2. Top Suppliers")
- print("--------------\n")
- recent_year = max(df_by_year.keys())
- top_suppliers = get_top_suppliers(df_by_year[recent_year])
- print(f"Top suppliers for {recent_year}:")
- print(top_suppliers.to_string(index=False))
-
- # Beneficiary Demographics
- print("\n3. Beneficiary Demographics")
- print("--------------------------\n")
- demographics = get_beneficiary_demographics(df_by_year[recent_year])
- print(f"Demographics for {recent_year}:")
-
- # Print age distribution
- print("\nAge Distribution:")
- for age_group, percentage in demographics['Age Distribution'].items():
- print(f" - {age_group}: {percentage:.2f}%")
-
- # Print gender distribution
- print("\nGender Distribution:")
- for gender, percentage in demographics['Gender Distribution'].items():
- print(f" - {gender}: {percentage:.2f}%")
-
- # Print race distribution
- print("\nRace Distribution:")
- for race, percentage in demographics['Race Distribution'].items():
- print(f" - {race}: {percentage:.2f}%")
-
- # Health Conditions
- print("\n4. Common Health Conditions")
- print("-------------------------\n")
- conditions = get_common_health_conditions(df_by_year[recent_year])
- print(f"Health conditions for {recent_year}:")
-
- # Print physical health conditions
- print("\nPhysical Health Conditions:")
- for condition, percentage in conditions['Physical Health Conditions'][:10]:
- print(f" - {condition}: {percentage:.2f}%")
-
- # Print behavioral health conditions
- print("\nBehavioral Health Conditions:")
- for condition, percentage in conditions['Behavioral Health Conditions'][:10]:
- print(f" - {condition}: {percentage:.2f}%")
-
- # Spending Patterns
- print("\n5. Medicare Spending Patterns")
- print("---------------------------\n")
- spending_df = analyze_spending_patterns(df_by_year)
-
- # Format the DataFrame for display with appropriate formatting
- formatted_spending_df = spending_df.copy()
-
- # Format monetary columns with dollar signs
- monetary_cols = ['Total Spending', 'Spending Per Beneficiary', 'DME Spending',
- 'Prosthetic/Orthotic Spending', 'Drug Spending']
- for col in monetary_cols:
- if col in formatted_spending_df.columns:
- formatted_spending_df[col] = formatted_spending_df[col].apply(
- lambda x: f"${x:,.2f}")
-
- # Format count columns with commas
- count_cols = ['Year', 'Total Beneficiaries']
- for col in count_cols:
- if col in formatted_spending_df.columns:
- formatted_spending_df[col] = formatted_spending_df[col].apply(
- lambda x: f"{x:,.0f}")
-
- print("Medicare spending patterns across years:")
- print(formatted_spending_df.to_string(index=False))
-
- # ----- VISUALIZATIONS -----
- print("\n\n6. Generating Visualizations")
- print("---------------------------\n")
-
- # Setting plot style
- sns.set_style('whitegrid')
- plt.rcParams['figure.figsize'] = [14, 9]
-
- # Generate all visualizations
- visualizations = {}
-
- # 1. Spending Trends
- print("Generating spending trends visualization...")
- spending_trends_fig = plot_spending_trends(spending_df)
- visualizations['spending_trends'] = spending_trends_fig
-
- # 2. Demographics
- print("Generating demographics visualization...")
- demographics_fig = plot_demographics(df_by_year[recent_year])
- visualizations['demographics'] = demographics_fig
-
- # 3. Health Conditions
- print("Generating health conditions visualization...")
- health_conditions_fig = plot_health_conditions(df_by_year[recent_year])
- visualizations['health_conditions'] = health_conditions_fig
-
- # 4. Top Suppliers
- print("Generating top suppliers visualization...")
- suppliers_fig = plot_top_suppliers(df_by_year[recent_year])
- visualizations['top_suppliers'] = suppliers_fig
-
- # 5. Geographical Distribution
- print("Generating geographical distribution visualization...")
- geo_fig = plot_geographical_distribution(df_by_year[recent_year])
- visualizations['geographical_distribution'] = geo_fig
-
- # 6. Custom visualization: YoY percentage changes
- print("Generating year-over-year changes visualization...")
-
- # Calculate YoY percentage changes
- spending_df['Beneficiaries % Change'] = spending_df['Total Beneficiaries'].pct_change() * \
- 100
- spending_df['Spending % Change'] = spending_df['Total Spending'].pct_change() * \
- 100
- spending_df['Per Beneficiary % Change'] = spending_df['Spending Per Beneficiary'].pct_change() * \
- 100
-
- # Create plot
- yoy_fig, ax = plt.subplots(figsize=(14, 8))
- metrics = ['Beneficiaries % Change',
- 'Spending % Change', 'Per Beneficiary % Change']
- colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
-
- for i, metric in enumerate(metrics):
- ax.plot(spending_df['Year'][1:], spending_df[metric][1:],
- marker='o', linewidth=3, markersize=10,
- label=metric.replace(' % Change', ''),
- color=colors[i])
-
- ax.axhline(y=0, color='r', linestyle='--', alpha=0.5)
- ax.set_title(
- 'Year-over-Year Percentage Changes in Key Metrics', fontsize=16)
- ax.legend(fontsize=12)
- ax.grid(True)
- ax.set_xlabel('Year', fontsize=14)
- ax.set_ylabel('Percentage Change (%)', fontsize=14)
- visualizations['yoy_changes'] = yoy_fig
-
- # Save visualizations to files if not in a notebook environment
- try:
- # Check if we're in a notebook environment
- if 'ipykernel' not in sys.modules:
- print("\nSaving visualizations to files...")
- os.makedirs('visualizations', exist_ok=True)
- for name, fig in visualizations.items():
- fig.savefig(
- f'visualizations/{name}.png', dpi=300, bbox_inches='tight')
- print(f"Saved: visualizations/{name}.png")
- except:
- print("Note: Visualizations will be displayed if run in a Jupyter notebook")
-
- # When run in Jupyter, the figures will be displayed inline
- return df_by_year, visualizations
-
-
-if __name__ == "__main__":
- import sys
- main()
diff --git a/dme_dictionary.py b/dme_dictionary.py
new file mode 100644
index 0000000..f5cd696
--- /dev/null
+++ b/dme_dictionary.py
@@ -0,0 +1,116 @@
+DATA_DICTIONARY = {
+ # Supplier Information
+ "Suplr_NPI": "Supplier NPI - NPI for the Supplier on the DMEPOS claim",
+ "Suplr_Prvdr_Last_Name_Org": "Supplier Last Name/Organization Name - When registered as individual, the Supplier's last name. When registered as organization, this is the organization name",
+ "Suplr_Prvdr_First_Name": "Supplier First Name - When registered as individual, the Supplier's first name",
+ "Suplr_Prvdr_MI": "Supplier Middle Initial - When registered as individual, the Supplier's middle initial",
+ "Suplr_Prvdr_Crdntls": "Supplier Credentials - When registered as individual, these are the Supplier's credentials",
+ "Suplr_Prvdr_Gndr": "Supplier Gender - When registered as individual, this is the Supplier's gender",
+ "Suplr_Prvdr_Ent_Cd": "Supplier Entity Code - 'I' identifies Suppliers registered as individuals, 'O' identifies Suppliers registered as organizations",
+ "Suplr_Prvdr_St1": "Supplier Street 1 - First line of the Supplier's street address",
+ "Suplr_Prvdr_St2": "Supplier Street 2 - Second line of the Supplier's street address",
+ "Suplr_Prvdr_City": "Supplier City - The city where the Supplier is located",
+ "Suplr_Prvdr_State_Abrvtn": "Supplier State - State postal abbreviation where the Supplier is located",
+ "Suplr_Prvdr_State_FIPS": "Supplier State FIPS Code - FIPS code for Supplier's state",
+ "Suplr_Prvdr_Zip5": "Supplier ZIP - The Supplier's ZIP code",
+ "Suplr_Prvdr_RUCA": "Supplier RUCA - Rural-Urban Commuting Area Code for the Supplier ZIP code",
+ "Suplr_Prvdr_RUCA_Desc": "Supplier RUCA Description - Description of Rural-Urban Commuting Area (RUCA) Code",
+ "Suplr_Prvdr_Cntry": "Supplier Country - Country where the Supplier is located",
+ "Suplr_Prvdr_Spclty_Desc": "Supplier Provider Specialty Description - Derived from Medicare provider/supplier specialty code",
+ "Suplr_Prvdr_Spclty_Srce": "Supplier Provider Specialty Source - Source of the Supplier Specialty (claims-specialty or NPPES-specialty)",
+
+ # Total Supplier Claims/Services
+ "Tot_Suplr_HCPCS_Cds": "Number of Supplier HCPCS - Total unique DMEPOS product/service HCPCS codes",
+ "Tot_Suplr_Benes": "Number of Supplier Beneficiaries - Total unique beneficiaries (<11 are suppressed)",
+ "Tot_Suplr_Clms": "Number of Supplier Claims - Total DMEPOS claims submitted",
+ "Tot_Suplr_Srvcs": "Number of Supplier Services - Total DMEPOS products/services rendered",
+ "Suplr_Sbmtd_Chrgs": "Supplier Submitted Charges - Total charges submitted for DMEPOS products/services",
+ "Suplr_Mdcr_Alowd_Amt": "Supplier Medicare Allowed Amount - Total Medicare allowed amount",
+ "Suplr_Mdcr_Pymt_Amt": "Supplier Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
+ "Suplr_Mdcr_Stdzd_Pymt_Amt": "Supplier Medicare Standard Payment Amount - Standardized Medicare payments",
+
+ # DME-specific Fields
+ "DME_Sprsn_Ind": "Durable Medical Equipment Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
+ "DME_Tot_Suplr_HCPCS_Cds": "Number of DME HCPCS - Total unique DME HCPCS codes",
+ "DME_Tot_Suplr_Benes": "Number of DME Beneficiaries - Total unique beneficiaries with DME claims (<11 are suppressed)",
+ "DME_Tot_Suplr_Clms": "Number of DME Claims - Total DME claims submitted",
+ "DME_Tot_Suplr_Srvcs": "Number of DME Services - Total DME products/services rendered",
+ "DME_Suplr_Sbmtd_Chrgs": "DME Submitted Charges - Total charges submitted for DME products/services",
+ "DME_Suplr_Mdcr_Alowd_Amt": "DME Medicare Allowed Amount - Total Medicare allowed amount for DME",
+ "DME_Suplr_Mdcr_Pymt_Amt": "DME Medicare Payment Amount - Amount Medicare paid for DME after deductible/coinsurance",
+ "DME_Suplr_Mdcr_Stdzd_Pymt_Amt": "DME Medicare Standard Payment Amount - Standardized Medicare payments for DME",
+
+ # Prosthetic and Orthotic Fields
+ "POS_Sprsn_Ind": "Prosthetic and Orthotic Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
+ "POS_Tot_Suplr_HCPCS_Cds": "Number of Prosthetic/Orthotic HCPCS - Total unique prosthetic/orthotic HCPCS codes",
+ "POS_Tot_Suplr_Benes": "Number of Prosthetic/Orthotic Beneficiaries - Total unique beneficiaries",
+ "POS_Tot_Suplr_Clms": "Number of Prosthetic/Orthotic Claims - Total prosthetic/orthotic claims submitted",
+ "POS_Tot_Suplr_Srvcs": "Number of Prosthetic/Orthotic Services - Total prosthetic/orthotic products/services",
+ "POS_Suplr_Sbmtd_Chrgs": "Prosthetic/Orthotic Submitted Charges - Total charges submitted for prosthetic/orthotic",
+ "POS_Suplr_Mdcr_Alowd_Amt": "Prosthetic/Orthotic Medicare Allowed Amount - Total Medicare allowed amount",
+ "POS_Suplr_Mdcr_Pymt_Amt": "Prosthetic/Orthotic Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
+ "POS_Suplr_Mdcr_Stdzd_Pymt_Amt": "Prosthetic/Orthotic Medicare Standard Payment Amount - Standardized Medicare payments",
+
+ # Drug and Nutritional Fields
+ "Drug_Sprsn_Ind": "Drug and Nutritional Suppression Indicator - '*'=suppressed (1-10 claims), '#'=counter-suppressed",
+ "Drug_Tot_Suplr_HCPCS_Cds": "Number of Drug/Nutritional HCPCS - Total unique drug/nutritional HCPCS codes",
+ "Drug_Tot_Suplr_Benes": "Number of Drug/Nutritional Beneficiaries - Total unique beneficiaries",
+ "Drug_Tot_Suplr_Clms": "Number of Drug/Nutritional Claims - Total drug/nutritional claims submitted",
+ "Drug_Tot_Suplr_Srvcs": "Number of Drug/Nutritional Services - Total drug/nutritional products/services",
+ "Drug_Suplr_Sbmtd_Chrgs": "Drug/Nutritional Submitted Charges - Total charges submitted for drug/nutritional",
+ "Drug_Suplr_Mdcr_Alowd_Amt": "Drug/Nutritional Medicare Allowed Amount - Total Medicare allowed amount",
+ "Drug_Suplr_Mdcr_Pymt_Amt": "Drug/Nutritional Medicare Payment Amount - Amount Medicare paid after deductible/coinsurance",
+ "Drug_Suplr_Mdcr_Stdzd_Pymt_Amt": "Drug/Nutritional Medicare Standard Payment Amount - Standardized Medicare payments",
+
+ # Beneficiary Demographics
+ "Bene_Avg_Age": "Average Age of Beneficiaries - Average age at end of calendar year or time of death",
+ "Bene_Age_LT_65_Cnt": "Number of Beneficiaries <65 - Count of beneficiaries under 65 years old",
+ "Bene_Age_65_74_Cnt": "Number of Beneficiaries 65-74 - Count of beneficiaries between 65-74 years old",
+ "Bene_Age_75_84_Cnt": "Number of Beneficiaries 75-84 - Count of beneficiaries between 75-84 years old",
+ "Bene_Age_GT_84_Cnt": "Number of Beneficiaries >84 - Count of beneficiaries over 84 years old",
+ "Bene_Feml_Cnt": "Number of Female Beneficiaries - Count of female beneficiaries",
+ "Bene_Male_Cnt": "Number of Male Beneficiaries - Count of male beneficiaries",
+ "Bene_Race_Wht_Cnt": "Number of White Beneficiaries - Count of non-Hispanic white beneficiaries",
+ "Bene_Race_Black_Cnt": "Number of Black Beneficiaries - Count of non-Hispanic Black/African American beneficiaries",
+ "Bene_Race_Api_Cnt": "Number of Asian/PI Beneficiaries - Count of Asian Pacific Islander beneficiaries",
+ "Bene_Race_Hspnc_Cnt": "Number of Hispanic Beneficiaries - Count of Hispanic beneficiaries",
+ "Bene_Race_Natind_Cnt": "Number of Native American/Alaska Native Beneficiaries - Count of American Indian/Alaska Native beneficiaries",
+ "Bene_Race_Othr_Cnt": "Number of Other Race Beneficiaries - Count of beneficiaries with race not elsewhere classified",
+ "Bene_Ndual_Cnt": "Number of Medicare & Medicaid Beneficiaries - Count of dual-eligible beneficiaries",
+ "Bene_Dual_Cnt": "Number of Medicare-Only Beneficiaries - Count of Medicare-only beneficiaries",
+
+ # Beneficiary Health Conditions (Mental/Behavioral Health)
+ "Bene_CC_BH_ADHD_OthCD_V1_Pct": "Percent with ADHD and Other Conduct Disorders",
+ "Bene_CC_BH_Alcohol_Drug_V1_Pct": "Percent with Alcohol and Drug Use Disorders",
+ "Bene_CC_BH_Tobacco_V1_Pct": "Percent with Tobacco Use Disorders",
+ "Bene_CC_BH_Alz_NonAlzdem_V2_Pct": "Percent with Alzheimer's and Non-Alzheimer's Dementia",
+ "Bene_CC_BH_Anxiety_V1_Pct": "Percent with Anxiety Disorders",
+ "Bene_CC_BH_Bipolar_V1_Pct": "Percent with Bipolar Disorder",
+ "Bene_CC_BH_Mood_V2_Pct": "Percent with Depression, Bipolar or Other Mood Disorders",
+ "Bene_CC_BH_Depress_V1_Pct": "Percent with Major Depressive Affective Disorder",
+ "Bene_CC_BH_PD_V1_Pct": "Percent with Personality Disorders",
+ "Bene_CC_BH_PTSD_V1_Pct": "Percent with Post-Traumatic Stress Disorder",
+ "Bene_CC_BH_Schizo_OthPsy_V1_Pct": "Percent with Schizophrenia and Other Psychotic Disorders",
+
+ # Beneficiary Health Conditions (Physical Health)
+ "Bene_CC_PH_Asthma_V2_Pct": "Percent with Asthma",
+ "Bene_CC_PH_Afib_V2_Pct": "Percent with Atrial Fibrillation and Flutter",
+ "Bene_CC_PH_Cancer6_V2_Pct": "Percent with Cancer (combined 6 cancer indicators)",
+ "Bene_CC_PH_CKD_V2_Pct": "Percent with Chronic Kidney Disease",
+ "Bene_CC_PH_COPD_V2_Pct": "Percent with Chronic Obstructive Pulmonary Disease",
+ "Bene_CC_PH_Diabetes_V2_Pct": "Percent with Diabetes",
+ "Bene_CC_PH_HF_NonIHD_V2_Pct": "Percent with Heart Failure and Non-Ischemic Heart Disease",
+ "Bene_CC_PH_Hyperlipidemia_V2_Pct": "Percent with Hyperlipidemia",
+ "Bene_CC_PH_Hypertension_V2_Pct": "Percent with Hypertension",
+ "Bene_CC_PH_IschemicHeart_V2_Pct": "Percent with Ischemic Heart Disease",
+ "Bene_CC_PH_Osteoporosis_V2_Pct": "Percent with Osteoporosis",
+ "Bene_CC_PH_Parkinson_V2_Pct": "Percent with Parkinson's Disease",
+ "Bene_CC_PH_Arthritis_V2_Pct": "Percent with Rheumatoid Arthritis/Osteoarthritis",
+ "Bene_CC_PH_Stroke_TIA_V2_Pct": "Percent with Stroke/Transient Ischemic Attack",
+
+ # Risk Score
+ "Bene_Avg_Risk_Scre": "Average HCC Risk Score of Beneficiaries",
+
+ # Year column (added by our script)
+ "year": "Year of the data"
+}
diff --git a/dme_notebook_example.ipynb b/dme_notebook_example.ipynb
deleted file mode 100644
index 0519ecb..0000000
--- a/dme_notebook_example.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index f40278b..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-jupyter==1.0.0
-notebook==7.3.2
-pandas==2.2.3
-numpy==1.26.0
-matplotlib==3.9.2
-seaborn==0.13.2
-scikit-learn==1.6.1
\ No newline at end of file