From 5d6c323481129cd6fb48590a722cb4a09b98bd2c Mon Sep 17 00:00:00 2001 From: Ravishankar Sivasubramaniam Date: Tue, 30 Sep 2025 19:18:39 -0500 Subject: [PATCH 1/4] Rebrand to Conversational SQL, update docs, organize guides, improve developer experience --- README.md | 153 ++++++++++--------------- docs/D1_SETUP.md | 94 +++++++--------- docs/ENVIRONMENT_SETUP.md | 222 ++++++++----------------------------- docs/GOOGLE_OAUTH_SETUP.md | 61 ++++++++++ docs/R2_SETUP.md | 102 +++++++++-------- 5 files changed, 259 insertions(+), 373 deletions(-) create mode 100644 docs/GOOGLE_OAUTH_SETUP.md diff --git a/README.md b/README.md index 8e1ebce..64b4e55 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,50 @@ -# Single Family Loan Analytics Platform -Transform natural language questions into powerful SQL queries for mortgage loan portfolio analysis using AI-powered ontology-driven intelligence. +# Conversational SQL -## 🎯 What This Does +**Conversational SQL** is an open-source framework for transforming natural language questions into powerful SQL queries for any tabular dataset. It’s designed for developers, data scientists, and teams who want to build AI-powered analytics tools with minimal effort. -This platform enables mortgage analysts and data scientists to query millions of loan records using plain English, automatically generating precise SQL through an ontological data model that understands mortgage finance domain knowledge. +## πŸš€ Why Conversational SQL? + +Stop writing complex SQL by hand! With Conversational SQL, you can: +- Ask questions in plain English and get optimized SQL instantly +- Integrate with multiple AI providers (Anthropic Claude, AWS Bedrock, local models) +- Extend to any domain with ontological data modeling +- Build interactive dashboards, query builders, and analytics apps + +## πŸ† Flagship Use Case: Single Family Loan Analytics + +This repo features a production-grade implementation for mortgage loan portfolio analysis. It’s a showcase of how Conversational SQL can power real-world, domain-specific analytics. + +**Key Features:** +- **Natural Language to SQL**: "Show me high-risk loans in California" β†’ SQL +- **Ontological Intelligence**: 110+ fields, 15 business domains, semantic relationships +- **Real-time Analytics**: Dashboards, metrics, risk indicators +- **Multi-Provider AI**: Anthropic Claude, AWS Bedrock, local models +- **Cloudflare D1 Logging**: All user logins and queries are securely logged using Cloudflare D1 (no external DB required) -**Key Capabilities:** -- **Natural Language to SQL**: Ask questions like "Show me high-risk loans in California" β†’ Get optimized SQL -- **Ontological Intelligence**: 110+ data fields organized across 15 business domains with semantic relationships -- **Real-time Analytics**: Interactive dashboards with loan performance metrics and risk indicators -- **Multi-Provider AI**: Support for AWS Bedrock, Anthropic Claude, and local models ## 🧠 How Ontology Improves SQL Generation -Traditional NL-to-SQL systems struggle with domain-specific terminology and field relationships. Our ontological approach: +Conversational SQL uses an ontological approach to bridge the gap between natural language and complex, domain-specific SQL. This enables: -### 1. **Domain-Aware Context** -``` -Instead of: "Show loans in bad condition" -Ontology understands: DLQ_STATUS = '03' (90+ days delinquent) -``` +- **Accurate mapping of business terms to data fields** +- **Automatic handling of semantic relationships and business rules** +- **Consistent, explainable query generation for analytics and reporting** -### 2. **Semantic Relationships** -``` -"High-risk borrowers" automatically includes: -- CSCORE_B < 620 (credit quality) -- OLTV > 95% (equity position) -- DTI > 43% (payment capacity) -``` +**Examples:** -### 3. **Business Intelligence Integration** -``` -"Portfolio concentration risk" generates: -SELECT STATE, SUM(CURRENT_UPB)/1000000 as UPB_MM, - COUNT(*) as loan_count, - SUM(CURRENT_UPB)/(SELECT SUM(CURRENT_UPB) FROM data)*100 as pct_portfolio -FROM data GROUP BY STATE HAVING pct_portfolio > 15 -``` +- *Domain-Aware Context*: Instead of "Show loans in bad condition," the ontology maps this to `DLQ_STATUS = '03'` (90+ days delinquent). +- *Semantic Relationships*: "High-risk borrowers" automatically includes: + - `CSCORE_B < 620` (credit quality) + - `OLTV > 95%` (equity position) + - `DTI > 43%` (payment capacity) +- *Business Intelligence Integration*: "Portfolio concentration risk" generates: + ```sql + SELECT STATE, SUM(CURRENT_UPB)/1000000 as UPB_MM, + COUNT(*) as loan_count, + SUM(CURRENT_UPB)/(SELECT SUM(CURRENT_UPB) FROM data)*100 as pct_portfolio + FROM data GROUP BY STATE HAVING pct_portfolio > 15 + ``` ## πŸ—οΈ Architecture @@ -47,6 +54,7 @@ graph TB R2[Cloudflare R2 Storage] PQ[Parquet Files
9M+ loan records] DUCK[DuckDB Engine] + D1[Cloudflare D1 Logging] end subgraph "Intelligence Layer" @@ -76,6 +84,7 @@ graph TB R2 --> PQ PQ --> DUCK ONT --> REL + D1 --> UI UI --> QRY QRY --> ONT @@ -90,47 +99,13 @@ graph TB classDef appNodes fill:#e8f5e8 classDef secNodes fill:#fff3e0 - class R2,PQ,DUCK dataNodes + class R2,PQ,DUCK,D1 dataNodes class ONT,REL,BED,CLA,LOC aiNodes class UI,QRY,VIZ,EXP appNodes class AUTH,SESS secNodes ``` -## πŸš€ Core Components - -### **1. Ontological Data Dictionary** -- **15 Business Domains**: Identification, Temporal, Credit Risk, Geographic, etc. -- **110+ Field Mappings**: Complete mortgage loan lifecycle coverage -- **Semantic Relationships**: Credit triangle (Credit + Collateral + Capacity) -- **Risk Intelligence**: Built-in risk tiers and business rules - -### **2. AI-Powered Query Engine** -- **Multi-Provider Support**: AWS Bedrock (Claude, Titan), Anthropic Claude API -- **Context-Aware Generation**: Domain knowledge + field relationships + business rules -- **Query Validation**: Syntax checking, field validation, performance optimization -- **Natural Language Processing**: Complex financial terminology understanding - -### **3. Interactive Analytics Interface** -- **Query Builder**: Natural language input with AI-powered SQL generation -- **Ontology Explorer**: Interactive data model navigation -- **Advanced SQL Editor**: Direct query access with schema reference -- **Real-time Results**: Sub-second query execution on 9M+ records - -### **4. Secure Access** -- **Authentication**: Google OAuth integration -- **Session Management**: Secure session handling -- **Data Privacy**: No PII storage, secure cloud storage -- **Query Tracking**: Session-based query history - -## πŸ“Š Data Overview - -- **Records**: 9+ million individual loan performance observations -- **Time Range**: 1999-2025 loan originations with monthly updates -- **Coverage**: All 50 states + DC, $12.4 trillion original UPB -- **Performance**: 0.3% lifetime loss rate, 98% current payment rate -- **Storage**: Optimized Parquet format for high-performance analytics - -## πŸš€ Quick Start +## �️ Quick Start ### Prerequisites - Python 3.11+ @@ -159,40 +134,30 @@ cp .env.example .env streamlit run app.py ``` -## πŸ“– Setup Guides -**Essential Setup Documentation:** +## πŸ“– Developer Setup Guides -- **[πŸ” Google OAuth Setup](GOOGLE_OAUTH_SETUP.md)** - Authentication configuration -- **[☁️ Cloud Storage Setup](docs/R2_SETUP.md)** - Cloudflare R2 data storage configuration -- **[πŸš€ Deployment Guide](docs/DEPLOYMENT.md)** - Production deployment instructions -- **[βš™οΈ Environment Setup](docs/ENVIRONMENT_SETUP.md)** - Development environment configuration +All setup and deployment guides are located in the `docs/` directory: -## 🎯 Use Cases +- **[Google OAuth Setup](docs/GOOGLE_OAUTH_SETUP.md)** β€” Authentication configuration +- **[Cloud Storage Setup](docs/R2_SETUP.md)** β€” Cloudflare R2 data storage configuration +- **[Cloudflare D1 Setup](docs/D1_SETUP.md)** β€” Logging user activity with Cloudflare D1 +- **[Environment Setup](docs/ENVIRONMENT_SETUP.md)** β€” Environment variables and dependencies +- **[Deployment Guide](docs/DEPLOYMENT.md)** β€” Deploy to Streamlit Cloud or locally -**Portfolio Risk Management** -``` -"Show me all loans in Florida with FICO scores below 620" -β†’ Geographic + credit risk analysis -``` -**Performance Analytics** -``` -"What's the delinquency rate by vintage year for California loans?" -β†’ Temporal + geographic performance trending -``` -**Concentration Risk** -``` -"Which states have more than 15% of our portfolio?" -β†’ Geographic concentration analysis -``` +## πŸ’‘ Extending Conversational SQL -**Credit Quality Assessment** -``` -"Compare average DTI and LTV by credit score tier" -β†’ Multi-dimensional credit risk profiling -``` +Conversational SQL is designed for easy adaptation to any tabular dataset. To use it for your own data, simply swap out the ontology and schema files for your domain. + + +## 🎯 Example Use Cases + +- **Portfolio Risk Management**: "Show me all loans in Florida with FICO scores below 620" +- **Performance Analytics**: "What's the delinquency rate by vintage year for California loans?" +- **Concentration Risk**: "Which states have more than 15% of our portfolio?" +- **Credit Quality Assessment**: "Compare average DTI and LTV by credit score tier" ## 🀝 Contributing @@ -204,4 +169,4 @@ This project is licensed under the MIT License - see the LICENSE file for detail --- -**Built with:** Python β€’ Streamlit β€’ DuckDB β€’ AWS Bedrock β€’ Anthropic Claude β€’ Google OAuth β€’ Cloudflare R2 \ No newline at end of file +**Built with:** Python β€’ Streamlit β€’ DuckDB β€’ AWS Bedrock β€’ Anthropic Claude β€’ Google OAuth β€’ Cloudflare R2 β€’ Cloudflare D1 \ No newline at end of file diff --git a/docs/D1_SETUP.md b/docs/D1_SETUP.md index b63443c..00ca511 100644 --- a/docs/D1_SETUP.md +++ b/docs/D1_SETUP.md @@ -1,94 +1,78 @@ -# Cloudflare D1 Database Setup +npm install -g wrangler -Minimal setup for user activity logging (logins and queries). +# Cloudflare D1 Database Setup -## πŸš€ Quick Setup +This guide explains how to set up Cloudflare D1 for logging user activity (logins and queries) in Conversational SQL. -### 1. Create D1 Database +## Quick Setup +### 1. Create the D1 Database ```bash -# Install Wrangler (if not already installed) npm install -g wrangler - -# Login to Cloudflare wrangler login - -# Create D1 database wrangler d1 create nlptosql-logs ``` -### 2. Initialize Database Schema - +### 2. Initialize the Database Schema ```bash -# Run the schema creation wrangler d1 execute nlptosql-logs --file=scripts/d1_schema.sql ``` ### 3. Get Database Credentials - -After creating the database, get these values from your Cloudflare dashboard: - -1. **Account ID**: Dashboard β†’ Right sidebar β†’ Account ID -2. **Database ID**: From the `wrangler d1 create` output -3. **API Token**: Dashboard β†’ My Profile β†’ API Tokens β†’ Create Token - - Use template: "Custom token" - - Permissions: `Cloudflare D1:Edit` - - Account resources: Include your account +After creation, retrieve these from your Cloudflare dashboard: +- Account ID (right sidebar) +- Database ID (from `wrangler d1 create` output) +- API Token (create a custom token with `Cloudflare D1:Edit` permission) ### 4. Update Environment Variables - Add to your `.env` file: - ```bash -# Cloudflare D1 Database (optional) CLOUDFLARE_ACCOUNT_ID=your_account_id_here CLOUDFLARE_D1_DATABASE_ID=your_database_id_here CLOUDFLARE_API_TOKEN=your_api_token_here ``` -## πŸ“Š Database Schema +## Database Schema -**Minimal tables:** +Minimal tables: -### `user_logins` -- `id` - Auto increment primary key -- `user_id` - Google user ID -- `email` - User email -- `login_time` - Timestamp -- `user_agent` - Browser info +### user_logins +- id: Auto increment primary key +- user_id: Google user ID +- email: User email +- login_time: Timestamp +- user_agent: Browser info -### `user_queries` -- `id` - Auto increment primary key -- `user_id` - Google user ID -- `email` - User email -- `question` - Natural language question -- `sql_query` - Generated SQL -- `ai_provider` - claude/bedrock -- `execution_time` - Query execution time -- `query_time` - Timestamp +### user_queries +- id: Auto increment primary key +- user_id: Google user ID +- email: User email +- question: Natural language question +- sql_query: Generated SQL +- ai_provider: claude/bedrock +- execution_time: Query execution time +- query_time: Timestamp -## πŸ”’ Security +## Security - Uses Cloudflare's secure REST API -- API token with minimal D1 permissions only -- No sensitive data stored (just activity logs) -- Silent fail mode - app works without database - -## βœ… Benefits +- API token with minimal D1 permissions +- No sensitive data stored (only activity logs) +- App works without database if logging is disabled -- **Lightweight**: Only 2 simple tables -- **Optional**: App works fine without it -- **Fast**: Cloudflare D1 is globally distributed -- **Free tier**: Generous limits for logging -- **No maintenance**: Managed by Cloudflare +## Benefits -## πŸ”§ Testing +- Lightweight: Only two tables +- Optional: App functions without logging +- Fast: Globally distributed +- Free tier: Generous limits +- No maintenance: Managed by Cloudflare +## Testing ```bash -# Test database connectivity wrangler d1 execute nlptosql-logs --command="SELECT COUNT(*) FROM user_logins;" ``` --- -**That's it!** The logging is completely optional and runs silently in the background. \ No newline at end of file +Logging is optional and runs silently in the background. \ No newline at end of file diff --git a/docs/ENVIRONMENT_SETUP.md b/docs/ENVIRONMENT_SETUP.md index 5f537fb..f870cff 100644 --- a/docs/ENVIRONMENT_SETUP.md +++ b/docs/ENVIRONMENT_SETUP.md @@ -1,29 +1,26 @@ + # Environment Configuration Guide -Complete guide to configuring your Single Family Loan Analytics Platform environment variables and dependencies. +This guide provides clear instructions for configuring environment variables and dependencies for Conversational SQL and its Single Family Loan Analytics implementation. -## πŸ”§ Environment Variables +## Environment Variables -Create a `.env` file in your project root with the following configurations: +Create a `.env` file in your project root with the following settings: -### **Core Application** +### Core Application ```bash -# Application Settings STREAMLIT_SERVER_PORT=8501 STREAMLIT_SERVER_HEADLESS=true PYTHONPATH=/app - -# Data Configuration PROCESSED_DATA_DIR=data/processed/ CACHE_TTL=3600 FORCE_DATA_REFRESH=false ``` -### **AI Provider Configuration** +### AI Provider Configuration -#### Option 1: AWS Bedrock (Recommended) +#### AWS Bedrock (Recommended) ```bash -# AWS Bedrock Configuration AI_PROVIDER=bedrock AWS_ACCESS_KEY_ID=your_aws_access_key AWS_SECRET_ACCESS_KEY=your_aws_secret_key @@ -32,207 +29,78 @@ BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0 BEDROCK_MAX_TOKENS=4096 ``` -#### Option 2: Anthropic Claude API +#### Anthropic Claude API ```bash -# Anthropic Claude Configuration AI_PROVIDER=claude ANTHROPIC_API_KEY=sk-ant-your-api-key-here CLAUDE_MODEL=claude-3-sonnet-20240229 CLAUDE_MAX_TOKENS=4096 ``` -### **Authentication (Optional)** +### Authentication (Optional) ```bash -# Supabase Authentication ENABLE_AUTH=false SUPABASE_URL=https://your-project.supabase.co SUPABASE_ANON_KEY=your-anon-key SUPABASE_SERVICE_ROLE_KEY=your-service-role-key ``` -### **Cloud Storage (Optional)** +### Cloud Storage (Optional) ```bash -# Cloudflare R2 Storage R2_ENDPOINT_URL=https://your-account-id.r2.cloudflarestorage.com R2_BUCKET_NAME=single-family-loan R2_ACCESS_KEY_ID=your-r2-access-key R2_SECRET_ACCESS_KEY=your-r2-secret-key ``` -## πŸ“¦ Dependencies +## Dependencies -### **Core Requirements** +Install all dependencies: ```bash -# Install all dependencies pip install -r requirements.txt - -# Or install individually: -pip install streamlit pandas duckdb boto3 python-dotenv -pip install anthropic # for Claude API -pip install boto3 # for AWS Bedrock -pip install supabase # for authentication -``` - -### **Development Dependencies** -```bash -# Development tools -pip install pytest black flake8 mypy -pip install streamlit-autorefresh # for development -``` - -## πŸ› οΈ System Requirements - -### **Minimum System Specs** -- **CPU**: 2+ cores -- **RAM**: 4GB minimum, 8GB recommended -- **Storage**: 2GB free space for data files -- **Python**: 3.9 or higher - -### **Recommended Production Specs** -- **CPU**: 4+ cores -- **RAM**: 16GB -- **Storage**: 10GB+ SSD -- **Network**: Stable internet for AI API calls - -## πŸš€ Quick Environment Setup - -### **1. Clone and Setup** -```bash -git clone -cd nlptosql -cp .env.example .env ``` -### **2. Configure AI Provider** -Choose one AI provider and configure accordingly: - -**For AWS Bedrock:** -```bash -# Add to .env -AI_PROVIDER=bedrock -AWS_ACCESS_KEY_ID=your_key -AWS_SECRET_ACCESS_KEY=your_secret -AWS_DEFAULT_REGION=us-east-1 -``` +## System Requirements -**For Claude API:** -```bash -# Add to .env -AI_PROVIDER=claude -ANTHROPIC_API_KEY=your_api_key -``` +### Minimum +- CPU: 2+ cores +- RAM: 4GB (8GB recommended) +- Storage: 2GB free space +- Python: 3.9 or higher -### **3. Install Dependencies** -```bash -pip install -r requirements.txt -``` +### Recommended (Production) +- CPU: 4+ cores +- RAM: 16GB +- Storage: 10GB+ SSD +- Network: Stable internet for AI API calls -### **4. Test Configuration** -```bash -# Test AI connectivity -python -c "from src.core import initialize_ai_client; print('βœ… AI client initialized' if initialize_ai_client() else '❌ AI setup failed')" +## Quick Setup -# Test data access -python -c "from src.core import scan_parquet_files; files = scan_parquet_files(); print(f'βœ… Found {len(files)} data files' if files else '⚠️ No data files found')" -``` +1. Clone the repository and set up your environment: + ```bash + git clone + cd nlptosql + cp .env.example .env + pip install -r requirements.txt + ``` +2. Configure your AI provider in `.env`. +3. Launch the application: + ```bash + streamlit run app.py + ``` -### **5. Launch Application** -```bash -streamlit run app.py -``` +## Troubleshooting -## βš™οΈ Configuration Options - -### **AI Model Selection** - -**AWS Bedrock Models:** -- `anthropic.claude-3-sonnet-20240229-v1:0` (Recommended) -- `anthropic.claude-3-haiku-20240307-v1:0` (Faster) -- `anthropic.claude-instant-v1` (Legacy) - -**Claude API Models:** -- `claude-3-sonnet-20240229` (Recommended) -- `claude-3-haiku-20240307` (Faster) -- `claude-2.1` (Legacy) - -### **Performance Tuning** - -**For Large Datasets:** -```bash -CACHE_TTL=7200 # Increase cache duration -STREAMLIT_SERVER_MAX_UPLOAD_SIZE=1000 # MB -``` - -**For Development:** -```bash -STREAMLIT_SERVER_HEADLESS=false -STREAMLIT_SERVER_RUN_ON_SAVE=true -CACHE_TTL=300 # Shorter cache for development -``` - -## πŸ” Troubleshooting - -### **Common Issues** - -**AI Provider Connection Failed:** -```bash -# Check credentials -echo $ANTHROPIC_API_KEY # Should not be empty -aws sts get-caller-identity # Should return AWS account info -``` - -**Data Files Not Found:** -```bash -# Check data directory -ls -la data/processed/ -# Should contain .parquet files -``` - -**Import Errors:** -```bash -# Check Python path -echo $PYTHONPATH -# Should include project root -``` - -**Memory Issues:** -```bash -# Monitor memory usage -htop -# Consider reducing CACHE_TTL or dataset size -``` - -### **Environment Validation Script** -```bash -# Create validate_env.py -python -c " -import os -from dotenv import load_dotenv - -load_dotenv() - -required_vars = ['AI_PROVIDER'] -optional_vars = ['ANTHROPIC_API_KEY', 'AWS_ACCESS_KEY_ID', 'SUPABASE_URL'] - -print('πŸ” Environment Validation:') -for var in required_vars: - value = os.getenv(var) - print(f'βœ… {var}: {\"Set\" if value else \"❌ Missing\"}') - -for var in optional_vars: - value = os.getenv(var) - if value: - print(f'βœ… {var}: Set') -" -``` +- Check credentials and environment variables. +- Ensure data files are present in `data/processed/`. +- Review logs for errors. -## πŸ“š Related Documentation +## Related Documentation -- **[πŸ” Authentication Setup](SUPABASE_SETUP.md)** - Configure user authentication -- **[☁️ Cloud Storage Setup](R2_SETUP.md)** - Set up data storage -- **[πŸš€ Deployment Guide](DEPLOYMENT.md)** - Production deployment -- **[⚑ Quick Start Guide](AUTH_QUICK_START.md)** - Get running quickly +- [Cloud Storage Setup](R2_SETUP.md) +- [Deployment Guide](DEPLOYMENT.md) +- [Cloudflare D1 Setup](D1_SETUP.md) --- -**Need Help?** Check our troubleshooting guide or open an issue on GitHub. \ No newline at end of file +For help, open an issue on GitHub. \ No newline at end of file diff --git a/docs/GOOGLE_OAUTH_SETUP.md b/docs/GOOGLE_OAUTH_SETUP.md new file mode 100644 index 0000000..1fd0bf2 --- /dev/null +++ b/docs/GOOGLE_OAUTH_SETUP.md @@ -0,0 +1,61 @@ +# Google OAuth Setup Guide + +This app now uses direct Google OAuth for authentication - no database required! + +## πŸš€ Quick Setup + +### 1. Get Google OAuth Credentials + +1. Go to [Google Cloud Console](https://console.cloud.google.com/) +2. Create a new project or select existing one +3. Go to **APIs & Services** β†’ **Credentials** +4. Click **+ CREATE CREDENTIALS** β†’ **OAuth 2.0 Client IDs** +5. Choose **Web application** +6. Add your redirect URIs (see below) +7. Copy the **Client ID** and **Client Secret** + +### 2. Configure Redirect URIs + +In your Google OAuth client, add these authorized redirect URIs: + +**For localhost:** +``` +http://localhost:8501 +``` + +**For IP address access:** +``` +http://YOUR_IP_ADDRESS:8501 +``` + +**For Replit:** +``` +https://your-repl-name.your-username.repl.co +``` + +**For Streamlit Cloud:** +``` +https://your-app-name.streamlit.app +``` + +### 3. Set Environment Variables + +Create a `.env` file with: + +```bash +# Google OAuth (required) +GOOGLE_CLIENT_ID=your_google_client_id_here +GOOGLE_CLIENT_SECRET=your_google_client_secret_here + +# Auth settings +ENABLE_AUTH=true +DEMO_MODE=false # Set to true for debugging +``` + +## βœ… Benefits of This Approach + +- **No Database**: No Supabase or external database needed +- **Simple**: Just Google OAuth credentials required +- **Fast**: Direct authentication flow +- **Clean**: No complex setup or multiple services +- **Secure**: Google handles all security aspects diff --git a/docs/R2_SETUP.md b/docs/R2_SETUP.md index 3ec45d7..6fe91dd 100644 --- a/docs/R2_SETUP.md +++ b/docs/R2_SETUP.md @@ -1,64 +1,72 @@ + # Cloudflare R2 Setup Guide -This guide will walk you through setting up Cloudflare R2 for data storage and generating the necessary API keys for the Single Family Loan application. +This guide provides clear, step-by-step instructions for configuring Cloudflare R2 as the data storage backend for Conversational SQL and its Single Family Loan Analytics implementation. ## Table of Contents -- [Prerequisites](#prerequisites) -- [Step 1: Create R2 Bucket](#step-1-create-r2-bucket) -- [Step 2: Generate R2 API Tokens](#step-2-generate-r2-api-tokens) -- [Step 3: Configure Application](#step-3-configure-application) -- [Step 4: Upload Data](#step-4-upload-data) -- [Step 5: Test Connection](#step-5-test-connection) -- [Troubleshooting](#troubleshooting) +- Prerequisites +- Create R2 Bucket +- Generate R2 API Tokens +- Configure Application +- Upload Data +- Test Connection +- Troubleshooting ## Prerequisites -- Cloudflare account (free tier available) -- Data files in Parquet format ready for upload +- Cloudflare account (free tier is sufficient) +- Parquet-format data files ready for upload -## Step 1: Create R2 Bucket +## 1. Create an R2 Bucket -1. **Log into Cloudflare Dashboard** - - Go to [Cloudflare Dashboard](https://dash.cloudflare.com) - - Sign in with your account credentials +1. Log in to your [Cloudflare Dashboard](https://dash.cloudflare.com). +2. In the sidebar, select **R2 Object Storage**. +3. Click **Create bucket**. +4. Enter a bucket name (e.g., `single-family-loan`). +5. Select a location near your deployment region. +6. Click **Create bucket** to finish. -2. **Navigate to R2 Object Storage** - - In the sidebar, click **"R2 Object Storage"** - - Click **"Create bucket"** +## 2. Generate R2 API Tokens -3. **Configure Bucket** - - **Bucket name**: `single-family-loan` (or your preferred name) - - **Location**: Choose closest to your deployment region - - Click **"Create bucket"** +1. In the Cloudflare Dashboard, go to **R2 Object Storage**. +2. Click **Manage R2 API tokens**. +3. Click **Create API token** and provide a descriptive name (e.g., `single-family-loan-token`). +4. Set permissions: + - `Object:Read` (required) + - `Object:List` (required) + - `Object:Write` (if you plan to upload via API) +5. Restrict resources to your account and the specific bucket. +6. Click **Continue to summary**, review, and then **Create token**. +7. Copy and save your `Access Key ID` and `Secret Access Key` securely. -## Step 2: Generate R2 API Tokens +## 3. Configure the Application -### Option A: R2 API Tokens (Recommended) +Add the following environment variables to your `.env` file: -1. **Navigate to API Tokens** - - In Cloudflare Dashboard, go to **"R2 Object Storage"** - - Click **"Manage R2 API tokens"** - -2. **Create New Token** - - Click **"Create API token"** - - Give it a descriptive name: `single-family-loan-token` - -3. **Set Permissions** - - **Permissions**: Select the following: - - βœ… `Object:Read` on your bucket - - βœ… `Object:List` on your bucket - - βœ… `Object:Write` (if you need to upload via API) - -4. **Set Resource Restrictions** - - **Include**: `All accounts` β†’ Your account - - **Include**: `All zones` β†’ Specific bucket β†’ `single-family-loan` - -5. **Generate Token** - - Click **"Continue to summary"** - - Review permissions and click **"Create token"** - - **Important**: Copy and save the following credentials immediately: - - `Access Key ID` - - `Secret Access Key` +```bash +R2_ENDPOINT_URL=https://.r2.cloudflarestorage.com +R2_BUCKET_NAME=single-family-loan +R2_ACCESS_KEY_ID= +R2_SECRET_ACCESS_KEY= +``` + +## 4. Upload Data + +Upload your Parquet data files to the R2 bucket using the Cloudflare dashboard or API tools. Ensure files are named and organized as expected by the application (see data documentation for details). + +## 5. Test Connection + +Start the application and verify that it can access and read data from the R2 bucket. Check logs for any errors related to storage or permissions. + +## Troubleshooting + +- Double-check your API token permissions and resource restrictions. +- Ensure your environment variables match your Cloudflare configuration. +- Review application logs for error messages. + +--- + +For further assistance, consult the official Cloudflare R2 documentation or open an issue in the Conversational SQL repository. - `Endpoint URL` (will be in format: `https://[account-id].r2.cloudflarestorage.com`) ### Option B: Global API Key (Less Secure) From b6e158aa137425bd0dccfa6780ee1943f833c2d8 Mon Sep 17 00:00:00 2001 From: Ravishankar Sivasubramaniam Date: Wed, 1 Oct 2025 18:41:52 -0500 Subject: [PATCH 2/4] Rename project to converSQL --- .env.example | 273 ++++-- .../ISSUE_TEMPLATE/ai_engine_contribution.md | 144 +++ .github/ISSUE_TEMPLATE/bug_report.md | 55 ++ .github/ISSUE_TEMPLATE/feature_request.md | 68 ++ .github/workflows/ci.yml | 142 +++ .gitignore | 29 +- CONTRIBUTING.md | 552 +++++++++++ GOOGLE_OAUTH_SETUP.md | 106 --- Makefile | 108 ++- README.md | 332 +++++-- app.py | 619 ++++++++----- docs/AI_ENGINES.md | 867 ++++++++++++++++++ docs/ARCHITECTURE.md | 696 ++++++++++++++ docs/D1_SETUP.md | 6 +- ..._data_dictionary.md => DATA_DICTIONARY.md} | 2 +- docs/DATA_PIPELINE.md | 595 ++++++++++++ docs/DEPLOYMENT.md | 10 +- docs/ENVIRONMENT_SETUP.md | 21 +- .../pipeline_csv_to_parquet multifile.ipynb | 808 ++++++++++++++++ notebooks/pipeline_csv_to_parquet.ipynb | 552 +++++++++++ pyproject.toml | 13 + pytest.ini | 47 + requirements.txt | 10 +- scripts/supabase_schema.sql | 114 --- scripts/sync_data.py | 48 +- setup.cfg | 51 ++ src/__init__.py | 2 +- src/ai_engines/__init__.py | 16 + src/ai_engines/base.py | 210 +++++ src/ai_engines/bedrock_adapter.py | 212 +++++ src/ai_engines/claude_adapter.py | 148 +++ src/ai_engines/gemini_adapter.py | 203 ++++ src/ai_service.py | 315 +++---- src/core.py | 65 +- src/d1_logger.py | 40 +- src/data_dictionary.py | 373 ++++---- src/simple_auth.py | 164 ++-- src/simple_auth_components.py | 73 +- tests/__init__.py | 3 + tests/conftest.py | 90 ++ tests/integration/__init__.py | 3 + .../integration/test_adapters_integration.py | 101 ++ tests/unit/__init__.py | 9 + tests/unit/test_adapters.py | 172 ++++ tests/unit/test_ai_service_simple.py | 89 ++ 45 files changed, 7325 insertions(+), 1231 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/ai_engine_contribution.md create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/workflows/ci.yml create mode 100644 CONTRIBUTING.md delete mode 100644 GOOGLE_OAUTH_SETUP.md create mode 100644 docs/AI_ENGINES.md create mode 100644 docs/ARCHITECTURE.md rename docs/{comprehensive_data_dictionary.md => DATA_DICTIONARY.md} (99%) create mode 100644 docs/DATA_PIPELINE.md create mode 100644 notebooks/pipeline_csv_to_parquet multifile.ipynb create mode 100644 notebooks/pipeline_csv_to_parquet.ipynb create mode 100644 pyproject.toml create mode 100644 pytest.ini delete mode 100644 scripts/supabase_schema.sql create mode 100644 setup.cfg create mode 100644 src/ai_engines/__init__.py create mode 100644 src/ai_engines/base.py create mode 100644 src/ai_engines/bedrock_adapter.py create mode 100644 src/ai_engines/claude_adapter.py create mode 100644 src/ai_engines/gemini_adapter.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_adapters_integration.py create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/test_adapters.py create mode 100644 tests/unit/test_ai_service_simple.py diff --git a/.env.example b/.env.example index 065df47..41c1cb3 100644 --- a/.env.example +++ b/.env.example @@ -1,93 +1,222 @@ -# Single Family Loan Q&A Application Configuration -# Copy this file to .env and update with your actual values +# ============================================================================= +# converSQL - Environment Configuration +# Multi-Provider AI-Powered SQL Generation +# ============================================================================= -# ======================================== -# DATA CONFIGURATION -# ======================================== +# ----------------------------------------------------------------------------- +# AI Provider Configuration (Multi-Provider Support) +# ----------------------------------------------------------------------------- + +# DEFAULT AI PROVIDER (Primary provider for SQL generation) +# Options: bedrock | claude | gemini +# +# - bedrock: AWS Bedrock (requires AWS credentials + model access) +# - claude: Anthropic Claude API (requires CLAUDE_API_KEY) +# - gemini: Google Gemini API (requires GOOGLE_API_KEY) +# +# Leave empty or set to 'auto' to use first available provider +# The app will automatically fall back to other configured providers if the primary is unavailable +AI_PROVIDER=claude + +# Enable prompt caching for supported providers (reduces costs and latency) +# Options: true | false +ENABLE_PROMPT_CACHE=true + +# ----------------------------------------------------------------------------- +# AWS Bedrock Configuration +# ----------------------------------------------------------------------------- +# To use Bedrock: Configure AWS credentials + set AI_PROVIDER=bedrock +# OR let it auto-select if credentials are available + +# AWS region for Bedrock service +# Recommended: us-west-2, us-east-1 (check model availability by region) +AWS_DEFAULT_REGION=us-west-2 + +# Bedrock model identifier +# Recommended models: +# - anthropic.claude-3-5-sonnet-20241022-v2:0 (Most capable, higher cost) +# - anthropic.claude-3-5-haiku-20241022-v1:0 (Fast, cost-effective) +BEDROCK_MODEL_ID=anthropic.claude-3-5-haiku-20241022-v1:0 + +# Bedrock guardrail configuration (OPTIONAL - for content filtering) +# Leave empty if not using guardrails +BEDROCK_GUARDRAIL_ID= +BEDROCK_GUARDRAIL_VERSION=DRAFT + +# AWS credentials (OPTIONAL if using IAM roles, EC2 instance profiles, or AWS CLI profiles) +# Only set these if you're not using AWS credential chain +# AWS_ACCESS_KEY_ID=your-access-key-id +# AWS_SECRET_ACCESS_KEY=your-secret-access-key + +# ----------------------------------------------------------------------------- +# Anthropic Claude API Configuration +# ----------------------------------------------------------------------------- +# To use Claude API: Set CLAUDE_API_KEY + set AI_PROVIDER=claude +# Get your API key from: https://console.anthropic.com/ + +# Anthropic API key (REQUIRED for Claude API access) +CLAUDE_API_KEY= + +# Claude model identifier +# Recommended models: +# - claude-3-5-sonnet-20241022 (Most capable, balanced cost/performance) +# - claude-3-5-haiku-20241022 (Fast, cost-effective) +CLAUDE_MODEL=claude-3-5-sonnet-20241022 + +# ----------------------------------------------------------------------------- +# Google Gemini Configuration +# ----------------------------------------------------------------------------- +# To use Gemini: Set GOOGLE_API_KEY + set AI_PROVIDER=gemini +# Get your API key from: https://aistudio.google.com/app/apikey + +# Google API key (REQUIRED for Gemini access) +GOOGLE_API_KEY= + +# Alternative: Use GEMINI_API_KEY if you prefer (both work) +# GEMINI_API_KEY= + +# Gemini model identifier +# Recommended models: +# - gemini-1.5-pro (Most capable) +# - gemini-1.5-flash (Fast, cost-effective) +GEMINI_MODEL=gemini-1.5-pro + +# ----------------------------------------------------------------------------- +# Google OAuth Configuration +# ----------------------------------------------------------------------------- + +# Google OAuth credentials for authentication +GOOGLE_CLIENT_ID=your_google_client_id +GOOGLE_CLIENT_SECRET=your_google_client_secret + +# OAuth redirect URIs - Configure in Google Cloud Console: +# β†’ APIs & Services β†’ Credentials β†’ OAuth 2.0 Client ID +# +# For LOCALHOST development: http://localhost:8501 +# For REPLIT deployment: https://[repl-name].[username].repl.co +# For STREAMLIT CLOUD: https://[your-app-name].streamlit.app +# For CUSTOM DOMAIN: https://yourdomain.com + +# Session secret key for secure cookie signing +# Generate with: python -c "import secrets; print(secrets.token_hex(32))" +SESSION_SECRET_KEY=your-session-secret-key-here + +# ----------------------------------------------------------------------------- +# Data Configuration +# ----------------------------------------------------------------------------- + +# Directory containing processed data files (Parquet format) PROCESSED_DATA_DIR=data/processed/ + +# Default data file to load +DEFAULT_DATA_FILE=data.parquet + +# Cache configuration CACHE_TTL=3600 FORCE_DATA_REFRESH=false -# ======================================== -# CLOUDFLARE R2 CONFIGURATION -# ======================================== -# Your Cloudflare R2 bucket details -R2_ENDPOINT_URL=https://50ee71713e4e8762d5eab0e8ec442f1e.r2.cloudflarestorage.com +# ----------------------------------------------------------------------------- +# Cloudflare R2 Storage Configuration (Optional) +# ----------------------------------------------------------------------------- + +# R2 bucket name R2_BUCKET_NAME=single-family-loan + +# R2 access credentials R2_ACCESS_KEY_ID=your_r2_access_key_id R2_SECRET_ACCESS_KEY=your_r2_secret_access_key -R2_ACCOUNT_ID=your_cloudflare_account_id -# ======================================== -# AI PROVIDER CONFIGURATION -# ======================================== -# Choose your AI provider: 'claude' or 'bedrock' -AI_PROVIDER=claude +# R2 endpoint URL +R2_ENDPOINT_URL=https://50ee71713e4e8762d5eab0e8ec442f1e.r2.cloudflarestorage.com -# Claude API Configuration (Recommended) -CLAUDE_API_KEY=your_claude_api_key_here -CLAUDE_MODEL=claude-3-5-sonnet-20241022 +# Cloudflare account ID +R2_ACCOUNT_ID=your_cloudflare_account_id -# Amazon Bedrock Configuration (Alternative) -BEDROCK_GUARDRAIL_ID=your_actual_guardrail_id -BEDROCK_MODEL_ID=anthropic.claude-3-5-haiku-20241022-v1:0 -AWS_DEFAULT_REGION=us-west-2 -ENABLE_BEDROCK=true +# ----------------------------------------------------------------------------- +# Cloudflare D1 Database Configuration (Optional) +# ----------------------------------------------------------------------------- -# AWS Authentication (for Bedrock - choose one method): -# Option 1: AWS Access Keys (for development/testing) -AWS_ACCESS_KEY_ID=your_aws_access_key -AWS_SECRET_ACCESS_KEY=your_aws_secret_key +# D1 database ID for user activity logging +CLOUDFLARE_D1_DATABASE_ID=your_d1_database_id -# Option 2: Use AWS CLI credentials or IAM roles (recommended for production) -# Leave the above commented out to use AWS CLI or IAM role credentials +# D1 API token +CLOUDFLARE_API_TOKEN=your_cloudflare_api_token -# ======================================== -# PERFORMANCE OPTIMIZATION -# ======================================== -ENABLE_PROMPT_CACHE=true -PROMPT_CACHE_TTL=3600 +# Cloudflare account ID +CLOUDFLARE_ACCOUNT_ID=your_cloudflare_account_id -# ======================================== -# AUTHENTICATION (GOOGLE OAUTH) -# ======================================== -# Google OAuth Configuration (Direct - No Supabase needed) -GOOGLE_CLIENT_ID=your_google_client_id -GOOGLE_CLIENT_SECRET=your_google_client_secret +# ----------------------------------------------------------------------------- +# Application Settings +# ----------------------------------------------------------------------------- -# IMPORTANT: Configure these URLs in your Google OAuth settings: -# Go to Google Cloud Console β†’ APIs & Services β†’ Credentials -# Edit your OAuth 2.0 Client ID and add these redirect URIs: -# -# For LOCALHOST development: -# - http://localhost:8501 -# -# For REPLIT deployment: -# - https://[repl-name].[username].repl.co -# -# For STREAMLIT CLOUD deployment: -# - https://[your-app-name].streamlit.app -# -# For CUSTOM DOMAIN: -# - https://yourdomain.com +# Application environment +# Options: development, staging, production +ENVIRONMENT=development -# ======================================== -# STREAMLIT CONFIGURATION -# ======================================== -STREAMLIT_SERVER_PORT=8501 -STREAMLIT_SERVER_HEADLESS=true +# Enable debug mode (detailed error messages and logging) +# Options: true, false +DEBUG=true -# ======================================== -# APPLICATION FEATURES -# ======================================== +# Application features DEMO_MODE=false ENABLE_AUTH=false -# ======================================== -# CLOUDFLARE D1 DATABASE (OPTIONAL) -# ======================================== -# For user activity logging (logins and queries) -# Leave empty to disable database logging -CLOUDFLARE_ACCOUNT_ID=your_cloudflare_account_id -CLOUDFLARE_D1_DATABASE_ID=your_d1_database_id -CLOUDFLARE_API_TOKEN=your_cloudflare_api_token \ No newline at end of file +# Streamlit configuration +STREAMLIT_SERVER_PORT=8501 +STREAMLIT_SERVER_HEADLESS=true + +# ============================================================================= +# Setup Instructions +# ============================================================================= +# +# 1. Copy this file to .env: +# cp .env.example .env +# +# 2. CONFIGURE AI PROVIDERS (Choose one or more): +# +# OPTION A - Single Provider Setup: +# -------------------------------- +# Configure ONE provider and set AI_PROVIDER explicitly: +# +# For Claude API: +# - Set CLAUDE_API_KEY=sk-ant-... +# - Set AI_PROVIDER=claude +# +# For AWS Bedrock: +# - Configure AWS credentials (IAM role OR access keys) +# - Set AWS_DEFAULT_REGION and BEDROCK_MODEL_ID +# - Set AI_PROVIDER=bedrock +# +# For Google Gemini: +# - Set GOOGLE_API_KEY=AIza... +# - Set AI_PROVIDER=gemini +# +# OPTION B - Multi-Provider Setup (Recommended): +# ---------------------------------------------- +# Configure MULTIPLE providers for redundancy and flexibility: +# +# 1. Set up credentials for all providers you want to use +# (e.g., both CLAUDE_API_KEY and GOOGLE_API_KEY) +# +# 2. Set AI_PROVIDER to your preferred default provider +# OR leave empty for automatic selection +# +# 3. The app will show a provider selector in the sidebar +# allowing you to switch between available providers +# +# 3. CONFIGURE AUTHENTICATION (Optional - for user management): +# - Set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET +# - Configure OAuth redirect URIs in Google Cloud Console +# - Set ENABLE_AUTH=true +# +# 4. CONFIGURE DATA PATHS (Optional - customize data location): +# - Review PROCESSED_DATA_DIR and DEFAULT_DATA_FILE +# - Configure R2 or D1 settings if using Cloudflare storage +# +# 5. REVIEW APPLICATION SETTINGS: +# - Set ENVIRONMENT (development/staging/production) +# - Enable/disable DEBUG mode +# - Configure DEMO_MODE if needed +# +# For detailed setup instructions, see docs/ENVIRONMENT_SETUP.md +# ============================================================================= \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/ai_engine_contribution.md b/.github/ISSUE_TEMPLATE/ai_engine_contribution.md new file mode 100644 index 0000000..e104938 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/ai_engine_contribution.md @@ -0,0 +1,144 @@ +--- +name: AI Engine Contribution +about: Propose or implement support for a new AI engine/provider +title: '[AI ENGINE] ' +labels: ai-engine, enhancement +assignees: '' +--- + +## AI Engine Information + +**Provider Name**: [e.g., Google Gemini, Ollama, OpenAI, etc.] +**API Documentation**: [Link to official API docs] +**Pricing Model**: [Free tier available? Pay-per-use? Self-hosted?] + +## Motivation +Why should converSQL support this AI engine? +- What unique capabilities does it offer? +- What use cases does it enable? +- Who would benefit from this integration? + +## Implementation Status +- [ ] Proof of concept completed +- [ ] Adapter class implemented +- [ ] Configuration added +- [ ] Error handling implemented +- [ ] Tests written +- [ ] Documentation updated +- [ ] Ready for review + +## Technical Details + +### API Access +- **Authentication Method**: [API Key, OAuth, Self-hosted, etc.] +- **Rate Limits**: [If applicable] +- **Model Options**: [List available models] +- **Recommended Model**: [For SQL generation] + +### Sample Code +If you've started implementation, share a code snippet: + +```python +class NewEngineAdapter: + def __init__(self): + self.client = None + self._initialize() + + def _initialize(self): + # Initialization code + pass + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + # Implementation + pass +``` + +## Configuration Requirements + +What environment variables or configuration will users need? + +```bash +# .env additions +NEW_ENGINE_API_KEY=xxx +NEW_ENGINE_MODEL=model-name +NEW_ENGINE_ENDPOINT=https://... # If applicable +``` + +## Testing Plan + +How will this be tested? +- [ ] Unit tests for adapter +- [ ] Integration tests with real API +- [ ] Mock tests for CI/CD +- [ ] Manual testing completed + +## Documentation Plan + +What documentation needs to be created/updated? +- [ ] Add setup instructions to docs/ +- [ ] Update AI_ENGINES.md guide +- [ ] Add example queries +- [ ] Update README.md +- [ ] Add troubleshooting section + +## Dependencies + +Will this require new dependencies? + +```python +# requirements.txt additions +new-ai-library==1.0.0 +``` + +## Challenges and Considerations + +Are there any challenges or special considerations? +- API limitations +- Cost concerns +- Performance characteristics +- Error handling nuances +- Model-specific quirks + +## Comparison with Existing Engines + +How does this compare to existing supported engines (Bedrock, Claude)? + +| Feature | Bedrock | Claude | New Engine | +|---------|---------|--------|------------| +| Cost | $$$ | $$ | ? | +| Speed | Fast | Very Fast | ? | +| Accuracy | High | Very High | ? | +| Self-hosted | No | No | ? | + +## Questions for Review + +Any specific questions for maintainers? +1. ... +2. ... + +## Willingness to Maintain + +- [ ] I'm willing to maintain this adapter +- [ ] I can provide ongoing support +- [ ] I'll respond to related issues +- [ ] I need help with maintenance + +## Additional Resources + +- Link to example implementations +- Link to relevant research/benchmarks +- Link to community discussions +- Other helpful resources + +--- + +**Contribution Checklist** (for implementers): +- [ ] Forked repository +- [ ] Created feature branch +- [ ] Implemented adapter following patterns in src/ai_service.py +- [ ] Added tests +- [ ] Updated documentation +- [ ] Tested locally +- [ ] Ready to submit PR + +**See [AI Engine Development Guide](docs/AI_ENGINES.md) for detailed implementation instructions.** diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..06dc6a6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,55 @@ +--- +name: Bug Report +about: Report a bug to help us improve converSQL +title: '[BUG] ' +labels: bug +assignees: '' +--- + +## Bug Description +A clear and concise description of what the bug is. + +## To Reproduce +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '...' +3. Enter '...' +4. See error + +## Expected Behavior +A clear and concise description of what you expected to happen. + +## Actual Behavior +A clear and concise description of what actually happened. + +## Screenshots +If applicable, add screenshots to help explain your problem. + +## Environment +- **OS**: [e.g., macOS 13.0, Ubuntu 22.04, Windows 11] +- **Python Version**: [e.g., 3.11.5] +- **converSQL Version/Branch**: [e.g., main, v1.0.0] +- **AI Provider**: [e.g., Claude API, AWS Bedrock, None] +- **Browser** (if UI issue): [e.g., Chrome 118, Firefox 119] + +## Configuration +```yaml +# Relevant parts of your .env file (REMOVE SENSITIVE DATA!) +AI_PROVIDER=bedrock +ENABLE_AUTH=true +# etc. +``` + +## Error Logs +``` +Paste relevant error messages or logs here +``` + +## Additional Context +Add any other context about the problem here. For example: +- Does this happen consistently or intermittently? +- Did this work in a previous version? +- Are there any workarounds? + +## Possible Solution +(Optional) If you have ideas about what might be causing this or how to fix it, share them here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..73df6b1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,68 @@ +--- +name: Feature Request +about: Suggest a new feature or enhancement for converSQL +title: '[FEATURE] ' +labels: enhancement +assignees: '' +--- + +## Feature Description +A clear and concise description of the feature you'd like to see. + +## Problem or Use Case +**Is your feature request related to a problem?** +Describe the problem this feature would solve. For example: +- "I'm frustrated when I need to..." +- "It's difficult to..." +- "Users would benefit from..." + +## Proposed Solution +Describe how you envision this feature working: +- What would the user interface look like? +- What would the API or code interface be? +- How would users interact with it? + +## Example Usage +Show how this feature would be used: + +```python +# Code example +result = new_feature(parameters) +``` + +Or describe user workflow: +1. User opens... +2. User selects... +3. System displays... + +## Alternatives Considered +Have you considered alternative solutions or features? Describe them here: +- Alternative A: ... +- Alternative B: ... + +## Benefits +Who would benefit from this feature? +- [ ] Data analysts +- [ ] Data engineers +- [ ] Developers +- [ ] End users +- [ ] System administrators +- [ ] Other: ___________ + +## Additional Context +Add any other context, screenshots, mockups, or examples: +- Links to similar features in other tools +- Research or articles supporting this feature +- Mockups or diagrams + +## Implementation Notes +(Optional) If you have technical implementation ideas: +- Which files/modules would be affected? +- What dependencies might be needed? +- Are there any potential challenges? + +## Willingness to Contribute +- [ ] I'm willing to submit a PR to implement this feature +- [ ] I can help with testing +- [ ] I can help with documentation +- [ ] I'd like to discuss this first diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5b8e3ac --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,142 @@ +name: CI/CD Pipeline + +on: + push: + branches: [ main, enhance-pipeline ] + pull_request: + branches: [ main, enhance-pipeline ] + +jobs: + lint: + name: Code Quality Checks + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install black flake8 isort mypy + pip install -r requirements.txt + + - name: Check formatting with Black + run: black --check --line-length 120 src/ tests/ + + - name: Lint with Flake8 + run: flake8 src/ tests/ + + - name: Check import sorting with isort + run: isort --check-only --profile black src/ tests/ + + - name: Type check with mypy + run: mypy src/ + continue-on-error: true + + test: + name: Run Tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run unit tests with coverage + run: | + pytest tests/unit/ -v --cov=src --cov-report=xml --cov-report=term-missing + env: + PYTHONPATH: ${{ github.workspace }} + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.11' + uses: codecov/codecov-action@v4 + with: + file: ./coverage.xml + flags: unittests + name: codecov-umbrella + fail_ci_if_error: false + token: ${{ secrets.CODECOV_TOKEN }} + + integration-test: + name: Integration Tests + runs-on: ubuntu-latest + needs: test + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run integration tests + run: | + pytest tests/integration/ -v -m integration + env: + PYTHONPATH: ${{ github.workspace }} + continue-on-error: true + + format: + name: Auto-format Code + runs-on: ubuntu-latest + if: github.event_name == 'push' + + steps: + - uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install formatting tools + run: | + python -m pip install --upgrade pip + pip install black isort + + - name: Format code with Black + run: black --line-length 120 src/ tests/ + + - name: Sort imports with isort + run: isort --profile black src/ tests/ + + - name: Commit changes + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git diff --quiet && git diff --staged --quiet || (git add -A && git commit -m "style: auto-format code with black and isort [skip ci]") + + - name: Push changes + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: ${{ github.ref }} + if: success() diff --git a/.gitignore b/.gitignore index 48ec9aa..80514ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,18 @@ # Byte-compiled / optimized / DLL files __pycache__/ -*.py[codz] +*.py[cod] *$py.class +*.pyc +*.pyo +*~ + # C extensions *.so + +# Environment files +.env .env.yml +.env.local # Distribution / packaging .Python @@ -46,10 +54,13 @@ htmlcov/ nosetests.xml coverage.xml *.cover +*.py,cover *.py.cover .hypothesis/ .pytest_cache/ cover/ +test-results/ +*.xml # Translations *.mo @@ -57,6 +68,8 @@ cover/ # Django stuff: *.log +*.out +*.err local_settings.py db.sqlite3 db.sqlite3-journal @@ -143,6 +156,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.python-version # Spyder project settings .spyderproject @@ -205,3 +219,16 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# Project specific +*.tmp +*.temp +*.bak +*.swp +*.swo +.DS_Store + +# Backup files +*_old.py +*_backup.py +data/raw/2025Q1.csv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..88f6d03 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,552 @@ +# Contributing to converSQL + +Thank you for your interest in contributing to converSQL! We're building an open-source framework that makes data conversational, and we welcome contributions from developers, data engineers, analysts, and domain experts. + +## 🌟 How You Can Contribute + +There are many ways to contribute to converSQL: + +- **πŸ› Report bugs** β€” Help us identify and fix issues +- **πŸ’‘ Suggest features** β€” Share ideas for new capabilities +- **πŸ“ Improve documentation** β€” Make converSQL easier to understand and use +- **πŸ”§ Add AI engine adapters** β€” Extend support to new AI providers +- **🎨 Enhance the UI** β€” Improve the user experience +- **πŸ§ͺ Write tests** β€” Increase code coverage and reliability +- **πŸ—οΈ Add domain implementations** β€” Showcase converSQL in new industries + +--- + +## πŸš€ Getting Started + +### 1. Fork and Clone + +```bash +# Fork the repository on GitHub, then clone your fork +git clone https://github.com/YOUR_USERNAME/conversql.git +cd conversql + +# Add upstream remote +git remote add upstream https://github.com/ravishan16/conversql.git +``` + +### 2. Set Up Development Environment + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt + +# Install development dependencies +pip install pytest pytest-cov black flake8 mypy + +# Copy environment template +cp .env.example .env +# Edit .env with your development settings +``` + +### 3. Create a Feature Branch + +```bash +# Sync with upstream +git fetch upstream +git checkout main +git merge upstream/main + +# Create your feature branch +git checkout -b feature/your-feature-name +``` + +--- + +## πŸ“‹ Contribution Guidelines + +### Code Standards + +We follow Python best practices to maintain code quality: + +**Style Guide:** +- Follow [PEP 8](https://pep8.org/) style guidelines +- Use meaningful variable and function names +- Add docstrings to all public functions and classes +- Keep functions focused and concise (ideally <50 lines) +- Use type hints where appropriate + +**Formatting:** +```bash +# Format code with Black +black src/ app.py + +# Check with flake8 +flake8 src/ app.py --max-line-length=100 +``` + +**Example:** +```python +def execute_sql_query(sql_query: str, parquet_files: List[str]) -> pd.DataFrame: + """ + Execute SQL query using DuckDB on Parquet files. + + Args: + sql_query: SQL query string to execute + parquet_files: List of absolute paths to Parquet files + + Returns: + DataFrame with query results + + Raises: + Exception: If query execution fails + """ + try: + conn = duckdb.connect() + + for file_path in parquet_files: + table_name = os.path.splitext(os.path.basename(file_path))[0] + conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM '{file_path}'") + + result_df = conn.execute(sql_query).fetchdf() + conn.close() + + return result_df + + except Exception as e: + logger.error(f"Query execution failed: {e}") + raise +``` + +### Testing + +All new features and bug fixes should include tests: + +**Writing Tests:** +```python +# tests/test_core.py +import pytest +from src.core import execute_sql_query + +def test_execute_simple_query(sample_parquet_files): + """Test execution of a simple SELECT query.""" + sql = "SELECT COUNT(*) as count FROM data" + result = execute_sql_query(sql, sample_parquet_files) + + assert len(result) == 1 + assert 'count' in result.columns + assert result['count'][0] > 0 + +def test_execute_invalid_query(sample_parquet_files): + """Test handling of invalid SQL.""" + sql = "SELECT * FROM nonexistent_table" + + with pytest.raises(Exception): + execute_sql_query(sql, sample_parquet_files) +``` + +**Running Tests:** +```bash +# Run all tests +pytest + +# Run with coverage report +pytest --cov=src --cov-report=html + +# Run specific test file +pytest tests/test_core.py + +# Run specific test +pytest tests/test_core.py::test_execute_simple_query +``` + +### Documentation + +Good documentation makes converSQL accessible: + +**Code Documentation:** +- Add docstrings to all public functions, classes, and modules +- Use Google-style or NumPy-style docstrings +- Include examples in docstrings where helpful + +**Project Documentation:** +- Update relevant `.md` files in `docs/` for new features +- Add usage examples to README.md if applicable +- Create new documentation files for major features + +**Example Docstring:** +```python +def generate_sql_with_ai(user_question: str, schema_context: str) -> Tuple[str, str]: + """ + Generate SQL query from natural language using AI. + + This function sends the user's question along with database schema context + to an AI provider (Bedrock, Claude, etc.) and returns the generated SQL query. + + Args: + user_question: Natural language question from the user + schema_context: Database schema information including table structures, + relationships, and ontological context + + Returns: + Tuple of (sql_query, error_message). If successful, sql_query contains + the generated SQL and error_message is empty. If failed, sql_query is + empty and error_message contains the error details. + + Example: + >>> schema = get_table_schemas(parquet_files) + >>> sql, error = generate_sql_with_ai( + ... "Show top 10 states by loan volume", + ... schema + ... ) + >>> print(sql) + SELECT STATE, COUNT(*) as loan_count FROM data GROUP BY STATE ... + """ + service = get_ai_service() + sql_query, error_msg, provider = service.generate_sql(user_question, schema_context) + return sql_query, error_msg +``` + +--- + +## πŸ”§ Adding New AI Engine Adapters + +One of the best ways to contribute is by adding support for new AI providers. See our detailed guide: + +πŸ“„ **[AI Engine Development Guide](docs/AI_ENGINES.md)** + +### Quick Overview + +1. **Implement the adapter interface**: +```python +class GeminiAdapter(AIEngineAdapter): + def __init__(self): + self.client = None + self._initialize() + + def _initialize(self): + """Initialize Gemini client.""" + # Setup code here + + def is_available(self) -> bool: + """Check if Gemini is available.""" + return self.client is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL using Gemini.""" + # Implementation here +``` + +2. **Register in AI service**: +```python +# src/ai_service.py +class AIService: + def __init__(self): + self.bedrock = BedrockClient() + self.claude = ClaudeClient() + self.gemini = GeminiAdapter() # Add new adapter + self._determine_active_provider() +``` + +3. **Add configuration**: +```python +# .env +GEMINI_API_KEY=your_gemini_key +GEMINI_MODEL=gemini-pro +``` + +4. **Test thoroughly**: +```python +def test_gemini_adapter(): + adapter = GeminiAdapter() + assert adapter.is_available() + + sql, error = adapter.generate_sql("SELECT * FROM data LIMIT 10") + assert sql + assert not error +``` + +--- + +## 🎯 Development Workflow + +### Making Changes + +1. **Write code** following our style guidelines +2. **Add tests** for new functionality +3. **Update documentation** as needed +4. **Run tests** to ensure everything works +5. **Format code** with Black +6. **Commit changes** with clear messages + +### Commit Messages + +Write clear, descriptive commit messages: + +**Good:** +``` +Add Gemini AI adapter support + +- Implement GeminiAdapter class with API integration +- Add configuration for Gemini API key and model +- Include error handling and fallback logic +- Add unit tests for Gemini adapter +- Update documentation with Gemini setup instructions +``` + +**Bad:** +``` +fixed stuff +updated files +changes +``` + +**Format:** +``` +: + + + + +``` + +**Types:** +- `feat`: New feature +- `fix`: Bug fix +- `docs`: Documentation changes +- `style`: Code style changes (formatting, etc.) +- `refactor`: Code refactoring +- `test`: Adding or updating tests +- `chore`: Maintenance tasks + +### Pull Request Process + +1. **Ensure all tests pass**: +```bash +pytest +``` + +2. **Update documentation** if you've added features + +3. **Push to your fork**: +```bash +git push origin feature/your-feature-name +``` + +4. **Create Pull Request** on GitHub: + - Use a clear, descriptive title + - Reference any related issues (`Fixes #123`) + - Describe what changed and why + - Include screenshots for UI changes + - List any breaking changes + +5. **Address review feedback**: + - Respond to comments + - Make requested changes + - Push updates to your branch + +### Pull Request Template + +When creating a PR, use this template: + +```markdown +## Description +Brief description of what this PR does. + +## Type of Change +- [ ] Bug fix +- [ ] New feature +- [ ] Breaking change +- [ ] Documentation update + +## Testing +Describe how you tested your changes: +- [ ] Unit tests added/updated +- [ ] Manual testing performed +- [ ] All existing tests pass + +## Checklist +- [ ] Code follows project style guidelines +- [ ] Self-review completed +- [ ] Comments added for complex logic +- [ ] Documentation updated +- [ ] No new warnings generated +- [ ] Tests added that prove fix/feature works +- [ ] Dependent changes merged + +## Related Issues +Fixes #(issue number) + +## Screenshots (if applicable) +Add screenshots for UI changes. +``` + +--- + +## πŸ› Reporting Bugs + +### Before Reporting + +1. **Check existing issues** β€” Your bug may already be reported +2. **Try the latest version** β€” The bug might be fixed +3. **Gather information** β€” Logs, error messages, steps to reproduce + +### Bug Report Template + +```markdown +**Describe the bug** +A clear description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Enter '....' +4. See error + +**Expected behavior** +What you expected to happen. + +**Actual behavior** +What actually happened. + +**Screenshots** +If applicable, add screenshots. + +**Environment:** +- OS: [e.g., macOS 13.0] +- Python version: [e.g., 3.11.5] +- converSQL version: [e.g., 1.0.0] +- AI Provider: [e.g., Claude API] + +**Additional context** +Any other relevant information. + +**Logs** +``` +Paste relevant log output here +``` +``` + +--- + +## πŸ’‘ Suggesting Features + +We love new ideas! When suggesting features: + +1. **Check existing issues** β€” Your idea might already be proposed +2. **Describe the use case** β€” Help us understand the problem you're solving +3. **Propose a solution** β€” Share your thoughts on implementation +4. **Consider alternatives** β€” What other approaches might work? + +### Feature Request Template + +```markdown +**Is your feature request related to a problem?** +A clear description of the problem. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear description of what you want to happen. + +**Describe alternatives you've considered** +Other solutions or features you've considered. + +**Use cases** +How would this feature be used? Who benefits? + +**Additional context** +Add any other context or screenshots. +``` + +--- + +## πŸ“š Documentation Contributions + +Documentation is crucial for adoption. Ways to contribute: + +- **Fix typos and unclear explanations** +- **Add examples and tutorials** +- **Improve setup guides** +- **Create video walkthroughs** +- **Translate documentation** (future) + +--- + +## πŸ—οΈ Domain-Specific Implementations + +Help showcase converSQL in new domains: + +1. **Choose a domain** (healthcare, e-commerce, finance, etc.) +2. **Find an open dataset** or create a sample dataset +3. **Define ontology** for that domain +4. **Create data pipeline** to transform data +5. **Build example queries** demonstrating value +6. **Document the implementation** + +Example domains we'd love to see: +- Healthcare: Patient outcomes, clinical trials +- E-commerce: Customer behavior, inventory +- Finance: Transaction analytics, fraud detection +- Education: Student performance, enrollment +- Transportation: Fleet management, routing +- Energy: Usage patterns, optimization + +--- + +## 🀝 Community Guidelines + +### Code of Conduct + +We are committed to providing a welcoming and inclusive environment: + +- **Be respectful** β€” Treat everyone with respect +- **Be collaborative** β€” Work together toward common goals +- **Be patient** β€” Help others learn and grow +- **Be constructive** β€” Provide helpful feedback +- **Be inclusive** β€” Welcome diverse perspectives + +### Getting Help + +If you need help: + +- **Check documentation** in the `docs/` folder +- **Search issues** for similar questions +- **Ask in discussions** on GitHub +- **Be specific** about your problem and what you've tried + +--- + +## πŸ“¬ Communication + +- **GitHub Issues** β€” Bug reports and feature requests +- **GitHub Discussions** β€” Questions, ideas, and general discussion +- **Pull Requests** β€” Code contributions +- **Email** β€” For security issues or private matters + +--- + +## πŸŽ‰ Recognition + +Contributors are recognized in several ways: + +- Listed in README.md contributors section +- Mentioned in release notes for significant contributions +- Invited to join core contributor team (for consistent contributors) + +--- + +## πŸ“„ License + +By contributing to converSQL, you agree that your contributions will be licensed under the MIT License. + +--- + +## πŸ™ Thank You! + +Every contribution, no matter how small, makes converSQL better. Whether you're fixing a typo, adding a feature, or helping others in discussions β€” **thank you** for being part of the converSQL community! + +**Questions?** Open an issue or start a discussion. We're here to help! + +--- + +**Happy Contributing! πŸš€** + +*Making data conversational, together.* diff --git a/GOOGLE_OAUTH_SETUP.md b/GOOGLE_OAUTH_SETUP.md deleted file mode 100644 index e3efe99..0000000 --- a/GOOGLE_OAUTH_SETUP.md +++ /dev/null @@ -1,106 +0,0 @@ -# Google OAuth Setup Guide - -This app now uses direct Google OAuth for authentication - no database required! - -## πŸš€ Quick Setup - -### 1. Get Google OAuth Credentials - -1. Go to [Google Cloud Console](https://console.cloud.google.com/) -2. Create a new project or select existing one -3. Go to **APIs & Services** β†’ **Credentials** -4. Click **+ CREATE CREDENTIALS** β†’ **OAuth 2.0 Client IDs** -5. Choose **Web application** -6. Add your redirect URIs (see below) -7. Copy the **Client ID** and **Client Secret** - -### 2. Configure Redirect URIs - -In your Google OAuth client, add these authorized redirect URIs: - -**For localhost:** -``` -http://localhost:8501 -``` - -**For IP address access:** -``` -http://YOUR_IP_ADDRESS:8501 -``` - -**For Replit:** -``` -https://your-repl-name.your-username.repl.co -``` - -**For Streamlit Cloud:** -``` -https://your-app-name.streamlit.app -``` - -### 3. Set Environment Variables - -Create a `.env` file with: - -```bash -# Google OAuth (required) -GOOGLE_CLIENT_ID=your_google_client_id_here -GOOGLE_CLIENT_SECRET=your_google_client_secret_here - -# Auth settings -ENABLE_AUTH=true -DEMO_MODE=false # Set to true for debugging -``` - -## βœ… Benefits of This Approach - -- **No Database**: No Supabase or external database needed -- **Simple**: Just Google OAuth credentials required -- **Fast**: Direct authentication flow -- **Clean**: No complex setup or multiple services -- **Secure**: Google handles all security aspects - -## πŸ”§ Configuration Options - -```bash -ENABLE_AUTH=false # Disable authentication entirely -DEMO_MODE=true # Show debug information -``` - -## πŸ› Debugging - -Set `DEMO_MODE=true` to see: -- OAuth URLs being generated -- Redirect URIs being used -- Authentication flow details -- Configuration status - -## πŸ“± How It Works - -1. User clicks "Sign in with Google" -2. App redirects to Google OAuth -3. Google redirects back with authorization code -4. App exchanges code for user information -5. User information stored in session (no database) -6. User stays authenticated until session expires - -## πŸ”’ Security Features - -- **CSRF Protection**: State parameter prevents attacks -- **Session-based**: No persistent storage of credentials -- **Google Security**: Leverages Google's security infrastructure -- **Automatic Cleanup**: Clears sensitive data on logout - -## πŸš€ Deployment - -The app automatically detects your environment and uses the correct redirect URI: -- Localhost: `http://localhost:8501` -- IP access: `http://your-ip:8501` -- Replit: `https://repl-url` -- Streamlit Cloud: `https://app-url` - -Just make sure to add all the URLs you'll use to your Google OAuth configuration! - ---- - -**That's it!** Much simpler than the previous Supabase setup. πŸŽ‰ \ No newline at end of file diff --git a/Makefile b/Makefile index ce99c6c..0bd3eb1 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ -# NLP to SQL - Makefile +# converSQL - Makefile # Professional development utilities for the Streamlit application -.PHONY: help clean clean-cache clean-logs start dev install test lint format check-deps +.PHONY: help clean clean-cache clean-logs start dev install install-dev test test-unit test-integration test-cov lint format format-check check-deps setup ci # Default target help: - @echo "πŸ” NLP to SQL - Development Commands" + @echo "πŸ” converSQL - Development Commands" @echo "==================================" @echo "" @echo "πŸ“± Application Commands:" @@ -19,16 +19,23 @@ help: @echo "" @echo "πŸ› οΈ Development Commands:" @echo " install Install dependencies from requirements.txt" - @echo " test Run tests (if available)" - @echo " lint Run code linting" - @echo " format Format code with black" + @echo " install-dev Install with development dependencies" + @echo " test Run all tests with coverage" + @echo " test-unit Run unit tests only" + @echo " test-integration Run integration tests only" + @echo " test-cov Run tests with coverage report" + @echo " lint Run code linting (flake8)" + @echo " format Format code with black and isort" + @echo " format-check Check code formatting without changes" @echo " check-deps Check for dependency updates" + @echo " setup Complete setup for new development environment" + @echo " ci Run full CI checks (format, lint, test)" @echo "" @echo "Usage: make " # Start Streamlit application with logging start: - @echo "πŸš€ Starting NLP to SQL Streamlit App..." + @echo "πŸš€ Starting converSQL Streamlit App..." @echo "πŸ“ URL: http://localhost:8501" @echo "πŸ“ Logs will be displayed below..." @echo "==================================" @@ -48,7 +55,12 @@ clean: clean-cache clean-logs @find . -type f -name "*.tmp" -delete 2>/dev/null || true @find . -type f -name "*.temp" -delete 2>/dev/null || true @find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true - @find . -type d -name ".coverage" -exec rm -rf {} + 2>/dev/null || true + @rm -rf .pytest_cache 2>/dev/null || true + @rm -f .coverage 2>/dev/null || true + @rm -f coverage.xml 2>/dev/null || true + @rm -rf htmlcov 2>/dev/null || true + @rm -rf .mypy_cache 2>/dev/null || true + @rm -rf .ruff_cache 2>/dev/null || true @echo "βœ… Cleanup completed!" # Clean Python cache files @@ -56,6 +68,7 @@ clean-cache: @echo "🧹 Cleaning Python cache files..." @find . -type f -name "*.pyc" -delete 2>/dev/null || true @find . -type f -name "*.pyo" -delete 2>/dev/null || true + @find . -type f -name "*~" -delete 2>/dev/null || true @find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true @find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true @find . -type f -name ".DS_Store" -delete 2>/dev/null || true @@ -66,6 +79,7 @@ clean-logs: @echo "🧹 Cleaning log files..." @find . -type f -name "*.log" -delete 2>/dev/null || true @find . -type f -name "*.out" -delete 2>/dev/null || true + @find . -type f -name "*.err" -delete 2>/dev/null || true @echo "βœ… Log files cleaned!" # Install dependencies @@ -74,32 +88,61 @@ install: @pip install -r requirements.txt @echo "βœ… Dependencies installed!" -# Run tests (if test files exist) +# Install with development dependencies +install-dev: install + @echo "πŸ“¦ Installing development dependencies..." + @pip install -r requirements.txt + @echo "βœ… All dependencies installed!" + +# Run all tests with coverage test: - @echo "πŸ§ͺ Running tests..." - @if [ -d "tests" ] || ls test_*.py 1> /dev/null 2>&1; then \ - python -m pytest -v; \ - else \ - echo "⚠️ No tests found. Create test files to enable testing."; \ - fi + @echo "πŸ§ͺ Running all tests with coverage..." + @pytest tests/ -v --cov=src --cov-report=term-missing --cov-report=html + @echo "βœ… Tests completed! Coverage report: htmlcov/index.html" + +# Run unit tests only +test-unit: + @echo "πŸ§ͺ Running unit tests..." + @pytest tests/unit/ -v -m "not integration" + @echo "βœ… Unit tests completed!" + +# Run integration tests only +test-integration: + @echo "πŸ§ͺ Running integration tests..." + @pytest tests/integration/ -v -m integration + @echo "βœ… Integration tests completed!" + +# Run tests with detailed coverage +test-cov: + @echo "πŸ§ͺ Running tests with detailed coverage..." + @pytest tests/ -v --cov=src --cov-report=term-missing --cov-report=html --cov-report=xml --cov-branch + @echo "βœ… Coverage reports generated:" + @echo " - Terminal: (shown above)" + @echo " - HTML: htmlcov/index.html" + @echo " - XML: coverage.xml" # Run linting lint: @echo "πŸ” Running code linting..." - @if command -v flake8 >/dev/null 2>&1; then \ - flake8 app.py --max-line-length=88 --ignore=E203,W503; \ - else \ - echo "⚠️ flake8 not installed. Run: pip install flake8"; \ - fi + @echo "Running flake8..." + @flake8 src/ tests/ app.py || echo "⚠️ Flake8 found issues" + @echo "Running mypy..." + @mypy src/ || echo "⚠️ MyPy found issues" + @echo "βœ… Linting completed!" -# Format code +# Format code with black and isort format: - @echo "✨ Formatting code..." - @if command -v black >/dev/null 2>&1; then \ - black app.py --line-length=88; \ - else \ - echo "⚠️ black not installed. Run: pip install black"; \ - fi + @echo "✨ Formatting code with black and isort..." + @black --line-length 120 src/ tests/ app.py + @isort --profile black src/ tests/ app.py + @echo "βœ… Code formatted!" + +# Check code formatting without making changes +format-check: + @echo "πŸ” Checking code formatting..." + @black --check --line-length 120 src/ tests/ app.py || (echo "❌ Code needs formatting. Run 'make format'" && exit 1) + @isort --check-only --profile black src/ tests/ app.py || (echo "❌ Imports need sorting. Run 'make format'" && exit 1) + @echo "βœ… Code formatting is correct!" # Check for dependency updates check-deps: @@ -113,10 +156,17 @@ check-deps: fi # Quick setup for new development environment -setup: install clean +setup: install-dev clean @echo "πŸŽ‰ Setup completed! Ready for development." @echo "" @echo "Quick start:" @echo " make start # Start the application" @echo " make dev # Start in development mode" - @echo " make help # Show all available commands" \ No newline at end of file + @echo " make test # Run tests" + @echo " make help # Show all available commands" + +# Run full CI checks locally +ci: clean format-check lint test-cov + @echo "βœ… All CI checks passed!" + @echo "" + @echo "Ready to commit and push!" \ No newline at end of file diff --git a/README.md b/README.md index 64b4e55..fba2b4b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,64 @@ -# Conversational SQL +# Conver--- -**Conversational SQL** is an open-source framework for transforming natural language questions into powerful SQL queries for any tabular dataset. It’s designed for developers, data scientists, and teams who want to build AI-powered analytics tools with minimal effort. +## πŸ“– The Story Behind converSQL + +### The Problem + +Data is everywhere, but accessing it remains a technical barrier. Analysts spend hours writing SQL queries. Business users wait for reports. Data scientists translate questions into complex joins and aggregations. Meanwhile, the insights trapped in your data remain just out of reach for those who need them most. + +Traditional BI tools offer pre-built dashboards, but they're rigid. They can't answer the questions you didn't anticipate. And when you need a custom query, you're back to writing SQL or waiting in the queue for engineering support. + +### The Open Data Opportunity + +What if we could turn this around? What if anyone could ask questions in plain English and get instant, accurate SQL queries tailored to their specific data domain? + +That's where converSQL comes in. Built on the principle that **data should be conversational**, converSQL combines: +- **Ontological modeling**: Structured knowledge about your data domains, relationships, and business rules +- **AI-powered generation**: Multiple AI engines (Bedrock, Claude, Gemini, Ollama) that understand context and generate accurate SQL +- **Open data focus**: Showcasing what's possible with publicly available datasets like Fannie Mae's Single Family Loan Performance Data + +### Our Mission + +We believe data analysis should be: +- **Accessible**: Ask questions in natural language, get answers in seconds +- **Intelligent**: Understand business context, not just column names +- **Extensible**: Easy to adapt to any domain with any data structure +- **Open**: Built on open-source principles, welcoming community contributions + +--- + +## 🏑 Flagship Implementation: Single Family Loan Analytics + +To demonstrate converSQL's capabilities, we've built a production-ready application analyzing **9+ million mortgage loan records** from Fannie Mae's public dataset. + +### Why This Matters + +The Single Family Loan Performance Data represents one of the most comprehensive public datasets on U.S. mortgage markets. It contains granular loan-level data spanning originations, performance, modifications, and defaults. But with 110+ columns and complex domain knowledge required, it's challenging to analyze effectively. + +**converSQL makes it conversational:** + +πŸ” **Natural Language Query:** +*"Show me high-risk loans in California with credit scores below 620"* + +✨ **Generated SQL:** +```sql +SELECT LOAN_ID, STATE, CSCORE_B, OLTV, DTI, DLQ_STATUS, CURRENT_UPB +FROM data +WHERE STATE = 'CA' + AND CSCORE_B < 620 + AND CSCORE_B IS NOT NULL +ORDER BY CSCORE_B ASC, OLTV DESC +LIMIT 20 +``` + +πŸ“Š **Instant Results** β€” with context-aware risk metrics and portfolio insights.L + +# converSQL + +> **Transform Natural Language into SQL β€” Intelligently** + +**converSQL** is an open-source framework that bridges the gap between human questions and database queries. Using ontological data modeling and AI-powered query generation, converSQL makes complex data analysis accessible to everyone β€” from analysts to executives β€” without requiring SQL expertise. ## πŸš€ Why Conversational SQL? @@ -15,95 +72,55 @@ Stop writing complex SQL by hand! With Conversational SQL, you can: This repo features a production-grade implementation for mortgage loan portfolio analysis. It’s a showcase of how Conversational SQL can power real-world, domain-specific analytics. -**Key Features:** -- **Natural Language to SQL**: "Show me high-risk loans in California" β†’ SQL -- **Ontological Intelligence**: 110+ fields, 15 business domains, semantic relationships -- **Real-time Analytics**: Dashboards, metrics, risk indicators -- **Multi-Provider AI**: Anthropic Claude, AWS Bedrock, local models -- **Cloudflare D1 Logging**: All user logins and queries are securely logged using Cloudflare D1 (no external DB required) +### Key Features +- **🧠 Ontological Intelligence**: 110+ fields organized into 15 business domains (Credit Risk, Geographic, Temporal, Performance, etc.) +- **🎯 Domain-Aware Context**: AI understands mortgage terminology β€” "high-risk" automatically considers credit scores, LTV ratios, and DTI +- **⚑ High-Performance Pipeline**: Pipe-separated CSVs β†’ Parquet with schema enforcement, achieving 10x compression and instant query performance +- **πŸ” Enterprise Security**: Google OAuth integration with Cloudflare D1 query logging +- **πŸš€ Multiple AI Engines**: Out-of-the-box support for AWS Bedrock, Claude API, and extensible to Gemini, Ollama, and more -## 🧠 How Ontology Improves SQL Generation +--- -Conversational SQL uses an ontological approach to bridge the gap between natural language and complex, domain-specific SQL. This enables: +## πŸ—οΈ Architecture -- **Accurate mapping of business terms to data fields** -- **Automatic handling of semantic relationships and business rules** -- **Consistent, explainable query generation for analytics and reporting** +converSQL follows a clean, layered architecture designed for extensibility: -**Examples:** +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Application Layer β”‚ +β”‚ (Streamlit UI β€’ Query Builder β€’ Ontology Explorer) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AI Engine Layer β”‚ +β”‚ (Adapter Pattern: Bedrock β€’ Claude β€’ Gemini β€’ Ollama) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Intelligence Layer β”‚ +β”‚ (Ontology β€’ Schema Context β€’ Business Rules) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Data Layer β”‚ +β”‚ (Parquet Files β€’ DuckDB β€’ R2 Storage β€’ Query Execution) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` -- *Domain-Aware Context*: Instead of "Show loans in bad condition," the ontology maps this to `DLQ_STATUS = '03'` (90+ days delinquent). -- *Semantic Relationships*: "High-risk borrowers" automatically includes: - - `CSCORE_B < 620` (credit quality) - - `OLTV > 95%` (equity position) - - `DTI > 43%` (payment capacity) -- *Business Intelligence Integration*: "Portfolio concentration risk" generates: - ```sql - SELECT STATE, SUM(CURRENT_UPB)/1000000 as UPB_MM, - COUNT(*) as loan_count, - SUM(CURRENT_UPB)/(SELECT SUM(CURRENT_UPB) FROM data)*100 as pct_portfolio - FROM data GROUP BY STATE HAVING pct_portfolio > 15 - ``` +### The Data Engineering Pipeline -## πŸ—οΈ Architecture +Our showcase implementation demonstrates a complete data engineering workflow: -```mermaid -graph TB - subgraph "Data Layer" - R2[Cloudflare R2 Storage] - PQ[Parquet Files
9M+ loan records] - DUCK[DuckDB Engine] - D1[Cloudflare D1 Logging] - end - - subgraph "Intelligence Layer" - ONT[Ontological Data Dictionary
15 Domains Γ— 110 Fields] - VAL[Query Validator
Business Rules] - REL[Semantic Relationships] - end - - subgraph "AI Layer" - BED[AWS Bedrock] - CLA[Anthropic Claude] - LOC[Local Models] - end - - subgraph "Application Layer" - UI[Streamlit Interface] - QRY[Query Builder] - VIZ[Data Visualization] - EXP[Ontology Explorer] - end - - subgraph "Security Layer" - AUTH[Google OAuth] - SESS[Session Management] - end - - R2 --> PQ - PQ --> DUCK - ONT --> REL - D1 --> UI - - UI --> QRY - QRY --> ONT - QRY --> BED & CLA & LOC - BED & CLA & LOC --> DUCK - - AUTH --> UI - SESS --> QRY - - classDef dataNodes fill:#e1f5fe - classDef aiNodes fill:#f3e5f5 - classDef appNodes fill:#e8f5e8 - classDef secNodes fill:#fff3e0 - - class R2,PQ,DUCK,D1 dataNodes - class ONT,REL,BED,CLA,LOC aiNodes - class UI,QRY,VIZ,EXP appNodes - class AUTH,SESS secNodes -``` +1. **Ingestion**: Fannie Mae's pipe-separated loan performance files +2. **Transformation**: Schema enforcement with explicit data types (VARCHAR, Float, Int16, etc.) +3. **Storage**: Parquet format with SNAPPY compression (10x size reduction) +4. **Performance**: DuckDB for blazing-fast analytical queries +5. **Ontology**: Structured metadata linking business concepts to database schema + +πŸ“„ **[Learn more about the data pipeline β†’](docs/DATA_PIPELINE.md)** + +--- ## �️ Quick Start @@ -116,7 +133,7 @@ graph TB ### Installation ```bash git clone -cd nlptosql +cd converSQL pip install -r requirements.txt ``` @@ -147,26 +164,151 @@ All setup and deployment guides are located in the `docs/` directory: -## πŸ’‘ Extending Conversational SQL +## οΏ½ Documentation -Conversational SQL is designed for easy adaptation to any tabular dataset. To use it for your own data, simply swap out the ontology and schema files for your domain. +### Setup Guides +- **[Environment Setup](docs/ENVIRONMENT_SETUP.md)** β€” Configure environment variables and dependencies +- **[Data Pipeline Setup](docs/DATA_PIPELINE.md)** β€” Understand and customize the data pipeline +- **[Google OAuth Setup](docs/GOOGLE_OAUTH_SETUP.md)** β€” Enable authentication +- **[Cloud Storage Setup](docs/R2_SETUP.md)** β€” Configure Cloudflare R2 +- **[Deployment Guide](docs/DEPLOYMENT.md)** β€” Deploy to production +### Developer Guides +- **[Contributing Guide](CONTRIBUTING.md)** β€” How to contribute to converSQL +- **[AI Engine Development](docs/AI_ENGINES.md)** β€” Add support for new AI providers +- **[Architecture Overview](docs/ARCHITECTURE.md)** β€” Deep dive into system design -## 🎯 Example Use Cases - -- **Portfolio Risk Management**: "Show me all loans in Florida with FICO scores below 620" -- **Performance Analytics**: "What's the delinquency rate by vintage year for California loans?" -- **Concentration Risk**: "Which states have more than 15% of our portfolio?" -- **Credit Quality Assessment**: "Compare average DTI and LTV by credit score tier" +--- ## 🀝 Contributing -We welcome contributions! Please see our contributing guidelines for code standards, testing requirements, and the pull request process. +We welcome contributions from the community! Whether you're: +- πŸ› Reporting bugs +- πŸ’‘ Suggesting features +- πŸ”§ Adding new AI engine adapters +- πŸ“– Improving documentation +- 🎨 Enhancing the UI + +**Your contributions make converSQL better for everyone.** + +### How to Contribute + +1. **Fork the repository** +2. **Create a feature branch**: `git checkout -b feature/your-feature-name` +3. **Make your changes** with clear commit messages +4. **Test thoroughly** β€” ensure existing functionality still works +5. **Submit a pull request** with a detailed description + +πŸ“„ **[Read the full contributing guide β†’](CONTRIBUTING.md)** + +### Adding New AI Engines + +converSQL uses an adapter pattern for AI engines. Adding a new provider is straightforward: + +1. Implement the `AIEngineAdapter` interface +2. Add configuration options +3. Register in the AI service +4. Test and document + +πŸ“„ **[AI Engine Development Guide β†’](docs/AI_ENGINES.md)** + +--- + +## 🎯 Use Cases Beyond Loan Analytics + +While our flagship implementation focuses on mortgage data, converSQL is designed for **any domain** with tabular data: + +### Financial Services +- Credit card transaction analysis +- Investment portfolio performance +- Fraud detection patterns +- Regulatory reporting + +### Healthcare +- Patient outcomes analysis +- Clinical trial data exploration +- Hospital performance metrics +- Insurance claims analytics + +### E-commerce +- Customer behavior patterns +- Inventory optimization +- Sales performance tracking +- Supply chain analytics + +### Your Domain +**Bring your own data** β€” converSQL adapts through ontological modeling. Define your domains, specify relationships, and let AI handle the query generation. + +--- + +## 🌟 Why converSQL? + +### For Analysts +- **Stop writing SQL by hand** β€” describe what you want, get optimized queries +- **Explore data faster** β€” try different angles without syntax barriers +- **Focus on insights** β€” spend time analyzing, not coding + +### For Data Engineers +- **Modular architecture** β€” swap AI providers, storage backends, or UI components +- **Production-ready** β€” authentication, logging, caching, error handling built-in +- **Extensible ontology** β€” encode business logic once, reuse everywhere + +### For Organizations +- **Democratize data access** β€” empower non-technical users to explore data +- **Reduce bottlenecks** β€” less waiting for custom reports and queries +- **Open source** β€” no vendor lock-in, full transparency, community-driven development + +--- + +## πŸ›£οΈ Roadmap + +### Current Focus (v1.0) +- βœ… Multi-AI engine support (Bedrock, Claude, Gemini) +- βœ… Bedrock Guardrails integration for content filtering +- βœ… Ontological data modeling +- βœ… Single Family Loan Analytics showcase +- πŸ”„ Ollama adapter implementation +- πŸ”„ Enhanced query validation and optimization + +### Future Enhancements (v2.0+) +- Multi-table query generation with JOIN intelligence +- Query explanation and visualization +- Historical query learning and optimization +- More domain-specific implementations (healthcare, e-commerce, etc.) +- API server mode for programmatic access +- Web-based ontology editor + +**Have ideas?** [Open an issue](https://github.com/ravishan16/conversql/issues) or join the discussion! + +--- ## πŸ“„ License -This project is licensed under the MIT License - see the LICENSE file for details. +**MIT License** β€” Free to use, modify, and distribute. + +See the [LICENSE](LICENSE) file for details. --- -**Built with:** Python β€’ Streamlit β€’ DuckDB β€’ AWS Bedrock β€’ Anthropic Claude β€’ Google OAuth β€’ Cloudflare R2 β€’ Cloudflare D1 \ No newline at end of file +## πŸ™ Acknowledgments + +- **Fannie Mae** for making Single Family Loan Performance Data publicly available +- **DuckDB** team for an incredible analytical database engine +- **Anthropic** and **AWS** for powerful AI models +- **Streamlit** for making data apps beautiful and easy +- **Open source community** for inspiration and contributions + +--- + +## πŸ“¬ Stay Connected + +- **⭐ Star this repo** to follow development +- **🐦 Share your use cases** β€” we'd love to hear how you're using converSQL +- **πŸ’¬ Join discussions** β€” ask questions, share ideas, help others +- **πŸ› Report issues** β€” help us improve + +--- + +**Built with ❀️ by the converSQL community** + +*Making data conversational, one query at a time.* \ No newline at end of file diff --git a/app.py b/app.py index 463ffd1..434bb5c 100644 --- a/app.py +++ b/app.py @@ -1,40 +1,44 @@ #!/usr/bin/env python3 """ -Single Family Loan Analytics Platform -Advanced data intelligence and AI-powered loan portfolio analysis. +converSQL - Natural Language to SQL Query Generation Platform +Powered by AI, ontological data modeling, and analytical intelligence. + +Multi-provider AI support for flexible SQL generation. """ -import streamlit as st -import pandas as pd import os import time -from typing import List + +import pandas as pd +import streamlit as st + +# Import AI service with new adapter pattern +from src.ai_service import generate_sql_with_ai, get_ai_service # Import core functionality from src.core import ( - initialize_ai_client, - scan_parquet_files, - get_table_schemas, - generate_sql_with_bedrock, execute_sql_query, + get_ai_service_status, get_analyst_questions, - get_ai_service_status + get_table_schemas, + scan_parquet_files, ) +from src.simple_auth import get_auth_service # Import authentication from src.simple_auth_components import simple_auth_wrapper -from src.simple_auth import get_auth_service # Configure page with professional styling st.set_page_config( - page_title="Single Family Loan Analytics", - page_icon="🏠", + page_title="converSQL - Natural Language to SQL", + page_icon="πŸ’¬", layout="wide", - initial_sidebar_state="expanded" + initial_sidebar_state="expanded", ) # Professional CSS styling -st.markdown(""" +st.markdown( + """ -""", unsafe_allow_html=True) +""", + unsafe_allow_html=True, +) # Load configuration from environment variables -DEMO_MODE = os.getenv('DEMO_MODE', 'false').lower() == 'true' +DEMO_MODE = os.getenv("DEMO_MODE", "false").lower() == "true" def format_file_size(size_bytes: int) -> str: @@ -100,6 +106,7 @@ def format_file_size(size_bytes: int) -> str: return "0 B" size_names = ["B", "KB", "MB", "GB", "TB"] import math + i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = round(size_bytes / p, 2) @@ -114,7 +121,7 @@ def display_results(result_df: pd.DataFrame, title: str, execution_time: float = if execution_time: performance_info += f" β€’ ⚑ {execution_time:.2f}s" st.success(performance_info) - + # More compact result metrics in fewer columns col1, col2, col3, col4 = st.columns([2, 2, 2, 3]) with col1: @@ -133,17 +140,13 @@ def display_results(result_df: pd.DataFrame, title: str, execution_time: float = data=csv_data, file_name=filename, mime="text/csv", - key=f"download_{title}" + key=f"download_{title}", ) - + # Use full width for the dataframe with responsive height height = min(600, max(200, len(result_df) * 35 + 50)) # Dynamic height based on rows - st.dataframe( - result_df, - width="stretch", - height=height - ) - + st.dataframe(result_df, width="stretch", height=height) + else: st.warning("⚠️ No results found") @@ -153,41 +156,44 @@ def load_parquet_files(): """Load and cache parquet files.""" return scan_parquet_files() + @st.cache_data(ttl=3600) # Cache for 1 hour def load_schema_context(_parquet_files): """Load and cache schema context.""" return get_table_schemas(_parquet_files) + @st.cache_resource(ttl=3600) # Cache for 1 hour -def load_ai_client(): - """Load and cache AI client.""" - return initialize_ai_client() +def load_ai_service(): + """Load and cache AI service with adapter pattern.""" + return get_ai_service() + def initialize_app_data(): """Initialize application data and AI services efficiently.""" # Initialize session state for non-data items only if missing - if 'generated_sql' not in st.session_state: + if "generated_sql" not in st.session_state: st.session_state.generated_sql = "" - if 'bedrock_error' not in st.session_state: - st.session_state.bedrock_error = "" - if 'show_edit_sql' not in st.session_state: + if "ai_error" not in st.session_state: + st.session_state.ai_error = "" + if "show_edit_sql" not in st.session_state: st.session_state.show_edit_sql = False # Check if we need to initialize data (avoid reinitializing on every rerun) - if 'app_initialized' not in st.session_state or not st.session_state.app_initialized: + if "app_initialized" not in st.session_state or not st.session_state.app_initialized: # Show spinner only if we're actually loading data - if 'parquet_files' not in st.session_state: + if "parquet_files" not in st.session_state: with st.spinner("πŸ”„ Loading data files..."): st.session_state.parquet_files = load_parquet_files() - if 'schema_context' not in st.session_state: + if "schema_context" not in st.session_state: with st.spinner("πŸ”„ Building schema context..."): st.session_state.schema_context = load_schema_context(st.session_state.parquet_files) - if 'ai_client' not in st.session_state or 'ai_provider' not in st.session_state: + if "ai_service" not in st.session_state: with st.spinner("πŸ”„ Initializing AI services..."): - st.session_state.ai_client, st.session_state.ai_provider = load_ai_client() - st.session_state.ai_available = st.session_state.ai_client is not None + st.session_state.ai_service = load_ai_service() + st.session_state.ai_available = st.session_state.ai_service.is_available() # Mark as initialized only after all components are loaded st.session_state.app_initialized = True @@ -195,61 +201,109 @@ def initialize_app_data(): def main(): """Main Streamlit application.""" - + # Check if data is available (should be loaded by now) - if not st.session_state.get('parquet_files', []): + if not st.session_state.get("parquet_files", []): st.error("❌ No data files found. Please ensure Parquet files are in the data/processed/ directory.") return - + # Professional sidebar with enhanced styling with st.sidebar: # Header with better styling - st.markdown(""" -

πŸ“Š System Status

- """, unsafe_allow_html=True) - + """, + unsafe_allow_html=True, + ) + # Data overview with metrics styling - parquet_files = st.session_state.get('parquet_files', []) - st.markdown(""" + parquet_files = st.session_state.get("parquet_files", []) + st.markdown( + """
Data Files: {}
- """.format(len(parquet_files)), unsafe_allow_html=True) - - # Professional AI status display - ai_status = get_ai_service_status() - if ai_status['available']: - provider_name = ai_status['active_provider'].title() - st.markdown(""" -
πŸ€– AI Assistant: {}
- """.format(provider_name), unsafe_allow_html=True) - + """.format( + provider_name + ), + unsafe_allow_html=True, + ) + + # AI Provider Selector (if multiple available) + ai_service = st.session_state.get("ai_service") + if ai_service: + available_providers = ai_service.get_available_providers() + + if len(available_providers) > 1: + st.markdown("---") + st.markdown("**πŸ”„ Switch AI Provider:**") + + provider_options = list(available_providers.keys()) + current_provider = ai_service.get_active_provider() + + # Find current index + default_index = ( + provider_options.index(current_provider) if current_provider in provider_options else 0 + ) + + selected_provider = st.selectbox( + "Select AI Provider", + options=provider_options, + format_func=lambda x: available_providers[x], + index=default_index, + key="sidebar_provider_selector", + help="Choose which AI provider to use for SQL generation", + ) + + # Update provider if changed + if selected_provider != current_provider: + ai_service.set_active_provider(selected_provider) + st.rerun() + # Show provider details in professional expander with st.expander("πŸ”§ AI Provider Details", expanded=False): - status = ai_status['provider_status'] - st.markdown(""" - **System Status:** - - **Active Provider**: {} - - **Bedrock**: {} - - **Claude API**: {} - """.format( - provider_name, - 'βœ… Available' if status['bedrock'] else '❌ Unavailable', - 'βœ… Available' if status['claude'] else '❌ Unavailable' - )) + status = ai_status["provider_status"] + + # Show all available providers + st.markdown("**Available Providers:**") + for provider_key, is_available in status.items(): + if provider_key != "active": + provider_display = provider_key.title() + icon = "βœ…" if is_available else "❌" + status_text = "Available" if is_available else "Unavailable" + active_marker = " **(Active)**" if provider_key == ai_status["active_provider"] else "" + st.markdown(f"- **{provider_display}**: {icon} {status_text}{active_marker}") else: - st.markdown(""" -
πŸ€– AI Assistant: Unavailable @@ -258,18 +312,23 @@ def main(): Configure Claude API or Bedrock access
- """, unsafe_allow_html=True) - + """, + unsafe_allow_html=True, + ) + # Professional configuration status with debug info if DEMO_MODE: - st.markdown(""" + st.markdown( + """
πŸ§ͺ Demo Mode Active
- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Show detailed debug information in demo mode auth = get_auth_service() @@ -290,81 +349,118 @@ def main(): st.markdown("**Current URL Parameters**: None") # Show user session info if authenticated - if 'user' in st.session_state: + if "user" in st.session_state: user = st.session_state.user st.markdown("**User Session:**") st.markdown(f"- **Email**: {user.get('email', 'N/A')}") st.markdown(f"- **Name**: {user.get('name', 'N/A')}") - st.markdown(f"- **Auth Time**: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(user.get('authenticated_at', 0)))}") + st.markdown( + f"- **Auth Time**: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(user.get('authenticated_at', 0)))}" + ) # Configuration status st.markdown("**Configuration:**") st.markdown(f"- **Google Client ID**: {'βœ… Set' if os.getenv('GOOGLE_CLIENT_ID') else '❌ Missing'}") - st.markdown(f"- **Google Client Secret**: {'βœ… Set' if os.getenv('GOOGLE_CLIENT_SECRET') else '❌ Missing'}") + st.markdown( + f"- **Google Client Secret**: {'βœ… Set' if os.getenv('GOOGLE_CLIENT_SECRET') else '❌ Missing'}" + ) st.markdown(f"- **Enable Auth**: {os.getenv('ENABLE_AUTH', 'true')}") - - st.markdown("
", unsafe_allow_html=True) - + + st.markdown( + "
", + unsafe_allow_html=True, + ) + # Professional data tables section with st.expander("πŸ“‹ Available Tables", expanded=False): - parquet_files = st.session_state.get('parquet_files', []) + parquet_files = st.session_state.get("parquet_files", []) if parquet_files: for file_path in parquet_files: table_name = os.path.splitext(os.path.basename(file_path))[0] - st.markdown(f"
β€’ {table_name}
", unsafe_allow_html=True) + st.markdown( + f"
β€’ {table_name}
", + unsafe_allow_html=True, + ) else: - st.markdown("
No tables loaded
", unsafe_allow_html=True) - + st.markdown( + "
No tables loaded
", + unsafe_allow_html=True, + ) + # Professional quick stats section with st.expander("πŸ“ˆ Portfolio Overview", expanded=True): if st.session_state.parquet_files: try: import duckdb - conn = duckdb.connect() - - # Get record count - total = conn.execute("SELECT COUNT(*) FROM 'data/processed/data.parquet'").fetchone()[0] - - # Get total file size - total_size = sum(os.path.getsize(f) for f in st.session_state.parquet_files if os.path.exists(f)) - - # Clean metrics display - one per row for readability - st.metric("πŸ“Š Total Records", f"{total:,}") - st.metric("πŸ’Ύ Data Size", format_file_size(total_size)) - st.metric("πŸ“ Data Files", len(st.session_state.parquet_files)) - if total > 0 and total_size > 0: - records_per_mb = int(total / (total_size / (1024*1024))) - st.metric("⚑ Record Density", f"{records_per_mb:,} per MB") - - conn.close() - + + # Use in-memory connection for stats only + with duckdb.connect() as conn: + # Get record count + total = conn.execute("SELECT COUNT(*) FROM 'data/processed/data.parquet'").fetchone()[0] + + # Get total file size (cached calculation) + if "total_data_size" not in st.session_state: + st.session_state.total_data_size = sum( + os.path.getsize(f) for f in st.session_state.parquet_files if os.path.exists(f) + ) + total_size = st.session_state.total_data_size + + # Clean metrics display - one per row for readability + st.metric("πŸ“Š Total Records", f"{total:,}") + st.metric("πŸ’Ύ Data Size", format_file_size(total_size)) + st.metric("πŸ“ Data Files", len(st.session_state.parquet_files)) + if total > 0 and total_size > 0: + records_per_mb = int(total / (total_size / (1024 * 1024))) + st.metric("⚑ Record Density", f"{records_per_mb:,} per MB") + except Exception: # Fallback stats - clean single column layout - total_size = sum(os.path.getsize(f) for f in st.session_state.parquet_files if os.path.exists(f)) + if "total_data_size" not in st.session_state: + st.session_state.total_data_size = sum( + os.path.getsize(f) for f in st.session_state.parquet_files if os.path.exists(f) + ) st.metric("πŸ“ Data Files", len(st.session_state.parquet_files)) - st.metric("πŸ’Ύ Data Size", format_file_size(total_size)) + st.metric( + "πŸ’Ύ Data Size", + format_file_size(st.session_state.total_data_size), + ) else: - st.markdown("
No data loaded
", unsafe_allow_html=True) - + st.markdown( + "
No data loaded
", + unsafe_allow_html=True, + ) + # Professional header with subtle styling - st.markdown(""" + st.markdown( + """

- 🏠 Single Family Loan Analytics + πŸ’¬ converSQL

- AI-Powered Loan Portfolio Intelligence + Natural Language to SQL Query Generation

+

+ Multi-Provider AI Intelligence +

+
+ Dataset: + 🏠 Single Family Loan Analytics +
- """, unsafe_allow_html=True) - + """, + unsafe_allow_html=True, + ) + # Enhanced tab layout with ontology exploration tab1, tab2, tab3 = st.tabs(["πŸ” Query Builder", "πŸ—ΊοΈ Data Ontology", "πŸ”§ Advanced"]) - + st.markdown("
", unsafe_allow_html=True) - + with tab1: - st.markdown(""" + st.markdown( + """

Ask Questions About Your Loan Data @@ -373,88 +469,99 @@ def main(): Use natural language to query your loan portfolio data

- """, unsafe_allow_html=True) - + """, + unsafe_allow_html=True, + ) + # More compact analyst question dropdown analyst_questions = get_analyst_questions() - + col1, col2 = st.columns([4, 1]) with col1: selected_question = st.selectbox( "πŸ’‘ **Common Questions:**", [""] + list(analyst_questions.keys()), - help="Select a pre-defined question" + help="Select a pre-defined question", ) - + with col2: st.write("") # Add spacing to align button if st.button("🎯 Use", disabled=not selected_question): if selected_question in analyst_questions: st.session_state.user_question = analyst_questions[selected_question] st.rerun() - + # Professional question input with better styling - st.markdown("
", unsafe_allow_html=True) + st.markdown( + "
", + unsafe_allow_html=True, + ) user_question = st.text_area( "Your Question", - value=st.session_state.get('user_question', ''), + value=st.session_state.get("user_question", ""), placeholder="e.g., What are the top 10 states by loan volume and their average interest rates?", help="Ask your question in natural language - be specific for better results", height=100, - label_visibility="collapsed" + label_visibility="collapsed", ) - + # AI Generation - Always show button, disable if conditions not met - ai_provider = st.session_state.get('ai_provider') - provider_name = ai_provider.title() if ai_provider else "AI" - ai_available = st.session_state.get('ai_available', False) + ai_service = st.session_state.get("ai_service") + ai_provider = ai_service.get_active_provider() if ai_service else None + + # Get provider display name + if ai_service and ai_provider: + available_providers = ai_service.get_available_providers() + provider_name = available_providers.get(ai_provider, ai_provider.title()) + else: + provider_name = "AI" + + ai_available = st.session_state.get("ai_available", False) is_ai_ready = ai_available and user_question.strip() - + generate_button = st.button( f"πŸ€– Generate SQL with {provider_name}", type="primary", width="stretch", disabled=not is_ai_ready, - help="Enter a question above to generate SQL" if not is_ai_ready else None + help="Enter a question above to generate SQL" if not is_ai_ready else None, ) - + if generate_button and is_ai_ready: with st.spinner(f"🧠 {provider_name} is analyzing your question..."): start_time = time.time() - sql_query, error_msg = generate_sql_with_bedrock( - user_question, - st.session_state.get('schema_context', ''), - st.session_state.get('ai_client') - ) + sql_query, error_msg = generate_sql_with_ai(user_question, st.session_state.get("schema_context", "")) ai_generation_time = time.time() - start_time st.session_state.generated_sql = sql_query - st.session_state.bedrock_error = error_msg - + st.session_state.ai_error = error_msg + # Log query for authenticated users auth = get_auth_service() if auth.is_authenticated() and sql_query and not error_msg: auth.log_query(user_question, sql_query, provider_name, ai_generation_time) - + if sql_query and not error_msg: st.info(f"πŸ€– {provider_name} generated SQL in {ai_generation_time:.2f} seconds") - + # Show warning only if AI is unavailable but user entered text - if user_question.strip() and not st.session_state.get('ai_available', False): - st.warning("πŸ€– AI Assistant unavailable. Please configure Claude API or AWS Bedrock access, or use Manual SQL in the Advanced tab.") - + if user_question.strip() and not st.session_state.get("ai_available", False): + st.warning( + "πŸ€– AI Assistant unavailable. Please configure Claude API or AWS Bedrock access, or use Manual SQL in the Advanced tab." + ) + # Display AI errors - if st.session_state.bedrock_error: - st.error(st.session_state.bedrock_error) - st.session_state.bedrock_error = "" - + if st.session_state.ai_error: + st.error(st.session_state.ai_error) + st.session_state.ai_error = "" + # Always show execute section, but conditionally enable st.markdown("---") - + # Show generated SQL if available if st.session_state.generated_sql: st.markdown("### 🧠 AI-Generated SQL") st.code(st.session_state.generated_sql, language="sql") - + # Always show buttons, disable based on state col1, col2 = st.columns([3, 1]) with col1: @@ -464,50 +571,65 @@ def main(): type="primary", width="stretch", disabled=not has_sql, - help="Generate SQL first to execute" if not has_sql else None + help="Generate SQL first to execute" if not has_sql else None, ) if execute_button and has_sql: with st.spinner("⚑ Running query..."): - start_time = time.time() - result_df = execute_sql_query(st.session_state.generated_sql, st.session_state.get('parquet_files', [])) - execution_time = time.time() - start_time - display_results(result_df, "AI Query Results", execution_time) - + try: + start_time = time.time() + result_df = execute_sql_query( + st.session_state.generated_sql, + st.session_state.get("parquet_files", []), + ) + execution_time = time.time() - start_time + display_results(result_df, "AI Query Results", execution_time) + except Exception as e: + st.error(f"❌ Query execution failed: {str(e)}") + st.info("πŸ’‘ Try editing the SQL or rephrasing your question") + with col2: edit_button = st.button( "✏️ Edit", width="stretch", disabled=not has_sql, - help="Generate SQL first to edit" if not has_sql else None + help="Generate SQL first to edit" if not has_sql else None, ) if edit_button and has_sql: st.session_state.show_edit_sql = True - + # Edit SQL interface - if st.session_state.get('show_edit_sql', False): + if st.session_state.get("show_edit_sql", False): st.markdown("### ✏️ Edit SQL Query") edited_sql = st.text_area( "Modify the query:", value=st.session_state.generated_sql, height=150, - key="edit_sql" + key="edit_sql", ) - + col1, col2 = st.columns([3, 1]) with col1: if st.button("πŸš€ Run Edited Query", type="primary", width="stretch"): with st.spinner("⚑ Running edited query..."): - start_time = time.time() - result_df = execute_sql_query(edited_sql, st.session_state.get('parquet_files', [])) - execution_time = time.time() - start_time - display_results(result_df, "Edited Query Results", execution_time) + try: + start_time = time.time() + result_df = execute_sql_query( + edited_sql, + st.session_state.get("parquet_files", []), + ) + execution_time = time.time() - start_time + display_results(result_df, "Edited Query Results", execution_time) + except Exception as e: + st.error(f"❌ Query execution failed: {str(e)}") + st.info("πŸ’‘ Check your SQL syntax and try again") with col2: if st.button("❌ Cancel", width="stretch"): st.session_state.show_edit_sql = False st.rerun() - + with tab2: - st.markdown(""" + st.markdown( + """

πŸ—ΊοΈ Data Ontology Explorer @@ -516,14 +638,15 @@ def main(): Explore the structured organization of all 110+ data fields across 15 business domains

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Import ontology data from src.data_dictionary import LOAN_ONTOLOGY, PORTFOLIO_CONTEXT # Portfolio Overview # st.markdown("### πŸ“Š Portfolio Overview") - # col1, col2, col3 = st.columns(3) # with col1: # st.metric( @@ -551,14 +674,15 @@ def main(): selected_domain = st.selectbox( "Choose a domain to explore:", options=domain_names, - format_func=lambda x: f"{x.replace('_', ' ').title()} ({len(LOAN_ONTOLOGY[x]['fields'])} fields)" + format_func=lambda x: f"{x.replace('_', ' ').title()} ({len(LOAN_ONTOLOGY[x]['fields'])} fields)", ) if selected_domain: domain_info = LOAN_ONTOLOGY[selected_domain] # Domain header - st.markdown(f""" + st.markdown( + f"""

@@ -568,21 +692,29 @@ def main(): {domain_info['domain_description']}

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Fields in this domain st.markdown("#### πŸ“‹ Fields in this Domain") fields_data = [] - for field_name, field_meta in domain_info['fields'].items(): + for field_name, field_meta in domain_info["fields"].items(): risk_indicator = "πŸ”΄" if field_meta.risk_impact else "🟒" - fields_data.append({ - "Field": field_name, - "Risk": risk_indicator, - "Description": field_meta.description, - "Business Context": field_meta.business_context[:100] + "..." if len(field_meta.business_context) > 100 else field_meta.business_context - }) + fields_data.append( + { + "Field": field_name, + "Risk": risk_indicator, + "Description": field_meta.description, + "Business Context": ( + field_meta.business_context[:100] + "..." + if len(field_meta.business_context) > 100 + else field_meta.business_context + ), + } + ) # Display fields table fields_df = pd.DataFrame(fields_data) @@ -594,24 +726,25 @@ def main(): "Field": st.column_config.TextColumn(width="medium"), "Risk": st.column_config.TextColumn(width="small"), "Description": st.column_config.TextColumn(width="large"), - "Business Context": st.column_config.TextColumn(width="large") - } + "Business Context": st.column_config.TextColumn(width="large"), + }, ) # Field detail explorer st.markdown("#### πŸ” Field Details") - field_names = list(domain_info['fields'].keys()) + field_names = list(domain_info["fields"].keys()) selected_field = st.selectbox( "Select a field for detailed information:", options=field_names, - key=f"field_select_{selected_domain}" + key=f"field_select_{selected_domain}", ) if selected_field: - field_meta = domain_info['fields'][selected_field] + field_meta = domain_info["fields"][selected_field] # Field details card - st.markdown(f""" + st.markdown( + f"""
{selected_field}

Domain: {field_meta.domain}

@@ -619,7 +752,9 @@ def main():

Description: {field_meta.description}

Business Context: {field_meta.business_context}

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Risk impact if present if field_meta.risk_impact: @@ -637,7 +772,8 @@ def main(): # Risk Framework Summary st.markdown("### βš–οΈ Risk Assessment Framework") - st.markdown(f""" + st.markdown( + f"""

Credit Triangle: {PORTFOLIO_CONTEXT['risk_framework']['credit_triangle']}

    @@ -646,10 +782,13 @@ def main():
  • Alt-A: {PORTFOLIO_CONTEXT['risk_framework']['risk_tiers']['alt_a']}
- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) with tab3: - st.markdown(""" + st.markdown( + """

πŸ”§ Advanced Options @@ -658,32 +797,34 @@ def main(): Manual SQL queries and database schema exploration

- """, unsafe_allow_html=True) - + """, + unsafe_allow_html=True, + ) + col1, col2 = st.columns([2, 1]) - + with col1: st.markdown("### πŸ› οΈ Manual SQL Query") - + # Sample queries for manual use sample_queries = { "": "", "Total Portfolio": "SELECT COUNT(*) as total_loans, ROUND(SUM(ORIG_UPB)/1000000, 2) as total_upb_millions FROM data", "Geographic Analysis": "SELECT STATE, COUNT(*) as loan_count, ROUND(AVG(ORIG_UPB), 0) as avg_upb, ROUND(AVG(ORIG_RATE), 2) as avg_rate FROM data WHERE STATE IS NOT NULL GROUP BY STATE ORDER BY loan_count DESC LIMIT 10", "Credit Risk": "SELECT CASE WHEN CSCORE_B < 620 THEN 'Subprime' WHEN CSCORE_B < 680 THEN 'Near Prime' WHEN CSCORE_B < 740 THEN 'Prime' ELSE 'Super Prime' END as credit_tier, COUNT(*) as loans, ROUND(AVG(OLTV), 1) as avg_ltv FROM data WHERE CSCORE_B IS NOT NULL GROUP BY credit_tier ORDER BY MIN(CSCORE_B)", - "High LTV Analysis": "SELECT STATE, COUNT(*) as high_ltv_loans, ROUND(AVG(CSCORE_B), 0) as avg_credit_score FROM data WHERE OLTV > 90 AND STATE IS NOT NULL GROUP BY STATE HAVING COUNT(*) > 100 ORDER BY high_ltv_loans DESC" + "High LTV Analysis": "SELECT STATE, COUNT(*) as high_ltv_loans, ROUND(AVG(CSCORE_B), 0) as avg_credit_score FROM data WHERE OLTV > 90 AND STATE IS NOT NULL GROUP BY STATE HAVING COUNT(*) > 100 ORDER BY high_ltv_loans DESC", } - + selected_sample = st.selectbox("πŸ“‹ Choose a sample query:", list(sample_queries.keys())) - + manual_sql = st.text_area( "Write your SQL query:", value=sample_queries[selected_sample], height=200, placeholder="SELECT * FROM data LIMIT 10", - help="Use 'data' as the table name" + help="Use 'data' as the table name", ) - + # Always show execute button, disable if no query has_manual_sql = bool(manual_sql.strip()) execute_manual = st.button( @@ -691,16 +832,16 @@ def main(): type="primary", width="stretch", disabled=not has_manual_sql, - help="Enter SQL query above to execute" if not has_manual_sql else None + help="Enter SQL query above to execute" if not has_manual_sql else None, ) - + if execute_manual and has_manual_sql: with st.spinner("⚑ Running manual query..."): start_time = time.time() - result_df = execute_sql_query(manual_sql, st.session_state.get('parquet_files', [])) + result_df = execute_sql_query(manual_sql, st.session_state.get("parquet_files", [])) execution_time = time.time() - start_time display_results(result_df, "Manual Query Results", execution_time) - + with col2: st.markdown("### πŸ“Š Database Schema") @@ -708,10 +849,10 @@ def main(): schema_view = st.radio( "Choose schema view:", ["🎯 Quick Reference", "πŸ“‹ Ontological Schema", "πŸ’» Raw SQL"], - horizontal=True + horizontal=True, ) - schema_context = st.session_state.get('schema_context', '') + schema_context = st.session_state.get("schema_context", "") if schema_view == "🎯 Quick Reference": # Quick reference with domain summary @@ -722,17 +863,24 @@ def main(): # Create a compact domain overview for i in range(0, len(LOAN_ONTOLOGY), 3): # Display in rows of 3 cols = st.columns(3) - domains = list(LOAN_ONTOLOGY.items())[i:i+3] + domains = list(LOAN_ONTOLOGY.items())[i : i + 3] for j, (domain_name, domain_info) in enumerate(domains): with cols[j]: - field_count = len(domain_info['fields']) + field_count = len(domain_info["fields"]) # Create colored cards for each domain - colors = ["#3498db", "#e74c3c", "#f39c12", "#2ecc71", "#9b59b6"] - color = colors[i//3 % len(colors)] - - st.markdown(f""" + colors = [ + "#3498db", + "#e74c3c", + "#f39c12", + "#2ecc71", + "#9b59b6", + ] + color = colors[i // 3 % len(colors)] + + st.markdown( + f"""
{domain_name.replace('_', ' ').title()}
@@ -740,7 +888,9 @@ def main(): {field_count} fields

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Sample fields reference st.markdown("#### πŸ” Common Fields") @@ -753,7 +903,7 @@ def main(): "DTI": "Debt-to-income ratio (%)", "ORIG_UPB": "Original unpaid balance ($)", "CURRENT_UPB": "Current unpaid balance ($)", - "PURPOSE": "P=Purchase, R=Refi, C=CashOut" + "PURPOSE": "P=Purchase, R=Refi, C=CashOut", } field_cols = st.columns(2) @@ -767,32 +917,32 @@ def main(): # Organized schema by domains if schema_context: # Extract the organized parts of the schema - lines = schema_context.split('\n') + lines = schema_context.split("\n") in_create_table = False current_section = [] sections = [] for line in lines: - if 'CREATE TABLE' in line: + if "CREATE TABLE" in line: if current_section: - sections.append('\n'.join(current_section)) + sections.append("\n".join(current_section)) current_section = [line] in_create_table = True elif in_create_table: current_section.append(line) - if line.strip() == ');': + if line.strip() == ");": in_create_table = False elif not in_create_table and line.strip(): current_section.append(line) if current_section: - sections.append('\n'.join(current_section)) + sections.append("\n".join(current_section)) # Display each section with better formatting for i, section in enumerate(sections): - if 'CREATE TABLE' in section: - table_name = section.split('CREATE TABLE ')[1].split(' (')[0] - with st.expander(f"πŸ“Š Table: {table_name.upper()}", expanded=i==0): + if "CREATE TABLE" in section: + table_name = section.split("CREATE TABLE ")[1].split(" (")[0] + with st.expander(f"πŸ“Š Table: {table_name.upper()}", expanded=i == 0): st.code(section, language="sql") elif section.strip(): with st.expander("πŸ“š Business Intelligence Context", expanded=False): @@ -807,44 +957,47 @@ def main(): st.code(schema_context, language="sql") else: st.warning("Schema not available") - + # Professional footer with enhanced styling st.markdown("
", unsafe_allow_html=True) - + # Footer content with professional design ai_status = get_ai_service_status() ai_provider_text = "" - if ai_status['available']: - provider = ai_status['active_provider'] - if provider == 'claude': + if ai_status["available"]: + provider = ai_status["active_provider"] + if provider == "claude": ai_provider_text = "Claude API (Anthropic)" - elif provider == 'bedrock': + elif provider == "bedrock": ai_provider_text = "Amazon Bedrock" else: ai_provider_text = "AI Assistant" else: ai_provider_text = "Manual Analysis Mode" - - st.markdown(f""" + + st.markdown( + f"""
- 🏠 Single Family Loan Analytics Platform + πŸ’¬ converSQL - Natural Language to SQL Query Generation Platform
- Powered by Streamlit β€’ DuckDB β€’ Google OAuth β€’ {ai_provider_text}
+ Powered by Streamlit β€’ DuckDB β€’ {ai_provider_text} β€’ Ontological Data Intelligence
- Single Family Loan Performance Data + Implementation Showcase: Single Family Loan Analytics
- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) if __name__ == "__main__": # Initialize app data before authentication initialize_app_data() - + # Wrap main function with authentication - simple_auth_wrapper(main)() \ No newline at end of file + simple_auth_wrapper(main)() diff --git a/docs/AI_ENGINES.md b/docs/AI_ENGINES.md new file mode 100644 index 0000000..65f26eb --- /dev/null +++ b/docs/AI_ENGINES.md @@ -0,0 +1,867 @@ +# AI Engine Development Guide + +## Overview + +converSQL uses a modular **adapter pattern** for AI engine integration, making it easy to add support for new AI providers. This guide walks you through creating a new AI engine adapter from scratch. + +--- + +## Architecture + +### The Adapter Pattern + +converSQL separates AI provider logic into independent adapters: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AIService (Orchestrator) β”‚ +β”‚ - Manages multiple adapters β”‚ +β”‚ - Determines active provider β”‚ +β”‚ - Handles fallback and caching β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” + β”‚ Bedrock β”‚ β”‚ Claude β”‚ β”‚ Gemini β”‚ β”‚ Ollama β”‚ + β”‚ Adapter β”‚ β”‚ Adapter β”‚ β”‚ Adapter β”‚ β”‚ Adapter β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Key Benefits + +- **Modularity**: Each adapter is self-contained +- **Extensibility**: Add new providers without changing core logic +- **Flexibility**: Easy to switch between providers +- **Testing**: Mock adapters for testing without API calls +- **Community**: Contributors can add providers independently + +--- + +## Quick Start: Adding a New Engine + +### Step 1: Understand the Interface + +Every AI engine adapter must implement these methods: + +```python +class AIEngineAdapter: + """Base interface for AI engine adapters.""" + + def __init__(self): + """Initialize the adapter and its client.""" + pass + + def _initialize(self): + """ + Set up the AI client with API keys, configuration, etc. + Should set self.client to the initialized client or None if unavailable. + """ + raise NotImplementedError + + def is_available(self) -> bool: + """ + Check if this AI engine is available and configured. + + Returns: + True if the engine can be used, False otherwise + """ + raise NotImplementedError + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """ + Generate SQL from a prompt. + + Args: + prompt: Complete prompt including user question, schema, and instructions + + Returns: + Tuple of (sql_query, error_message) + - If successful: (generated_sql, "") + - If failed: ("", error_description) + """ + raise NotImplementedError +``` + +### Step 2: Create Your Adapter + +Let's implement a Gemini adapter as an example: + +```python +# src/ai_engines/gemini_adapter.py + +import os +from typing import Tuple + +class GeminiAdapter: + """Gemini AI adapter for SQL generation.""" + + def __init__(self): + self.client = None + self.model_name = os.getenv('GEMINI_MODEL', 'gemini-pro') + self._initialize() + + def _initialize(self): + """Initialize Gemini client.""" + api_key = os.getenv('GEMINI_API_KEY') + + if not api_key: + print("⚠️ GEMINI_API_KEY not found in environment") + return + + try: + import google.generativeai as genai + + genai.configure(api_key=api_key) + self.client = genai.GenerativeModel(self.model_name) + + # Test with a simple request + response = self.client.generate_content("test") + print(f"βœ… Gemini adapter initialized ({self.model_name})") + + except ImportError: + print("⚠️ google-generativeai package not installed") + print(" Run: pip install google-generativeai") + self.client = None + except Exception as e: + print(f"⚠️ Gemini initialization failed: {e}") + self.client = None + + def is_available(self) -> bool: + """Check if Gemini is available.""" + return self.client is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL using Gemini.""" + if not self.client: + return "", "Gemini client not available" + + try: + response = self.client.generate_content(prompt) + sql_query = response.text.strip() + + # Clean up response (remove markdown formatting if present) + if sql_query.startswith("```sql"): + sql_query = sql_query.replace("```sql", "").replace("```", "").strip() + + return sql_query, "" + + except Exception as e: + error_msg = f"Gemini error: {str(e)}" + return "", error_msg +``` + +### Step 3: Register the Adapter + +Add your adapter to the AIService class in `src/ai_service.py`: + +```python +# Import your new adapter +from src.ai_engines.gemini_adapter import GeminiAdapter + +class AIService: + """Main AI service that manages multiple providers.""" + + def __init__(self): + self.bedrock = BedrockClient() + self.claude = ClaudeClient() + self.gemini = GeminiAdapter() # Add your adapter + self.active_provider = None + self._determine_active_provider() + + def _determine_active_provider(self): + """Determine which AI provider to use.""" + ai_provider = os.getenv('AI_PROVIDER', 'bedrock').lower() + + # Check preferred provider first + if ai_provider == 'gemini' and self.gemini.is_available(): + self.active_provider = 'gemini' + elif ai_provider == 'claude' and self.claude.is_available(): + self.active_provider = 'claude' + elif ai_provider == 'bedrock' and self.bedrock.is_available(): + self.active_provider = 'bedrock' + # Fallback to any available provider + elif self.gemini.is_available(): + self.active_provider = 'gemini' + elif self.claude.is_available(): + self.active_provider = 'claude' + elif self.bedrock.is_available(): + self.active_provider = 'bedrock' + else: + self.active_provider = None + + def get_provider_status(self) -> Dict[str, bool]: + """Get status of all providers.""" + return { + 'bedrock': self.bedrock.is_available(), + 'claude': self.claude.is_available(), + 'gemini': self.gemini.is_available(), # Add your adapter + 'active': self.active_provider + } + + def generate_sql(self, user_question: str, schema_context: str) -> Tuple[str, str, str]: + """Generate SQL using active provider.""" + if not self.is_available(): + return "", "No AI providers available", "none" + + prompt = self._build_sql_prompt(user_question, schema_context) + + # Route to appropriate adapter + if self.active_provider == 'gemini': + sql_query, error_msg = self.gemini.generate_sql(prompt) + elif self.active_provider == 'claude': + sql_query, error_msg = self.claude.generate_sql(prompt) + elif self.active_provider == 'bedrock': + sql_query, error_msg = self.bedrock.generate_sql(prompt) + else: + return "", "No active provider", "none" + + return sql_query, error_msg, self.active_provider +``` + +### Step 4: Add Configuration + +Update `.env.example` and documentation: + +```bash +# AI Provider Configuration +AI_PROVIDER=gemini # Options: bedrock, claude, gemini, ollama + +# Gemini Configuration +GEMINI_API_KEY=your_gemini_api_key_here +GEMINI_MODEL=gemini-pro # or gemini-pro-vision, gemini-ultra +``` + +### Step 5: Add Dependencies + +Update `requirements.txt`: + +``` +# Gemini AI support +google-generativeai>=0.3.0 +``` + +### Step 6: Test Your Adapter + +Create tests in `tests/test_gemini_adapter.py`: + +```python +import pytest +from unittest.mock import Mock, patch +from src.ai_engines.gemini_adapter import GeminiAdapter + +def test_gemini_initialization(): + """Test Gemini adapter initialization.""" + adapter = GeminiAdapter() + # With mock API key, should initialize + assert adapter is not None + +def test_gemini_availability(): + """Test checking if Gemini is available.""" + adapter = GeminiAdapter() + # This will depend on environment configuration + is_available = adapter.is_available() + assert isinstance(is_available, bool) + +@patch('google.generativeai.GenerativeModel') +def test_gemini_sql_generation(mock_model): + """Test SQL generation with Gemini.""" + # Mock the Gemini response + mock_response = Mock() + mock_response.text = "SELECT * FROM data LIMIT 10" + mock_model.return_value.generate_content.return_value = mock_response + + adapter = GeminiAdapter() + adapter.client = mock_model.return_value + + prompt = "Test prompt" + sql, error = adapter.generate_sql(prompt) + + assert sql == "SELECT * FROM data LIMIT 10" + assert error == "" + +def test_gemini_error_handling(): + """Test error handling in Gemini adapter.""" + adapter = GeminiAdapter() + adapter.client = None # Simulate unavailable client + + sql, error = adapter.generate_sql("test") + + assert sql == "" + assert "not available" in error.lower() +``` + +Run tests: + +```bash +pytest tests/test_gemini_adapter.py -v +``` + +--- + +## Complete Example: Ollama Adapter + +Ollama is a popular self-hosted LLM platform. Here's a complete implementation: + +### Implementation + +```python +# src/ai_engines/ollama_adapter.py + +import os +import requests +from typing import Tuple + +class OllamaAdapter: + """Ollama adapter for local/self-hosted AI models.""" + + def __init__(self): + self.client = None + self.base_url = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434') + self.model_name = os.getenv('OLLAMA_MODEL', 'llama2') + self._initialize() + + def _initialize(self): + """Initialize Ollama connection.""" + try: + # Test connection to Ollama server + response = requests.get(f"{self.base_url}/api/tags", timeout=5) + + if response.status_code == 200: + models = response.json().get('models', []) + model_names = [m['name'] for m in models] + + if self.model_name in model_names: + self.client = True # Mark as available + print(f"βœ… Ollama adapter initialized ({self.model_name})") + else: + print(f"⚠️ Model '{self.model_name}' not found in Ollama") + print(f" Available models: {', '.join(model_names)}") + self.client = None + else: + print(f"⚠️ Ollama server returned status {response.status_code}") + self.client = None + + except requests.exceptions.RequestException as e: + print(f"⚠️ Could not connect to Ollama at {self.base_url}") + print(f" Make sure Ollama is running: ollama serve") + self.client = None + except Exception as e: + print(f"⚠️ Ollama initialization failed: {e}") + self.client = None + + def is_available(self) -> bool: + """Check if Ollama is available.""" + return self.client is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL using Ollama.""" + if not self.client: + return "", "Ollama not available" + + try: + response = requests.post( + f"{self.base_url}/api/generate", + json={ + "model": self.model_name, + "prompt": prompt, + "stream": False, + "options": { + "temperature": 0.1, # Lower temperature for more consistent SQL + "top_p": 0.9, + } + }, + timeout=60 # Longer timeout for local processing + ) + + if response.status_code == 200: + result = response.json() + sql_query = result['response'].strip() + + # Clean up markdown formatting + if sql_query.startswith("```sql"): + sql_query = sql_query.replace("```sql", "").replace("```", "").strip() + + return sql_query, "" + else: + return "", f"Ollama returned status {response.status_code}" + + except requests.exceptions.Timeout: + return "", "Ollama request timed out (model may be too slow)" + except Exception as e: + return "", f"Ollama error: {str(e)}" +``` + +### Configuration + +```bash +# Ollama Configuration +OLLAMA_BASE_URL=http://localhost:11434 # Default local URL +OLLAMA_MODEL=llama2 # Options: llama2, codellama, mistral, etc. +``` + +### Setup Instructions + +Document setup in `docs/OLLAMA_SETUP.md`: + +```markdown +# Ollama Setup Guide + +## Installation + +1. **Install Ollama**: + ```bash + # macOS + brew install ollama + + # Linux + curl https://ollama.ai/install.sh | sh + + # Windows + # Download from https://ollama.ai/download + ``` + +2. **Pull a model**: + ```bash + ollama pull llama2 + # Or for code-focused model: + ollama pull codellama + ``` + +3. **Start Ollama server**: + ```bash + ollama serve + ``` + +## Configuration + +Add to your `.env`: +```bash +AI_PROVIDER=ollama +OLLAMA_BASE_URL=http://localhost:11434 +OLLAMA_MODEL=llama2 +``` + +## Testing + +Test your setup: +```bash +curl http://localhost:11434/api/tags +``` + +Should return list of installed models. +``` + +--- + +## Best Practices + +### Error Handling + +Always handle errors gracefully: + +```python +def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL with comprehensive error handling.""" + if not self.client: + return "", "Client not initialized" + + try: + # API call logic + response = self.client.generate(prompt) + return response.text, "" + + except TimeoutError: + return "", "Request timed out - try again or check connectivity" + except AuthenticationError: + return "", "Authentication failed - check API key" + except RateLimitError: + return "", "Rate limit exceeded - try again later" + except APIError as e: + return "", f"API error: {str(e)}" + except Exception as e: + return "", f"Unexpected error: {str(e)}" +``` + +### Configuration Validation + +Validate configuration on initialization: + +```python +def _initialize(self): + """Initialize with validation.""" + api_key = os.getenv('NEW_ENGINE_API_KEY') + + # Check required configuration + if not api_key: + print("❌ NEW_ENGINE_API_KEY not set") + print(" Add to .env: NEW_ENGINE_API_KEY=your_key") + return + + if len(api_key) < 20: + print("⚠️ NEW_ENGINE_API_KEY appears invalid (too short)") + return + + # Validate other settings + model = os.getenv('NEW_ENGINE_MODEL', 'default-model') + valid_models = ['model-a', 'model-b', 'model-c'] + + if model not in valid_models: + print(f"⚠️ Invalid model: {model}") + print(f" Valid options: {', '.join(valid_models)}") + return + + # Continue with initialization... +``` + +### Prompt Optimization + +Different engines may need different prompt formats: + +```python +def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL with engine-specific prompt optimization.""" + + # Some engines work better with specific formatting + if self.engine_type == 'code-focused': + # Add code-specific instructions + prompt = f"```sql\n{prompt}\n```" + elif self.engine_type == 'chat-based': + # Format as conversation + prompt = f"User: {prompt}\n\nAssistant:" + + # Make API call + response = self.client.generate(prompt) + + # Clean up response based on engine behavior + sql = self._clean_response(response.text) + + return sql, "" + +def _clean_response(self, response: str) -> str: + """Clean engine-specific artifacts from response.""" + # Remove markdown code blocks + if "```sql" in response: + response = response.split("```sql")[1].split("```")[0] + + # Remove chat-style prefixes + if response.startswith("Assistant:"): + response = response.replace("Assistant:", "").strip() + + return response.strip() +``` + +--- + +## Testing Strategies + +### Unit Tests + +Test adapter logic without API calls: + +```python +@patch('your_engine.Client') +def test_adapter_logic(mock_client): + """Test adapter with mocked API.""" + mock_client.return_value.generate.return_value = "SELECT * FROM data" + + adapter = YourAdapter() + adapter.client = mock_client.return_value + + sql, error = adapter.generate_sql("test prompt") + + assert sql == "SELECT * FROM data" + assert error == "" + mock_client.return_value.generate.assert_called_once() +``` + +### Integration Tests + +Test with real API (mark as slow): + +```python +@pytest.mark.slow +@pytest.mark.integration +def test_real_api_call(): + """Test with real API (requires valid credentials).""" + adapter = YourAdapter() + + if not adapter.is_available(): + pytest.skip("API not configured") + + prompt = "SELECT COUNT(*) FROM test_table" + sql, error = adapter.generate_sql(prompt) + + assert sql + assert not error + assert "SELECT" in sql.upper() +``` + +### Mock Server for Testing + +Create a mock API server for testing: + +```python +import pytest +from flask import Flask, request, jsonify +import threading + +@pytest.fixture +def mock_api_server(): + """Start a mock API server for testing.""" + app = Flask(__name__) + + @app.route('/api/generate', methods=['POST']) + def generate(): + data = request.json + return jsonify({ + "response": "SELECT * FROM data LIMIT 10", + "model": data.get('model', 'test-model') + }) + + server = threading.Thread(target=lambda: app.run(port=5555)) + server.daemon = True + server.start() + + yield "http://localhost:5555" + + # Server stops when test ends + +def test_with_mock_server(mock_api_server): + """Test adapter with mock server.""" + os.environ['ENGINE_BASE_URL'] = mock_api_server + + adapter = YourAdapter() + sql, error = adapter.generate_sql("test") + + assert sql == "SELECT * FROM data LIMIT 10" +``` + +--- + +## Documentation + +### Setup Guide + +Create `docs/YOUR_ENGINE_SETUP.md`: + +```markdown +# [Your Engine] Setup Guide + +## Overview +Brief description of the AI engine and why users might choose it. + +## Prerequisites +- Requirements (API keys, installations, etc.) +- Cost considerations +- Account setup + +## Installation + +### 1. Get API Credentials +Steps to obtain API key... + +### 2. Install Dependencies +```bash +pip install your-engine-library +``` + +### 3. Configure converSQL +Add to `.env`: +```bash +AI_PROVIDER=your_engine +YOUR_ENGINE_API_KEY=xxx +YOUR_ENGINE_MODEL=model-name +``` + +## Testing +How to test the setup... + +## Troubleshooting +Common issues and solutions... +``` + +### Update Main Documentation + +Add to `README.md`: +- List your engine as supported +- Add badge if applicable +- Link to setup guide + +Update `docs/AI_ENGINES.md`: +- Add your engine to the list +- Provide implementation example +- Note any special considerations + +--- + +## Performance Considerations + +### Caching + +Implement caching for expensive operations: + +```python +from functools import lru_cache + +@lru_cache(maxsize=100) +def _cached_generate(prompt_hash: str, schema_hash: str) -> Tuple[str, str]: + """Cache SQL generation results.""" + # This will be called by generate_sql + pass + +def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL with caching.""" + prompt_hash = hashlib.md5(prompt.encode()).hexdigest() + + # Check cache + try: + return self._cached_generate(prompt_hash, schema_hash) + except: + pass + + # Generate if not cached + return self._make_api_call(prompt) +``` + +### Rate Limiting + +Implement rate limiting to avoid API throttling: + +```python +import time +from collections import deque + +class RateLimiter: + def __init__(self, max_calls: int, period: int): + self.max_calls = max_calls + self.period = period + self.calls = deque() + + def wait_if_needed(self): + """Wait if rate limit would be exceeded.""" + now = time.time() + + # Remove old calls outside the period + while self.calls and self.calls[0] < now - self.period: + self.calls.popleft() + + # Wait if at limit + if len(self.calls) >= self.max_calls: + sleep_time = self.period - (now - self.calls[0]) + if sleep_time > 0: + time.sleep(sleep_time) + + self.calls.append(now) + +class YourAdapter: + def __init__(self): + self.rate_limiter = RateLimiter(max_calls=10, period=60) # 10 calls per minute + self._initialize() + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """Generate SQL with rate limiting.""" + self.rate_limiter.wait_if_needed() + + # Make API call + return self._make_api_call(prompt) +``` + +### Async Support (Advanced) + +For high-throughput scenarios: + +```python +import asyncio +from typing import Tuple + +class AsyncAdapter: + async def generate_sql_async(self, prompt: str) -> Tuple[str, str]: + """Async SQL generation.""" + if not self.client: + return "", "Client not available" + + try: + response = await self.client.generate_async(prompt) + return response.text, "" + except Exception as e: + return "", str(e) + +# Usage +async def process_multiple_queries(queries): + adapter = AsyncAdapter() + tasks = [adapter.generate_sql_async(q) for q in queries] + results = await asyncio.gather(*tasks) + return results +``` + +--- + +## Community Checklist + +Before submitting your AI engine contribution: + +- [ ] **Implementation** + - [ ] Adapter class follows interface + - [ ] Error handling implemented + - [ ] Configuration validation + - [ ] Initialization feedback (console messages) + +- [ ] **Testing** + - [ ] Unit tests written + - [ ] Integration tests (optional) + - [ ] Manual testing completed + - [ ] Edge cases handled + +- [ ] **Documentation** + - [ ] Setup guide created + - [ ] Configuration documented + - [ ] README updated + - [ ] AI_ENGINES.md updated + - [ ] Code comments added + +- [ ] **Dependencies** + - [ ] Added to requirements.txt + - [ ] Version pinned appropriately + - [ ] No conflicts with existing packages + +- [ ] **Configuration** + - [ ] .env.example updated + - [ ] Default values sensible + - [ ] Environment variables documented + +- [ ] **Code Quality** + - [ ] Follows project style guidelines + - [ ] Type hints added + - [ ] Docstrings complete + - [ ] No hardcoded secrets + +--- + +## Getting Help + +- **Review existing adapters**: `src/ai_service.py` +- **Check documentation**: `docs/` +- **Ask in discussions**: GitHub Discussions +- **Open an issue**: Use "AI Engine Contribution" template + +--- + +## Examples of Good Contributions + +Look at these PRs for inspiration: +- (Future: Link to actual PRs once we have community contributions) + +--- + +## Thank You! + +Every new AI engine makes converSQL more flexible and accessible. Your contribution helps the entire community! + +**Questions?** Open an issue or discussion on GitHub. + +--- + +**Happy Coding! πŸš€** + +*Making converSQL support every AI engine, together.* diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..8bc5c25 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,696 @@ +# converSQL Architecture + +## Overview + +converSQL follows a **clean, layered architecture** designed for modularity, extensibility, and maintainability. This document provides a deep dive into the system design, component interactions, and architectural decisions. + +--- + +## Architectural Layers + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Presentation Layer β”‚ +β”‚ β€’ Streamlit UI β”‚ +β”‚ β€’ Interactive Components β”‚ +β”‚ β€’ Visualization β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Application Layer β”‚ +β”‚ β€’ Query Builder Logic β”‚ +β”‚ β€’ User Session Management β”‚ +β”‚ β€’ Authentication β”‚ +β”‚ β€’ Caching β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AI Engine Layer β”‚ +β”‚ β€’ Adapter Pattern β”‚ +β”‚ β€’ Multiple AI Providers (Bedrock, Claude, Gemini, Ollama) β”‚ +β”‚ β€’ Prompt Engineering β”‚ +β”‚ β€’ Response Parsing β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Intelligence Layer β”‚ +β”‚ β€’ Ontological Data Dictionary β”‚ +β”‚ β€’ Schema Context Generation β”‚ +β”‚ β€’ Business Rules Engine β”‚ +β”‚ β€’ Semantic Relationships β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Data Access Layer β”‚ +β”‚ β€’ DuckDB Query Engine β”‚ +β”‚ β€’ Parquet File Management β”‚ +β”‚ β€’ SQL Execution β”‚ +β”‚ β€’ Result Formatting β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Storage Layer β”‚ +β”‚ β€’ Parquet Files (local or R2) β”‚ +β”‚ β€’ Cloudflare D1 (logging) β”‚ +β”‚ β€’ Session State β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Core Components + +### 1. Application Entry Point (`app.py`) + +**Responsibility**: Main Streamlit application and UI orchestration + +**Key Functions**: +- Initialize session state and caching +- Render UI components (tabs, forms, visualizations) +- Handle user interactions +- Coordinate between authentication, AI service, and data layers + +**Design Patterns**: +- **Singleton Pattern**: AI service and authentication service cached as singletons +- **Facade Pattern**: Wraps complex subsystems with simple interfaces +- **Observer Pattern**: Streamlit's reactive model for state changes + +**Code Structure**: +```python +# Initialization +initialize_app_data() # Load data, AI, schema + +# Authentication wrapper +simple_auth_wrapper(main)() # Protects main app + +# Main UI +def main(): + # Tabs: Query Builder, Ontology Explorer, Advanced + tab1, tab2, tab3 = st.tabs([...]) + + with tab1: # Query Builder + # Natural language input + # AI SQL generation + # Query execution + # Results display +``` + +### 2. Core Module (`src/core.py`) + +**Responsibility**: Business logic and data operations + +**Key Functions**: +```python +def scan_parquet_files() -> List[str] + """Discover available Parquet files.""" + +def get_table_schemas(parquet_files: List[str]) -> str + """Generate schema context with ontology.""" + +def execute_sql_query(sql_query: str, parquet_files: List[str]) -> pd.DataFrame + """Execute SQL using DuckDB.""" + +def get_analyst_questions() -> Dict[str, str] + """Provide pre-built analytical questions.""" +``` + +**Caching Strategy**: +- `@st.cache_data(ttl=3600)` for expensive operations +- File scans cached to reduce I/O +- Schema generation cached to reduce processing + +**Design Decisions**: +- **Separation of Concerns**: Data operations separate from UI +- **Dependency Injection**: Pass parquet_files explicitly +- **Fail-Safe Defaults**: Graceful degradation if data unavailable + +### 3. AI Service Module (`src/ai_service.py`) + +**Responsibility**: AI provider management and SQL generation + +**Architecture**: +``` +AIService (Orchestrator) +β”œβ”€β”€ BedrockClient (AWS Bedrock) +β”œβ”€β”€ ClaudeClient (Anthropic API) +β”œβ”€β”€ GeminiAdapter (Google Gemini) [Future] +└── OllamaAdapter (Local/Self-hosted) [Future] +``` + +**Key Classes**: + +```python +class AIService: + def __init__(self): + self.bedrock = BedrockClient() + self.claude = ClaudeClient() + self._determine_active_provider() + + def generate_sql(self, question: str, schema: str) -> Tuple[str, str, str]: + """Route to active provider, return (sql, error, provider).""" +``` + +**Provider Selection Logic**: +1. Check `AI_PROVIDER` environment variable +2. Verify provider is available (API key, connectivity) +3. Fallback to next available provider +4. Return error if none available + +**Prompt Engineering**: +- Context-rich prompts with ontology +- Domain-specific instructions (mortgage analytics) +- Business rule integration +- Output format specification + +**Caching**: +- `@st.cache_data` with prompt hashing +- Configurable TTL (default 1 hour) +- Cache invalidation on schema changes + +### 4. Data Dictionary Module (`src/data_dictionary.py`) + +**Responsibility**: Ontological data modeling + +**Structure**: +```python +LOAN_ONTOLOGY = { + "DOMAIN_NAME": { + "domain_description": "...", + "primary_key": "...", + "fields": { + "FIELD_NAME": FieldMetadata( + description="...", + domain="...", + data_type="...", + business_context="...", + risk_impact="...", + values={...}, + relationships=[...] + ) + } + } +} +``` + +**Key Functions**: +```python +def generate_enhanced_schema_context(parquet_files: List[str]) -> str + """Generate schema with ontological enrichment.""" + +def get_field_metadata(field_name: str) -> FieldMetadata + """Retrieve metadata for a specific field.""" +``` + +**Design Benefits**: +- **Semantic Understanding**: AI understands business context +- **Relationship Mapping**: Cross-field dependencies documented +- **Business Rules**: Encoded once, used everywhere +- **Extensibility**: Easy to add new domains + +### 5. Authentication Module (`src/simple_auth.py`) + +**Responsibility**: User authentication and logging + +**Components**: +```python +class AuthService: + def is_enabled(self) -> bool + """Check if auth is configured.""" + + def is_authenticated(self) -> bool + """Check user authentication status.""" + + def handle_oauth_callback(self) + """Process Google OAuth callback.""" + + def log_query(self, question, sql, provider, time) + """Log query to Cloudflare D1.""" +``` + +**Flow**: +1. User accesses app +2. Check authentication status +3. If not authenticated β†’ OAuth redirect +4. Handle OAuth callback +5. Store user session +6. Log queries to D1 + +**Security**: +- Google OAuth 2.0 +- Session state in Streamlit +- No local password storage +- HTTPS required in production + +--- + +## Data Flow + +### Query Generation Flow + +``` +User Question + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Streamlit UI β”‚ +β”‚ (app.py) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Core Module β”‚ +β”‚ (src/core.py) β”‚ +β”‚ - Load schema β”‚ +β”‚ - Get ontology β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ AI Service β”‚ +β”‚ (src/ai_service.py) β”‚ +β”‚ - Build prompt β”‚ +β”‚ - Call AI provider β”‚ +β”‚ - Parse response β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ DuckDB Engine β”‚ +β”‚ (src/core.py) β”‚ +β”‚ - Execute SQL β”‚ +β”‚ - Return results β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Streamlit UI β”‚ +β”‚ - Format results β”‚ +β”‚ - Display table β”‚ +β”‚ - Show metrics β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Data Pipeline Flow + +``` +Raw Data (Fannie Mae) + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Ingestion β”‚ +β”‚ (scripts/sync_data) β”‚ +β”‚ - Download from R2 β”‚ +β”‚ - Validate format β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Transformation β”‚ +β”‚ (notebooks/) β”‚ +β”‚ - Parse CSV/pipe β”‚ +β”‚ - Apply schema β”‚ +β”‚ - Cast types β”‚ +β”‚ - Validate data β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Parquet Storage β”‚ +β”‚ (data/processed/) β”‚ +β”‚ - Write Parquet β”‚ +β”‚ - SNAPPY compress β”‚ +β”‚ - Add metadata β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Query Engine β”‚ +β”‚ (DuckDB) β”‚ +β”‚ - Load Parquet β”‚ +β”‚ - Execute queries β”‚ +β”‚ - Return results β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Design Patterns + +### 1. Adapter Pattern (AI Engines) + +**Problem**: Multiple AI providers with different APIs + +**Solution**: Unified interface with provider-specific adapters + +```python +class AIEngineAdapter: + def is_available(self) -> bool: ... + def generate_sql(self, prompt: str) -> Tuple[str, str]: ... + +class BedrockClient(AIEngineAdapter): ... +class ClaudeClient(AIEngineAdapter): ... +class GeminiAdapter(AIEngineAdapter): ... +``` + +**Benefits**: +- Add new providers without changing core logic +- Easy to test with mock adapters +- Clear separation of concerns + +### 2. Facade Pattern (Core Module) + +**Problem**: Complex subsystems (file I/O, DuckDB, caching) + +**Solution**: Simple unified interface + +```python +# Complex operations hidden behind simple functions +def execute_sql_query(sql: str, files: List[str]) -> pd.DataFrame: + # Handles: connection, table registration, execution, cleanup +``` + +**Benefits**: +- Simple API for UI layer +- Easy to refactor internals +- Reduced coupling + +### 3. Strategy Pattern (Ontology) + +**Problem**: Different domains need different business rules + +**Solution**: Ontology-driven strategy selection + +```python +# AI service selects strategies based on ontology +if domain == "CREDIT_RISK": + # Apply credit risk rules +elif domain == "GEOGRAPHIC": + # Apply geographic rules +``` + +**Benefits**: +- Domain-specific intelligence +- Easy to extend to new domains +- Centralized business logic + +### 4. Singleton Pattern (Services) + +**Problem**: Expensive initialization (AI clients, auth) + +**Solution**: Cached singleton instances + +```python +@st.cache_resource +def get_ai_service() -> AIService: + return AIService() # Created once, reused +``` + +**Benefits**: +- Avoid repeated API connections +- Faster subsequent requests +- Lower resource usage + +--- + +## Configuration Management + +### Environment Variables + +```bash +# Core Configuration +PROCESSED_DATA_DIR=data/processed/ +DEMO_MODE=false + +# AI Provider Selection +AI_PROVIDER=claude # bedrock, claude, gemini, ollama + +# Claude API +CLAUDE_API_KEY=sk-ant-... +CLAUDE_MODEL=claude-3-5-sonnet-20241022 + +# AWS Bedrock +AWS_DEFAULT_REGION=us-west-2 +BEDROCK_MODEL_ID=anthropic.claude-3-5-haiku-20241022-v1:0 + +# Authentication +ENABLE_AUTH=true +GOOGLE_CLIENT_ID=... +GOOGLE_CLIENT_SECRET=... + +# Cloudflare +R2_ACCOUNT_ID=... +R2_BUCKET_NAME=... +D1_DATABASE_ID=... + +# Performance +CACHE_TTL=3600 +ENABLE_PROMPT_CACHE=true +``` + +### Configuration Priority + +1. Environment variables (`.env`) +2. Default values in code +3. Fallback to safe defaults + +--- + +## Performance Optimizations + +### 1. Caching Strategy + +**Data Caching**: +```python +@st.cache_data(ttl=3600) +def scan_parquet_files(): ... + +@st.cache_data(ttl=3600) +def get_table_schemas(): ... +``` + +**Resource Caching**: +```python +@st.cache_resource +def load_ai_client(): ... + +@st.cache_resource +def get_auth_service(): ... +``` + +### 2. Lazy Loading + +- Parquet files loaded only when queried +- Schema generated on first request +- AI client initialized on first use + +### 3. DuckDB Optimizations + +- **Columnar reads**: Only requested columns loaded +- **Predicate pushdown**: Filters applied at file level +- **Statistics**: Parquet metadata for query planning +- **Zero-copy**: Direct Parquet file access + +### 4. UI Optimizations + +- **Component keys**: Prevent unnecessary re-renders +- **Session state**: Preserve expensive computations +- **Incremental updates**: Only re-render changed components + +--- + +## Security Considerations + +### 1. Authentication + +- Google OAuth 2.0 (industry standard) +- No password storage +- Session-based authentication +- HTTPS required + +### 2. Data Access + +- No user-uploaded SQL (prevents injection) +- AI-generated queries only +- Parameterized queries where applicable +- Read-only database access + +### 3. API Keys + +- Environment variables only +- Never committed to git +- Rotated regularly +- Separate keys per environment + +### 4. Logging + +- Query logging to D1 +- No sensitive data in logs +- User actions tracked for audit + +--- + +## Scalability + +### Current Capacity + +- **Data**: 10M+ rows performant +- **Users**: Single-instance, supports 10-50 concurrent +- **Queries**: Sub-second response for most queries + +### Scaling Strategies + +**Vertical Scaling** (current): +- More memory for larger datasets +- Faster CPU for query execution +- SSD for faster I/O + +**Horizontal Scaling** (future): +- Multiple Streamlit instances behind load balancer +- Shared R2 storage +- Distributed query engine (Spark) + +**Data Scaling**: +- Partition Parquet files by time/geography +- Incremental updates instead of full reloads +- Archive old data to cold storage + +--- + +## Testing Strategy + +### Unit Tests +```python +# Test individual functions +def test_execute_sql_query(): + result = execute_sql_query("SELECT COUNT(*) FROM data", files) + assert len(result) > 0 +``` + +### Integration Tests +```python +# Test component interactions +def test_ai_to_database_flow(): + sql, error = generate_sql_with_ai(question, schema) + result = execute_sql_query(sql, files) + assert not result.empty +``` + +### End-to-End Tests +```python +# Test complete user flows +def test_query_generation_flow(): + # User enters question + # AI generates SQL + # Query executes + # Results displayed +``` + +--- + +## Deployment Architecture + +### Local Development +``` +Developer Machine +β”œβ”€β”€ Python 3.11+ +β”œβ”€β”€ Streamlit +β”œβ”€β”€ DuckDB +β”œβ”€β”€ Parquet files (local) +└── Environment variables (.env) +``` + +### Production (Streamlit Cloud) +``` +Streamlit Cloud +β”œβ”€β”€ App container +β”œβ”€β”€ Environment secrets +β”œβ”€β”€ Cloudflare R2 (data storage) +β”œβ”€β”€ Cloudflare D1 (logging) +└── Google OAuth (authentication) +``` + +### Production (Self-Hosted) +``` +Server/Container +β”œβ”€β”€ Docker container +β”œβ”€β”€ Reverse proxy (nginx) +β”œβ”€β”€ HTTPS certificate +β”œβ”€β”€ Environment variables +└── Volume mounts (data) +``` + +--- + +## Error Handling + +### Graceful Degradation + +1. **AI unavailable** β†’ Show manual SQL option +2. **Data missing** β†’ Show helpful setup instructions +3. **Auth unavailable** β†’ Demo mode (if enabled) +4. **Query errors** β†’ Show error, suggest corrections + +### Error Boundaries + +```python +try: + result = execute_sql_query(sql, files) +except Exception as e: + logger.error(f"Query failed: {e}") + st.error("Query execution failed. Please check SQL syntax.") + return pd.DataFrame() # Empty result +``` + +--- + +## Monitoring & Observability + +### Logging + +- Application logs (stdout) +- Query logs (D1 database) +- Error logs (stderr) +- Performance metrics (execution times) + +### Metrics + +- Query success/failure rates +- Response times (AI, database) +- User activity patterns +- Resource utilization + +--- + +## Future Enhancements + +### Planned Improvements + +1. **Multi-table queries** - JOIN support with relationship intelligence +2. **Query explanation** - Visualize query plan and logic +3. **Historical learning** - Learn from past queries +4. **API mode** - Programmatic access without UI +5. **Real-time collaboration** - Multiple users sharing queries +6. **Advanced visualizations** - Interactive charts and dashboards + +--- + +## Related Documentation + +- **[Data Pipeline](DATA_PIPELINE.md)** - Data transformation architecture +- **[AI Engines](AI_ENGINES.md)** - AI adapter implementation +- **[Contributing](../CONTRIBUTING.md)** - Development guidelines + +--- + +## Questions? + +For architecture questions or suggestions: +- Open an issue with the "architecture" label +- Start a discussion on GitHub +- Review existing PRs for implementation patterns + +--- + +**Built with careful consideration for modularity, extensibility, and maintainability.** + +*Understanding the architecture helps you contribute effectively!* diff --git a/docs/D1_SETUP.md b/docs/D1_SETUP.md index 00ca511..93ed29e 100644 --- a/docs/D1_SETUP.md +++ b/docs/D1_SETUP.md @@ -10,12 +10,12 @@ This guide explains how to set up Cloudflare D1 for logging user activity (login ```bash npm install -g wrangler wrangler login -wrangler d1 create nlptosql-logs +wrangler d1 create converSQL-logs ``` ### 2. Initialize the Database Schema ```bash -wrangler d1 execute nlptosql-logs --file=scripts/d1_schema.sql +wrangler d1 execute converSQL-logs --file=scripts/d1_schema.sql ``` ### 3. Get Database Credentials @@ -70,7 +70,7 @@ Minimal tables: ## Testing ```bash -wrangler d1 execute nlptosql-logs --command="SELECT COUNT(*) FROM user_logins;" +wrangler d1 execute converSQL-logs --command="SELECT COUNT(*) FROM user_logins;" ``` --- diff --git a/docs/comprehensive_data_dictionary.md b/docs/DATA_DICTIONARY.md similarity index 99% rename from docs/comprehensive_data_dictionary.md rename to docs/DATA_DICTIONARY.md index 11a0d89..b4c1e5f 100644 --- a/docs/comprehensive_data_dictionary.md +++ b/docs/DATA_DICTIONARY.md @@ -3,7 +3,7 @@ **Dataset Overview:** - Total Columns: 110 - Total Rows: 9,091,836 -- File: `/Users/ravi/projects/git/nlptosql/data/processed/data.parquet` +- File: `/Users/ravi/projects/git/converSQL/data/processed/data.parquet` - Date Generated: September 15, 2025 ## Column Categories and Complete Listing diff --git a/docs/DATA_PIPELINE.md b/docs/DATA_PIPELINE.md new file mode 100644 index 0000000..c99704e --- /dev/null +++ b/docs/DATA_PIPELINE.md @@ -0,0 +1,595 @@ +# Data Pipeline Documentation + +## Overview + +The **converSQL Single Family Loan Analytics** showcase implementation demonstrates a complete, production-grade data engineering pipeline. This document details how we transform Fannie Mae's public loan performance data from raw pipe-separated files into a high-performance analytical data store. + +--- + +## Data Source: Fannie Mae Single Family Loan Performance Data + +### What is This Data? + +Fannie Mae's [Single Family Loan Performance Data](https://capitalmarkets.fanniemae.com/tools-applications/data-dynamics) represents one of the most comprehensive public datasets on U.S. residential mortgage markets. Released under Fannie Mae's open data initiative, it provides loan-level detail on millions of mortgages, enabling researchers, analysts, and developers to study mortgage credit risk, housing finance, and loan performance trends. + +### Data Characteristics + +- **Scale**: 9+ million loan records in our sample dataset +- **Columns**: 110 fields covering identification, origination, property, borrower, and performance data +- **Time Span**: Vintages from 2000s through recent years, with monthly performance updates +- **Format**: Pipe-separated text files (`.txt` with `|` delimiter) +- **Updates**: Quarterly releases with new originations and updated performance data + +### Key Data Domains + +The dataset is organized into rich ontological domains: + +1. **Identification** (7 fields): Loan IDs, pool identifiers, servicer information +2. **Temporal** (15 fields): Origination dates, maturity dates, reporting periods +3. **Loan Terms** (13 fields): Interest rates, loan terms, product types +4. **Financial** (8 fields): Unpaid principal balances (UPB), accrued interest +5. **Borrower Profile** (8 fields): Credit scores (FICO), debt-to-income ratios +6. **Property** (8 fields): Property types, occupancy, geographic location +7. **Credit Risk** (3 fields): Loan-to-value ratios (LTV), combined LTV, mortgage insurance +8. **Performance** (6 fields): Delinquency status, default indicators +9. **Modifications** (7 fields): Loan modification history and terms +10. **Loss Events** (14 fields): Foreclosure, REO, and disposition tracking +11. **Geographic** (multiple): State, MSA, ZIP codes +12. **Pricing** (6 fields): Interest rate components and adjustments +13. **Servicing** (8 fields): Servicer transfers, fees, escrow balances +14. **Special Features** (multiple): High balance loans, first-time buyers, special programs +15. **Calculations** (derived): Loan age, remaining terms, vintage cohorts + +--- + +## Pipeline Architecture + +### High-Level Flow + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Raw Data Source β”‚ +β”‚ (Fannie Mae .txt) β”‚ +β”‚ Pipe-separated β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Ingestion Layer β”‚ +β”‚ - Download/Sync β”‚ +β”‚ - Validate Format β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Transformation β”‚ +β”‚ - Schema Mapping β”‚ +β”‚ - Type Casting β”‚ +β”‚ - Data Validation β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Parquet Output β”‚ +β”‚ - SNAPPY Compress β”‚ +β”‚ - Columnar Format β”‚ +β”‚ - Metadata Rich β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Analytical Layer β”‚ +β”‚ - DuckDB Engine β”‚ +β”‚ - Fast Queries β”‚ +β”‚ - Ontology-Aware β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Stage 1: Data Ingestion + +### Source Data Format + +Fannie Mae distributes loan performance data as pipe-separated text files: + +``` +POOL_ID|LOAN_ID|ACT_PERIOD|CHANNEL|SELLER|SERVICER|... +ABC123|100000000001|012023|R|SELLER_A|SERVICER_X|... +ABC123|100000000002|012023|R|SELLER_B|SERVICER_Y|... +``` + +**Challenges:** +- **No explicit schema**: Field types are implicit +- **Large file sizes**: 1GB+ raw text files +- **Quarterly updates**: Need to merge/update existing data +- **Data quality**: Missing values, inconsistent formats + +### Ingestion Process + +Our ingestion layer (`scripts/sync_data.py`) handles: + +1. **Download/Sync from Cloudflare R2** (or local storage) +2. **File validation**: Check format, delimiter, basic structure +3. **Header parsing**: Extract column names from first row +4. **Chunked reading**: Process large files in manageable batches + +```python +# Example: Reading pipe-separated file +import pandas as pd + +df = pd.read_csv( + 'raw_data.txt', + sep='|', + dtype='str', # Read all as strings initially + na_values=['', 'NULL', 'NA'], + keep_default_na=False, + low_memory=False +) +``` + +--- + +## Stage 2: Schema Transformation + +### The Schema Challenge + +Fannie Mae provides data dictionaries, but the raw files have no enforced types. Columns like `ORIG_UPB` (unpaid principal balance) are stored as strings, and credit scores may mix integers with nulls or special codes. + +### Our Schema Solution + +We define an **explicit, typed schema** for all 110 columns: + +```python +SCHEMA = { + # Identifiers - VARCHAR for flexibility + 'LOAN_ID': 'VARCHAR', + 'POOL_ID': 'VARCHAR', + + # Financial - appropriate numeric types + 'ORIG_UPB': 'DOUBLE', # Large dollar amounts + 'CURRENT_UPB': 'DOUBLE', + + # Credit scores - INT16 (range: 300-850) + 'CSCORE_B': 'INT16', + 'CSCORE_C': 'INT16', + + # Ratios and percentages - FLOAT + 'OLTV': 'FLOAT', + 'DTI': 'FLOAT', + 'ORIG_RATE': 'FLOAT', + + # Dates - VARCHAR (MMYYYY format) + 'ORIG_DATE': 'VARCHAR', + 'ACT_PERIOD': 'VARCHAR', + + # Categorical - VARCHAR + 'STATE': 'VARCHAR', + 'PURPOSE': 'VARCHAR', + 'PROP': 'VARCHAR', + + # ... and 100+ more fields +} +``` + +### Type Casting Logic + +The transformation layer (`notebooks/pipeline_csv_to_parquet.ipynb`) applies sophisticated type casting: + +```python +def cast_columns(df, schema): + for col, dtype in schema.items(): + if col not in df.columns: + continue + + if dtype == 'DOUBLE' or dtype == 'FLOAT': + # Handle currency and percentages + df[col] = pd.to_numeric(df[col], errors='coerce') + + elif dtype.startswith('INT'): + # Handle integers with nulls + df[col] = pd.to_numeric(df[col], errors='coerce') + if dtype == 'INT16': + df[col] = df[col].astype('Int16') # Nullable integer + elif dtype == 'INT8': + df[col] = df[col].astype('Int8') + + elif dtype == 'VARCHAR': + # Keep as string, handle nulls + df[col] = df[col].astype('str') + df[col] = df[col].replace(['nan', 'None'], pd.NA) + + return df +``` + +### Data Validation + +Post-transformation, we validate: +- **Range checks**: Credit scores 300-850, LTV 0-200% +- **Referential integrity**: Valid state codes, property types +- **Business rules**: Origination date <= current period +- **Null handling**: Expected nulls (co-borrower fields) vs. data quality issues + +--- + +## Stage 3: Parquet Storage + +### Why Parquet? + +[Apache Parquet](https://parquet.apache.org/) is a columnar storage format ideal for analytical workloads: + +- **Columnar layout**: Read only the columns you need +- **Compression**: SNAPPY, GZIP, or ZSTD reduce file size 5-15x +- **Type preservation**: Native support for integers, floats, decimals +- **Metadata**: Embedded schema and statistics for query optimization +- **Ecosystem support**: Works with DuckDB, Pandas, Spark, etc. + +### Our Parquet Configuration + +```python +import pyarrow as pa +import pyarrow.parquet as pq + +# Define schema with explicit types +schema = pa.schema([ + ('LOAN_ID', pa.string()), + ('ORIG_UPB', pa.float64()), + ('CSCORE_B', pa.int16()), + ('STATE', pa.string()), + # ... all 110 fields +]) + +# Write with SNAPPY compression +table = pa.Table.from_pandas(df, schema=schema) +pq.write_table( + table, + 'data.parquet', + compression='SNAPPY', + use_dictionary=True, # Compress repetitive values + write_statistics=True # Enable query optimization +) +``` + +### Performance Gains + +| Metric | Raw .txt | Parquet (SNAPPY) | Improvement | +|--------|----------|------------------|-------------| +| File Size | 1.2 GB | 120 MB | **10x reduction** | +| Full Scan | 45 seconds | 2 seconds | **22x faster** | +| Column Select | 45 seconds | 0.3 seconds | **150x faster** | +| Memory Usage | 3.5 GB | 450 MB | **7.8x reduction** | + +--- + +## Stage 4: DuckDB Integration + +### Why DuckDB? + +[DuckDB](https://duckdb.org/) is an embedded analytical database designed for fast OLAP queries: + +- **Zero-copy reads**: Queries Parquet files directly without loading into memory +- **Vectorized execution**: SIMD optimizations for blazing-fast aggregations +- **SQL support**: Full SQL-92 compliance with analytical extensions +- **Embedded**: No server setup, runs in-process with Python + +### Query Examples + +**Simple aggregation:** +```sql +SELECT STATE, COUNT(*) as loan_count, AVG(ORIG_UPB) as avg_balance +FROM 'data/processed/data.parquet' +GROUP BY STATE +ORDER BY loan_count DESC +LIMIT 10; +``` +*Execution time: 0.15 seconds for 9M rows* + +**Complex analytical query:** +```sql +SELECT + CASE + WHEN CSCORE_B >= 740 THEN 'Super Prime' + WHEN CSCORE_B >= 680 THEN 'Prime' + WHEN CSCORE_B >= 620 THEN 'Near Prime' + ELSE 'Subprime' + END as credit_tier, + COUNT(*) as loans, + ROUND(AVG(OLTV), 1) as avg_ltv, + ROUND(AVG(DTI), 1) as avg_dti, + SUM(CASE WHEN DLQ_STATUS > '00' THEN 1 ELSE 0 END) as delinquent, + ROUND(SUM(CURRENT_UPB)/1000000000, 2) as total_upb_billions +FROM 'data/processed/data.parquet' +WHERE CSCORE_B IS NOT NULL +GROUP BY credit_tier +ORDER BY MIN(CSCORE_B); +``` +*Execution time: 0.8 seconds with aggregations and conditional logic* + +### Integration with converSQL + +Our `src/core.py` module wraps DuckDB for seamless querying: + +```python +import duckdb + +def execute_sql_query(sql_query: str, parquet_files: List[str]): + """Execute SQL on Parquet files using DuckDB.""" + conn = duckdb.connect() + + # Register Parquet files as tables + for file_path in parquet_files: + table_name = os.path.splitext(os.path.basename(file_path))[0] + conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM '{file_path}'") + + # Execute query + result_df = conn.execute(sql_query).fetchdf() + conn.close() + + return result_df +``` + +--- + +## Data Ontology Integration + +### From Raw Data to Business Intelligence + +The pipeline doesn't just transform file formatsβ€”it infuses **business intelligence** through our ontological data dictionary. + +### Ontology Structure + +Located in `src/data_dictionary.py`, the ontology defines: + +1. **Domain organization**: 15 business domains (Credit Risk, Geographic, Temporal, etc.) +2. **Field metadata**: Description, business context, risk implications +3. **Relationships**: Cross-field dependencies and calculations +4. **Value codes**: Enumerated types (e.g., DLQ_STATUS codes, PURPOSE types) +5. **Business rules**: Credit score tiers, LTV risk bands, vintage cohorts + +### Example: Credit Risk Domain + +```python +LOAN_ONTOLOGY = { + "CREDIT_RISK": { + "domain_description": "Borrower credit quality and equity position indicators", + "fields": { + "CSCORE_B": FieldMetadata( + description="Primary borrower credit score (FICO)", + domain="Credit_Risk", + data_type="INT16", + business_context="Primary indicator of credit quality. Scores 740+ are super prime, 680-739 prime, 620-679 near prime, <620 subprime.", + risk_impact="Primary driver of default risk. Each 20-point decrease in score doubles default probability.", + relationships=["credit_tier", "default_risk", "pricing"] + ), + "OLTV": FieldMetadata( + description="Original loan-to-value ratio (%)", + domain="Credit_Risk", + data_type="FLOAT", + business_context="Measures borrower equity at origination. LTV >80% typically requires mortgage insurance. LTV >95% indicates minimal down payment.", + risk_impact="Higher LTV = lower equity cushion = higher default and loss severity risk. 80% is key threshold.", + relationships=["equity_position", "mi_requirement", "loss_severity"] + ) + } + } +} +``` + +### How AI Uses Ontology + +When converSQL generates SQL, it leverages ontology to: + +1. **Map concepts to fields**: "high-risk loans" β†’ `CSCORE_B < 620 AND OLTV > 90` +2. **Apply business rules**: "super prime borrowers" β†’ `CSCORE_B >= 740` +3. **Generate context-aware calculations**: "portfolio concentration" includes grouping and ratio logic +4. **Provide semantic relationships**: Understanding that `OLTV` relates to `MI_PCT` and loss severity + +--- + +## Pipeline Execution + +### Running the Pipeline + +**Notebooks (Development):** +```bash +# For single file conversion +jupyter notebook notebooks/pipeline_csv_to_parquet.ipynb + +# For multi-file processing +jupyter notebook notebooks/pipeline_csv_to_parquet_multifile.ipynb +``` + +**Scripts (Production):** +```bash +# Sync data from R2 storage +python scripts/sync_data.py + +# Force re-download and re-process +python scripts/sync_data.py --force +``` + +### Configuration + +Set environment variables in `.env`: + +```bash +# Data directories +PROCESSED_DATA_DIR=data/processed/ +RAW_DATA_DIR=data/raw/ + +# Cloudflare R2 (optional) +R2_ACCOUNT_ID=your_account_id +R2_ACCESS_KEY_ID=your_access_key +R2_SECRET_ACCESS_KEY=your_secret_key +R2_BUCKET_NAME=your_bucket_name +``` + +--- + +## Data Quality & Monitoring + +### Quality Checks + +Our pipeline includes automated quality checks: + +```python +def validate_data(df): + """Validate transformed data quality.""" + issues = [] + + # Check for required fields + required_fields = ['LOAN_ID', 'ORIG_UPB', 'STATE'] + missing = [f for f in required_fields if f not in df.columns] + if missing: + issues.append(f"Missing required fields: {missing}") + + # Check credit score ranges + if 'CSCORE_B' in df.columns: + invalid_scores = df[(df['CSCORE_B'] < 300) | (df['CSCORE_B'] > 850)].shape[0] + if invalid_scores > 0: + issues.append(f"{invalid_scores} invalid credit scores (out of 300-850 range)") + + # Check LTV ranges + if 'OLTV' in df.columns: + invalid_ltv = df[(df['OLTV'] < 0) | (df['OLTV'] > 200)].shape[0] + if invalid_ltv > 0: + issues.append(f"{invalid_ltv} invalid LTV values (out of 0-200% range)") + + # Check for duplicates + dupes = df.duplicated(subset=['LOAN_ID', 'ACT_PERIOD']).sum() + if dupes > 0: + issues.append(f"{dupes} duplicate loan-period combinations") + + return issues +``` + +### Monitoring Metrics + +Track pipeline health with: +- **Processing time**: Target <5 minutes per 1M rows +- **Data volume**: Row counts match expected quarterly releases +- **Quality scores**: % of records passing validation +- **File sizes**: Compression ratios within expected range (8-12x) +- **Query performance**: Sample queries execute within SLA + +--- + +## Performance Optimization + +### Optimization Techniques + +1. **Chunked processing**: Read/write large files in 500K row chunks +2. **Column selection**: Only cast columns actually used in queries +3. **Compression tuning**: SNAPPY for speed, GZIP for size +4. **Partitioning**: Consider partitioning by year/quarter for very large datasets +5. **Indexing**: Parquet row groups serve as implicit indexes +6. **Caching**: Reuse Parquet files across application sessions + +### Scaling Considerations + +For datasets beyond 50M rows: + +- **Partitioning by time**: `data_2020Q1.parquet`, `data_2020Q2.parquet`, etc. +- **Delta updates**: Only process new/changed records +- **Distributed processing**: Use Spark or Dask for multi-machine transformation +- **Data warehousing**: Consider BigQuery, Snowflake, or Redshift for very large scale + +--- + +## Extending the Pipeline + +### Adapting to Your Data + +To use this pipeline for other datasets: + +1. **Update schema definition**: Define column types for your data +2. **Modify transformation logic**: Adjust type casting and validation +3. **Create custom ontology**: Define your domain-specific ontology +4. **Configure data source**: Point to your raw data location +5. **Test thoroughly**: Validate with sample data before production + +### Example: E-commerce Order Data + +```python +# Define schema +ECOMMERCE_SCHEMA = { + 'order_id': 'VARCHAR', + 'customer_id': 'VARCHAR', + 'order_date': 'DATE', + 'order_amount': 'DOUBLE', + 'product_category': 'VARCHAR', + 'quantity': 'INT16', + 'discount_pct': 'FLOAT' +} + +# Define ontology +ECOMMERCE_ONTOLOGY = { + "CUSTOMER": { + "domain_description": "Customer identification and segmentation", + "fields": { + "customer_id": FieldMetadata( + description="Unique customer identifier", + domain="Customer", + data_type="VARCHAR", + business_context="Primary key for customer analytics" + ) + } + }, + "TRANSACTION": { + "domain_description": "Order and revenue tracking", + "fields": { + "order_amount": FieldMetadata( + description="Total order value in USD", + domain="Transaction", + data_type="DOUBLE", + business_context="Revenue metric for analytics and reporting" + ) + } + } +} +``` + +--- + +## Troubleshooting + +### Common Issues + +**Issue: Out of memory during transformation** +- Solution: Reduce chunk size, process in batches + +**Issue: Parquet write fails with schema errors** +- Solution: Check for incompatible types (e.g., mixed int/float in same column) + +**Issue: Query performance degradation** +- Solution: Check file size, consider partitioning, verify compression + +**Issue: Data quality failures** +- Solution: Review raw data, adjust null handling, update validation rules + +--- + +## Resources + +### Documentation +- [Fannie Mae Data Portal](https://capitalmarkets.fanniemae.com/tools-applications/data-dynamics) +- [Apache Parquet Format](https://parquet.apache.org/docs/) +- [DuckDB Documentation](https://duckdb.org/docs/) +- [Pandas API Reference](https://pandas.pydata.org/docs/) + +### Related Files +- `notebooks/pipeline_csv_to_parquet.ipynb` β€” Single-file transformation notebook +- `notebooks/pipeline_csv_to_parquet_multifile.ipynb` β€” Multi-file batch processing +- `scripts/sync_data.py` β€” Production data sync script +- `src/data_dictionary.py` β€” Ontological data dictionary +- `docs/comprehensive_data_dictionary.md` β€” Complete field reference + +--- + +## Questions? + +For pipeline questions or issues: +- Open an issue on GitHub +- Check existing documentation in `docs/` +- Review notebook examples in `notebooks/` + +**Making data pipelines conversational, one transformation at a time.** πŸš€ diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md index b486d5d..287a5e6 100644 --- a/docs/DEPLOYMENT.md +++ b/docs/DEPLOYMENT.md @@ -7,8 +7,8 @@ Simple guide for deploying the NLP to SQL Streamlit application directly from Gi ### 1. Local Development ```bash # Clone repository -git clone https://github.com/ravishan16/nlptosql.git -cd nlptosql +git clone https://github.com/ravishan16/converSQL.git +cd converSQL # Install dependencies pip install -r requirements.txt @@ -22,7 +22,7 @@ streamlit run app.py ### 2. Streamlit Cloud Deployment 1. Visit [share.streamlit.io](https://share.streamlit.io) 2. Connect your GitHub account -3. Select your repository: `ravishan16/nlptosql` +3. Select your repository: `ravishan16/converSQL` 4. Set main file path: `app.py` 5. Configure environment variables (see below) 6. Deploy! @@ -86,7 +86,7 @@ DEMO_MODE=false 1. **Go to Streamlit Cloud**: Visit [share.streamlit.io](https://share.streamlit.io) 2. **Sign in**: Use your GitHub account 3. **New App**: Click "New app" -4. **Select Repository**: Choose `ravishan16/nlptosql` +4. **Select Repository**: Choose `ravishan16/converSQL` 5. **Set Main File**: Enter `app.py` 6. **Branch**: Select `main` (default) 7. **Click Deploy** @@ -219,4 +219,4 @@ echo $CLAUDE_API_KEY - [Streamlit Cloud Documentation](https://docs.streamlit.io/streamlit-cloud) - [Streamlit Community Forum](https://discuss.streamlit.io/) -- [GitHub Repository](https://github.com/ravishan16/nlptosql) \ No newline at end of file +- [GitHub Repository](https://github.com/ravishan16/converSQL) \ No newline at end of file diff --git a/docs/ENVIRONMENT_SETUP.md b/docs/ENVIRONMENT_SETUP.md index f870cff..9ecf4e3 100644 --- a/docs/ENVIRONMENT_SETUP.md +++ b/docs/ENVIRONMENT_SETUP.md @@ -25,18 +25,31 @@ AI_PROVIDER=bedrock AWS_ACCESS_KEY_ID=your_aws_access_key AWS_SECRET_ACCESS_KEY=your_aws_secret_key AWS_DEFAULT_REGION=us-east-1 -BEDROCK_MODEL_ID=anthropic.claude-3-sonnet-20240229-v1:0 +BEDROCK_MODEL_ID=anthropic.claude-3-5-haiku-20241022-v1:0 BEDROCK_MAX_TOKENS=4096 + +# Optional: Bedrock Guardrails for content filtering +BEDROCK_GUARDRAIL_ID=your-guardrail-id +BEDROCK_GUARDRAIL_VERSION=DRAFT # or specific version number ``` #### Anthropic Claude API ```bash AI_PROVIDER=claude -ANTHROPIC_API_KEY=sk-ant-your-api-key-here -CLAUDE_MODEL=claude-3-sonnet-20240229 +CLAUDE_API_KEY=sk-ant-your-api-key-here +CLAUDE_MODEL=claude-3-5-sonnet-20241022 CLAUDE_MAX_TOKENS=4096 ``` +#### Google Gemini +```bash +AI_PROVIDER=gemini +GOOGLE_API_KEY=your-google-api-key +# or +GEMINI_API_KEY=your-google-api-key +GEMINI_MODEL=gemini-1.5-pro +``` + ### Authentication (Optional) ```bash ENABLE_AUTH=false @@ -79,7 +92,7 @@ pip install -r requirements.txt 1. Clone the repository and set up your environment: ```bash git clone - cd nlptosql + cd converSQL cp .env.example .env pip install -r requirements.txt ``` diff --git a/notebooks/pipeline_csv_to_parquet multifile.ipynb b/notebooks/pipeline_csv_to_parquet multifile.ipynb new file mode 100644 index 0000000..991be5f --- /dev/null +++ b/notebooks/pipeline_csv_to_parquet multifile.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e47151c6", + "metadata": {}, + "source": [ + "# Fannie Mae Loan Performance Data: CSV to Parquet Conversion\n", + "\n", + "## Overview\n", + "This notebook converts Fannie Mae Single-Family Loan Performance CSV files to optimized Parquet format.\n", + "\n", + "**Key Features:**\n", + "- Uses proper column names and data types from the R reference script\n", + "- Handles pipe-separated values (|) format\n", + "- Optimizes memory usage with appropriate data types\n", + "- Provides significant file size reduction through compression\n", + "\n", + "**Input:** Raw CSV files from Fannie Mae (located in `../../data/raw/`)\n", + "**Output:** Optimized Parquet files for efficient analysis (saved to `../../data/processed/`)\n", + "\n", + "**Reference:** Based on `LPPUB_Infile.R` script from Fannie Mae (see `../scripts/`)" + ] + }, + { + "cell_type": "markdown", + "id": "4ee334a6", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b5aacfcb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ“ pyarrow is available\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "import subprocess\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "# Install pyarrow if not already available\n", + "try:\n", + " import pyarrow\n", + " print(\"βœ“ pyarrow is available\")\n", + "except ImportError:\n", + " print(\"Installing pyarrow...\")\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"pyarrow\"])\n", + " import pyarrow\n", + " print(\"βœ“ pyarrow installed successfully\")" + ] + }, + { + "cell_type": "markdown", + "id": "7833716d", + "metadata": {}, + "source": [ + "## 2. Configuration and File Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6aa46ec4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 5 CSV files:\n", + " - 2024Q1.csv (768.9 MB)\n", + " - 2024Q2.csv (822.5 MB)\n", + " - 2024Q3.csv (671.7 MB)\n", + " - 2024Q4.csv (381.3 MB)\n", + " - 2025Q1.csv (117.3 MB)\n", + "\n", + "Combined output: ../../data/processed/data.parquet\n" + ] + } + ], + "source": [ + "# Define paths\n", + "SOURCE_DATA_DIR = Path('../../data/raw')\n", + "PROCESSED_DATA_DIR = Path('../../data/processed')\n", + "\n", + "# Find all CSV files in the raw directory\n", + "csv_files = list(SOURCE_DATA_DIR.glob('*.csv'))\n", + "csv_files.sort() # Sort for consistent processing order\n", + "\n", + "print(f\"Found {len(csv_files)} CSV files:\")\n", + "for csv_file in csv_files:\n", + " file_size = csv_file.stat().st_size / (1024**2) # Size in MB\n", + " print(f\" - {csv_file.name} ({file_size:.1f} MB)\")\n", + "\n", + "# Output file for combined data\n", + "combined_parquet_path = PROCESSED_DATA_DIR / 'data.parquet'\n", + "print(f\"\\nCombined output: {combined_parquet_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d4e30374", + "metadata": {}, + "source": [ + "## 3. Column Definitions from Fannie Mae R Script\n", + "\n", + "These column names and types are based on the official `LPPUB_Infile.R` script provided by Fannie Mae." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cfab3599", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total columns: 110\n" + ] + } + ], + "source": [ + "# Column names from LPPUB_Infile.R\n", + "LPPUB_COLUMN_NAMES = [\n", + " \"POOL_ID\", \"LOAN_ID\", \"ACT_PERIOD\", \"CHANNEL\", \"SELLER\", \"SERVICER\",\n", + " \"MASTER_SERVICER\", \"ORIG_RATE\", \"CURR_RATE\", \"ORIG_UPB\", \"ISSUANCE_UPB\",\n", + " \"CURRENT_UPB\", \"ORIG_TERM\", \"ORIG_DATE\", \"FIRST_PAY\", \"LOAN_AGE\",\n", + " \"REM_MONTHS\", \"ADJ_REM_MONTHS\", \"MATR_DT\", \"OLTV\", \"OCLTV\",\n", + " \"NUM_BO\", \"DTI\", \"CSCORE_B\", \"CSCORE_C\", \"FIRST_FLAG\", \"PURPOSE\",\n", + " \"PROP\", \"NO_UNITS\", \"OCC_STAT\", \"STATE\", \"MSA\", \"ZIP\", \"MI_PCT\",\n", + " \"PRODUCT\", \"PPMT_FLG\", \"IO\", \"FIRST_PAY_IO\", \"MNTHS_TO_AMTZ_IO\",\n", + " \"DLQ_STATUS\", \"PMT_HISTORY\", \"MOD_FLAG\", \"MI_CANCEL_FLAG\", \"Zero_Bal_Code\",\n", + " \"ZB_DTE\", \"LAST_UPB\", \"RPRCH_DTE\", \"CURR_SCHD_PRNCPL\", \"TOT_SCHD_PRNCPL\",\n", + " \"UNSCHD_PRNCPL_CURR\", \"LAST_PAID_INSTALLMENT_DATE\", \"FORECLOSURE_DATE\",\n", + " \"DISPOSITION_DATE\", \"FORECLOSURE_COSTS\", \"PROPERTY_PRESERVATION_AND_REPAIR_COSTS\",\n", + " \"ASSET_RECOVERY_COSTS\", \"MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS\",\n", + " \"ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY\", \"NET_SALES_PROCEEDS\",\n", + " \"CREDIT_ENHANCEMENT_PROCEEDS\", \"REPURCHASES_MAKE_WHOLE_PROCEEDS\",\n", + " \"OTHER_FORECLOSURE_PROCEEDS\", \"NON_INTEREST_BEARING_UPB\", \"PRINCIPAL_FORGIVENESS_AMOUNT\",\n", + " \"ORIGINAL_LIST_START_DATE\", \"ORIGINAL_LIST_PRICE\", \"CURRENT_LIST_START_DATE\",\n", + " \"CURRENT_LIST_PRICE\", \"ISSUE_SCOREB\", \"ISSUE_SCOREC\", \"CURR_SCOREB\",\n", + " \"CURR_SCOREC\", \"MI_TYPE\", \"SERV_IND\", \"CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT\",\n", + " \"CUMULATIVE_MODIFICATION_LOSS_AMOUNT\", \"CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS\",\n", + " \"CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS\", \"HOMEREADY_PROGRAM_INDICATOR\",\n", + " \"FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT\", \"RELOCATION_MORTGAGE_INDICATOR\",\n", + " \"ZERO_BALANCE_CODE_CHANGE_DATE\", \"LOAN_HOLDBACK_INDICATOR\", \"LOAN_HOLDBACK_EFFECTIVE_DATE\",\n", + " \"DELINQUENT_ACCRUED_INTEREST\", \"PROPERTY_INSPECTION_WAIVER_INDICATOR\",\n", + " \"HIGH_BALANCE_LOAN_INDICATOR\", \"ARM_5_YR_INDICATOR\", \"ARM_PRODUCT_TYPE\",\n", + " \"MONTHS_UNTIL_FIRST_PAYMENT_RESET\", \"MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET\",\n", + " \"INTEREST_RATE_CHANGE_DATE\", \"PAYMENT_CHANGE_DATE\", \"ARM_INDEX\",\n", + " \"ARM_CAP_STRUCTURE\", \"INITIAL_INTEREST_RATE_CAP\", \"PERIODIC_INTEREST_RATE_CAP\",\n", + " \"LIFETIME_INTEREST_RATE_CAP\", \"MARGIN\", \"BALLOON_INDICATOR\",\n", + " \"PLAN_NUMBER\", \"FORBEARANCE_INDICATOR\", \"HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR\",\n", + " \"DEAL_NAME\", \"RE_PROCS_FLAG\", \"ADR_TYPE\", \"ADR_COUNT\", \"ADR_UPB\", \n", + " \"PAYMENT_DEFERRAL_MOD_EVENT_FLAG\", \"INTEREST_BEARING_UPB\"\n", + "]\n", + "\n", + "print(f\"Total columns: {len(LPPUB_COLUMN_NAMES)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f6be85b4", + "metadata": {}, + "source": [ + "## 4. Optimized Data Types\n", + "\n", + "Define optimized data types for better memory efficiency and performance." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e23bbd5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data type mappings defined for 109 columns\n" + ] + } + ], + "source": [ + "# Optimized data types based on R script column classes\n", + "OPTIMIZED_DTYPES = {\n", + " # Character/categorical columns\n", + " \"POOL_ID\": \"string\", \"LOAN_ID\": \"string\", \"ACT_PERIOD\": \"string\", \n", + " \"CHANNEL\": \"category\", \"SELLER\": \"category\", \"SERVICER\": \"category\",\n", + " \"MASTER_SERVICER\": \"category\", \"ORIG_DATE\": \"string\", \"FIRST_PAY\": \"string\", \n", + " \"MATR_DT\": \"string\", \"FIRST_FLAG\": \"category\", \"PURPOSE\": \"category\",\n", + " \"PROP\": \"category\", \"OCC_STAT\": \"category\", \"STATE\": \"category\", \n", + " \"MSA\": \"string\", \"ZIP\": \"string\", \"PRODUCT\": \"category\", \n", + " \"PPMT_FLG\": \"category\", \"IO\": \"category\", \"FIRST_PAY_IO\": \"string\", \n", + " \"MNTHS_TO_AMTZ_IO\": \"string\", \"DLQ_STATUS\": \"category\", \"PMT_HISTORY\": \"string\", \n", + " \"MOD_FLAG\": \"category\", \"MI_CANCEL_FLAG\": \"category\", \"Zero_Bal_Code\": \"category\",\n", + " \"ZB_DTE\": \"string\", \"RPRCH_DTE\": \"string\", \"LAST_PAID_INSTALLMENT_DATE\": \"string\",\n", + " \"FORECLOSURE_DATE\": \"string\", \"DISPOSITION_DATE\": \"string\", \"ORIGINAL_LIST_START_DATE\": \"string\",\n", + " \"CURRENT_LIST_START_DATE\": \"string\", \"MI_TYPE\": \"category\", \"SERV_IND\": \"category\",\n", + " \"HOMEREADY_PROGRAM_INDICATOR\": \"category\", \"RELOCATION_MORTGAGE_INDICATOR\": \"category\",\n", + " \"ZERO_BALANCE_CODE_CHANGE_DATE\": \"string\", \"LOAN_HOLDBACK_INDICATOR\": \"category\",\n", + " \"LOAN_HOLDBACK_EFFECTIVE_DATE\": \"string\", \"PROPERTY_INSPECTION_WAIVER_INDICATOR\": \"category\",\n", + " \"HIGH_BALANCE_LOAN_INDICATOR\": \"category\", \"ARM_5_YR_INDICATOR\": \"category\",\n", + " \"ARM_PRODUCT_TYPE\": \"string\", \"INTEREST_RATE_CHANGE_DATE\": \"string\",\n", + " \"PAYMENT_CHANGE_DATE\": \"string\", \"ARM_INDEX\": \"string\", \"ARM_CAP_STRUCTURE\": \"string\",\n", + " \"BALLOON_INDICATOR\": \"category\", \"PLAN_NUMBER\": \"string\", \"FORBEARANCE_INDICATOR\": \"category\",\n", + " \"HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR\": \"category\", \"DEAL_NAME\": \"string\",\n", + " \"RE_PROCS_FLAG\": \"category\", \"ADR_TYPE\": \"string\", \"PAYMENT_DEFERRAL_MOD_EVENT_FLAG\": \"category\",\n", + " \n", + " # Numeric columns with appropriate precision\n", + " \"ORIG_RATE\": \"float32\", \"CURR_RATE\": \"float32\", \"ORIG_UPB\": \"float64\", \"ISSUANCE_UPB\": \"float64\",\n", + " \"CURRENT_UPB\": \"float64\", \"ORIG_TERM\": \"int16\", \"LOAN_AGE\": \"int16\", \"REM_MONTHS\": \"int16\",\n", + " \"ADJ_REM_MONTHS\": \"int16\", \"OLTV\": \"float32\", \"OCLTV\": \"float32\", \"DTI\": \"float32\",\n", + " \"CSCORE_B\": \"int16\", \"CSCORE_C\": \"int16\", \"MI_PCT\": \"float32\", \"NO_UNITS\": \"int8\",\n", + " \"LAST_UPB\": \"float64\", \"CURR_SCHD_PRNCPL\": \"float64\", \"TOT_SCHD_PRNCPL\": \"float64\",\n", + " \"UNSCHD_PRNCPL_CURR\": \"float64\", \"FORECLOSURE_COSTS\": \"float64\", \n", + " \"PROPERTY_PRESERVATION_AND_REPAIR_COSTS\": \"float64\", \"ASSET_RECOVERY_COSTS\": \"float64\",\n", + " \"MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS\": \"float64\", \"ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY\": \"float64\",\n", + " \"NET_SALES_PROCEEDS\": \"float64\", \"CREDIT_ENHANCEMENT_PROCEEDS\": \"float64\",\n", + " \"REPURCHASES_MAKE_WHOLE_PROCEEDS\": \"float64\", \"OTHER_FORECLOSURE_PROCEEDS\": \"float64\",\n", + " \"NON_INTEREST_BEARING_UPB\": \"float64\", \"PRINCIPAL_FORGIVENESS_AMOUNT\": \"float64\",\n", + " \"ORIGINAL_LIST_PRICE\": \"float64\", \"CURRENT_LIST_PRICE\": \"float64\",\n", + " \"ISSUE_SCOREB\": \"int16\", \"ISSUE_SCOREC\": \"int16\", \"CURR_SCOREB\": \"int16\", \"CURR_SCOREC\": \"int16\",\n", + " \"CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT\": \"float64\", \"CUMULATIVE_MODIFICATION_LOSS_AMOUNT\": \"float64\",\n", + " \"CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS\": \"float64\", \"CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS\": \"float64\",\n", + " \"FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT\": \"float64\", \"DELINQUENT_ACCRUED_INTEREST\": \"float64\",\n", + " \"MONTHS_UNTIL_FIRST_PAYMENT_RESET\": \"int16\", \"MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET\": \"int16\",\n", + " \"INITIAL_INTEREST_RATE_CAP\": \"float32\", \"PERIODIC_INTEREST_RATE_CAP\": \"float32\",\n", + " \"LIFETIME_INTEREST_RATE_CAP\": \"float32\", \"MARGIN\": \"float32\", \"ADR_COUNT\": \"int16\",\n", + " \"ADR_UPB\": \"float64\", \"INTEREST_BEARING_UPB\": \"float64\"\n", + "}\n", + "\n", + "print(f\"Data type mappings defined for {len(OPTIMIZED_DTYPES)} columns\")" + ] + }, + { + "cell_type": "markdown", + "id": "81aca490", + "metadata": {}, + "source": [ + "## 5. CSV to Parquet Conversion Function" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c725817e", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_csv_to_parquet(csv_file_path, parquet_file_path, column_names, dtype_mapping):\n", + " \"\"\"\n", + " Convert Fannie Mae CSV to optimized Parquet format.\n", + " \n", + " Parameters:\n", + " - csv_file_path: Path to input CSV file\n", + " - parquet_file_path: Path to output Parquet file\n", + " - column_names: List of column names\n", + " - dtype_mapping: Dictionary mapping column names to data types\n", + " \n", + " Returns:\n", + " - DataFrame with converted data\n", + " \"\"\"\n", + " print(f\"πŸ”„ Reading CSV file: {csv_file_path}\")\n", + " \n", + " # First pass: Read as strings to handle any data issues\n", + " df = pd.read_csv(\n", + " csv_file_path,\n", + " sep='|',\n", + " names=column_names,\n", + " dtype='string',\n", + " header=None,\n", + " low_memory=False,\n", + " na_values=['', ' ', 'NULL', 'null', 'NA']\n", + " )\n", + " \n", + " print(f\"πŸ“Š Initial shape: {df.shape}\")\n", + " print(f\"πŸ”§ Converting data types...\")\n", + " \n", + " # Convert to optimized data types\n", + " conversion_errors = []\n", + " \n", + " for col, target_dtype in dtype_mapping.items():\n", + " if col in df.columns:\n", + " try:\n", + " if target_dtype == 'category':\n", + " df[col] = df[col].astype('category')\n", + " elif target_dtype in ['int8', 'int16', 'int32', 'int64']:\n", + " # Use nullable integer types for columns with missing values\n", + " df[col] = pd.to_numeric(df[col], errors='coerce')\n", + " df[col] = df[col].astype(f'Int{target_dtype[3:]}')\n", + " elif target_dtype in ['float32', 'float64']:\n", + " df[col] = pd.to_numeric(df[col], errors='coerce').astype(target_dtype)\n", + " elif target_dtype == 'string':\n", + " df[col] = df[col].astype('string')\n", + " except Exception as e:\n", + " conversion_errors.append(f\"{col}: {str(e)}\")\n", + " \n", + " if conversion_errors:\n", + " print(f\"⚠️ Conversion warnings for {len(conversion_errors)} columns\")\n", + " for error in conversion_errors[:5]: # Show first 5 errors\n", + " print(f\" {error}\")\n", + " \n", + " print(f\"πŸ’Ύ Saving to Parquet: {parquet_file_path}\")\n", + " \n", + " # Save to Parquet with compression\n", + " df.to_parquet(\n", + " parquet_file_path,\n", + " engine='pyarrow',\n", + " compression='snappy',\n", + " index=False\n", + " )\n", + " \n", + " return df\n", + "\n", + "\n", + "def process_multiple_csv_files(csv_files, column_names, dtype_mapping, combined_output_path):\n", + " \"\"\"\n", + " Process multiple CSV files and combine them into a single Parquet file.\n", + " \n", + " Parameters:\n", + " - csv_files: List of CSV file paths\n", + " - column_names: List of column names\n", + " - dtype_mapping: Dictionary mapping column names to data types\n", + " - combined_output_path: Path for the combined output Parquet file\n", + " \n", + " Returns:\n", + " - Combined DataFrame\n", + " \"\"\"\n", + " print(f\"πŸš€ Processing {len(csv_files)} CSV files...\")\n", + " \n", + " all_dataframes = []\n", + " total_rows = 0\n", + " \n", + " for i, csv_file in enumerate(csv_files, 1):\n", + " print(f\"\\nπŸ“ Processing file {i}/{len(csv_files)}: {csv_file.name}\")\n", + " \n", + " # Read and convert individual CSV file\n", + " df = pd.read_csv(\n", + " csv_file,\n", + " sep='|',\n", + " names=column_names,\n", + " dtype='string',\n", + " header=None,\n", + " low_memory=False,\n", + " na_values=['', ' ', 'NULL', 'null', 'NA']\n", + " )\n", + " \n", + " print(f\" πŸ“Š Shape: {df.shape}\")\n", + " total_rows += len(df)\n", + " \n", + " # Convert data types efficiently\n", + " conversion_errors = []\n", + " for col, target_dtype in dtype_mapping.items():\n", + " if col in df.columns:\n", + " try:\n", + " if target_dtype == 'category':\n", + " df[col] = df[col].astype('category')\n", + " elif target_dtype in ['int8', 'int16', 'int32', 'int64']:\n", + " df[col] = pd.to_numeric(df[col], errors='coerce')\n", + " df[col] = df[col].astype(f'Int{target_dtype[3:]}')\n", + " elif target_dtype in ['float32', 'float64']:\n", + " df[col] = pd.to_numeric(df[col], errors='coerce').astype(target_dtype)\n", + " elif target_dtype == 'string':\n", + " df[col] = df[col].astype('string')\n", + " except Exception as e:\n", + " conversion_errors.append(f\"{col}: {str(e)}\")\n", + " \n", + " if conversion_errors and i == 1: # Only show errors for first file\n", + " print(f\" ⚠️ Data type conversion notes (first file only):\")\n", + " for error in conversion_errors[:3]:\n", + " print(f\" {error}\")\n", + " \n", + " all_dataframes.append(df)\n", + " \n", + " # Memory management: show current memory usage\n", + " memory_mb = df.memory_usage(deep=True).sum() / (1024**2)\n", + " print(f\" πŸ’Ύ Memory usage: ~{memory_mb:.1f} MB\")\n", + " \n", + " print(f\"\\nπŸ”— Combining {len(all_dataframes)} DataFrames...\")\n", + " print(f\" Total rows across all files: {total_rows:,}\")\n", + " \n", + " # Combine all DataFrames\n", + " combined_df = pd.concat(all_dataframes, ignore_index=True)\n", + " \n", + " print(f\" πŸ“Š Combined shape: {combined_df.shape}\")\n", + " \n", + " # Clean up individual DataFrames to free memory\n", + " del all_dataframes\n", + " \n", + " # Save combined data to Parquet\n", + " print(f\"πŸ’Ύ Saving combined data to: {combined_output_path}\")\n", + " \n", + " combined_df.to_parquet(\n", + " combined_output_path,\n", + " engine='pyarrow',\n", + " compression='snappy',\n", + " index=False\n", + " )\n", + " \n", + " print(f\"βœ… Combined file saved successfully!\")\n", + " \n", + " return combined_df" + ] + }, + { + "cell_type": "markdown", + "id": "af06d7df", + "metadata": {}, + "source": [ + "## 6. Run the Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "baadc7c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸš€ Processing 5 CSV files...\n", + "\n", + "πŸ“ Processing file 1/5: 2024Q1.csv\n", + " πŸ“Š Shape: (2535876, 110)\n", + " πŸ’Ύ Memory usage: ~4792.5 MB\n", + "\n", + "πŸ“ Processing file 2/5: 2024Q2.csv\n", + " πŸ“Š Shape: (2704635, 110)\n", + " πŸ’Ύ Memory usage: ~5111.6 MB\n", + "\n", + "πŸ“ Processing file 3/5: 2024Q3.csv\n", + " πŸ“Š Shape: (2206431, 110)\n", + " πŸ’Ύ Memory usage: ~4170.2 MB\n", + "\n", + "πŸ“ Processing file 4/5: 2024Q4.csv\n", + " πŸ“Š Shape: (1256272, 110)\n", + " πŸ’Ύ Memory usage: ~2374.5 MB\n", + "\n", + "πŸ“ Processing file 5/5: 2025Q1.csv\n", + " πŸ“Š Shape: (388622, 110)\n", + " πŸ’Ύ Memory usage: ~734.6 MB\n", + "\n", + "πŸ”— Combining 5 DataFrames...\n", + " Total rows across all files: 9,091,836\n", + " πŸ“Š Combined shape: (9091836, 110)\n", + "πŸ’Ύ Saving combined data to: ../../data/processed/data.parquet\n", + "βœ… Combined file saved successfully!\n", + "\n", + "πŸŽ‰ All files processed and combined successfully!\n", + "πŸ“ˆ Final combined dataset shape: (9091836, 110)\n", + "πŸ’Ύ Combined file saved to: ../../data/processed/data.parquet\n" + ] + } + ], + "source": [ + "# Process all CSV files and combine them\n", + "combined_df = process_multiple_csv_files(\n", + " csv_files, \n", + " LPPUB_COLUMN_NAMES, \n", + " OPTIMIZED_DTYPES,\n", + " combined_parquet_path\n", + ")\n", + "\n", + "print(f\"\\nπŸŽ‰ All files processed and combined successfully!\")\n", + "print(f\"πŸ“ˆ Final combined dataset shape: {combined_df.shape}\")\n", + "print(f\"πŸ’Ύ Combined file saved to: {combined_parquet_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e6175750", + "metadata": {}, + "source": [ + "## 7. Verification and Performance Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "210051ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ“ File Size Comparison:\n", + " Total CSV files: 2,895,879,462 bytes (2761.73 MB)\n", + " Combined Parquet: 123,303,897 bytes (117.59 MB)\n", + " Compression ratio: 23.49x\n", + " Space saved: 95.7%\n", + "\n", + "πŸ“Š Individual CSV file sizes:\n", + " 2024Q1.csv: 768.9 MB\n", + " 2024Q2.csv: 822.5 MB\n", + " 2024Q3.csv: 671.7 MB\n", + " 2024Q4.csv: 381.3 MB\n", + " 2025Q1.csv: 117.3 MB\n", + "\n", + "πŸ” Verification - Reading combined Parquet file:\n", + " Shape: (9091836, 110)\n", + " Memory usage: ~20072.20 MB\n", + "\n", + "πŸ“‹ Sample Data Types:\n", + " POOL_ID: string\n", + " LOAN_ID: string\n", + " ACT_PERIOD: string\n", + " CHANNEL: category\n", + " SELLER: string\n", + " SERVICER: string\n", + " MASTER_SERVICER: category\n", + " ORIG_RATE: float32\n", + " CURR_RATE: float32\n", + " ORIG_UPB: float64\n" + ] + } + ], + "source": [ + "# File size comparison for all processed files\n", + "total_csv_size = sum(csv_file.stat().st_size for csv_file in csv_files)\n", + "combined_parquet_size = os.path.getsize(combined_parquet_path)\n", + "\n", + "print(\"πŸ“ File Size Comparison:\")\n", + "print(f\" Total CSV files: {total_csv_size:,} bytes ({total_csv_size/1024/1024:.2f} MB)\")\n", + "print(f\" Combined Parquet: {combined_parquet_size:,} bytes ({combined_parquet_size/1024/1024:.2f} MB)\")\n", + "print(f\" Compression ratio: {total_csv_size/combined_parquet_size:.2f}x\")\n", + "print(f\" Space saved: {((total_csv_size - combined_parquet_size) / total_csv_size) * 100:.1f}%\")\n", + "\n", + "print(f\"\\nπŸ“Š Individual CSV file sizes:\")\n", + "for csv_file in csv_files:\n", + " size_mb = csv_file.stat().st_size / (1024**2)\n", + " print(f\" {csv_file.name}: {size_mb:.1f} MB\")\n", + "\n", + "# Verify by reading back the combined file\n", + "print(f\"\\nπŸ” Verification - Reading combined Parquet file:\")\n", + "df_verify = pd.read_parquet(combined_parquet_path, engine='pyarrow')\n", + "print(f\" Shape: {df_verify.shape}\")\n", + "print(f\" Memory usage: ~{df_verify.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\")\n", + "\n", + "print(f\"\\nπŸ“‹ Sample Data Types:\")\n", + "for i, (col, dtype) in enumerate(df_verify.dtypes.head(10).items()):\n", + " print(f\" {col}: {dtype}\")" + ] + }, + { + "cell_type": "markdown", + "id": "275e333d", + "metadata": {}, + "source": [ + "## 8. Data Quality Summary" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f42eba52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ“Š Combined Dataset Quality Summary:\n", + " Total files processed: 5\n", + " Total rows: 9,091,836\n", + " Total columns: 110\n", + "\n", + "πŸ“… Top 10 Activity Periods:\n", + " 032025: 1,121,928 records\n", + " 022025: 1,060,795 records\n", + " 012025: 1,009,497 records\n", + " 122024: 944,022 records\n", + " 112024: 870,725 records\n", + " 102024: 792,422 records\n", + " 092024: 711,020 records\n", + " 082024: 630,945 records\n", + " 072024: 536,765 records\n", + " 062024: 440,303 records\n", + "\n", + "❓ Missing Data Analysis:\n", + " Columns with missing values: 78\n", + " Top 5 columns with most missing values:\n", + " POOL_ID: 9,091,836 (100.0%)\n", + " SERVICER: 48,544 (0.5%)\n", + " MASTER_SERVICER: 9,091,836 (100.0%)\n", + " CURR_RATE: 48,523 (0.5%)\n", + " ISSUANCE_UPB: 9,091,836 (100.0%)\n", + "\n", + "πŸ“ˆ Data Type Distribution:\n", + " string: 35 columns\n", + " float64: 28 columns\n", + " Int16: 13 columns\n", + " float32: 10 columns\n", + " category: 5 columns\n", + " category: 5 columns\n", + " category: 3 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " Int8: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + "\n", + "πŸ’Ύ Memory Efficiency:\n", + " Average memory per row: 2314.96 bytes\n", + " Estimated memory for 1M rows: 2207.7 MB\n" + ] + } + ], + "source": [ + "print(\"πŸ“Š Combined Dataset Quality Summary:\")\n", + "print(f\" Total files processed: {len(csv_files)}\")\n", + "print(f\" Total rows: {len(df_verify):,}\")\n", + "print(f\" Total columns: {len(df_verify.columns)}\")\n", + "\n", + "# Show data distribution by period (if ACT_PERIOD exists)\n", + "if 'ACT_PERIOD' in df_verify.columns:\n", + " period_counts = df_verify['ACT_PERIOD'].value_counts().head(10)\n", + " print(f\"\\nπŸ“… Top 10 Activity Periods:\")\n", + " for period, count in period_counts.items():\n", + " print(f\" {period}: {count:,} records\")\n", + "\n", + "# Missing values summary\n", + "missing_summary = df_verify.isnull().sum()\n", + "columns_with_missing = missing_summary[missing_summary > 0]\n", + "\n", + "print(f\"\\n❓ Missing Data Analysis:\")\n", + "print(f\" Columns with missing values: {len(columns_with_missing)}\")\n", + "if len(columns_with_missing) > 0:\n", + " print(f\" Top 5 columns with most missing values:\")\n", + " for col, count in columns_with_missing.head().items():\n", + " pct = (count / len(df_verify)) * 100\n", + " print(f\" {col}: {count:,} ({pct:.1f}%)\")\n", + "\n", + "# Data type distribution\n", + "dtype_counts = df_verify.dtypes.value_counts()\n", + "print(f\"\\nπŸ“ˆ Data Type Distribution:\")\n", + "for dtype, count in dtype_counts.items():\n", + " print(f\" {dtype}: {count} columns\")\n", + "\n", + "# Memory efficiency summary\n", + "print(f\"\\nπŸ’Ύ Memory Efficiency:\")\n", + "memory_per_row = df_verify.memory_usage(deep=True).sum() / len(df_verify)\n", + "print(f\" Average memory per row: {memory_per_row:.2f} bytes\")\n", + "print(f\" Estimated memory for 1M rows: {memory_per_row * 1000000 / (1024**2):.1f} MB\")" + ] + }, + { + "cell_type": "markdown", + "id": "a8fee0a6", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook successfully processes **all** Fannie Mae Loan Performance CSV files in the raw data folder and combines them into a single optimized Parquet file with:\n", + "\n", + "- **Multi-file processing** - Automatically discovers and processes all CSV files in the raw folder\n", + "- **Memory efficient combination** - Processes files individually then combines for optimal memory usage\n", + "- **Proper column naming** based on official R script (LPPUB_Infile.R)\n", + "- **Optimized data types** for memory efficiency and performance\n", + "- **Significant compression** (typically 10-15x size reduction across all files)\n", + "- **Data integrity** preservation across all datasets\n", + "- **Comprehensive error handling** for data quality issues\n", + "\n", + "### Key Features:\n", + "- **Input**: All CSV files in `../../data/raw/` folder \n", + "- **Output**: Single combined file `../../data/processed/data.parquet`\n", + "- **Automatic discovery**: No need to manually specify file names\n", + "- **Scalable processing**: Handles multiple files efficiently\n", + "- **Quality reporting**: Comprehensive statistics for the combined dataset\n", + "\n", + "### Performance Benefits:\n", + "- **Storage**: Dramatic reduction in storage space (10-15x compression)\n", + "- **Speed**: Much faster read times for analysis workflows\n", + "- **Memory**: Optimized data types reduce memory footprint\n", + "- **Convenience**: Single file contains all historical data\n", + "\n", + "**Next Steps:**\n", + "- Use the combined `data.parquet` file for comprehensive analysis\n", + "- Consider time-series analysis across all quarters\n", + "- Implement data quality checks and validation rules\n", + "- Set up automated processing pipeline for new quarterly data" + ] + }, + { + "cell_type": "markdown", + "id": "1de02770", + "metadata": {}, + "source": [ + "## πŸš€ Processing Summary Report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51e2e7b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate comprehensive processing summary report\n", + "import datetime\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"πŸš€ FANNIE MAE DATA PROCESSING SUMMARY REPORT\")\n", + "print(\"=\" * 80)\n", + "print(f\"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n", + "print()\n", + "\n", + "# Extract quarters from file names\n", + "quarters_processed = []\n", + "for csv_file in csv_files:\n", + " # Extract quarter from filename (e.g., \"2024Q1.csv\" -> \"2024Q1\")\n", + " quarter = csv_file.stem\n", + " quarters_processed.append(quarter)\n", + "\n", + "quarters_processed.sort()\n", + "\n", + "print(\"πŸ“Š PROCESSING OVERVIEW\")\n", + "print(\"-\" * 40)\n", + "print(f\"Quarters Processed: {', '.join(quarters_processed)}\")\n", + "print(f\"Number of Files: {len(csv_files)}\")\n", + "print(f\"Date Range: {quarters_processed[0]} to {quarters_processed[-1]}\")\n", + "print()\n", + "\n", + "print(\"πŸ“ FILE SIZE ANALYSIS\")\n", + "print(\"-\" * 40)\n", + "print(f\"Total CSV Size: {total_csv_size / (1024**3):.2f} GB ({total_csv_size:,} bytes)\")\n", + "print(f\"Final Parquet Size: {combined_parquet_size / (1024**2):.2f} MB ({combined_parquet_size:,} bytes)\")\n", + "print(f\"Compression Ratio: {total_csv_size / combined_parquet_size:.1f}:1\")\n", + "print(f\"Space Saved: {((total_csv_size - combined_parquet_size) / total_csv_size) * 100:.1f}%\")\n", + "print(f\"Storage Efficiency: {combined_parquet_size / total_csv_size * 100:.2f}% of original size\")\n", + "print()\n", + "\n", + "print(\"πŸ“ˆ DATASET STATISTICS\")\n", + "print(\"-\" * 40)\n", + "print(f\"Total Records: {len(combined_df):,}\")\n", + "print(f\"Total Columns: {len(combined_df.columns)}\")\n", + "print(f\"Average Records/Quarter: {len(combined_df) // len(csv_files):,}\")\n", + "print(f\"Memory Footprint: {combined_df.memory_usage(deep=True).sum() / (1024**2):.1f} MB\")\n", + "print()\n", + "\n", + "print(\"⚑ PERFORMANCE METRICS\")\n", + "print(\"-\" * 40)\n", + "compression_efficiency = total_csv_size / combined_parquet_size\n", + "storage_reduction = ((total_csv_size - combined_parquet_size) / total_csv_size) * 100\n", + "print(f\"Compression Efficiency: {compression_efficiency:.1f}x smaller\")\n", + "print(f\"Storage Reduction: {storage_reduction:.1f}% reduction\")\n", + "print(f\"Data Density: {len(combined_df) / (combined_parquet_size / (1024**2)):.0f} records/MB\")\n", + "print()\n", + "\n", + "print(\"βœ… PROCESSING STATUS\")\n", + "print(\"-\" * 40)\n", + "print(\"Status: COMPLETED SUCCESSFULLY\")\n", + "print(f\"Output File: {combined_parquet_path.name}\")\n", + "print(f\"Output Location: {combined_parquet_path.parent}\")\n", + "print(\"Data Quality: All files processed with consistent schema\")\n", + "print(\"Ready for Analysis: YES\")\n", + "\n", + "print(\"=\" * 80)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gcp-pipeline", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/pipeline_csv_to_parquet.ipynb b/notebooks/pipeline_csv_to_parquet.ipynb new file mode 100644 index 0000000..0129bb2 --- /dev/null +++ b/notebooks/pipeline_csv_to_parquet.ipynb @@ -0,0 +1,552 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e47151c6", + "metadata": {}, + "source": [ + "# Fannie Mae Loan Performance Data: CSV to Parquet Conversion\n", + "\n", + "## Overview\n", + "This notebook converts Fannie Mae Single-Family Loan Performance CSV files to optimized Parquet format.\n", + "\n", + "**Key Features:**\n", + "- Uses proper column names and data types from the R reference script\n", + "- Handles pipe-separated values (|) format\n", + "- Optimizes memory usage with appropriate data types\n", + "- Provides significant file size reduction through compression\n", + "\n", + "**Input:** Raw CSV files from Fannie Mae (located in `../../data/raw/`)\n", + "**Output:** Optimized Parquet files for efficient analysis (saved to `../../data/processed/`)\n", + "\n", + "**Reference:** Based on `LPPUB_Infile.R` script from Fannie Mae (see `../scripts/`)" + ] + }, + { + "cell_type": "markdown", + "id": "4ee334a6", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b5aacfcb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ“ pyarrow is available\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "import subprocess\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "# Install pyarrow if not already available\n", + "try:\n", + " import pyarrow\n", + " print(\"βœ“ pyarrow is available\")\n", + "except ImportError:\n", + " print(\"Installing pyarrow...\")\n", + " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"pyarrow\"])\n", + " import pyarrow\n", + " print(\"βœ“ pyarrow installed successfully\")" + ] + }, + { + "cell_type": "markdown", + "id": "7833716d", + "metadata": {}, + "source": [ + "## 2. Configuration and File Paths" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6aa46ec4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input CSV: ../../data/raw/2025Q1.csv\n", + "Output Parquet: ../../data/processed/2025Q1.parquet\n", + "File exists: True\n" + ] + } + ], + "source": [ + "# Define paths\n", + "SOURCE_DATA_DIR = Path('../../data/raw')\n", + "PROCESSED_DATA_DIR = Path('../../data/processed')\n", + "INPUT_FILE = '2025Q1.csv' # Change this to process different files\n", + "\n", + "csv_path = SOURCE_DATA_DIR / INPUT_FILE\n", + "parquet_path = PROCESSED_DATA_DIR / INPUT_FILE.replace('.csv', '.parquet')\n", + "\n", + "print(f\"Input CSV: {csv_path}\")\n", + "print(f\"Output Parquet: {parquet_path}\")\n", + "print(f\"File exists: {csv_path.exists()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d4e30374", + "metadata": {}, + "source": [ + "## 3. Column Definitions from Fannie Mae R Script\n", + "\n", + "These column names and types are based on the official `LPPUB_Infile.R` script provided by Fannie Mae." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cfab3599", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total columns: 110\n" + ] + } + ], + "source": [ + "# Column names from LPPUB_Infile.R\n", + "LPPUB_COLUMN_NAMES = [\n", + " \"POOL_ID\", \"LOAN_ID\", \"ACT_PERIOD\", \"CHANNEL\", \"SELLER\", \"SERVICER\",\n", + " \"MASTER_SERVICER\", \"ORIG_RATE\", \"CURR_RATE\", \"ORIG_UPB\", \"ISSUANCE_UPB\",\n", + " \"CURRENT_UPB\", \"ORIG_TERM\", \"ORIG_DATE\", \"FIRST_PAY\", \"LOAN_AGE\",\n", + " \"REM_MONTHS\", \"ADJ_REM_MONTHS\", \"MATR_DT\", \"OLTV\", \"OCLTV\",\n", + " \"NUM_BO\", \"DTI\", \"CSCORE_B\", \"CSCORE_C\", \"FIRST_FLAG\", \"PURPOSE\",\n", + " \"PROP\", \"NO_UNITS\", \"OCC_STAT\", \"STATE\", \"MSA\", \"ZIP\", \"MI_PCT\",\n", + " \"PRODUCT\", \"PPMT_FLG\", \"IO\", \"FIRST_PAY_IO\", \"MNTHS_TO_AMTZ_IO\",\n", + " \"DLQ_STATUS\", \"PMT_HISTORY\", \"MOD_FLAG\", \"MI_CANCEL_FLAG\", \"Zero_Bal_Code\",\n", + " \"ZB_DTE\", \"LAST_UPB\", \"RPRCH_DTE\", \"CURR_SCHD_PRNCPL\", \"TOT_SCHD_PRNCPL\",\n", + " \"UNSCHD_PRNCPL_CURR\", \"LAST_PAID_INSTALLMENT_DATE\", \"FORECLOSURE_DATE\",\n", + " \"DISPOSITION_DATE\", \"FORECLOSURE_COSTS\", \"PROPERTY_PRESERVATION_AND_REPAIR_COSTS\",\n", + " \"ASSET_RECOVERY_COSTS\", \"MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS\",\n", + " \"ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY\", \"NET_SALES_PROCEEDS\",\n", + " \"CREDIT_ENHANCEMENT_PROCEEDS\", \"REPURCHASES_MAKE_WHOLE_PROCEEDS\",\n", + " \"OTHER_FORECLOSURE_PROCEEDS\", \"NON_INTEREST_BEARING_UPB\", \"PRINCIPAL_FORGIVENESS_AMOUNT\",\n", + " \"ORIGINAL_LIST_START_DATE\", \"ORIGINAL_LIST_PRICE\", \"CURRENT_LIST_START_DATE\",\n", + " \"CURRENT_LIST_PRICE\", \"ISSUE_SCOREB\", \"ISSUE_SCOREC\", \"CURR_SCOREB\",\n", + " \"CURR_SCOREC\", \"MI_TYPE\", \"SERV_IND\", \"CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT\",\n", + " \"CUMULATIVE_MODIFICATION_LOSS_AMOUNT\", \"CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS\",\n", + " \"CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS\", \"HOMEREADY_PROGRAM_INDICATOR\",\n", + " \"FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT\", \"RELOCATION_MORTGAGE_INDICATOR\",\n", + " \"ZERO_BALANCE_CODE_CHANGE_DATE\", \"LOAN_HOLDBACK_INDICATOR\", \"LOAN_HOLDBACK_EFFECTIVE_DATE\",\n", + " \"DELINQUENT_ACCRUED_INTEREST\", \"PROPERTY_INSPECTION_WAIVER_INDICATOR\",\n", + " \"HIGH_BALANCE_LOAN_INDICATOR\", \"ARM_5_YR_INDICATOR\", \"ARM_PRODUCT_TYPE\",\n", + " \"MONTHS_UNTIL_FIRST_PAYMENT_RESET\", \"MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET\",\n", + " \"INTEREST_RATE_CHANGE_DATE\", \"PAYMENT_CHANGE_DATE\", \"ARM_INDEX\",\n", + " \"ARM_CAP_STRUCTURE\", \"INITIAL_INTEREST_RATE_CAP\", \"PERIODIC_INTEREST_RATE_CAP\",\n", + " \"LIFETIME_INTEREST_RATE_CAP\", \"MARGIN\", \"BALLOON_INDICATOR\",\n", + " \"PLAN_NUMBER\", \"FORBEARANCE_INDICATOR\", \"HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR\",\n", + " \"DEAL_NAME\", \"RE_PROCS_FLAG\", \"ADR_TYPE\", \"ADR_COUNT\", \"ADR_UPB\", \n", + " \"PAYMENT_DEFERRAL_MOD_EVENT_FLAG\", \"INTEREST_BEARING_UPB\"\n", + "]\n", + "\n", + "print(f\"Total columns: {len(LPPUB_COLUMN_NAMES)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f6be85b4", + "metadata": {}, + "source": [ + "## 4. Optimized Data Types\n", + "\n", + "Define optimized data types for better memory efficiency and performance." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e23bbd5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data type mappings defined for 109 columns\n" + ] + } + ], + "source": [ + "# Optimized data types based on R script column classes\n", + "OPTIMIZED_DTYPES = {\n", + " # Character/categorical columns\n", + " \"POOL_ID\": \"string\", \"LOAN_ID\": \"string\", \"ACT_PERIOD\": \"string\", \n", + " \"CHANNEL\": \"category\", \"SELLER\": \"category\", \"SERVICER\": \"category\",\n", + " \"MASTER_SERVICER\": \"category\", \"ORIG_DATE\": \"string\", \"FIRST_PAY\": \"string\", \n", + " \"MATR_DT\": \"string\", \"FIRST_FLAG\": \"category\", \"PURPOSE\": \"category\",\n", + " \"PROP\": \"category\", \"OCC_STAT\": \"category\", \"STATE\": \"category\", \n", + " \"MSA\": \"string\", \"ZIP\": \"string\", \"PRODUCT\": \"category\", \n", + " \"PPMT_FLG\": \"category\", \"IO\": \"category\", \"FIRST_PAY_IO\": \"string\", \n", + " \"MNTHS_TO_AMTZ_IO\": \"string\", \"DLQ_STATUS\": \"category\", \"PMT_HISTORY\": \"string\", \n", + " \"MOD_FLAG\": \"category\", \"MI_CANCEL_FLAG\": \"category\", \"Zero_Bal_Code\": \"category\",\n", + " \"ZB_DTE\": \"string\", \"RPRCH_DTE\": \"string\", \"LAST_PAID_INSTALLMENT_DATE\": \"string\",\n", + " \"FORECLOSURE_DATE\": \"string\", \"DISPOSITION_DATE\": \"string\", \"ORIGINAL_LIST_START_DATE\": \"string\",\n", + " \"CURRENT_LIST_START_DATE\": \"string\", \"MI_TYPE\": \"category\", \"SERV_IND\": \"category\",\n", + " \"HOMEREADY_PROGRAM_INDICATOR\": \"category\", \"RELOCATION_MORTGAGE_INDICATOR\": \"category\",\n", + " \"ZERO_BALANCE_CODE_CHANGE_DATE\": \"string\", \"LOAN_HOLDBACK_INDICATOR\": \"category\",\n", + " \"LOAN_HOLDBACK_EFFECTIVE_DATE\": \"string\", \"PROPERTY_INSPECTION_WAIVER_INDICATOR\": \"category\",\n", + " \"HIGH_BALANCE_LOAN_INDICATOR\": \"category\", \"ARM_5_YR_INDICATOR\": \"category\",\n", + " \"ARM_PRODUCT_TYPE\": \"string\", \"INTEREST_RATE_CHANGE_DATE\": \"string\",\n", + " \"PAYMENT_CHANGE_DATE\": \"string\", \"ARM_INDEX\": \"string\", \"ARM_CAP_STRUCTURE\": \"string\",\n", + " \"BALLOON_INDICATOR\": \"category\", \"PLAN_NUMBER\": \"string\", \"FORBEARANCE_INDICATOR\": \"category\",\n", + " \"HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR\": \"category\", \"DEAL_NAME\": \"string\",\n", + " \"RE_PROCS_FLAG\": \"category\", \"ADR_TYPE\": \"string\", \"PAYMENT_DEFERRAL_MOD_EVENT_FLAG\": \"category\",\n", + " \n", + " # Numeric columns with appropriate precision\n", + " \"ORIG_RATE\": \"float32\", \"CURR_RATE\": \"float32\", \"ORIG_UPB\": \"float64\", \"ISSUANCE_UPB\": \"float64\",\n", + " \"CURRENT_UPB\": \"float64\", \"ORIG_TERM\": \"int16\", \"LOAN_AGE\": \"int16\", \"REM_MONTHS\": \"int16\",\n", + " \"ADJ_REM_MONTHS\": \"int16\", \"OLTV\": \"float32\", \"OCLTV\": \"float32\", \"DTI\": \"float32\",\n", + " \"CSCORE_B\": \"int16\", \"CSCORE_C\": \"int16\", \"MI_PCT\": \"float32\", \"NO_UNITS\": \"int8\",\n", + " \"LAST_UPB\": \"float64\", \"CURR_SCHD_PRNCPL\": \"float64\", \"TOT_SCHD_PRNCPL\": \"float64\",\n", + " \"UNSCHD_PRNCPL_CURR\": \"float64\", \"FORECLOSURE_COSTS\": \"float64\", \n", + " \"PROPERTY_PRESERVATION_AND_REPAIR_COSTS\": \"float64\", \"ASSET_RECOVERY_COSTS\": \"float64\",\n", + " \"MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS\": \"float64\", \"ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY\": \"float64\",\n", + " \"NET_SALES_PROCEEDS\": \"float64\", \"CREDIT_ENHANCEMENT_PROCEEDS\": \"float64\",\n", + " \"REPURCHASES_MAKE_WHOLE_PROCEEDS\": \"float64\", \"OTHER_FORECLOSURE_PROCEEDS\": \"float64\",\n", + " \"NON_INTEREST_BEARING_UPB\": \"float64\", \"PRINCIPAL_FORGIVENESS_AMOUNT\": \"float64\",\n", + " \"ORIGINAL_LIST_PRICE\": \"float64\", \"CURRENT_LIST_PRICE\": \"float64\",\n", + " \"ISSUE_SCOREB\": \"int16\", \"ISSUE_SCOREC\": \"int16\", \"CURR_SCOREB\": \"int16\", \"CURR_SCOREC\": \"int16\",\n", + " \"CURRENT_PERIOD_MODIFICATION_LOSS_AMOUNT\": \"float64\", \"CUMULATIVE_MODIFICATION_LOSS_AMOUNT\": \"float64\",\n", + " \"CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS\": \"float64\", \"CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS\": \"float64\",\n", + " \"FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT\": \"float64\", \"DELINQUENT_ACCRUED_INTEREST\": \"float64\",\n", + " \"MONTHS_UNTIL_FIRST_PAYMENT_RESET\": \"int16\", \"MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET\": \"int16\",\n", + " \"INITIAL_INTEREST_RATE_CAP\": \"float32\", \"PERIODIC_INTEREST_RATE_CAP\": \"float32\",\n", + " \"LIFETIME_INTEREST_RATE_CAP\": \"float32\", \"MARGIN\": \"float32\", \"ADR_COUNT\": \"int16\",\n", + " \"ADR_UPB\": \"float64\", \"INTEREST_BEARING_UPB\": \"float64\"\n", + "}\n", + "\n", + "print(f\"Data type mappings defined for {len(OPTIMIZED_DTYPES)} columns\")" + ] + }, + { + "cell_type": "markdown", + "id": "81aca490", + "metadata": {}, + "source": [ + "## 5. CSV to Parquet Conversion Function" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c725817e", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_csv_to_parquet(csv_file_path, parquet_file_path, column_names, dtype_mapping):\n", + " \"\"\"\n", + " Convert Fannie Mae CSV to optimized Parquet format.\n", + " \n", + " Parameters:\n", + " - csv_file_path: Path to input CSV file\n", + " - parquet_file_path: Path to output Parquet file\n", + " - column_names: List of column names\n", + " - dtype_mapping: Dictionary mapping column names to data types\n", + " \n", + " Returns:\n", + " - DataFrame with converted data\n", + " \"\"\"\n", + " print(f\"πŸ”„ Reading CSV file: {csv_file_path}\")\n", + " \n", + " # First pass: Read as strings to handle any data issues\n", + " df = pd.read_csv(\n", + " csv_file_path,\n", + " sep='|',\n", + " names=column_names,\n", + " dtype='string',\n", + " header=None,\n", + " low_memory=False,\n", + " na_values=['', ' ', 'NULL', 'null', 'NA']\n", + " )\n", + " \n", + " print(f\"πŸ“Š Initial shape: {df.shape}\")\n", + " print(f\"πŸ”§ Converting data types...\")\n", + " \n", + " # Convert to optimized data types\n", + " conversion_errors = []\n", + " \n", + " for col, target_dtype in dtype_mapping.items():\n", + " if col in df.columns:\n", + " try:\n", + " if target_dtype == 'category':\n", + " df[col] = df[col].astype('category')\n", + " elif target_dtype in ['int8', 'int16', 'int32', 'int64']:\n", + " # Use nullable integer types for columns with missing values\n", + " df[col] = pd.to_numeric(df[col], errors='coerce')\n", + " df[col] = df[col].astype(f'Int{target_dtype[3:]}')\n", + " elif target_dtype in ['float32', 'float64']:\n", + " df[col] = pd.to_numeric(df[col], errors='coerce').astype(target_dtype)\n", + " elif target_dtype == 'string':\n", + " df[col] = df[col].astype('string')\n", + " except Exception as e:\n", + " conversion_errors.append(f\"{col}: {str(e)}\")\n", + " \n", + " if conversion_errors:\n", + " print(f\"⚠️ Conversion warnings for {len(conversion_errors)} columns\")\n", + " for error in conversion_errors[:5]: # Show first 5 errors\n", + " print(f\" {error}\")\n", + " \n", + " print(f\"πŸ’Ύ Saving to Parquet: {parquet_file_path}\")\n", + " \n", + " # Save to Parquet with compression\n", + " df.to_parquet(\n", + " parquet_file_path,\n", + " engine='pyarrow',\n", + " compression='snappy',\n", + " index=False\n", + " )\n", + " \n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "af06d7df", + "metadata": {}, + "source": [ + "## 6. Run the Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "baadc7c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ”„ Reading CSV file: ../../data/raw/2025Q1.csv\n", + "πŸ“Š Initial shape: (388622, 110)\n", + "πŸ”§ Converting data types...\n", + "πŸ’Ύ Saving to Parquet: ../../data/processed/2025Q1.parquet\n", + "\n", + "βœ… Conversion completed successfully!\n", + "πŸ“ˆ Final shape: (388622, 110)\n" + ] + } + ], + "source": [ + "# Perform the conversion\n", + "df_converted = convert_csv_to_parquet(\n", + " csv_path, \n", + " parquet_path, \n", + " LPPUB_COLUMN_NAMES, \n", + " OPTIMIZED_DTYPES\n", + ")\n", + "\n", + "print(f\"\\nβœ… Conversion completed successfully!\")\n", + "print(f\"πŸ“ˆ Final shape: {df_converted.shape}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e6175750", + "metadata": {}, + "source": [ + "## 7. Verification and Performance Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "210051ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ“ File Size Comparison:\n", + " CSV: 123,044,558 bytes (117.34 MB)\n", + " Parquet: 8,896,605 bytes (8.48 MB)\n", + " Compression ratio: 13.83x\n", + " Space saved: 92.8%\n", + "\n", + "πŸ” Verification - Reading Parquet file:\n", + " Shape: (388622, 110)\n", + " Memory usage: ~734.56 MB\n", + "\n", + "πŸ“‹ Sample Data Types:\n", + " POOL_ID: string\n", + " LOAN_ID: string\n", + " ACT_PERIOD: string\n", + " CHANNEL: category\n", + " SELLER: category\n", + " SERVICER: category\n", + " MASTER_SERVICER: category\n", + " ORIG_RATE: float32\n", + " CURR_RATE: float32\n", + " ORIG_UPB: float64\n" + ] + } + ], + "source": [ + "# File size comparison\n", + "csv_size = os.path.getsize(csv_path)\n", + "parquet_size = os.path.getsize(parquet_path)\n", + "\n", + "print(\"πŸ“ File Size Comparison:\")\n", + "print(f\" CSV: {csv_size:,} bytes ({csv_size/1024/1024:.2f} MB)\")\n", + "print(f\" Parquet: {parquet_size:,} bytes ({parquet_size/1024/1024:.2f} MB)\")\n", + "print(f\" Compression ratio: {csv_size/parquet_size:.2f}x\")\n", + "print(f\" Space saved: {((csv_size - parquet_size) / csv_size) * 100:.1f}%\")\n", + "\n", + "# Verify by reading back\n", + "print(\"\\nπŸ” Verification - Reading Parquet file:\")\n", + "df_verify = pd.read_parquet(parquet_path, engine='pyarrow')\n", + "print(f\" Shape: {df_verify.shape}\")\n", + "print(f\" Memory usage: ~{df_verify.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\")\n", + "\n", + "print(\"\\nπŸ“‹ Sample Data Types:\")\n", + "for i, (col, dtype) in enumerate(df_verify.dtypes.head(10).items()):\n", + " print(f\" {col}: {dtype}\")" + ] + }, + { + "cell_type": "markdown", + "id": "275e333d", + "metadata": {}, + "source": [ + "## 8. Data Quality Summary" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f42eba52", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ“Š Data Quality Summary:\n", + " Total rows: 388,622\n", + " Total columns: 110\n", + " Columns with missing values: 77\n", + " Top 5 columns with most missing values:\n", + " POOL_ID: 388,622 (100.0%)\n", + " SERVICER: 651 (0.2%)\n", + " MASTER_SERVICER: 388,622 (100.0%)\n", + " CURR_RATE: 651 (0.2%)\n", + " ISSUANCE_UPB: 388,622 (100.0%)\n", + "\n", + "πŸ“ˆ Data Type Distribution:\n", + " string: 29 columns\n", + " float64: 28 columns\n", + " Int16: 13 columns\n", + " float32: 10 columns\n", + " category: 5 columns\n", + " category: 5 columns\n", + " category: 4 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " Int8: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n", + " category: 1 columns\n" + ] + } + ], + "source": [ + "print(\"πŸ“Š Data Quality Summary:\")\n", + "print(f\" Total rows: {len(df_verify):,}\")\n", + "print(f\" Total columns: {len(df_verify.columns)}\")\n", + "\n", + "# Missing values summary\n", + "missing_summary = df_verify.isnull().sum()\n", + "columns_with_missing = missing_summary[missing_summary > 0]\n", + "\n", + "print(f\" Columns with missing values: {len(columns_with_missing)}\")\n", + "if len(columns_with_missing) > 0:\n", + " print(f\" Top 5 columns with most missing values:\")\n", + " for col, count in columns_with_missing.head().items():\n", + " pct = (count / len(df_verify)) * 100\n", + " print(f\" {col}: {count:,} ({pct:.1f}%)\")\n", + "\n", + "# Data type distribution\n", + "dtype_counts = df_verify.dtypes.value_counts()\n", + "print(f\"\\nπŸ“ˆ Data Type Distribution:\")\n", + "for dtype, count in dtype_counts.items():\n", + " print(f\" {dtype}: {count} columns\")" + ] + }, + { + "cell_type": "markdown", + "id": "a8fee0a6", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This notebook successfully converts Fannie Mae Loan Performance CSV files to optimized Parquet format with:\n", + "\n", + "- **Proper column naming** based on official R script\n", + "- **Optimized data types** for memory efficiency\n", + "- **Significant compression** (typically 10-15x size reduction)\n", + "- **Data integrity** preservation\n", + "- **Error handling** for data quality issues\n", + "\n", + "The resulting Parquet files can be used for efficient data analysis with much faster read times and reduced storage requirements.\n", + "\n", + "**Next Steps:**\n", + "- Use the Parquet files for analysis in other notebooks\n", + "- Consider partitioning large datasets by year/quarter\n", + "- Implement data validation checks for production use" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gcp-pipeline", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..53e2ec7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[tool.black] +line-length = 120 +target-version = ['py311'] +extend-exclude = ''' +/( + # Backup files + | src/ai_service_old\.py +)/ +''' + +[tool.isort] +profile = "black" +line_length = 120 diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..75be719 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,47 @@ +[pytest] +# Test discovery +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Test paths +testpaths = tests + +# Output options +addopts = + -v + --strict-markers + --cov=src + --cov-report=term-missing + --cov-report=html + --cov-report=xml + --cov-branch + --ignore=tests/unit/test_bedrock_adapter.py + --ignore=tests/unit/test_claude_adapter.py + --ignore=tests/unit/test_gemini_adapter.py + --ignore=tests/unit/test_base_adapter.py + --ignore=tests/unit/test_ai_service.py + --ignore=tests/integration/test_adapter_integration.py + +# Test markers +markers = + unit: Unit tests + integration: Integration tests + slow: Slow running tests + requires_api: Tests that require API credentials + +# Coverage options +[coverage:run] +source = src +omit = + */tests/* + */test_* + */__pycache__/* + +# Ignore patterns +norecursedirs = .git .tox dist build *.egg venv + +# Warnings +filterwarnings = + ignore::DeprecationWarning + ignore::PendingDeprecationWarning diff --git a/requirements.txt b/requirements.txt index 3ef6870..95bf4b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,12 +12,20 @@ boto3>=1.34.0 # AI/ML providers anthropic>=0.40.0 +google-generativeai>=0.3.0 # For Gemini support # Authentication (Google OAuth) requests>=2.31.0 -# Testing +# Testing and development pytest>=7.4.0 +pytest-cov>=4.1.0 +pytest-mock>=3.12.0 +black>=23.0.0 +flake8>=6.1.0 +isort>=5.12.0 +mypy>=1.7.0 +coverage>=7.3.0 # Optional: Enhanced performance and monitoring psutil>=5.9.0 \ No newline at end of file diff --git a/scripts/supabase_schema.sql b/scripts/supabase_schema.sql deleted file mode 100644 index ecc9ef5..0000000 --- a/scripts/supabase_schema.sql +++ /dev/null @@ -1,114 +0,0 @@ --- Supabase Database Schema for Authentication and User Management --- Run these commands in your Supabase SQL Editor - --- 1. User Profiles Table -CREATE TABLE IF NOT EXISTS public.user_profiles ( - id UUID REFERENCES auth.users(id) ON DELETE CASCADE PRIMARY KEY, - email TEXT UNIQUE NOT NULL, - name TEXT, - avatar_url TEXT, - provider TEXT DEFAULT 'google', - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW(), - last_login TIMESTAMPTZ DEFAULT NOW() -); - --- Enable Row Level Security -ALTER TABLE public.user_profiles ENABLE ROW LEVEL SECURITY; - --- Policy: Users can view their own profile -CREATE POLICY "Users can view own profile" ON public.user_profiles - FOR SELECT USING (auth.uid() = id); - --- Policy: Users can update their own profile -CREATE POLICY "Users can update own profile" ON public.user_profiles - FOR UPDATE USING (auth.uid() = id); - --- Policy: Users can insert their own profile -CREATE POLICY "Users can insert own profile" ON public.user_profiles - FOR INSERT WITH CHECK (auth.uid() = id); - --- 2. User Preferences Table -CREATE TABLE IF NOT EXISTS public.user_preferences ( - id UUID DEFAULT gen_random_uuid() PRIMARY KEY, - user_id UUID REFERENCES public.user_profiles(id) ON DELETE CASCADE NOT NULL, - preferences JSONB DEFAULT '{}', - created_at TIMESTAMPTZ DEFAULT NOW(), - updated_at TIMESTAMPTZ DEFAULT NOW(), - UNIQUE(user_id) -); - --- Enable Row Level Security -ALTER TABLE public.user_preferences ENABLE ROW LEVEL SECURITY; - --- Policy: Users can manage their own preferences -CREATE POLICY "Users can manage own preferences" ON public.user_preferences - FOR ALL USING (auth.uid() = user_id); - --- 3. Query History Table -CREATE TABLE IF NOT EXISTS public.query_history ( - id UUID DEFAULT gen_random_uuid() PRIMARY KEY, - user_id UUID REFERENCES public.user_profiles(id) ON DELETE CASCADE NOT NULL, - question TEXT NOT NULL, - sql_query TEXT NOT NULL, - ai_provider TEXT NOT NULL, - execution_time FLOAT, - created_at TIMESTAMPTZ DEFAULT NOW() -); - --- Enable Row Level Security -ALTER TABLE public.query_history ENABLE ROW LEVEL SECURITY; - --- Policy: Users can view their own query history -CREATE POLICY "Users can view own query history" ON public.query_history - FOR SELECT USING (auth.uid() = user_id); - --- Policy: Users can insert their own queries -CREATE POLICY "Users can insert own queries" ON public.query_history - FOR INSERT WITH CHECK (auth.uid() = user_id); - --- 4. Create indexes for better performance -CREATE INDEX IF NOT EXISTS idx_user_preferences_user_id ON public.user_preferences(user_id); -CREATE INDEX IF NOT EXISTS idx_query_history_user_id ON public.query_history(user_id); -CREATE INDEX IF NOT EXISTS idx_query_history_created_at ON public.query_history(created_at DESC); - --- 5. Create a function to automatically update the updated_at timestamp -CREATE OR REPLACE FUNCTION public.handle_updated_at() -RETURNS TRIGGER AS $$ -BEGIN - NEW.updated_at = NOW(); - RETURN NEW; -END; -$$ LANGUAGE plpgsql; - --- 6. Create triggers to automatically update timestamps -CREATE TRIGGER handle_updated_at_user_profiles - BEFORE UPDATE ON public.user_profiles - FOR EACH ROW EXECUTE FUNCTION public.handle_updated_at(); - -CREATE TRIGGER handle_updated_at_user_preferences - BEFORE UPDATE ON public.user_preferences - FOR EACH ROW EXECUTE FUNCTION public.handle_updated_at(); - --- 7. Grant necessary permissions (if needed) -GRANT USAGE ON SCHEMA public TO anon, authenticated; -GRANT ALL ON ALL TABLES IN SCHEMA public TO authenticated; -GRANT ALL ON ALL SEQUENCES IN SCHEMA public TO authenticated; - --- 8. Optional: Create a view for user analytics (admin only) -CREATE OR REPLACE VIEW public.user_analytics AS -SELECT - up.email, - up.name, - up.provider, - up.created_at as user_created_at, - up.last_login, - COUNT(qh.id) as total_queries, - MAX(qh.created_at) as last_query_at, - AVG(qh.execution_time) as avg_execution_time -FROM public.user_profiles up -LEFT JOIN public.query_history qh ON up.id = qh.user_id -GROUP BY up.id, up.email, up.name, up.provider, up.created_at, up.last_login; - --- Note: This view should only be accessible to admin users --- You may want to create additional RLS policies for admin access \ No newline at end of file diff --git a/scripts/sync_data.py b/scripts/sync_data.py index 099681f..357428e 100644 --- a/scripts/sync_data.py +++ b/scripts/sync_data.py @@ -30,7 +30,7 @@ def get_file_md5(file_path): """Calculate MD5 hash of a file.""" if not os.path.exists(file_path): return None - + hash_md5 = hashlib.md5() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): @@ -43,7 +43,7 @@ def create_r2_client(): if not R2_ACCESS_KEY_ID or not R2_SECRET_ACCESS_KEY: print("❌ R2 credentials not found. Please set R2_ACCESS_KEY_ID and R2_SECRET_ACCESS_KEY") return None - + try: client = boto3.client( 's3', @@ -52,12 +52,12 @@ def create_r2_client(): aws_secret_access_key=R2_SECRET_ACCESS_KEY, region_name='auto' # Cloudflare R2 uses 'auto' ) - + # Test connection client.head_bucket(Bucket=R2_BUCKET_NAME) print(f"βœ… Connected to R2 bucket: {R2_BUCKET_NAME}") return client - + except (ClientError, NoCredentialsError) as e: print(f"❌ Failed to connect to R2: {e}") return None @@ -67,11 +67,11 @@ def list_r2_objects(client): """List all objects in the R2 bucket.""" try: response = client.list_objects_v2(Bucket=R2_BUCKET_NAME) - + if 'Contents' not in response: print("⚠️ No objects found in R2 bucket") return [] - + objects = [] for obj in response['Contents']: # Focus on parquet files @@ -82,10 +82,10 @@ def list_r2_objects(client): 'last_modified': obj['LastModified'], 'etag': obj['ETag'].strip('"') }) - + print(f"πŸ“Š Found {len(objects)} parquet files in R2") return objects - + except ClientError as e: print(f"❌ Error listing R2 objects: {e}") return [] @@ -96,10 +96,10 @@ def download_file(client, r2_key, local_path): try: # Ensure local directory exists os.makedirs(os.path.dirname(local_path), exist_ok=True) - + print(f"πŸ“₯ Downloading {r2_key} to {local_path}") client.download_file(R2_BUCKET_NAME, r2_key, local_path) - + # Verify download if os.path.exists(local_path) and os.path.getsize(local_path) > 0: print(f"βœ… Successfully downloaded {r2_key}") @@ -107,7 +107,7 @@ def download_file(client, r2_key, local_path): else: print(f"❌ Download failed or file is empty: {r2_key}") return False - + except ClientError as e: print(f"❌ Error downloading {r2_key}: {e}") return False @@ -116,33 +116,33 @@ def download_file(client, r2_key, local_path): def sync_data(): """Main sync function.""" print("πŸš€ Starting R2 data sync...") - + # Create local data directory Path(LOCAL_DATA_DIR).mkdir(parents=True, exist_ok=True) - + # Create R2 client client = create_r2_client() if not client: print("❌ Cannot proceed without R2 connection") return False - + # List R2 objects r2_objects = list_r2_objects(client) if not r2_objects: print("❌ No data files found in R2") return False - + # Sync each file success_count = 0 total_files = len(r2_objects) - + for obj in r2_objects: r2_key = obj['key'] local_file = os.path.join(LOCAL_DATA_DIR, os.path.basename(r2_key)) - + # Check if we need to download should_download = FORCE_REFRESH - + if not should_download: if not os.path.exists(local_file): should_download = True @@ -153,7 +153,7 @@ def sync_data(): if local_size != obj['size']: should_download = True print(f"πŸ“„ Size mismatch for {r2_key}: local={local_size}, R2={obj['size']}") - + if should_download: if download_file(client, r2_key, local_file): success_count += 1 @@ -162,12 +162,12 @@ def sync_data(): else: print(f"βœ… File up to date: {r2_key}") success_count += 1 - + print(f"\nπŸ“Š Sync Summary:") print(f" Total files: {total_files}") print(f" Successful: {success_count}") print(f" Failed: {total_files - success_count}") - + if success_count == total_files: print("πŸŽ‰ Data sync completed successfully!") return True @@ -180,13 +180,13 @@ def check_data_availability(): """Check if data is available locally.""" if not os.path.exists(LOCAL_DATA_DIR): return False - + parquet_files = [f for f in os.listdir(LOCAL_DATA_DIR) if f.endswith('.parquet')] - + if not parquet_files: print(f"πŸ“‚ No parquet files found in {LOCAL_DATA_DIR}") return False - + print(f"βœ… Found {len(parquet_files)} parquet files locally") return True diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..16e28be --- /dev/null +++ b/setup.cfg @@ -0,0 +1,51 @@ +[coverage:run] +source = src +omit = + */tests/* + */test_*.py + */__pycache__/* + */venv/* + */env/* + */ai_service_old.py + +[coverage:report] +precision = 2 +show_missing = True +skip_covered = False + +[coverage:html] +directory = htmlcov + +[flake8] +max-line-length = 120 +exclude = + .git, + __pycache__, + venv, + env, + .venv, + *.egg-info, + .pytest_cache, + htmlcov, + src/ai_service_old.py +ignore = E203,E501,W503 +per-file-ignores = + __init__.py:F401 + +[isort] +profile = black +line_length = 120 +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +use_parentheses = True +ensure_newline_before_comments = True + +[mypy] +python_version = 3.11 +warn_return_any = False +warn_unused_configs = True +disallow_untyped_defs = False +ignore_missing_imports = True +check_untyped_defs = False +no_implicit_optional = False diff --git a/src/__init__.py b/src/__init__.py index 192009a..8fd3540 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +1 @@ -# Single Family Loan Analytics Platform Package \ No newline at end of file +# Single Family Loan Analytics Platform Package diff --git a/src/ai_engines/__init__.py b/src/ai_engines/__init__.py new file mode 100644 index 0000000..625341b --- /dev/null +++ b/src/ai_engines/__init__.py @@ -0,0 +1,16 @@ +""" +converSQL AI Engine Adapters +Modular adapter pattern for multiple AI providers. +""" + +from .base import AIEngineAdapter +from .bedrock_adapter import BedrockAdapter +from .claude_adapter import ClaudeAdapter +from .gemini_adapter import GeminiAdapter + +__all__ = [ + "AIEngineAdapter", + "BedrockAdapter", + "ClaudeAdapter", + "GeminiAdapter", +] diff --git a/src/ai_engines/base.py b/src/ai_engines/base.py new file mode 100644 index 0000000..4f67d3f --- /dev/null +++ b/src/ai_engines/base.py @@ -0,0 +1,210 @@ +""" +Base AI Engine Adapter Interface +Defines the contract for all AI engine adapters in converSQL. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional, Tuple + + +class AIEngineAdapter(ABC): + """ + Abstract base class for AI engine adapters. + + All AI providers (Bedrock, Claude, Gemini, Ollama, etc.) must implement + this interface to integrate with converSQL's AI service layer. + + The adapter pattern allows converSQL to support multiple AI providers + with a unified interface, making it easy to add new engines or switch + between providers based on availability and user preference. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None): + """ + Initialize the AI engine adapter. + + Args: + config: Optional configuration dictionary containing provider-specific + settings (API keys, model IDs, region, endpoint, etc.) + """ + self.config = config or {} + self._initialize() + + @abstractmethod + def _initialize(self) -> None: + """ + Initialize the AI provider client/connection. + + This method should: + - Set up API clients or connections + - Validate credentials + - Configure provider-specific settings + - Handle initialization errors gracefully + + Implementation should not raise exceptions; instead, set internal + state that can be checked via is_available(). + """ + pass + + @abstractmethod + def is_available(self) -> bool: + """ + Check if the AI engine is available and properly configured. + + Returns: + bool: True if the engine is ready to generate SQL, False otherwise + + This method should check: + - API credentials are present + - Network connectivity (if applicable) + - Provider service is accessible + - All required configuration is present + """ + pass + + @abstractmethod + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """ + Generate SQL query from natural language prompt. + + Args: + prompt: Complete prompt including schema context, business rules, + ontological information, and user question + + Returns: + Tuple[str, str]: (sql_query, error_message) + - On success: (generated_sql, "") + - On failure: ("", error_description) + + The generated SQL should: + - Be syntactically valid + - Reference only tables/columns in the provided schema + - Follow best practices for query optimization + - Include helpful comments when appropriate + + Error messages should be user-friendly and actionable. + """ + pass + + @property + @abstractmethod + def name(self) -> str: + """ + Get the display name of this AI engine. + + Returns: + str: Human-readable name (e.g., "Claude API", "AWS Bedrock", "Gemini") + """ + pass + + @property + @abstractmethod + def provider_id(self) -> str: + """ + Get the unique identifier for this provider. + + Returns: + str: Lowercase identifier (e.g., "claude", "bedrock", "gemini") + """ + pass + + def get_model_info(self) -> Dict[str, Any]: + """ + Get information about the active model configuration. + + Returns: + Dict with model information: + - model_id: Model identifier + - version: Model version + - capabilities: List of capabilities + - max_tokens: Maximum token limit + - etc. + + Default implementation returns empty dict; override for model-specific info. + """ + return {} + + def validate_response(self, sql: str) -> Tuple[bool, str]: + """ + Validate the generated SQL response. + + Args: + sql: Generated SQL query string + + Returns: + Tuple[bool, str]: (is_valid, error_message) + - On valid: (True, "") + - On invalid: (False, error_description) + + Default implementation performs basic validation. + Override for provider-specific validation logic. + """ + if not sql or not sql.strip(): + return False, "Empty SQL query generated" + + sql_lower = sql.lower().strip() + + # Check for common SQL keywords + sql_keywords = ["select", "insert", "update", "delete", "create", "drop", "with"] + has_sql_keyword = any(sql_lower.startswith(keyword) for keyword in sql_keywords) + + if not has_sql_keyword: + return False, "Generated text does not appear to be valid SQL" + + # Warn about dangerous operations (but don't block) + dangerous_keywords = ["drop", "delete", "truncate", "alter"] + if any(keyword in sql_lower for keyword in dangerous_keywords): + return True, "Warning: Query contains potentially destructive operations" + + return True, "" + + def clean_sql_response(self, response: str) -> str: + """ + Clean and extract SQL from AI response. + + Args: + response: Raw response text from AI provider + + Returns: + str: Cleaned SQL query + + Handles common patterns: + - Removes markdown code blocks (```sql ... ```) + - Strips leading/trailing whitespace + - Removes explanatory text before/after SQL + - Extracts SQL from mixed content + """ + sql = response.strip() + + # Remove markdown code blocks + if "```" in sql: + # Extract content between ```sql and ``` or ``` and ``` + parts = sql.split("```") + for i, part in enumerate(parts): + part = part.strip() + if part.startswith("sql"): + sql = part[3:].strip() + break + elif i > 0 and (part.upper().startswith("SELECT") or part.upper().startswith("WITH")): + sql = part.strip() + break + + # Remove common AI response patterns + prefixes_to_remove = [ + "here's the sql query:", + "here is the sql query:", + "sql query:", + "query:", + ] + + sql_lower = sql.lower() + for prefix in prefixes_to_remove: + if sql_lower.startswith(prefix): + sql = sql[len(prefix) :].strip() + break + + return sql + + def __repr__(self) -> str: + """String representation of the adapter.""" + return f"<{self.__class__.__name__} provider={self.provider_id} available={self.is_available()}>" diff --git a/src/ai_engines/bedrock_adapter.py b/src/ai_engines/bedrock_adapter.py new file mode 100644 index 0000000..0492c1a --- /dev/null +++ b/src/ai_engines/bedrock_adapter.py @@ -0,0 +1,212 @@ +""" +AWS Bedrock AI Engine Adapter +Implements converSQL adapter interface for Amazon Bedrock. +""" + +import json +import os +from typing import Any, Dict, Tuple + +from .base import AIEngineAdapter + + +class BedrockAdapter(AIEngineAdapter): + """ + Amazon Bedrock AI engine adapter. + + Supports Claude models through AWS Bedrock infrastructure. + Requires AWS credentials and appropriate IAM permissions. + """ + + def __init__(self, config: Dict[str, Any] = None): + """ + Initialize Bedrock adapter. + + Args: + config: Configuration dict with keys: + - model_id: Bedrock model ID (default from env) + - region: AWS region (default from env) + - enable: Whether Bedrock is enabled (default True) + - guardrail_id: Optional Bedrock Guardrail ID + - guardrail_version: Optional Bedrock Guardrail version + """ + self.client = None + self.model_id = None + self.region = None + self.guardrail_id = None + self.guardrail_version = None + super().__init__(config) + + def _initialize(self) -> None: + """Initialize Bedrock client with AWS SDK.""" + # Check if Bedrock is enabled + enable_bedrock = self.config.get("enable", True) + if not enable_bedrock: + return + + # Get configuration + self.model_id = self.config.get( + "model_id", os.getenv("BEDROCK_MODEL_ID", "anthropic.claude-3-5-haiku-20241022-v1:0") + ) + self.region = self.config.get("region", os.getenv("AWS_DEFAULT_REGION", "us-west-2")) + self.guardrail_id = self.config.get("guardrail_id", os.getenv("BEDROCK_GUARDRAIL_ID")) + self.guardrail_version = self.config.get("guardrail_version", os.getenv("BEDROCK_GUARDRAIL_VERSION", "DRAFT")) + + try: + import boto3 + from botocore.exceptions import NoCredentialsError, PartialCredentialsError + + # Check if AWS credentials are available + try: + session = boto3.Session() + credentials = session.get_credentials() + if credentials is None: + print("⚠️ No AWS credentials found for Bedrock") + print(" Configure credentials via AWS CLI, environment variables, or IAM role") + self.client = None + return + + # Verify credentials are actually usable (not just present but frozen/expired) + frozen_creds = credentials.get_frozen_credentials() + if not frozen_creds.access_key: + print("⚠️ AWS credentials are invalid or expired") + self.client = None + return + + except (NoCredentialsError, PartialCredentialsError) as e: + print(f"⚠️ AWS credentials error: {e}") + self.client = None + return + except Exception as e: + print(f"⚠️ Error checking AWS credentials: {e}") + self.client = None + return + + # Initialize Bedrock runtime client + self.client = boto3.client("bedrock-runtime", region_name=self.region) + + # Test credentials with a simple API call to verify they actually work + try: + # Try to list models - this will fail fast if credentials are invalid + # We use the bedrock (not runtime) client for this test + bedrock_client = boto3.client("bedrock", region_name=self.region) + # Just call the API - if credentials are bad, this will raise an exception + bedrock_client.list_foundation_models(maxResults=1) + print(f"βœ… AWS Bedrock initialized successfully in {self.region}") + except Exception as e: + # Credentials exist but are invalid (expired, wrong permissions, etc) + print(f"⚠️ AWS Bedrock credentials are invalid: {e}") + print(" Your AWS credentials exist but cannot access Bedrock services") + self.client = None + return + + except ImportError: + print("⚠️ boto3 not installed. Run: pip install boto3") + self.client = None + except Exception as e: + print(f"⚠️ Bedrock initialization failed: {e}") + print(" Check AWS credentials and region configuration") + self.client = None + + def is_available(self) -> bool: + """Check if Bedrock client is initialized and ready.""" + return self.client is not None and self.model_id is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """ + Generate SQL using Amazon Bedrock. + + Args: + prompt: Complete prompt with schema and question + + Returns: + Tuple[str, str]: (sql_query, error_message) + """ + if not self.is_available(): + return "", "Bedrock client not available. Check AWS credentials and configuration." + + try: + # Build Bedrock request body + request_body = { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 4000, + "temperature": 0.0, # Deterministic for SQL generation + "messages": [{"role": "user", "content": prompt}], + } + + # Prepare invoke_model parameters + invoke_params = { + "modelId": self.model_id, + "body": json.dumps(request_body), + "contentType": "application/json", + "accept": "application/json", + } + + # Add guardrails if configured + if self.guardrail_id: + invoke_params["guardrailIdentifier"] = self.guardrail_id + invoke_params["guardrailVersion"] = self.guardrail_version + + # Call Bedrock API + response = self.client.invoke_model(**invoke_params) + + # Parse response + response_body = json.loads(response["body"].read()) + + # Extract SQL from response + if "content" in response_body and len(response_body["content"]) > 0: + raw_sql = response_body["content"][0]["text"] + sql_query = self.clean_sql_response(raw_sql) + + # Validate response + is_valid, validation_msg = self.validate_response(sql_query) + if not is_valid: + return "", f"Invalid SQL generated: {validation_msg}" + + return sql_query, "" + else: + return "", "Bedrock returned empty response" + + except Exception as e: + error_msg = f"Bedrock API error: {str(e)}" + + # Provide helpful error messages for common issues + error_lower = str(e).lower() + if "credentials" in error_lower or "access denied" in error_lower: + error_msg += "\nCheck AWS credentials (aws configure) or IAM permissions" + elif "throttling" in error_lower or "rate" in error_lower: + error_msg += "\nAPI rate limit exceeded. Try again in a moment" + elif "model" in error_lower: + error_msg += f"\nModel {self.model_id} may not be available in {self.region}" + + return "", error_msg + + @property + def name(self) -> str: + """Display name for this engine.""" + return "Amazon Bedrock" + + @property + def provider_id(self) -> str: + """Unique provider identifier.""" + return "bedrock" + + def get_model_info(self) -> Dict[str, Any]: + """Get Bedrock model configuration details.""" + info = { + "provider": "Amazon Bedrock", + "model_id": self.model_id, + "region": self.region, + "service": "bedrock-runtime", + "max_tokens": 4000, + "temperature": 0.0, + "capabilities": ["SQL generation", "Natural language understanding", "Schema comprehension"], + } + + # Add guardrail info if configured + if self.guardrail_id: + info["guardrail_id"] = self.guardrail_id + info["guardrail_version"] = self.guardrail_version + info["capabilities"].append("Content filtering with Bedrock Guardrails") + + return info diff --git a/src/ai_engines/claude_adapter.py b/src/ai_engines/claude_adapter.py new file mode 100644 index 0000000..d3f4985 --- /dev/null +++ b/src/ai_engines/claude_adapter.py @@ -0,0 +1,148 @@ +""" +Claude API AI Engine Adapter +Implements converSQL adapter interface for Anthropic Claude API. +""" + +import os +from typing import Any, Dict, Tuple + +from .base import AIEngineAdapter + + +class ClaudeAdapter(AIEngineAdapter): + """ + Anthropic Claude API engine adapter. + + Supports direct API access to Claude models via Anthropic API. + Requires CLAUDE_API_KEY environment variable. + """ + + def __init__(self, config: Dict[str, Any] = None): + """ + Initialize Claude adapter. + + Args: + config: Configuration dict with keys: + - api_key: Claude API key (default from env) + - model: Model name (default from env) + - max_tokens: Maximum response tokens + """ + self.client = None + self.api_key = None + self.model = None + self.max_tokens = None + super().__init__(config) + + def _initialize(self) -> None: + """Initialize Claude API client.""" + # Get configuration + self.api_key = self.config.get("api_key", os.getenv("CLAUDE_API_KEY")) + self.model = self.config.get("model", os.getenv("CLAUDE_MODEL", "claude-3-5-sonnet-20241022")) + self.max_tokens = self.config.get("max_tokens", 4000) + + if not self.api_key: + return + + try: + import anthropic + + # Initialize Claude client + self.client = anthropic.Anthropic(api_key=self.api_key) + + # Optional: Test connection with minimal request + # If test fails, still keep client - it might work for actual requests + try: + self.client.messages.create( + model=self.model, max_tokens=10, messages=[{"role": "user", "content": "test"}] + ) + except Exception: + # Keep client - API key might be valid but test failed + pass + + except ImportError: + print("⚠️ anthropic package not installed. Run: pip install anthropic") + self.client = None + except Exception as e: + print(f"⚠️ Claude initialization failed: {e}") + print(" Check CLAUDE_API_KEY environment variable") + self.client = None + + def is_available(self) -> bool: + """Check if Claude API client is initialized and ready.""" + return self.client is not None and self.api_key is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """ + Generate SQL using Claude API. + + Args: + prompt: Complete prompt with schema and question + + Returns: + Tuple[str, str]: (sql_query, error_message) + """ + if not self.is_available(): + return "", "Claude API not available. Check CLAUDE_API_KEY configuration." + + try: + # Call Claude API + response = self.client.messages.create( + model=self.model, + max_tokens=self.max_tokens, + temperature=0.0, # Deterministic for SQL generation + messages=[{"role": "user", "content": prompt}], + ) + + # Extract SQL from response + if response.content and len(response.content) > 0: + raw_sql = response.content[0].text + sql_query = self.clean_sql_response(raw_sql) + + # Validate response + is_valid, validation_msg = self.validate_response(sql_query) + if not is_valid: + return "", f"Invalid SQL generated: {validation_msg}" + + return sql_query, "" + else: + return "", "Claude returned empty response" + + except Exception as e: + error_msg = f"Claude API error: {str(e)}" + + # Provide helpful error messages for common issues + error_lower = str(e).lower() + if "api_key" in error_lower or "authentication" in error_lower: + error_msg += "\nCheck CLAUDE_API_KEY environment variable" + elif "rate" in error_lower or "quota" in error_lower: + error_msg += "\nAPI rate limit or quota exceeded" + elif "model" in error_lower: + error_msg += f"\nModel {self.model} may not be available or accessible" + + return "", error_msg + + @property + def name(self) -> str: + """Display name for this engine.""" + return "Claude API" + + @property + def provider_id(self) -> str: + """Unique provider identifier.""" + return "claude" + + def get_model_info(self) -> Dict[str, Any]: + """Get Claude model configuration details.""" + return { + "provider": "Anthropic Claude", + "model": self.model, + "max_tokens": self.max_tokens, + "temperature": 0.0, + "api_version": "messages-2023-12-15", + "capabilities": [ + "SQL generation", + "Natural language understanding", + "Schema comprehension", + "Business domain reasoning", + ], + } diff --git a/src/ai_engines/gemini_adapter.py b/src/ai_engines/gemini_adapter.py new file mode 100644 index 0000000..ff486d3 --- /dev/null +++ b/src/ai_engines/gemini_adapter.py @@ -0,0 +1,203 @@ +""" +Google Gemini AI Engine Adapter +Implements converSQL adapter interface for Google Gemini. +""" + +import os +from typing import Any, Dict, Tuple + +from .base import AIEngineAdapter + + +class GeminiAdapter(AIEngineAdapter): + """ + Google Gemini AI engine adapter. + + Supports Gemini Pro and other models through Google's Generative AI API. + Requires GOOGLE_API_KEY environment variable. + """ + + def __init__(self, config: Dict[str, Any] = None): + """ + Initialize Gemini adapter. + + Args: + config: Configuration dict with keys: + - api_key: Google API key (default from env) + - model: Model name (default 'gemini-1.5-pro') + - max_tokens: Maximum response tokens + - temperature: Temperature for generation (0.0-1.0) + """ + self.client = None + self.model = None + self.api_key = None + self.max_output_tokens = None + self.temperature = None + super().__init__(config) + + def _initialize(self) -> None: + """Initialize Gemini client with Google Generative AI SDK.""" + # Get configuration + self.api_key = self.config.get("api_key", os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")) + model_name = self.config.get("model", os.getenv("GEMINI_MODEL", "gemini-1.5-pro")) + self.max_output_tokens = self.config.get("max_output_tokens", 4000) + self.temperature = self.config.get("temperature", 0.0) + + if not self.api_key: + return + + try: + import google.generativeai as genai + + # Configure API key + genai.configure(api_key=self.api_key) + + # Initialize model with generation config + generation_config = { + "temperature": self.temperature, + "max_output_tokens": self.max_output_tokens, + "top_p": 0.95, + "top_k": 40, + } + + self.model = genai.GenerativeModel(model_name=model_name, generation_config=generation_config) + + # Optional: Test with minimal request to verify model initialization + try: + self.model.generate_content("test") + # If we get here, the model is working + except Exception: + # Keep model - might work for actual requests + pass + + except ImportError: + print("⚠️ google-generativeai package not installed.") + print(" Run: pip install google-generativeai") + self.model = None + except Exception as e: + print(f"⚠️ Gemini initialization failed: {e}") + print(" Check GOOGLE_API_KEY or GEMINI_API_KEY environment variable") + self.model = None + + def is_available(self) -> bool: + """Check if Gemini client is initialized and ready.""" + return self.model is not None and self.api_key is not None + + def generate_sql(self, prompt: str) -> Tuple[str, str]: + """ + Generate SQL using Google Gemini. + + Args: + prompt: Complete prompt with schema and question + + Returns: + Tuple[str, str]: (sql_query, error_message) + """ + if not self.is_available(): + return "", "Gemini not available. Check GOOGLE_API_KEY configuration." + + try: + # Generate content + response = self.model.generate_content(prompt) + + # Extract text from response + if response.text: + raw_sql = response.text + sql_query = self.clean_sql_response(raw_sql) + + # Validate response + is_valid, validation_msg = self.validate_response(sql_query) + if not is_valid: + return "", f"Invalid SQL generated: {validation_msg}" + + return sql_query, "" + else: + # Check for safety ratings that blocked the response + if hasattr(response, "prompt_feedback"): + feedback = response.prompt_feedback + if hasattr(feedback, "block_reason"): + return "", f"Response blocked: {feedback.block_reason}" + + return "", "Gemini returned empty response" + + except Exception as e: + error_msg = f"Gemini API error: {str(e)}" + + # Provide helpful error messages for common issues + error_lower = str(e).lower() + if "api key" in error_lower or "api_key" in error_lower: + error_msg += "\nCheck GOOGLE_API_KEY or GEMINI_API_KEY environment variable" + elif "quota" in error_lower or "rate limit" in error_lower: + error_msg += "\nAPI quota exceeded or rate limited" + elif "safety" in error_lower or "blocked" in error_lower: + error_msg += "\nContent was blocked by safety filters" + elif "model" in error_lower: + error_msg += "\nModel may not be available or accessible" + + return "", error_msg + + @property + def name(self) -> str: + """Display name for this engine.""" + return "Google Gemini" + + @property + def provider_id(self) -> str: + """Unique provider identifier.""" + return "gemini" + + def get_model_info(self) -> Dict[str, Any]: + """Get Gemini model configuration details.""" + model_name = self.model.model_name if self.model and hasattr(self.model, "model_name") else "gemini-1.5-pro" + + return { + "provider": "Google Gemini", + "model": model_name, + "max_output_tokens": self.max_output_tokens, + "temperature": self.temperature, + "top_p": 0.95, + "top_k": 40, + "capabilities": [ + "SQL generation", + "Natural language understanding", + "Schema comprehension", + "Multi-turn conversation", + "Safety filtering", + ], + } + + def set_safety_settings(self, safety_settings: Dict[str, Any]) -> None: + """ + Update safety settings for content generation. + + Args: + safety_settings: Dictionary of safety settings + Example: + { + 'HARM_CATEGORY_HARASSMENT': 'BLOCK_MEDIUM_AND_ABOVE', + 'HARM_CATEGORY_HATE_SPEECH': 'BLOCK_MEDIUM_AND_ABOVE', + } + """ + if not self.model: + print("⚠️ Cannot set safety settings: Model not initialized") + return + + try: + import google.generativeai as genai + + # Reconstruct model with new safety settings + generation_config = { + "temperature": self.temperature, + "max_output_tokens": self.max_output_tokens, + "top_p": 0.95, + "top_k": 40, + } + + model_name = self.model.model_name if hasattr(self.model, "model_name") else "gemini-1.5-pro" + + self.model = genai.GenerativeModel( + model_name=model_name, generation_config=generation_config, safety_settings=safety_settings + ) + + except Exception as e: + print(f"⚠️ Failed to update safety settings: {e}") diff --git a/src/ai_service.py b/src/ai_service.py index 0f84214..70df333 100644 --- a/src/ai_service.py +++ b/src/ai_service.py @@ -1,198 +1,118 @@ #!/usr/bin/env python3 """ AI Service Module -Handles Bedrock and Claude API connections, prompt caching, and SQL generation. +Manages AI providers using the adapter pattern for SQL generation. """ -import json import hashlib -import streamlit as st -from typing import Optional, Tuple, Dict, Any import os +from typing import Any, Dict, Optional, Tuple + +import streamlit as st from dotenv import load_dotenv +# Import new adapters +from src.ai_engines import BedrockAdapter, ClaudeAdapter, GeminiAdapter + # Load environment variables load_dotenv() # AI Configuration -AI_PROVIDER = os.getenv('AI_PROVIDER', 'bedrock').lower() -CLAUDE_API_KEY = os.getenv('CLAUDE_API_KEY') -CLAUDE_MODEL = os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet-20241022') -BEDROCK_MODEL_ID = os.getenv('BEDROCK_MODEL_ID', 'anthropic.claude-3-5-haiku-20241022-v1:0') -ENABLE_BEDROCK = os.getenv('ENABLE_BEDROCK', 'true').lower() == 'true' -ENABLE_PROMPT_CACHE = os.getenv('ENABLE_PROMPT_CACHE', 'true').lower() == 'true' -PROMPT_CACHE_TTL = int(os.getenv('PROMPT_CACHE_TTL', '3600')) +AI_PROVIDER = os.getenv("AI_PROVIDER", "claude").lower() +ENABLE_PROMPT_CACHE = os.getenv("ENABLE_PROMPT_CACHE", "true").lower() == "true" +PROMPT_CACHE_TTL = int(os.getenv("PROMPT_CACHE_TTL", "3600")) class AIServiceError(Exception): """Custom exception for AI service errors.""" - pass - -class BedrockClient: - """Bedrock AI client wrapper.""" - - def __init__(self): - self.client = None - self._initialize() - - def _initialize(self): - """Initialize Bedrock client.""" - if not ENABLE_BEDROCK: - return - - try: - import boto3 - self.client = boto3.client( - 'bedrock-runtime', - region_name=os.getenv('AWS_DEFAULT_REGION', 'us-west-2') - ) - - # Test connection - try: - self.client.list_foundation_models() - except Exception: - # Keep client - might work for invoke_model - pass - - except Exception as e: - print(f"⚠️ Bedrock initialization failed: {e}") - self.client = None - - def is_available(self) -> bool: - """Check if Bedrock is available.""" - return self.client is not None - - def generate_sql(self, prompt: str) -> Tuple[str, str]: - """Generate SQL using Bedrock.""" - if not self.client: - return "", "Bedrock client not available" - - try: - request_body = { - "anthropic_version": "bedrock-2023-05-31", - "max_tokens": 4000, - "messages": [{"role": "user", "content": prompt}] - } - - response = self.client.invoke_model( - modelId=BEDROCK_MODEL_ID, - body=json.dumps(request_body), - contentType="application/json", - accept="application/json" - ) - - response_body = json.loads(response['body'].read()) - sql_query = response_body['content'][0]['text'].strip() - - return sql_query, "" - - except Exception as e: - error_msg = f"Bedrock error: {str(e)}" - return "", error_msg - - -class ClaudeClient: - """Claude API client wrapper.""" - - def __init__(self): - self.client = None - self._initialize() - - def _initialize(self): - """Initialize Claude client.""" - if not CLAUDE_API_KEY: - return - - try: - import anthropic - self.client = anthropic.Anthropic(api_key=CLAUDE_API_KEY) - - # Test connection with minimal request - try: - self.client.messages.create( - model=CLAUDE_MODEL, - max_tokens=10, - messages=[{"role": "user", "content": "test"}] - ) - except Exception: - # Keep client - might work for other requests - pass - - except ImportError: - print("⚠️ Anthropic package not installed. Run: pip install anthropic") - self.client = None - except Exception as e: - print(f"⚠️ Claude initialization failed: {e}") - self.client = None - - def is_available(self) -> bool: - """Check if Claude API is available.""" - return self.client is not None - - def generate_sql(self, prompt: str) -> Tuple[str, str]: - """Generate SQL using Claude API.""" - if not self.client: - return "", "Claude API client not available" - - try: - response = self.client.messages.create( - model=CLAUDE_MODEL, - max_tokens=4000, - messages=[{"role": "user", "content": prompt}] - ) - - sql_query = response.content[0].text.strip() - return sql_query, "" - - except Exception as e: - error_msg = f"Claude API error: {str(e)}" - return "", error_msg + pass class AIService: - """Main AI service that manages multiple providers.""" - + """Main AI service that manages multiple AI providers using adapter pattern.""" + def __init__(self): - self.bedrock = BedrockClient() - self.claude = ClaudeClient() + """Initialize AI service with all available adapters.""" + # Initialize all adapters + self.adapters = { + "bedrock": BedrockAdapter(), + "claude": ClaudeAdapter(), + "gemini": GeminiAdapter(), + } + self.active_provider = None self._determine_active_provider() - + def _determine_active_provider(self): - """Determine which AI provider to use.""" - if AI_PROVIDER == 'claude' and self.claude.is_available(): - self.active_provider = 'claude' - elif AI_PROVIDER == 'bedrock' and self.bedrock.is_available(): - self.active_provider = 'bedrock' - elif self.claude.is_available(): - self.active_provider = 'claude' - elif self.bedrock.is_available(): - self.active_provider = 'bedrock' - else: - self.active_provider = None - + """Determine which AI provider to use based on configuration and availability.""" + # First, try the configured provider + if AI_PROVIDER in self.adapters and self.adapters[AI_PROVIDER].is_available(): + self.active_provider = AI_PROVIDER + return + + # Fallback to first available provider + for provider_id, adapter in self.adapters.items(): + if adapter.is_available(): + self.active_provider = provider_id + print(f"ℹ️ Using {adapter.name} (fallback from {AI_PROVIDER})") + return + + # No providers available + self.active_provider = None + def is_available(self) -> bool: """Check if any AI provider is available.""" return self.active_provider is not None - + def get_active_provider(self) -> Optional[str]: - """Get the currently active provider.""" + """Get the currently active provider ID.""" return self.active_provider - - def get_provider_status(self) -> Dict[str, bool]: + + def get_active_adapter(self): + """Get the active adapter instance.""" + if self.active_provider: + return self.adapters.get(self.active_provider) + return None + + def get_available_providers(self) -> Dict[str, str]: + """Get list of available providers with their display names.""" + available = {} + for provider_id, adapter in self.adapters.items(): + if adapter.is_available(): + available[provider_id] = adapter.name + return available + + def set_active_provider(self, provider_id: str) -> bool: + """Manually set the active provider if available. + + Args: + provider_id: The provider ID to set as active + + Returns: + bool: True if provider was set successfully, False otherwise + """ + if provider_id in self.adapters and self.adapters[provider_id].is_available(): + self.active_provider = provider_id + return True + return False + + def get_provider_status(self) -> Dict[str, Any]: """Get status of all providers.""" - return { - 'bedrock': self.bedrock.is_available(), - 'claude': self.claude.is_available(), - 'active': self.active_provider + status = { + "active": self.active_provider, } - + + for provider_id, adapter in self.adapters.items(): + status[provider_id] = adapter.is_available() + + return status + def _create_prompt_hash(self, user_question: str, schema_context: str) -> str: """Create hash for prompt caching.""" combined = f"{user_question}|{schema_context}|{self.active_provider}" return hashlib.md5(combined.encode()).hexdigest() - + def _build_sql_prompt(self, user_question: str, schema_context: str) -> str: """Build the SQL generation prompt.""" return f"""You are an expert Single Family Loan loan performance data analyst. Write a single, clean DuckDB-compatible SQL query. @@ -216,7 +136,7 @@ def _build_sql_prompt(self, user_question: str, schema_context: str) -> str: **Credit Risk Tiers (Use these exact breakpoints):** - CSCORE_B Credit Scores: * 740+ = Super Prime (premium pricing, <1% default risk) - * 680-739 = Prime (standard pricing, moderate risk) + * 680-739 = Prime (standard pricing, moderate risk) * 620-679 = Near Prime (risk-based pricing, elevated risk) * <620 = Subprime (highest risk, limited origination post-2008) - OLTV/CLTV Loan-to-Value: @@ -236,7 +156,7 @@ def _build_sql_prompt(self, user_question: str, schema_context: str) -> str: - ORIG_DATE: Key vintages = 2008-2012 (post-crisis), 2020-2021 (refi boom), 2022+ (rising rates) **Product & Channel Analysis:** -- PURPOSE: P=Purchase (portfolio growth), R=Refinance (rate optimization), C=Cash-out (credit event) +- PURPOSE: P=Purchase (portfolio growth), R=Refinance (rate optimization), C=Cash-out (credit event) - PROP: SF=Single Family (85%+), PU=Planned Development (10%), CO=Condo (5%), others minimal - CHANNEL: R=Retail (direct), C=Correspondent (volume), B=Broker (specialized) - SELLER: Top institutions drive volume - use for counterparty analysis @@ -272,54 +192,68 @@ def _build_sql_prompt(self, user_question: str, schema_context: str) -> str: - Use LIMIT 20 for top analyses unless specified otherwise Write ONLY the SQL query - no explanations:""" - + @st.cache_data(ttl=PROMPT_CACHE_TTL) def _cached_generate_sql(_self, user_question: str, schema_context: str, provider: str) -> Tuple[str, str]: """Cached SQL generation to reduce API calls.""" - # This is a placeholder for the cache decorator - # Actual generation happens in generate_sql + # Cache decorator handles the caching + # The actual generation happens in generate_sql return "", "" - + def generate_sql(self, user_question: str, schema_context: str) -> Tuple[str, str, str]: """ Generate SQL query using available AI provider. - Returns: (sql_query, error_message, provider_used) + + Args: + user_question: Natural language question + schema_context: Database schema context + + Returns: + Tuple[str, str, str]: (sql_query, error_message, provider_used) """ if not self.is_available(): error_msg = """🚫 **AI SQL Generation Unavailable** No AI providers are configured or available. This could be due to: -- Missing API keys (Claude API key or AWS credentials) +- Missing API keys (Claude API key, AWS credentials, or Google API key) - Network connectivity issues - Service configuration problems **You can still use the application by:** - Writing SQL queries manually in the Advanced tab - Using the sample queries provided -- Referring to the database schema for guidance""" +- Referring to the database schema for guidance + +**To configure an AI provider:** +- Claude: Set CLAUDE_API_KEY in .env +- Bedrock: Configure AWS credentials +- Gemini: Set GOOGLE_API_KEY in .env""" return "", error_msg, "none" - + # Check cache if enabled if ENABLE_PROMPT_CACHE: - cache_key = self._create_prompt_hash(user_question, schema_context) try: cached_result = self._cached_generate_sql(user_question, schema_context, self.active_provider) if cached_result[0]: # If cached result exists - return cached_result[0], cached_result[1], f"{self.active_provider} (cached)" + return ( + cached_result[0], + cached_result[1], + f"{self.active_provider} (cached)", + ) except Exception: pass # Cache miss or error, continue with API call - + # Build prompt prompt = self._build_sql_prompt(user_question, schema_context) - - # Generate SQL using active provider - if self.active_provider == 'claude': - sql_query, error_msg = self.claude.generate_sql(prompt) - elif self.active_provider == 'bedrock': - sql_query, error_msg = self.bedrock.generate_sql(prompt) - else: - return "", "No AI provider available", "none" - + + # Get active adapter + adapter = self.get_active_adapter() + if not adapter: + return "", "No AI adapter available", "none" + + # Generate SQL using adapter + sql_query, error_msg = adapter.generate_sql(prompt) + # Cache the result if successful and caching is enabled if ENABLE_PROMPT_CACHE and sql_query and not error_msg: try: @@ -327,24 +261,19 @@ def generate_sql(self, user_question: str, schema_context: str) -> Tuple[str, st self._cached_generate_sql(user_question, schema_context, self.active_provider) except Exception: pass # Cache update failed, but we have the result - - return sql_query, error_msg, self.active_provider - -# Global AI service instance -_ai_service = None + return sql_query, error_msg, self.active_provider +# Global AI service instance (cached) +@st.cache_resource def get_ai_service() -> AIService: - """Get or create global AI service instance.""" - global _ai_service - if _ai_service is None: - _ai_service = AIService() - return _ai_service + """Get or create global AI service instance (cached).""" + return AIService() # Convenience functions for backward compatibility -def initialize_ai_client() -> Tuple[Optional[object], str]: +def initialize_ai_client() -> Tuple[Optional[AIService], str]: """Initialize AI client - backward compatibility.""" service = get_ai_service() if service.is_available(): @@ -356,4 +285,4 @@ def generate_sql_with_ai(user_question: str, schema_context: str) -> Tuple[str, """Generate SQL with AI - backward compatibility.""" service = get_ai_service() sql_query, error_msg, provider = service.generate_sql(user_question, schema_context) - return sql_query, error_msg \ No newline at end of file + return sql_query, error_msg diff --git a/src/core.py b/src/core.py index f8c825c..44b1d48 100644 --- a/src/core.py +++ b/src/core.py @@ -4,23 +4,25 @@ Enhanced with caching, AI service integration, and R2 support. """ +import glob +import os +from typing import Dict, List, Optional, Tuple + import duckdb import pandas as pd -import os -import glob import streamlit as st -from typing import List, Dict, Optional, Tuple from dotenv import load_dotenv + +from .ai_service import generate_sql_with_ai, get_ai_service from .data_dictionary import generate_enhanced_schema_context -from .ai_service import get_ai_service, generate_sql_with_ai # Load environment variables load_dotenv() # Configuration from environment variables -PROCESSED_DATA_DIR = os.getenv('PROCESSED_DATA_DIR', 'data/processed/') -DEMO_MODE = os.getenv('DEMO_MODE', 'false').lower() == 'true' -CACHE_TTL = int(os.getenv('CACHE_TTL', '3600')) # 1 hour default +PROCESSED_DATA_DIR = os.getenv("PROCESSED_DATA_DIR", "data/processed/") +DEMO_MODE = os.getenv("DEMO_MODE", "false").lower() == "true" +CACHE_TTL = int(os.getenv("CACHE_TTL", "3600")) # 1 hour default @st.cache_data(ttl=CACHE_TTL) @@ -28,13 +30,13 @@ def scan_parquet_files() -> List[str]: """Scan the processed directory for Parquet files. Cached for performance.""" # Check if data sync is needed sync_data_if_needed() - + if not os.path.exists(PROCESSED_DATA_DIR): return [] - + pattern = os.path.join(PROCESSED_DATA_DIR, "*.parquet") parquet_files = glob.glob(pattern) - + return parquet_files @@ -55,6 +57,7 @@ def sync_data_if_needed(force: bool = False) -> bool: # Verify files are not empty/corrupted try: import duckdb + conn = duckdb.connect() # Quick validation - try to read first file test_query = f"SELECT COUNT(*) FROM '{parquet_files[0]}'" @@ -77,9 +80,9 @@ def sync_data_if_needed(force: bool = False) -> bool: import subprocess import sys - sync_args = [sys.executable, 'scripts/sync_data.py'] + sync_args = [sys.executable, "scripts/sync_data.py"] if force: - sync_args.append('--force') + sync_args.append("--force") result = subprocess.run(sync_args, capture_output=True, text=True) @@ -102,7 +105,7 @@ def get_table_schemas(parquet_files: List[str]) -> str: """Generate enhanced CREATE TABLE statements with rich metadata. Cached for performance.""" if not parquet_files: return "" - + try: return generate_enhanced_schema_context(parquet_files) except Exception: @@ -114,29 +117,29 @@ def get_basic_table_schemas(parquet_files: List[str]) -> str: """Fallback basic schema generation.""" if not parquet_files: return "" - + create_statements = [] - + try: conn = duckdb.connect() - + for file_path in parquet_files: table_name = os.path.splitext(os.path.basename(file_path))[0] query = f"DESCRIBE SELECT * FROM '{file_path}' LIMIT 1" schema_df = conn.execute(query).fetchdf() - + columns = [] for _, row in schema_df.iterrows(): - column_name = row['column_name'] - column_type = row['column_type'] + column_name = row["column_name"] + column_type = row["column_type"] columns.append(f" {column_name} {column_type}") - + create_statement = f"CREATE TABLE {table_name} (\n" + ",\n".join(columns) + "\n);" create_statements.append(create_statement) - + conn.close() return "\n\n".join(create_statements) - + except Exception: return "" @@ -158,18 +161,18 @@ def execute_sql_query(sql_query: str, parquet_files: List[str]) -> pd.DataFrame: """Execute SQL query using DuckDB.""" try: conn = duckdb.connect() - + # Register each Parquet file as a table for file_path in parquet_files: table_name = os.path.splitext(os.path.basename(file_path))[0] conn.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM '{file_path}'") - + # Execute the user's query result_df = conn.execute(sql_query).fetchdf() conn.close() - + return result_df - + except Exception: return pd.DataFrame() @@ -186,7 +189,7 @@ def get_analyst_questions() -> Dict[str, str]: "πŸ“Š Market Share by Channel": "Show origination volume and average loan characteristics by channel (Retail, Correspondent, Broker) for top 5 volume states", "πŸ” Credit Migration Analysis": "For loans aged 24-48 months (2020-2021 vintage), show how many have migrated from current to 30+ day delinquent status by original credit score", "🌟 Super Prime Performance": "Analyze our Super Prime segment (740+ credit scores) - show portfolio share, average UPB, geographic distribution, and performance metrics", - "🎲 Rate Sensitivity Analysis": "Compare current portfolio performance between ultra-low rate loans (2-4%) vs higher rate loans (5%+) - show delinquency rates and paydown behavior" + "🎲 Rate Sensitivity Analysis": "Compare current portfolio performance between ultra-low rate loans (2-4%) vs higher rate loans (5%+) - show delinquency rates and paydown behavior", } @@ -194,7 +197,7 @@ def get_ai_service_status() -> Dict[str, any]: """Get AI service status for UI display.""" service = get_ai_service() return { - 'available': service.is_available(), - 'active_provider': service.get_active_provider(), - 'provider_status': service.get_provider_status() - } \ No newline at end of file + "available": service.is_available(), + "active_provider": service.get_active_provider(), + "provider_status": service.get_provider_status(), + } diff --git a/src/d1_logger.py b/src/d1_logger.py index 5b8c102..00784e5 100644 --- a/src/d1_logger.py +++ b/src/d1_logger.py @@ -4,22 +4,22 @@ Lightweight logging service for user activity and queries. """ -import requests -import json import os -import time -from typing import Optional, Dict, Any +from typing import Dict, Optional + +import requests from dotenv import load_dotenv load_dotenv() + class D1Logger: """Minimal Cloudflare D1 database logger.""" def __init__(self): - self.account_id = os.getenv('CLOUDFLARE_ACCOUNT_ID') - self.database_id = os.getenv('CLOUDFLARE_D1_DATABASE_ID') - self.api_token = os.getenv('CLOUDFLARE_API_TOKEN') + self.account_id = os.getenv("CLOUDFLARE_ACCOUNT_ID") + self.database_id = os.getenv("CLOUDFLARE_D1_DATABASE_ID") + self.api_token = os.getenv("CLOUDFLARE_API_TOKEN") self.enabled = bool(self.account_id and self.database_id and self.api_token) def is_enabled(self) -> bool: @@ -33,17 +33,12 @@ def _execute_query(self, sql: str, params: list = None) -> Optional[Dict]: url = f"https://api.cloudflare.com/client/v4/accounts/{self.account_id}/d1/database/{self.database_id}/query" - headers = { - 'Authorization': f'Bearer {self.api_token}', - 'Content-Type': 'application/json' - } + headers = {"Authorization": f"Bearer {self.api_token}", "Content-Type": "application/json"} - payload = { - 'sql': sql - } + payload = {"sql": sql} if params: - payload['params'] = params + payload["params"] = params try: response = requests.post(url, headers=headers, json=payload) @@ -66,10 +61,11 @@ def log_user_login(self, user_id: str, email: str, user_agent: str = None): VALUES (?, ?, ?) """ - self._execute_query(sql, [user_id, email, user_agent or '']) + self._execute_query(sql, [user_id, email, user_agent or ""]) - def log_user_query(self, user_id: str, email: str, question: str, - sql_query: str, ai_provider: str, execution_time: float): + def log_user_query( + self, user_id: str, email: str, question: str, sql_query: str, ai_provider: str, execution_time: float + ): """Log user query event.""" if not self.enabled: return @@ -96,16 +92,18 @@ def get_user_stats(self, user_id: str) -> Dict: """ result = self._execute_query(sql, [user_id]) - if result and result.get('success') and result.get('result'): - return result['result'][0] if result['result'] else {} + if result and result.get("success") and result.get("result"): + return result["result"][0] if result["result"] else {} return {} + # Global logger instance _d1_logger = None + def get_d1_logger() -> D1Logger: """Get or create global D1 logger instance.""" global _d1_logger if _d1_logger is None: _d1_logger = D1Logger() - return _d1_logger \ No newline at end of file + return _d1_logger diff --git a/src/data_dictionary.py b/src/data_dictionary.py index 3e1feb4..d1ac49f 100644 --- a/src/data_dictionary.py +++ b/src/data_dictionary.py @@ -5,20 +5,21 @@ Comprehensive coverage of all 110 columns with ACTUAL field names from data. """ -import pandas as pd -import duckdb import os -from typing import Dict, List, Any, Optional from dataclasses import dataclass +from typing import Dict, List, Optional +import duckdb # ============================================================================= # FIELD METADATA STRUCTURE # ============================================================================= + @dataclass class FieldMetadata: """Standard field metadata structure for ontological organization.""" + description: str domain: str data_type: str @@ -43,48 +44,47 @@ class FieldMetadata: domain="Identification", data_type="VARCHAR", business_context="Groups loans for portfolio management and risk analysis", - relationships=["groups_loans_by_characteristics"] + relationships=["groups_loans_by_characteristics"], ), "LOAN_ID": FieldMetadata( description="Unique loan sequence number", domain="Identification", data_type="VARCHAR", business_context="Primary key for loan-level analysis and cross-file linkage", - relationships=["primary_key", "links_acquisition_to_performance"] + relationships=["primary_key", "links_acquisition_to_performance"], ), "CHANNEL": FieldMetadata( description="Current channel designation", domain="Identification", data_type="VARCHAR", - business_context="Current portfolio channel classification" + business_context="Current portfolio channel classification", ), "SELLER": FieldMetadata( description="Original seller/lender identifier", domain="Identification", data_type="VARCHAR", - business_context="Originator identification for quality assessment" + business_context="Originator identification for quality assessment", ), "SERVICER": FieldMetadata( description="Current servicer identifier", domain="Identification", data_type="VARCHAR", - business_context="Current servicer for performance attribution" + business_context="Current servicer for performance attribution", ), "MASTER_SERVICER": FieldMetadata( description="Master servicer identifier", domain="Identification", data_type="VARCHAR", - business_context="Master servicer overseeing loan administration" + business_context="Master servicer overseeing loan administration", ), "DEAL_NAME": FieldMetadata( description="Securitization deal name", domain="Identification", data_type="VARCHAR", - business_context="MBS pool or deal identification" - ) - } + business_context="MBS pool or deal identification", + ), + }, }, - "TEMPORAL": { "domain_description": "Time dimensions covering loan lifecycle and performance periods", "fields": { @@ -93,96 +93,95 @@ class FieldMetadata: domain="Temporal", data_type="VARCHAR", business_context="Reporting period for loan performance tracking", - relationships=["time_series_key"] + relationships=["time_series_key"], ), "ORIG_DATE": FieldMetadata( description="Original loan date (MMYYYY)", domain="Temporal", data_type="VARCHAR", business_context="Loan origination date for vintage analysis", - relationships=["determines_vintage", "key_for_cohort_analysis"] + relationships=["determines_vintage", "key_for_cohort_analysis"], ), "FIRST_PAY": FieldMetadata( description="First payment date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="First scheduled payment for age calculations" + business_context="First scheduled payment for age calculations", ), "MATR_DT": FieldMetadata( description="Loan maturity date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Final scheduled payment date" + business_context="Final scheduled payment date", ), "LAST_PAID_INSTALLMENT_DATE": FieldMetadata( description="Last paid installment date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Most recent payment received date" + business_context="Most recent payment received date", ), "FORECLOSURE_DATE": FieldMetadata( description="Foreclosure date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date foreclosure proceedings started" + business_context="Date foreclosure proceedings started", ), "DISPOSITION_DATE": FieldMetadata( description="Property disposition date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="REO property sale completion date" + business_context="REO property sale completion date", ), "ZB_DTE": FieldMetadata( description="Zero balance date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date loan balance reached zero" + business_context="Date loan balance reached zero", ), "RPRCH_DTE": FieldMetadata( description="Repurchase date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date loan was repurchased from pool" + business_context="Date loan was repurchased from pool", ), "ORIGINAL_LIST_START_DATE": FieldMetadata( description="Original REO listing start date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Initial REO property listing date" + business_context="Initial REO property listing date", ), "CURRENT_LIST_START_DATE": FieldMetadata( description="Current REO listing start date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Current REO listing period start date" + business_context="Current REO listing period start date", ), "INTEREST_RATE_CHANGE_DATE": FieldMetadata( description="Interest rate change date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date of ARM rate adjustment" + business_context="Date of ARM rate adjustment", ), "PAYMENT_CHANGE_DATE": FieldMetadata( description="Payment change date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date of ARM payment adjustment" + business_context="Date of ARM payment adjustment", ), "ZERO_BALANCE_CODE_CHANGE_DATE": FieldMetadata( description="Zero balance code change date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Date zero balance code was updated" + business_context="Date zero balance code was updated", ), "LOAN_HOLDBACK_EFFECTIVE_DATE": FieldMetadata( description="Loan holdback effective date (MMYYYY)", domain="Temporal", data_type="VARCHAR", - business_context="Effective date of loan holdback" - ) - } + business_context="Effective date of loan holdback", + ), + }, }, - "LOAN_TERMS": { "domain_description": "Core loan structure, terms, and product characteristics", "fields": { @@ -191,39 +190,39 @@ class FieldMetadata: domain="Loan Terms", data_type="FLOAT", business_context="Origination rate affecting refinance propensity", - risk_impact="Low rates create refinance risk when rates rise" + risk_impact="Low rates create refinance risk when rates rise", ), "CURR_RATE": FieldMetadata( description="Current interest rate (%)", domain="Loan Terms", data_type="FLOAT", - business_context="Current rate for ARM products" + business_context="Current rate for ARM products", ), "ORIG_TERM": FieldMetadata( description="Original loan term (months)", domain="Loan Terms", data_type="SMALLINT", business_context="Original amortization period", - values={"360": "30-year", "180": "15-year", "240": "20-year"} + values={"360": "30-year", "180": "15-year", "240": "20-year"}, ), "REM_MONTHS": FieldMetadata( description="Remaining months to maturity", domain="Loan Terms", data_type="SMALLINT", - business_context="Remaining term affects prepayment behavior" + business_context="Remaining term affects prepayment behavior", ), "ADJ_REM_MONTHS": FieldMetadata( description="Adjusted remaining months", domain="Loan Terms", data_type="SMALLINT", - business_context="Adjusted remaining term calculation" + business_context="Adjusted remaining term calculation", ), "LOAN_AGE": FieldMetadata( description="Loan age in months since origination", domain="Loan Terms", data_type="SMALLINT", business_context="Seasoning indicator - defaults peak at 12-60 months", - risk_impact="Early payment default risk highest in first 12 months" + risk_impact="Early payment default risk highest in first 12 months", ), "PURPOSE": FieldMetadata( description="Loan purpose code", @@ -231,57 +230,52 @@ class FieldMetadata: data_type="VARCHAR", business_context="Transaction type affecting risk", risk_impact="Purchase < Rate/Term Refi < Cash-Out Refi", - values={ - "P": "Purchase - Home acquisition", - "R": "Rate/Term Refinance", - "C": "Cash-Out Refinance" - } + values={"P": "Purchase - Home acquisition", "R": "Rate/Term Refinance", "C": "Cash-Out Refinance"}, ), "PRODUCT": FieldMetadata( description="Loan product type", domain="Loan Terms", data_type="VARCHAR", - business_context="Product classification (FRM/ARM)" + business_context="Product classification (FRM/ARM)", ), "IO": FieldMetadata( description="Interest-only indicator", domain="Loan Terms", data_type="VARCHAR", - business_context="Interest-only payment feature" + business_context="Interest-only payment feature", ), "FIRST_PAY_IO": FieldMetadata( description="First payment IO date", domain="Loan Terms", data_type="VARCHAR", - business_context="First interest-only payment date" + business_context="First interest-only payment date", ), "MNTHS_TO_AMTZ_IO": FieldMetadata( description="Months to amortize IO", domain="Loan Terms", data_type="VARCHAR", - business_context="Months until IO converts to amortizing" + business_context="Months until IO converts to amortizing", ), "PPMT_FLG": FieldMetadata( description="Prepayment penalty flag", domain="Loan Terms", data_type="VARCHAR", - business_context="Indicates presence of prepayment penalty" + business_context="Indicates presence of prepayment penalty", ), "BALLOON_INDICATOR": FieldMetadata( description="Balloon payment indicator", domain="Loan Terms", data_type="VARCHAR", - business_context="Indicates balloon payment feature" + business_context="Indicates balloon payment feature", ), "PLAN_NUMBER": FieldMetadata( description="Plan number identifier", domain="Loan Terms", data_type="VARCHAR", - business_context="Internal plan classification" - ) - } + business_context="Internal plan classification", + ), + }, }, - "UNPAID_BALANCES": { "domain_description": "All unpaid principal balance measurements and calculations", "fields": { @@ -290,41 +284,40 @@ class FieldMetadata: domain="Unpaid Balances", data_type="DOUBLE", business_context="Original loan amount determining jumbo status", - relationships=["baseline_for_balance_tracking"] + relationships=["baseline_for_balance_tracking"], ), "CURRENT_UPB": FieldMetadata( description="Current unpaid principal balance ($)", domain="Unpaid Balances", data_type="DOUBLE", - business_context="Current outstanding balance for exposure calculation" + business_context="Current outstanding balance for exposure calculation", ), "ISSUANCE_UPB": FieldMetadata( description="UPB at MBS issuance ($)", domain="Unpaid Balances", data_type="DOUBLE", - business_context="Balance when securitized into MBS" + business_context="Balance when securitized into MBS", ), "LAST_UPB": FieldMetadata( description="Last unpaid principal balance ($)", domain="Unpaid Balances", data_type="DOUBLE", - business_context="Final UPB before zero balance" + business_context="Final UPB before zero balance", ), "NON_INTEREST_BEARING_UPB": FieldMetadata( description="Non-interest bearing UPB ($)", domain="Unpaid Balances", data_type="DOUBLE", - business_context="Principal not accruing interest" + business_context="Principal not accruing interest", ), "INTEREST_BEARING_UPB": FieldMetadata( description="Interest bearing UPB ($)", domain="Unpaid Balances", data_type="DOUBLE", - business_context="Principal accruing interest" - ) - } + business_context="Principal accruing interest", + ), + }, }, - "BORROWER_PROFILE": { "domain_description": "Borrower characteristics and creditworthiness indicators", "fields": { @@ -334,42 +327,37 @@ class FieldMetadata: data_type="SMALLINT", business_context="Primary credit quality indicator", risk_impact="740+ Super Prime vs <620 Subprime - 10x default difference", - values={ - "740+": "Super Prime", - "680-739": "Prime", - "620-679": "Near Prime", - "<620": "Subprime" - } + values={"740+": "Super Prime", "680-739": "Prime", "620-679": "Near Prime", "<620": "Subprime"}, ), "CSCORE_C": FieldMetadata( description="Co-borrower credit score", domain="Borrower Profile", data_type="SMALLINT", - business_context="Secondary borrower credit quality" + business_context="Secondary borrower credit quality", ), "CURR_SCOREB": FieldMetadata( description="Current primary borrower credit score", domain="Borrower Profile", data_type="SMALLINT", - business_context="Updated primary borrower credit score" + business_context="Updated primary borrower credit score", ), "CURR_SCOREC": FieldMetadata( description="Current co-borrower credit score", domain="Borrower Profile", data_type="SMALLINT", - business_context="Updated co-borrower credit score" + business_context="Updated co-borrower credit score", ), "ISSUE_SCOREB": FieldMetadata( description="Issue date primary borrower credit score", domain="Borrower Profile", data_type="SMALLINT", - business_context="Primary borrower score at MBS issuance" + business_context="Primary borrower score at MBS issuance", ), "ISSUE_SCOREC": FieldMetadata( description="Issue date co-borrower credit score", domain="Borrower Profile", data_type="SMALLINT", - business_context="Co-borrower score at MBS issuance" + business_context="Co-borrower score at MBS issuance", ), "DTI": FieldMetadata( description="Debt-to-income ratio (%)", @@ -381,24 +369,23 @@ class FieldMetadata: "≀28%": "Conservative capacity", "29-36%": "Standard capacity", "37-43%": "Stretched capacity", - ">43%": "Aggressive capacity" - } + ">43%": "Aggressive capacity", + }, ), "NUM_BO": FieldMetadata( description="Number of borrowers", domain="Borrower Profile", data_type="VARCHAR", - business_context="Count of borrowers on loan (1 or 2)" + business_context="Count of borrowers on loan (1 or 2)", ), "FIRST_FLAG": FieldMetadata( description="First-time homebuyer flag", domain="Borrower Profile", data_type="VARCHAR", - business_context="Indicates first-time homebuyer status" - ) - } + business_context="Indicates first-time homebuyer status", + ), + }, }, - "PROPERTY_INFO": { "domain_description": "Property characteristics, location, and collateral information", "fields": { @@ -411,14 +398,14 @@ class FieldMetadata: "SF": "Single Family - Lowest risk", "PU": "Planned Unit Development", "CO": "Condominium", - "MH": "Manufactured Housing - Highest risk" - } + "MH": "Manufactured Housing - Highest risk", + }, ), "NO_UNITS": FieldMetadata( description="Number of units in property", domain="Property Info", data_type="TINYINT", - business_context="Property unit count affects classification" + business_context="Property unit count affects classification", ), "OCC_STAT": FieldMetadata( description="Occupancy status code", @@ -428,30 +415,29 @@ class FieldMetadata: values={ "P": "Primary residence - Lowest risk", "S": "Second home", - "I": "Investment property - Highest risk" - } + "I": "Investment property - Highest risk", + }, ), "STATE": FieldMetadata( description="State code", domain="Property Info", data_type="VARCHAR", - business_context="State location for geographic risk assessment" + business_context="State location for geographic risk assessment", ), "MSA": FieldMetadata( description="Metropolitan Statistical Area code", domain="Property Info", data_type="VARCHAR", - business_context="MSA classification for market analysis" + business_context="MSA classification for market analysis", ), "ZIP": FieldMetadata( description="Property ZIP code (3 digits)", domain="Property Info", data_type="VARCHAR", - business_context="Geographic location (privacy protected to 3 digits)" - ) - } + business_context="Geographic location (privacy protected to 3 digits)", + ), + }, }, - "LTV_RATIOS": { "domain_description": "Loan-to-value ratios and equity position metrics", "fields": { @@ -465,24 +451,23 @@ class FieldMetadata: "≀80%": "Conservative equity", "81-90%": "Standard equity", "91-95%": "Minimal equity", - ">95%": "High risk - very limited equity" - } + ">95%": "High risk - very limited equity", + }, ), "OCLTV": FieldMetadata( description="Original combined LTV (%)", domain="LTV Ratios", data_type="FLOAT", - business_context="Combined LTV including subordinate liens" + business_context="Combined LTV including subordinate liens", ), "MI_PCT": FieldMetadata( description="Mortgage insurance percentage (%)", domain="LTV Ratios", data_type="FLOAT", - business_context="MI coverage percentage for high LTV loans" - ) - } + business_context="MI coverage percentage for high LTV loans", + ), + }, }, - "PAYMENT_STATUS": { "domain_description": "Payment performance and delinquency status tracking", "fields": { @@ -495,42 +480,41 @@ class FieldMetadata: "00": "Current - No missed payments", "01": "30-59 Days - First delinquency", "02": "60-89 Days - Serious delinquency", - "03": "90+ Days - Severe delinquency" - } + "03": "90+ Days - Severe delinquency", + }, ), "PMT_HISTORY": FieldMetadata( description="Payment history string", domain="Payment Status", data_type="VARCHAR", - business_context="Coded payment performance history" + business_context="Coded payment performance history", ), "MOD_FLAG": FieldMetadata( description="Modification flag", domain="Payment Status", data_type="VARCHAR", - business_context="Loan modification indicator" + business_context="Loan modification indicator", ), "DELINQUENT_ACCRUED_INTEREST": FieldMetadata( description="Delinquent accrued interest amount ($)", domain="Payment Status", data_type="DOUBLE", - business_context="Unpaid interest accumulation" + business_context="Unpaid interest accumulation", ), "FORBEARANCE_INDICATOR": FieldMetadata( description="Forbearance indicator", domain="Payment Status", data_type="VARCHAR", - business_context="Temporary payment relief status" + business_context="Temporary payment relief status", ), "PAYMENT_DEFERRAL_MOD_EVENT_FLAG": FieldMetadata( description="Payment deferral modification event flag", domain="Payment Status", data_type="VARCHAR", - business_context="Payment deferral modification indicator" - ) - } + business_context="Payment deferral modification indicator", + ), + }, }, - "PRINCIPAL_PAYMENTS": { "domain_description": "Scheduled and unscheduled principal payment tracking", "fields": { @@ -538,29 +522,28 @@ class FieldMetadata: description="Current scheduled principal ($)", domain="Principal Payments", data_type="DOUBLE", - business_context="Current month scheduled principal payment" + business_context="Current month scheduled principal payment", ), "TOT_SCHD_PRNCPL": FieldMetadata( description="Total scheduled principal ($)", domain="Principal Payments", data_type="DOUBLE", - business_context="Cumulative scheduled principal payments" + business_context="Cumulative scheduled principal payments", ), "UNSCHD_PRNCPL_CURR": FieldMetadata( description="Current unscheduled principal ($)", domain="Principal Payments", data_type="DOUBLE", - business_context="Extra principal payments this period" + business_context="Extra principal payments this period", ), "PRINCIPAL_FORGIVENESS_AMOUNT": FieldMetadata( description="Principal forgiveness amount ($)", domain="Principal Payments", data_type="DOUBLE", - business_context="Principal amount forgiven in modification" - ) - } + business_context="Principal amount forgiven in modification", + ), + }, }, - "FORECLOSURE_COSTS": { "domain_description": "All foreclosure-related costs and expenses", "fields": { @@ -568,41 +551,40 @@ class FieldMetadata: description="Total foreclosure costs ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Direct foreclosure legal and processing costs" + business_context="Direct foreclosure legal and processing costs", ), "PROPERTY_PRESERVATION_AND_REPAIR_COSTS": FieldMetadata( description="Property preservation and repair costs ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Maintenance and repair costs for REO properties" + business_context="Maintenance and repair costs for REO properties", ), "ASSET_RECOVERY_COSTS": FieldMetadata( description="Asset recovery costs ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Costs to recover and sell REO property" + business_context="Costs to recover and sell REO property", ), "MISCELLANEOUS_HOLDING_EXPENSES_AND_CREDITS": FieldMetadata( description="Miscellaneous holding expenses and credits ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Other holding costs and credits" + business_context="Other holding costs and credits", ), "ASSOCIATED_TAXES_FOR_HOLDING_PROPERTY": FieldMetadata( description="Associated taxes for holding property ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Property taxes while REO" + business_context="Property taxes while REO", ), "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT": FieldMetadata( description="Foreclosure principal write-off amount ($)", domain="Foreclosure Costs", data_type="DOUBLE", - business_context="Principal amount written off in foreclosure" - ) - } + business_context="Principal amount written off in foreclosure", + ), + }, }, - "PROPERTY_DISPOSITION": { "domain_description": "REO property listing, sale, and disposition information", "fields": { @@ -610,47 +592,46 @@ class FieldMetadata: description="Original REO listing price ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Initial REO property listing price" + business_context="Initial REO property listing price", ), "CURRENT_LIST_PRICE": FieldMetadata( description="Current REO listing price ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Current REO property listing price" + business_context="Current REO property listing price", ), "NET_SALES_PROCEEDS": FieldMetadata( description="Net sales proceeds ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Net proceeds after costs from REO sale" + business_context="Net proceeds after costs from REO sale", ), "CREDIT_ENHANCEMENT_PROCEEDS": FieldMetadata( description="Credit enhancement proceeds ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Proceeds from credit enhancement" + business_context="Proceeds from credit enhancement", ), "REPURCHASES_MAKE_WHOLE_PROCEEDS": FieldMetadata( description="Repurchase make-whole proceeds ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Make-whole payments from repurchases" + business_context="Make-whole payments from repurchases", ), "OTHER_FORECLOSURE_PROCEEDS": FieldMetadata( description="Other foreclosure proceeds ($)", domain="Property Disposition", data_type="DOUBLE", - business_context="Other recovery proceeds from foreclosure" + business_context="Other recovery proceeds from foreclosure", ), "RE_PROCS_FLAG": FieldMetadata( description="REO proceeds flag", domain="Property Disposition", data_type="VARCHAR", - business_context="REO proceeds processing indicator" - ) - } + business_context="REO proceeds processing indicator", + ), + }, }, - "MORTGAGE_INSURANCE": { "domain_description": "Mortgage insurance coverage and cancellation tracking", "fields": { @@ -659,21 +640,16 @@ class FieldMetadata: domain="Mortgage Insurance", data_type="VARCHAR", business_context="Type of mortgage insurance coverage", - values={ - "1": "Borrower-paid monthly MI", - "2": "Lender-paid MI", - "3": "Split premium MI" - } + values={"1": "Borrower-paid monthly MI", "2": "Lender-paid MI", "3": "Split premium MI"}, ), "MI_CANCEL_FLAG": FieldMetadata( description="MI cancellation flag", domain="Mortgage Insurance", data_type="VARCHAR", - business_context="Mortgage insurance cancellation status" - ) - } + business_context="Mortgage insurance cancellation status", + ), + }, }, - "MODIFICATIONS_LOSSES": { "domain_description": "Loan modifications and realized loss amounts", "fields": { @@ -681,29 +657,28 @@ class FieldMetadata: description="Current period modification loss amount ($)", domain="Modifications & Losses", data_type="DOUBLE", - business_context="Modification loss recognized this period" + business_context="Modification loss recognized this period", ), "CUMULATIVE_MODIFICATION_LOSS_AMOUNT": FieldMetadata( description="Cumulative modification loss amount ($)", domain="Modifications & Losses", data_type="DOUBLE", - business_context="Total modification losses to date" + business_context="Total modification losses to date", ), "CURRENT_PERIOD_CREDIT_EVENT_NET_GAIN_OR_LOSS": FieldMetadata( description="Current period credit event net gain/loss ($)", domain="Modifications & Losses", data_type="DOUBLE", - business_context="Net credit event impact this period" + business_context="Net credit event impact this period", ), "CUMULATIVE_CREDIT_EVENT_NET_GAIN_OR_LOSS": FieldMetadata( description="Cumulative credit event net gain/loss ($)", domain="Modifications & Losses", data_type="DOUBLE", - business_context="Cumulative credit event net impact" - ) - } + business_context="Cumulative credit event net impact", + ), + }, }, - "ARM_FEATURES": { "domain_description": "Adjustable Rate Mortgage specific characteristics", "fields": { @@ -711,65 +686,64 @@ class FieldMetadata: description="ARM 5-year indicator", domain="ARM Features", data_type="VARCHAR", - business_context="Indicates 5-year ARM product" + business_context="Indicates 5-year ARM product", ), "ARM_PRODUCT_TYPE": FieldMetadata( description="ARM product type", domain="ARM Features", data_type="VARCHAR", - business_context="Specific ARM product classification" + business_context="Specific ARM product classification", ), "MONTHS_UNTIL_FIRST_PAYMENT_RESET": FieldMetadata( description="Months until first payment reset", domain="ARM Features", data_type="SMALLINT", - business_context="Months until first ARM payment adjustment" + business_context="Months until first ARM payment adjustment", ), "MONTHS_BETWEEN_SUBSEQUENT_PAYMENT_RESET": FieldMetadata( description="Months between subsequent payment resets", domain="ARM Features", data_type="SMALLINT", - business_context="Months between subsequent ARM adjustments" + business_context="Months between subsequent ARM adjustments", ), "ARM_INDEX": FieldMetadata( description="ARM index type", domain="ARM Features", data_type="VARCHAR", - business_context="Reference rate index for ARM adjustments" + business_context="Reference rate index for ARM adjustments", ), "ARM_CAP_STRUCTURE": FieldMetadata( description="ARM cap structure", domain="ARM Features", data_type="VARCHAR", - business_context="ARM rate cap structure definition" + business_context="ARM rate cap structure definition", ), "INITIAL_INTEREST_RATE_CAP": FieldMetadata( description="Initial interest rate cap (%)", domain="ARM Features", data_type="FLOAT", - business_context="Initial rate adjustment cap limit" + business_context="Initial rate adjustment cap limit", ), "PERIODIC_INTEREST_RATE_CAP": FieldMetadata( description="Periodic interest rate cap (%)", domain="ARM Features", data_type="FLOAT", - business_context="Periodic rate adjustment cap limit" + business_context="Periodic rate adjustment cap limit", ), "LIFETIME_INTEREST_RATE_CAP": FieldMetadata( description="Lifetime interest rate cap (%)", domain="ARM Features", data_type="FLOAT", - business_context="Maximum rate increase over loan life" + business_context="Maximum rate increase over loan life", ), "MARGIN": FieldMetadata( description="ARM margin (%)", domain="ARM Features", data_type="FLOAT", - business_context="Fixed margin added to ARM index rate" - ) - } + business_context="Fixed margin added to ARM index rate", + ), + }, }, - "SPECIAL_INDICATORS": { "domain_description": "Special programs and servicing indicators", "fields": { @@ -777,43 +751,43 @@ class FieldMetadata: description="Servicing indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="Special servicing status or programs" + business_context="Special servicing status or programs", ), "HOMEREADY_PROGRAM_INDICATOR": FieldMetadata( description="HomeReady program indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="Affordable housing program participation" + business_context="Affordable housing program participation", ), "RELOCATION_MORTGAGE_INDICATOR": FieldMetadata( description="Relocation mortgage indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="Employee relocation benefit program" + business_context="Employee relocation benefit program", ), "LOAN_HOLDBACK_INDICATOR": FieldMetadata( description="Loan holdback indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="Loan holdback status indicator" + business_context="Loan holdback status indicator", ), "PROPERTY_INSPECTION_WAIVER_INDICATOR": FieldMetadata( description="Property inspection waiver indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="Property inspection waiver status" + business_context="Property inspection waiver status", ), "HIGH_BALANCE_LOAN_INDICATOR": FieldMetadata( description="High balance loan indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="High-cost area conforming loan designation" + business_context="High-cost area conforming loan designation", ), "HIGH_LOAN_TO_VALUE_HLTV_REFINANCE_OPTION_INDICATOR": FieldMetadata( description="High LTV refinance option indicator", domain="Special Indicators", data_type="VARCHAR", - business_context="HLTV refinance program indicator" + business_context="HLTV refinance program indicator", ), "Zero_Bal_Code": FieldMetadata( description="Zero balance code", @@ -824,29 +798,29 @@ class FieldMetadata: "01": "Prepayment - Normal payoff", "02": "Third party sale - Foreclosure auction", "03": "Short sale - Negotiated sale", - "09": "REO disposition - Bank-owned sale" - } + "09": "REO disposition - Bank-owned sale", + }, ), "ADR_TYPE": FieldMetadata( description="Alternative disposition type", domain="Special Indicators", data_type="VARCHAR", - business_context="Alternative disposition method" + business_context="Alternative disposition method", ), "ADR_COUNT": FieldMetadata( description="Alternative disposition count", domain="Special Indicators", data_type="SMALLINT", - business_context="Count of alternative dispositions" + business_context="Count of alternative dispositions", ), "ADR_UPB": FieldMetadata( description="Alternative disposition UPB ($)", domain="Special Indicators", data_type="DOUBLE", - business_context="UPB for alternative disposition" - ) - } - } + business_context="UPB for alternative disposition", + ), + }, + }, } @@ -860,44 +834,40 @@ class FieldMetadata: "coverage": "56.8+ million loans worth $12.4+ trillion original UPB", "vintage_range": "1999-2025 loan originations", "geographic_scope": "All 50 states plus District of Columbia", - "update_frequency": "Monthly performance data through March 2025" + "update_frequency": "Monthly performance data through March 2025", }, - "performance_summary": { "lifetime_loss_rate": "0.3% of original UPB", "current_performance": "~98% of active loans current on payments", "credit_quality": "Average FICO 762 (borrower), 758 (co-borrower)", "leverage_metrics": "Average LTV 71.8%, Average DTI 34.5%", - "seasoning_impact": "Default risk peaks at 12-60 months loan age" + "seasoning_impact": "Default risk peaks at 12-60 months loan age", }, - "risk_framework": { "credit_triangle": "CSCORE_B (Credit) + OLTV (Collateral) + DTI (Capacity)", "risk_tiers": { "super_prime": "FICO 740+ AND LTV ≀80% AND DTI ≀36%", "prime": "FICO 680-739 with compensating factors", "alt_a": "One degraded factor with compensating strengths", - "high_risk": "Multiple degraded factors (rare post-2008)" - } + "high_risk": "Multiple degraded factors (rare post-2008)", + }, }, - "analytical_dimensions": [ "Vintage cohort analysis by ORIG_DATE", "Geographic concentration by STATE (state-level risk)", "Credit migration tracking via PMT_HISTORY", "Loss severity prediction by Zero_Bal_Code", "Prepayment modeling using rate differential", - "ARM performance during rate cycles" + "ARM performance during rate cycles", ], - "key_relationships": { "credit_collateral": "CSCORE_B inversely correlated with OLTV", "capacity_leverage": "DTI directly impacts delinquency probability", "geography_performance": "STATE concentration affects portfolio risk", "vintage_cycles": "ORIG_DATE correlates with economic cycle", "seasoning_curves": "Default probability varies by LOAN_AGE", - "rate_sensitivity": "ARM products sensitive to index movements" - } + "rate_sensitivity": "ARM products sensitive to index movements", + }, } @@ -951,11 +921,11 @@ def generate_enhanced_schema_context(parquet_files): for domain_name, domain_info in LOAN_ONTOLOGY.items(): domain_columns = [] for _, row in schema_df.iterrows(): - column_name = row['column_name'] - column_type = row['column_type'] + column_name = row["column_name"] + column_type = row["column_type"] - if column_name in domain_info['fields']: - field_meta = domain_info['fields'][column_name] + if column_name in domain_info["fields"]: + field_meta = domain_info["fields"][column_name] comment = f"-- {field_meta.description} | {field_meta.domain}" if field_meta.risk_impact: comment += f" | Risk: {field_meta.risk_impact}" @@ -968,18 +938,20 @@ def generate_enhanced_schema_context(parquet_files): # Add any unmapped columns for _, row in schema_df.iterrows(): - column_name = row['column_name'] - column_type = row['column_type'] + column_name = row["column_name"] + column_type = row["column_type"] # Check if column exists in any domain found = False for domain_info in LOAN_ONTOLOGY.values(): - if column_name in domain_info['fields']: + if column_name in domain_info["fields"]: found = True break if not found: - unmapped_columns.append(f" {column_name} {column_type} -- {column_name.replace('_', ' ').title()}") + unmapped_columns.append( + f" {column_name} {column_type} -- {column_name.replace('_', ' ').title()}" + ) if unmapped_columns: organized_columns.append(" -- OTHER FIELDS:") @@ -1026,15 +998,16 @@ def generate_enhanced_schema_context(parquet_files): def get_field_context(field_name): """Get context for a specific field (legacy compatibility).""" for domain in LOAN_ONTOLOGY.values(): - if field_name in domain['fields']: - return domain['fields'][field_name].__dict__ + if field_name in domain["fields"]: + return domain["fields"][field_name].__dict__ return {} + def get_analysis_suggestions(field_names): """Get analysis suggestions based on field names (legacy compatibility).""" suggestions = [] for field in field_names: field_info = get_field_context(field) - if field_info and 'business_context' in field_info: + if field_info and "business_context" in field_info: suggestions.append(f"{field}: {field_info['business_context']}") - return suggestions[:10] # Limit to 10 suggestions \ No newline at end of file + return suggestions[:10] # Limit to 10 suggestions diff --git a/src/simple_auth.py b/src/simple_auth.py index ffe7f2b..7144a7f 100644 --- a/src/simple_auth.py +++ b/src/simple_auth.py @@ -4,48 +4,57 @@ Clean implementation without external database dependencies. """ -import streamlit as st -import requests import os -import time -import json -import base64 -import hashlib import secrets -from typing import Dict, Optional, Any -from urllib.parse import urlencode, parse_qs, urlparse +import time +from typing import Any, Dict, Optional +from urllib.parse import urlencode + +import requests +import streamlit as st from dotenv import load_dotenv + from .d1_logger import get_d1_logger # Load environment variables load_dotenv() # Google OAuth Configuration -GOOGLE_CLIENT_ID = os.getenv('GOOGLE_CLIENT_ID') -GOOGLE_CLIENT_SECRET = os.getenv('GOOGLE_CLIENT_SECRET') -ENABLE_AUTH = os.getenv('ENABLE_AUTH', 'true').lower() == 'true' -DEMO_MODE = os.getenv('DEMO_MODE', 'false').lower() == 'true' +GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID") +GOOGLE_CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET") +ENABLE_AUTH = os.getenv("ENABLE_AUTH", "true").lower() == "true" +DEMO_MODE = os.getenv("DEMO_MODE", "false").lower() == "true" # OAuth URLs GOOGLE_AUTH_URL = "https://accounts.google.com/o/oauth2/v2/auth" GOOGLE_TOKEN_URL = "https://oauth2.googleapis.com/token" GOOGLE_USERINFO_URL = "https://www.googleapis.com/oauth2/v2/userinfo" + def get_current_url() -> str: """Get the current app URL for OAuth redirects.""" try: - if hasattr(st, 'context') and hasattr(st.context, 'headers'): + if hasattr(st, "context") and hasattr(st.context, "headers"): headers = st.context.headers - if 'host' in headers: - host = headers['host'] - protocol = 'https' if '.streamlit.app' in host or '.repl.co' in host or '.replit.dev' in host or '.replit.app' in host or 'ravishankars.com' in host else 'http' - return f'{protocol}://{host}' - except: + if "host" in headers: + host = headers["host"] + protocol = ( + "https" + if ".streamlit.app" in host + or ".repl.co" in host + or ".replit.dev" in host + or ".replit.app" in host + or "ravishankars.com" in host + else "http" + ) + return f"{protocol}://{host}" + except Exception: pass # Fallback to localhost - port = os.getenv('STREAMLIT_SERVER_PORT', '8501') - return f'http://localhost:{port}' + port = os.getenv("STREAMLIT_SERVER_PORT", "8501") + return f"http://localhost:{port}" + def generate_auth_url() -> str: """Generate Google OAuth authorization URL.""" @@ -63,13 +72,13 @@ def generate_auth_url() -> str: redirect_uri = get_current_url() params = { - 'client_id': GOOGLE_CLIENT_ID, - 'redirect_uri': redirect_uri, - 'scope': 'openid email profile', - 'response_type': 'code', - 'state': state, - 'access_type': 'offline', - 'prompt': 'select_account' + "client_id": GOOGLE_CLIENT_ID, + "redirect_uri": redirect_uri, + "scope": "openid email profile", + "response_type": "code", + "state": state, + "access_type": "offline", + "prompt": "select_account", } if DEMO_MODE: @@ -85,6 +94,7 @@ def generate_auth_url() -> str: return auth_url + def exchange_code_for_token(code: str, state: str) -> Optional[Dict[str, Any]]: """Exchange authorization code for access token.""" if not GOOGLE_CLIENT_ID or not GOOGLE_CLIENT_SECRET: @@ -93,9 +103,11 @@ def exchange_code_for_token(code: str, state: str) -> Optional[Dict[str, Any]]: return None # Verify state to prevent CSRF attacks - stored_state = st.session_state.get('oauth_state') + stored_state = st.session_state.get("oauth_state") if DEMO_MODE: - st.info(f"πŸ”’ State verification: Received={state[:10]}..., Stored={stored_state[:10] if stored_state else 'None'}...") + st.info( + f"πŸ”’ State verification: Received={state[:10]}..., Stored={stored_state[:10] if stored_state else 'None'}..." + ) # More lenient state verification - session state can be cleared during redirects if state != stored_state: @@ -113,11 +125,11 @@ def exchange_code_for_token(code: str, state: str) -> Optional[Dict[str, Any]]: redirect_uri = get_current_url() data = { - 'client_id': GOOGLE_CLIENT_ID, - 'client_secret': GOOGLE_CLIENT_SECRET, - 'code': code, - 'grant_type': 'authorization_code', - 'redirect_uri': redirect_uri + "client_id": GOOGLE_CLIENT_ID, + "client_secret": GOOGLE_CLIENT_SECRET, + "code": code, + "grant_type": "authorization_code", + "redirect_uri": redirect_uri, } try: @@ -133,9 +145,10 @@ def exchange_code_for_token(code: str, state: str) -> Optional[Dict[str, Any]]: st.error(f"❌ Token exchange error: {str(e)}") return None + def get_user_info(access_token: str) -> Optional[Dict[str, Any]]: """Get user information from Google.""" - headers = {'Authorization': f'Bearer {access_token}'} + headers = {"Authorization": f"Bearer {access_token}"} try: response = requests.get(GOOGLE_USERINFO_URL, headers=headers) @@ -150,6 +163,7 @@ def get_user_info(access_token: str) -> Optional[Dict[str, Any]]: st.error(f"❌ User info error: {str(e)}") return None + def handle_oauth_callback(): """Handle OAuth callback and authenticate user.""" if not ENABLE_AUTH: @@ -158,39 +172,43 @@ def handle_oauth_callback(): # Get URL parameters query_params = dict(st.query_params) - if 'code' in query_params and 'state' in query_params: + if "code" in query_params and "state" in query_params: if DEMO_MODE: - st.info(f"πŸ”— Processing OAuth callback") + st.info("πŸ”— Processing OAuth callback") - code = query_params['code'] - state = query_params['state'] + code = query_params["code"] + state = query_params["state"] # Exchange code for token token_data = exchange_code_for_token(code, state) - if not token_data or 'access_token' not in token_data: + if not token_data or "access_token" not in token_data: st.error("❌ Authentication failed. Please try again.") return # Get user information - user_info = get_user_info(token_data['access_token']) + user_info = get_user_info(token_data["access_token"]) if not user_info: st.error("❌ Failed to get user information. Please try again.") return # Store user in session st.session_state.user = { - 'id': user_info['id'], - 'email': user_info['email'], - 'name': user_info.get('name', 'Unknown'), - 'picture': user_info.get('picture'), - 'authenticated_at': time.time() + "id": user_info["id"], + "email": user_info["email"], + "name": user_info.get("name", "Unknown"), + "picture": user_info.get("picture"), + "authenticated_at": time.time(), } # Log login to D1 d1_logger = get_d1_logger() if d1_logger.is_enabled(): - user_agent = st.context.headers.get('user-agent', '') if hasattr(st, 'context') and hasattr(st.context, 'headers') else '' - d1_logger.log_user_login(user_info['id'], user_info['email'], user_agent) + user_agent = ( + st.context.headers.get("user-agent", "") + if hasattr(st, "context") and hasattr(st.context, "headers") + else "" + ) + d1_logger.log_user_login(user_info["id"], user_info["email"], user_agent) if DEMO_MODE: st.success(f"βœ… Authenticated: {user_info['email']}") @@ -199,8 +217,8 @@ def handle_oauth_callback(): st.query_params.clear() st.rerun() - elif 'error' in query_params: - error = query_params.get('error', 'Unknown error') + elif "error" in query_params: + error = query_params.get("error", "Unknown error") if DEMO_MODE: st.error(f"❌ OAuth error: {error}") else: @@ -209,6 +227,7 @@ def handle_oauth_callback(): # Clear URL parameters st.query_params.clear() + class SimpleAuth: """Simple authentication service using Google OAuth.""" @@ -223,13 +242,13 @@ def is_authenticated(self) -> bool: """Check if user is authenticated.""" if not self.enabled: return True # Skip auth if disabled - return 'user' in st.session_state and st.session_state.user is not None + return "user" in st.session_state and st.session_state.user is not None def get_current_user(self) -> Optional[Dict[str, Any]]: """Get current authenticated user.""" if not self.is_authenticated(): return None - return st.session_state.get('user') + return st.session_state.get("user") def get_auth_url(self) -> str: """Get Google OAuth URL.""" @@ -247,15 +266,15 @@ def log_query(self, question: str, sql_query: str, provider: str, execution_time return # Store in session for immediate access - if 'query_history' not in st.session_state: + if "query_history" not in st.session_state: st.session_state.query_history = [] query_log = { - 'question': question, - 'sql_query': sql_query, - 'ai_provider': provider, - 'execution_time': execution_time, - 'timestamp': time.time() + "question": question, + "sql_query": sql_query, + "ai_provider": provider, + "execution_time": execution_time, + "timestamp": time.time(), } st.session_state.query_history.insert(0, query_log) @@ -265,47 +284,42 @@ def log_query(self, question: str, sql_query: str, provider: str, execution_time # Log to D1 database d1_logger = get_d1_logger() if d1_logger.is_enabled(): - d1_logger.log_user_query( - user['id'], - user['email'], - question, - sql_query, - provider, - execution_time - ) + d1_logger.log_user_query(user["id"], user["email"], question, sql_query, provider, execution_time) def get_user_query_history(self, limit: int = 10): """Get user's query history from session.""" if not self.is_authenticated(): return [] - return st.session_state.get('query_history', [])[:limit] + return st.session_state.get("query_history", [])[:limit] def sign_out(self): """Sign out current user.""" - if 'user' in st.session_state: + if "user" in st.session_state: del st.session_state.user # Clear other session data keys_to_clear = [ - 'oauth_state', - 'generated_sql', - 'bedrock_error', - 'user_question', - 'show_edit_sql', - 'query_history' + "oauth_state", + "generated_sql", + "bedrock_error", + "user_question", + "show_edit_sql", + "query_history", ] for key in keys_to_clear: if key in st.session_state: del st.session_state[key] + # Global auth service instance _auth_service = None + def get_auth_service() -> SimpleAuth: """Get or create global auth service instance.""" global _auth_service if _auth_service is None: _auth_service = SimpleAuth() - return _auth_service \ No newline at end of file + return _auth_service diff --git a/src/simple_auth_components.py b/src/simple_auth_components.py index a3e1689..ae03d1d 100644 --- a/src/simple_auth_components.py +++ b/src/simple_auth_components.py @@ -4,11 +4,14 @@ Clean UI components for Google OAuth login and user management. """ -import streamlit as st -import time import os +import time + +import streamlit as st + from .simple_auth import get_auth_service, handle_oauth_callback + def render_login_page(): """Render the login page with Google OAuth.""" auth = get_auth_service() @@ -18,7 +21,8 @@ def render_login_page(): with col2: # Professional login header - st.markdown(""" + st.markdown( + """

🏠 Single Family Loan Analytics @@ -27,11 +31,14 @@ def render_login_page(): AI-Powered Loan Portfolio Intelligence

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Professional login card with st.container(): - st.markdown(""" + st.markdown( + """
@@ -42,7 +49,9 @@ def render_login_page(): Sign in with your Google account to access the loan analytics platform.

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) st.markdown("
", unsafe_allow_html=True) @@ -51,30 +60,36 @@ def render_login_page(): auth_url = auth.get_auth_url() if auth_url: - DEMO_MODE = os.getenv('DEMO_MODE', 'false').lower() == 'true' + DEMO_MODE = os.getenv("DEMO_MODE", "false").lower() == "true" if DEMO_MODE: st.info("πŸ”— Redirecting to Google OAuth...") st.code(auth_url) st.markdown(f"[Click here if not redirected automatically]({auth_url})") # Multiple redirect methods for better compatibility - st.markdown(f""" + st.markdown( + f""" - """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # Also provide a direct link as backup - st.markdown(f""" + st.markdown( + f""" - """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) with st.spinner("Redirecting to Google..."): time.sleep(2) @@ -85,7 +100,8 @@ def render_login_page(): # Info section with st.expander("ℹ️ About This Application", expanded=False): - st.markdown(""" + st.markdown( + """ **Single Family Loan Analytics Platform** provides comprehensive loan data analysis: **Core Features:** @@ -99,7 +115,8 @@ def render_login_page(): - Single Family Loan performance data (56.8M+ loans) - Real-time data sync from Cloudflare R2 storage - Comprehensive data dictionary with domain expertise - """) + """ + ) # Footer st.markdown("---") @@ -107,9 +124,10 @@ def render_login_page(): "
" "Powered by Streamlit and Google OAuth" "
", - unsafe_allow_html=True + unsafe_allow_html=True, ) + def render_user_menu(): """Render user menu in sidebar.""" auth = get_auth_service() @@ -122,30 +140,41 @@ def render_user_menu(): st.markdown("---") # User profile section - st.markdown(""" + st.markdown( + """

πŸ‘€ Profile

- """, unsafe_allow_html=True) + """, + unsafe_allow_html=True, + ) # User info - st.markdown(f"
πŸ‘€ {user.get('name', 'User')}
", unsafe_allow_html=True) - st.markdown(f"
πŸ“§ {user.get('email', '')}
", unsafe_allow_html=True) + st.markdown( + f"
πŸ‘€ {user.get('name', 'User')}
", + unsafe_allow_html=True, + ) + st.markdown( + f"
πŸ“§ {user.get('email', '')}
", + unsafe_allow_html=True, + ) # Show user picture if available - if user.get('picture'): - st.image(user['picture'], width=50) + if user.get("picture"): + st.image(user["picture"], width=50) # Sign out button if st.button("πŸšͺ Sign Out", type="primary", width="stretch"): auth.sign_out() st.rerun() + def simple_auth_wrapper(main_app_function): """ Simple auth wrapper for the entire Streamlit app. Usage: simple_auth_wrapper(main)() """ + def wrapper(): auth = get_auth_service() @@ -154,7 +183,7 @@ def wrapper(): # Skip auth if disabled if not auth.is_enabled(): - DEMO_MODE = os.getenv('DEMO_MODE', 'false').lower() == 'true' + DEMO_MODE = os.getenv("DEMO_MODE", "false").lower() == "true" if DEMO_MODE: st.info("πŸ”“ Authentication disabled. Running in open access mode.") return main_app_function() @@ -168,4 +197,4 @@ def wrapper(): render_user_menu() return main_app_function() - return wrapper \ No newline at end of file + return wrapper diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..88a9292 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +""" +Tests for converSQL +""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..de03f28 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,90 @@ +""" +Pytest fixtures and configuration for tests +""" + +from unittest.mock import MagicMock + +import pytest + + +@pytest.fixture +def mock_env_vars(monkeypatch): + """Mock environment variables for testing.""" + env_vars = { + "AI_PROVIDER": "claude", + "CLAUDE_API_KEY": "test-key", + "CLAUDE_MODEL": "claude-3-5-sonnet-20241022", + "AWS_DEFAULT_REGION": "us-west-2", + "BEDROCK_MODEL_ID": "anthropic.claude-3-5-haiku-20241022-v1:0", + "GOOGLE_API_KEY": "test-google-key", + "GEMINI_MODEL": "gemini-1.5-pro", + "PROCESSED_DATA_DIR": "data/processed/", + "ENABLE_PROMPT_CACHE": "false", + } + for key, value in env_vars.items(): + monkeypatch.setenv(key, value) + return env_vars + + +@pytest.fixture +def sample_schema(): + """Sample database schema for testing.""" + return """ + TABLE: data + Columns: + - LOAN_ID (VARCHAR): Unique loan identifier + - STATE (VARCHAR): State code + - CSCORE_B (INTEGER): Credit score + - OLTV (FLOAT): Original loan-to-value ratio + - DTI (FLOAT): Debt-to-income ratio + - ORIG_UPB (FLOAT): Original unpaid principal balance + - CURR_UPB (FLOAT): Current unpaid principal balance + """ + + +@pytest.fixture +def sample_question(): + """Sample user question for testing.""" + return "Show me loans in California with credit scores below 620" + + +@pytest.fixture +def sample_sql(): + """Sample SQL query for testing.""" + return """ + SELECT LOAN_ID, STATE, CSCORE_B, OLTV, DTI + FROM data + WHERE STATE = 'CA' + AND CSCORE_B < 620 + LIMIT 20 + """ + + +@pytest.fixture +def mock_boto3_client(): + """Mock boto3 client for Bedrock testing.""" + mock_client = MagicMock() + mock_response = {"body": MagicMock(), "ResponseMetadata": {"HTTPStatusCode": 200}} + mock_response["body"].read.return_value = b'{"content": [{"text": "SELECT * FROM data LIMIT 10"}]}' + mock_client.invoke_model.return_value = mock_response + return mock_client + + +@pytest.fixture +def mock_anthropic_client(): + """Mock Anthropic client for Claude testing.""" + mock_client = MagicMock() + mock_response = MagicMock() + mock_response.content = [MagicMock(text="SELECT * FROM data LIMIT 10")] + mock_client.messages.create.return_value = mock_response + return mock_client + + +@pytest.fixture +def mock_gemini_model(): + """Mock Gemini model for testing.""" + mock_model = MagicMock() + mock_response = MagicMock() + mock_response.text = "SELECT * FROM data LIMIT 10" + mock_model.generate_content.return_value = mock_response + return mock_model diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..f449721 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,3 @@ +""" +Integration test module initialization +""" diff --git a/tests/integration/test_adapters_integration.py b/tests/integration/test_adapters_integration.py new file mode 100644 index 0000000..a896dbe --- /dev/null +++ b/tests/integration/test_adapters_integration.py @@ -0,0 +1,101 @@ +""" +Integration tests for AI adapters +""" + +import pytest + +from src.ai_service import AIService + + +@pytest.mark.integration +class TestAdapterIntegration: + """Integration tests for AI adapters.""" + + @pytest.mark.skip(reason="Requires actual API credentials") + @pytest.mark.requires_api + def test_bedrock_real_api(self, sample_question, sample_schema): + """Test Bedrock adapter with real API (requires AWS credentials).""" + service = AIService() + adapter = service.adapters.get("bedrock") + + if adapter and adapter.is_available(): + prompt = f"{sample_schema}\n\n{sample_question}" + sql, error = adapter.generate_sql(prompt) + assert isinstance(sql, str) + assert isinstance(error, str) + if sql: + assert "SELECT" in sql.upper() + else: + pytest.skip("Bedrock adapter not available") + + @pytest.mark.skip(reason="Requires actual API credentials") + @pytest.mark.requires_api + def test_claude_real_api(self, sample_question, sample_schema): + """Test Claude adapter with real API (requires API key).""" + service = AIService() + adapter = service.adapters.get("claude") + + if adapter and adapter.is_available(): + prompt = f"{sample_schema}\n\n{sample_question}" + sql, error = adapter.generate_sql(prompt) + assert isinstance(sql, str) + assert isinstance(error, str) + if sql: + assert "SELECT" in sql.upper() + else: + pytest.skip("Claude adapter not available") + + @pytest.mark.skip(reason="Requires actual API credentials") + @pytest.mark.requires_api + def test_gemini_real_api(self, sample_question, sample_schema): + """Test Gemini adapter with real API (requires API key).""" + service = AIService() + adapter = service.adapters.get("gemini") + + if adapter and adapter.is_available(): + prompt = f"{sample_schema}\n\n{sample_question}" + sql, error = adapter.generate_sql(prompt) + assert isinstance(sql, str) + assert isinstance(error, str) + if sql: + assert "SELECT" in sql.upper() + else: + pytest.skip("Gemini adapter not available") + + def test_service_initialization(self): + """Test AIService initializes without errors.""" + service = AIService() + assert isinstance(service, AIService) + assert hasattr(service, "adapters") + assert len(service.adapters) > 0 + + def test_adapter_fallback_behavior(self): + """Test that service handles no available adapters gracefully.""" + service = AIService() + + # Get all available adapters + available_adapters = [name for name, adapter in service.adapters.items() if adapter.is_available()] + + # Test should pass even if no adapters are available + if available_adapters: + adapter = service.get_active_adapter() + assert adapter is not None + else: + adapter = service.get_active_adapter() + assert adapter is None + + def test_generate_sql_with_no_providers(self, sample_question, sample_schema): + """Test SQL generation when no providers are available.""" + service = AIService() + + # Generate SQL - should handle gracefully + sql, error, provider = service.generate_sql(sample_question, sample_schema) + + assert isinstance(sql, str) + assert isinstance(error, str) + assert isinstance(provider, str) + + # If no providers available, should have error message + if not service.is_available(): + assert len(error) > 0 + assert sql == "" diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..df7aede --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1,9 @@ +""" +Unit tests for __init__.py module +""" + + +def test_unit_init(): + """Test that unit test module can be imported.""" + # This is a simple test to ensure the module structure is correct + assert True diff --git a/tests/unit/test_adapters.py b/tests/unit/test_adapters.py new file mode 100644 index 0000000..d3365b0 --- /dev/null +++ b/tests/unit/test_adapters.py @@ -0,0 +1,172 @@ +""" +Unit tests for adapter implementations +Tests the actual behavior without complex mocking +""" + +import pytest + +from src.ai_engines.base import AIEngineAdapter +from src.ai_engines.bedrock_adapter import BedrockAdapter +from src.ai_engines.claude_adapter import ClaudeAdapter +from src.ai_engines.gemini_adapter import GeminiAdapter + + +class TestBaseAdapter: + """Test suite for AIEngineAdapter base class.""" + + def test_cannot_instantiate_abstract_class(self): + """Test that AIEngineAdapter cannot be instantiated directly.""" + with pytest.raises(TypeError): + AIEngineAdapter() + + def test_adapters_have_required_properties(self): + """Test that all adapters implement required properties.""" + adapters = [BedrockAdapter(), ClaudeAdapter(), GeminiAdapter()] + + for adapter in adapters: + assert hasattr(adapter, "name") + assert hasattr(adapter, "provider_id") + assert hasattr(adapter, "is_available") + assert hasattr(adapter, "generate_sql") + assert isinstance(adapter.name, str) + assert isinstance(adapter.provider_id, str) + assert isinstance(adapter.is_available(), bool) + + +class TestBedrockAdapter: + """Test suite for BedrockAdapter.""" + + def test_initialization(self): + """Test BedrockAdapter initialization.""" + adapter = BedrockAdapter() + assert adapter is not None + assert adapter.name == "Amazon Bedrock" + assert adapter.provider_id == "bedrock" + + def test_is_available_returns_bool(self): + """Test is_available returns boolean.""" + adapter = BedrockAdapter() + assert isinstance(adapter.is_available(), bool) + + def test_generate_sql_returns_tuple(self): + """Test generate_sql returns tuple when not available.""" + adapter = BedrockAdapter() + # If not configured, should return error tuple + if not adapter.is_available(): + sql, error = adapter.generate_sql("test prompt") + assert isinstance(sql, str) + assert isinstance(error, str) + assert sql == "" + assert len(error) > 0 + + def test_get_model_info(self): + """Test get_model_info returns dict.""" + adapter = BedrockAdapter() + info = adapter.get_model_info() + assert isinstance(info, dict) + assert "provider" in info + + +class TestClaudeAdapter: + """Test suite for ClaudeAdapter.""" + + def test_initialization(self): + """Test ClaudeAdapter initialization.""" + adapter = ClaudeAdapter() + assert adapter is not None + assert adapter.name == "Claude API" + assert adapter.provider_id == "claude" + + def test_is_available_returns_bool(self): + """Test is_available returns boolean.""" + adapter = ClaudeAdapter() + assert isinstance(adapter.is_available(), bool) + + def test_generate_sql_returns_tuple(self): + """Test generate_sql returns tuple when not available.""" + adapter = ClaudeAdapter() + # If not configured, should return error tuple + if not adapter.is_available(): + sql, error = adapter.generate_sql("test prompt") + assert isinstance(sql, str) + assert isinstance(error, str) + assert sql == "" + assert len(error) > 0 + + def test_get_model_info(self): + """Test get_model_info returns dict.""" + adapter = ClaudeAdapter() + info = adapter.get_model_info() + assert isinstance(info, dict) + assert "provider" in info + + +class TestGeminiAdapter: + """Test suite for GeminiAdapter.""" + + def test_initialization(self): + """Test GeminiAdapter initialization.""" + adapter = GeminiAdapter() + assert adapter is not None + assert adapter.name == "Google Gemini" + assert adapter.provider_id == "gemini" + + def test_is_available_returns_bool(self): + """Test is_available returns boolean.""" + adapter = GeminiAdapter() + assert isinstance(adapter.is_available(), bool) + + def test_generate_sql_returns_tuple(self): + """Test generate_sql returns tuple when not available.""" + adapter = GeminiAdapter() + # If not configured, should return error tuple + if not adapter.is_available(): + sql, error = adapter.generate_sql("test prompt") + assert isinstance(sql, str) + assert isinstance(error, str) + assert sql == "" + assert len(error) > 0 + + def test_get_model_info(self): + """Test get_model_info returns dict.""" + adapter = GeminiAdapter() + info = adapter.get_model_info() + assert isinstance(info, dict) + assert "provider" in info + + +class TestAdapterValidation: + """Test adapter validation methods.""" + + def test_validate_response(self): + """Test validate_response method.""" + adapter = BedrockAdapter() + + # Valid SQL + is_valid, msg = adapter.validate_response("SELECT * FROM table") + assert is_valid is True + + # Empty SQL + is_valid, msg = adapter.validate_response("") + assert is_valid is False + assert "empty" in msg.lower() + + # Not SQL + is_valid, msg = adapter.validate_response("This is not SQL") + assert is_valid is False + + def test_clean_sql_response(self): + """Test clean_sql_response method.""" + adapter = BedrockAdapter() + + # Clean SQL with markdown + sql = adapter.clean_sql_response("```sql\nSELECT * FROM table\n```") + assert sql == "SELECT * FROM table" + + # SQL with prefix + sql = adapter.clean_sql_response("Here's the SQL query: SELECT * FROM table") + assert sql == "SELECT * FROM table" + + # Plain SQL + sql = adapter.clean_sql_response("SELECT * FROM table") + assert sql == "SELECT * FROM table" diff --git a/tests/unit/test_ai_service_simple.py b/tests/unit/test_ai_service_simple.py new file mode 100644 index 0000000..de138d1 --- /dev/null +++ b/tests/unit/test_ai_service_simple.py @@ -0,0 +1,89 @@ +""" +Unit tests for AIService +""" + +from src.ai_service import AIService, generate_sql_with_ai, get_ai_service, initialize_ai_client + + +class TestAIService: + """Test suite for AIService class.""" + + def test_initialization(self): + """Test AIService initialization creates all adapters.""" + service = AIService() + + assert "bedrock" in service.adapters + assert "claude" in service.adapters + assert "gemini" in service.adapters + + def test_is_available(self): + """Test is_available returns boolean.""" + service = AIService() + assert isinstance(service.is_available(), bool) + + def test_get_active_provider(self): + """Test get_active_provider returns string or None.""" + service = AIService() + provider = service.get_active_provider() + assert provider is None or isinstance(provider, str) + + def test_get_provider_status(self): + """Test get_provider_status returns dict.""" + service = AIService() + status = service.get_provider_status() + + assert isinstance(status, dict) + assert "active" in status + assert "bedrock" in status + assert "claude" in status + assert "gemini" in status + + def test_generate_sql_returns_tuple(self, sample_question, sample_schema): + """Test generate_sql returns tuple.""" + service = AIService() + result = service.generate_sql(sample_question, sample_schema) + + assert isinstance(result, tuple) + assert len(result) == 3 + sql, error, provider = result + assert isinstance(sql, str) + assert isinstance(error, str) + assert isinstance(provider, str) + + +class TestGlobalFunctions: + """Test suite for global convenience functions.""" + + def test_get_ai_service_returns_service(self): + """Test that get_ai_service returns AIService instance.""" + service = get_ai_service() + assert isinstance(service, AIService) + + def test_get_ai_service_cached(self): + """Test that get_ai_service returns same instance.""" + service1 = get_ai_service() + service2 = get_ai_service() + # Due to Streamlit caching, should be same instance + assert isinstance(service1, AIService) + assert isinstance(service2, AIService) + + def test_initialize_ai_client(self): + """Test initialize_ai_client returns tuple.""" + result = initialize_ai_client() + assert isinstance(result, tuple) + assert len(result) == 2 + + service, provider = result + # Service can be None or AIService + assert service is None or isinstance(service, AIService) + assert isinstance(provider, str) + + def test_generate_sql_with_ai(self, sample_question, sample_schema): + """Test generate_sql_with_ai convenience function.""" + result = generate_sql_with_ai(sample_question, sample_schema) + + assert isinstance(result, tuple) + assert len(result) == 2 + sql, error = result + assert isinstance(sql, str) + assert isinstance(error, str) From d58780ffa945556aa073be2ec76dddf034f24b94 Mon Sep 17 00:00:00 2001 From: Ravishankar Sivasubramaniam Date: Wed, 1 Oct 2025 18:51:11 -0500 Subject: [PATCH 3/4] Update README.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fba2b4b..9164a40 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ ORDER BY CSCORE_B ASC, OLTV DESC LIMIT 20 ``` -πŸ“Š **Instant Results** β€” with context-aware risk metrics and portfolio insights.L +πŸ“Š **Instant Results** β€” with context-aware risk metrics and portfolio insights. # converSQL From e18e8c2195161600ceba35b241ede0aef042e1c4 Mon Sep 17 00:00:00 2001 From: Ravishankar Sivasubramaniam Date: Wed, 1 Oct 2025 18:51:57 -0500 Subject: [PATCH 4/4] fix: fixed github actions --- .github/workflows/ci.yml | 32 +++++++++++----- .github/workflows/format-code.yml | 61 +++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/format-code.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b8e3ac..17340c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,10 @@ on: pull_request: branches: [ main, enhance-pipeline ] +permissions: + contents: write + pull-requests: write + jobs: lint: name: Code Quality Checks @@ -105,12 +109,15 @@ jobs: format: name: Auto-format Code runs-on: ubuntu-latest - if: github.event_name == 'push' + if: github.event_name == 'push' && github.ref != 'refs/heads/main' + permissions: + contents: write steps: - uses: actions/checkout@v4 with: token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 @@ -128,15 +135,20 @@ jobs: - name: Sort imports with isort run: isort --profile black src/ tests/ - - name: Commit changes + - name: Check for changes + id: verify-changed-files + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "changed=true" >> $GITHUB_OUTPUT + else + echo "changed=false" >> $GITHUB_OUTPUT + fi + + - name: Commit and push changes + if: steps.verify-changed-files.outputs.changed == 'true' run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" - git diff --quiet && git diff --staged --quiet || (git add -A && git commit -m "style: auto-format code with black and isort [skip ci]") - - - name: Push changes - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: ${{ github.ref }} - if: success() + git add -A + git commit -m "style: auto-format code with black and isort [skip ci]" + git push diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml new file mode 100644 index 0000000..9aba171 --- /dev/null +++ b/.github/workflows/format-code.yml @@ -0,0 +1,61 @@ +name: Auto-format Code + +on: + push: + branches: [ enhance-pipeline ] + paths: + - 'src/**/*.py' + - 'tests/**/*.py' + +permissions: + contents: write + +jobs: + format: + name: Format Python Code + runs-on: ubuntu-latest + if: github.event_name == 'push' && !contains(github.event.head_commit.message, '[skip ci]') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install formatting tools + run: | + python -m pip install --upgrade pip + pip install black isort + + - name: Run Black formatter + run: | + black --line-length 120 src/ tests/ || true + + - name: Run isort + run: | + isort --profile black src/ tests/ || true + + - name: Check for changes + id: changes + run: | + if git diff --quiet; then + echo "changed=false" >> $GITHUB_OUTPUT + else + echo "changed=true" >> $GITHUB_OUTPUT + fi + + - name: Commit formatted code + if: steps.changes.outputs.changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add -A + git commit -m "style: auto-format code with black and isort [skip ci]" + git push \ No newline at end of file