From fc13f47f22df0f3b88a588adb7c5be255a1cdf6a Mon Sep 17 00:00:00 2001 From: Andrea Aceto <59835522+andreaceto@users.noreply.github.com> Date: Mon, 14 Jul 2025 16:37:33 +0200 Subject: [PATCH] feat(stack): docker implementation | knn intent classifier | duckling for time slot extraction | basic conversation pipeline --- .gitignore | 3 + Makefile | 45 + docker-compose.yml | 24 + docker/app/Dockerfile | 23 + .../02b_train_knn_intent_classifier.ipynb | 833 +++++++++++++++++- src/schedulebot/core/conversation_manager.py | 47 + src/schedulebot/main.py | 30 + src/schedulebot/nlp/intent_classifier.py | 53 +- src/schedulebot/nlp/slot_filler.py | 65 ++ 9 files changed, 1077 insertions(+), 46 deletions(-) create mode 100644 Makefile create mode 100644 docker-compose.yml create mode 100644 docker/app/Dockerfile create mode 100644 src/schedulebot/core/conversation_manager.py create mode 100644 src/schedulebot/main.py create mode 100644 src/schedulebot/nlp/slot_filler.py diff --git a/.gitignore b/.gitignore index b814464..87aeaa4 100644 --- a/.gitignore +++ b/.gitignore @@ -209,3 +209,6 @@ __marimo__/ # Streamlit .streamlit/secrets.toml + +# Models +models/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f6c73a9 --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ +# Makefile + +# Use bash for better scripting +SHELL := /bin/bash + +# Default command is 'help' +.DEFAULT_GOAL := help + +## -------------------------------------- +## Docker Commands +## -------------------------------------- + +.PHONY: build +build: ## ๐Ÿ› ๏ธ Build or rebuild the Docker services + @echo ">> Building services..." + @docker-compose build + +.PHONY: up +up: ## ๐Ÿš€ Start all services in detached mode + @echo ">> Starting services in the background..." + @docker-compose up -d + +.PHONY: down +down: ## ๐Ÿ›‘ Stop and remove all services + @echo ">> Stopping and removing containers..." + @docker-compose down + +.PHONY: logs +logs: ## ๐Ÿ“œ View real-time logs for all services + @echo ">> Tailing logs (press Ctrl+C to stop)..." + @docker-compose logs -f + +.PHONY: test +test: ## ๐Ÿงช Run pytest inside the app container + @echo ">> Running tests..." + @docker-compose run --rm app pytest + +## -------------------------------------- +## Help +## -------------------------------------- + +.PHONY: help +help: ## ๐Ÿ™‹ Show this help message + @echo "Available commands:" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_-LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c8c5420 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,24 @@ +services: + # ScheduleBOT+ Application + app: + build: + context: . + dockerfile: docker/app/Dockerfile + container_name: schedulebot_app + ports: + - "7860:7860" # Expose Gradio's port + volumes: + - ./src:/app/src + env_file: + - .env + depends_on: + - duckling + restart: unless-stopped + + # Duckling Service for Time/Date Extraction + duckling: + image: rasa/duckling:latest + container_name: duckling_service + ports: + - "8000:8000" + restart: unless-stopped diff --git a/docker/app/Dockerfile b/docker/app/Dockerfile new file mode 100644 index 0000000..4645879 --- /dev/null +++ b/docker/app/Dockerfile @@ -0,0 +1,23 @@ +# Use an official Python runtime as a parent image +FROM python:3.10-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the requirements file into the container +COPY requirements.txt . + +# Install any needed packages specified in requirements.txt +# --no-cache-dir: Disables the cache to keep the image size smaller +# --trusted-host pypi.python.org: Can help avoid SSL issues in some networks +RUN pip install --no-cache-dir --trusted-host pypi.python.org -r requirements.txt + +# Copy the rest of the application's source code from your host to your image filesystem. +COPY ./src /app/src +COPY ./models /app/models +COPY .env /app/.env + +# Command to run the application when the container launches +# This will be the main entry point for your Gradio app in the next milestone. +# For now, we can use the command-line chat. +CMD ["python", "-m", "src.schedulebot.main"] diff --git a/notebooks/02b_train_knn_intent_classifier.ipynb b/notebooks/02b_train_knn_intent_classifier.ipynb index 05a2ff1..f21a412 100644 --- a/notebooks/02b_train_knn_intent_classifier.ipynb +++ b/notebooks/02b_train_knn_intent_classifier.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "b7c54a44", "metadata": {}, "outputs": [], @@ -11,22 +11,31 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from dotenv import load_dotenv\n", - "from huggingface_hub import login, upload_file\n", + "from huggingface_hub import login, upload_file, HfApi\n", "from datasets import load_dataset\n", "from pathlib import Path\n", "import numpy as np\n", "from sentence_transformers import SentenceTransformer\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix\n", + "import json\n", "import joblib" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "840039c4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" + ] + } + ], "source": [ "# Setup and Login\n", "load_dotenv()\n", @@ -37,10 +46,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "2fafd205", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['intent', 'user_utterance', 'origin', 'new_intent', 'label'],\n", + " num_rows: 68562\n", + " })\n", + " val: Dataset({\n", + " features: ['intent', 'user_utterance', 'origin', 'new_intent', 'label'],\n", + " num_rows: 14645\n", + " })\n", + " test: Dataset({\n", + " features: ['intent', 'user_utterance', 'origin', 'new_intent', 'label'],\n", + " num_rows: 14684\n", + " })\n", + "})" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Load dataset\n", "project_root = Path().cwd().resolve().parent\n", @@ -51,10 +84,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "053f571d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5abb365abaee47cbaface3c4d858aa3e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/2143 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sklearn.manifold import TSNE\n", "\n", @@ -123,29 +198,660 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "8496d05a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
KNeighborsClassifier(metric='cosine')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "KNeighborsClassifier(metric='cosine')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Addestramento del classificatore KNN\n", - "print(\"Addestramento del classificatore KNN...\")\n", - "# Inizializziamo il classificatore\n", - "# n_neighbors=5 รจ un valore di default comune\n", + "# KNN Classifier Training\n", "knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')\n", - "\n", - "# Addestriamo il KNN usando gli embeddings e le etichette\n", - "knn.fit(train_embeddings, train_labels)\n", - "\n", - "print(\"Addestramento completato.\")" + "knn.fit(train_embeddings, train_labels)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "8eb69d81", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d57db525502b4c2988ee562ce3631547", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/459 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Confusion Matrix\n", "cm = confusion_matrix(test_labels, test_predictions)\n", @@ -207,10 +946,44 @@ "outputs": [], "source": [ "# Model Saving\n", - "model_dir = project_root / \"models\"\n", - "knn_model_path = model_dir / \"intent_classifier\" / \"knn_model.joblib\"\n", + "model_dir = project_root / \"models\" / \"intent_classifier\"\n", + "knn_model_path = model_dir / \"knn_model.joblib\"\n", "joblib.dump(knn, knn_model_path)" ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3e276fc4", + "metadata": {}, + "outputs": [], + "source": [ + "# id2label and label2id mapping saving\n", + "id2label = {i: label for i, label in enumerate(sorted(set(dataset['train']['new_intent'])))}\n", + "label2id = {label: i for i, label in id2label.items()}\n", + "with open(os.path.join(model_dir, \"id2label.json\"), 'w') as f:\n", + " json.dump(id2label, f)\n", + "with open(os.path.join(model_dir, \"label2id.json\"), 'w') as g:\n", + " json.dump(label2id, g)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e176f7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Push to Hugging Face Hub\n", + "api = HfApi()\n", + "api.upload_folder(\n", + " folder_path=str(model_dir),\n", + "\tpath_in_repo=\"\",\n", + "\trepo_id=hub_model_id,\n", + "\tcommit_message=\"Upload KNN Intent Classifier Model and Mappings\",\n", + " token=hf_token \n", + ")" + ] } ], "metadata": { diff --git a/src/schedulebot/core/conversation_manager.py b/src/schedulebot/core/conversation_manager.py new file mode 100644 index 0000000..dc164c2 --- /dev/null +++ b/src/schedulebot/core/conversation_manager.py @@ -0,0 +1,47 @@ +from src.schedulebot.nlp.intent_classifier import IntentClassifier +from src.schedulebot.nlp.slot_filler import SlotFiller + + +class ConversationManager: + def __init__(self, nlu_model_repo: str): + """ + Initializes the manager with the NLU models. + """ + self.intent_classifier = IntentClassifier(repo_id=nlu_model_repo) + self.slot_filler = SlotFiller() + + def get_response(self, user_text: str) -> str: + """ + Processes the user's input and returns a text response. + """ + # 1. Classify the intent + intent = self.intent_classifier.predict(user_text) + print(f"[DEBUG: Classified intent: {intent}]") + print(f"[DEBUG: time_slot: {self.slot_filler.parse_time(user_text)}]") + + # 2. Logic based on the intent + if intent == "greet": + return "Hello! How can I help you with your appointments today?" + + if intent == "bye": + return "Goodbye! Have a great day." + + if intent in ["book", "resched"]: + # 3. Extract date and time if required by the intent + time_slot = self.slot_filler.parse_time(user_text) + + action = "book" if intent == "book" else "reschedule" + + if time_slot: + return f"Okay, I see you want to {action} an appointment for {time_slot['value']}. Is that correct?" + else: + return "Sure, but I didn't understand the date and time. Could you please specify when?" + + if intent == "cancel": + return "Okay, I understand you want to cancel an appointment. Can you specify which one?" + + if intent == "avail": + return "I'm checking your availability now. One moment..." + + # Fallback for unhandled intents + return "I'm not sure I understood your request." diff --git a/src/schedulebot/main.py b/src/schedulebot/main.py new file mode 100644 index 0000000..46ee214 --- /dev/null +++ b/src/schedulebot/main.py @@ -0,0 +1,30 @@ +from src.schedulebot.core.conversation_manager import ConversationManager +import os +from dotenv import load_dotenv + + +def main(): + """ + Main loop to interact with the chatbot from the command line. + """ + # Load environment variables + load_dotenv() + repo_id = os.getenv("HUB_MODEL_ID") + + print("Initializing ConversationManager...") + manager = ConversationManager(nlu_model_repo=repo_id) + + print("\nScheduleBOT+ is active! Type 'exit' to quit.") + print("--------------------------------------------------") + + while True: + user_input = input("You: ") + if user_input.lower() == "exit": + break + + bot_response = manager.get_response(user_input) + print(f"Bot: {bot_response}") + + +if __name__ == "__main__": + main() diff --git a/src/schedulebot/nlp/intent_classifier.py b/src/schedulebot/nlp/intent_classifier.py index 705822e..f50693b 100644 --- a/src/schedulebot/nlp/intent_classifier.py +++ b/src/schedulebot/nlp/intent_classifier.py @@ -1,29 +1,50 @@ import os +import json +import joblib +from sentence_transformers import SentenceTransformer +from huggingface_hub import hf_hub_download from dotenv import load_dotenv -from transformers import AutoTokenizer, AutoModelForSequenceClassification -import torch class IntentClassifier: - def __init__(self, model_repo_id: str): + def __init__(self, repo_id: str): """ - Loads the fine-tuned model and tokenizer from the Hugging Face Hub. + Loads the KNN classifier and its artifacts from a Hugging Face Hub repository. + + Args: + repo_id (str): The ID of the repository on the Hub (e.g., 'username/repo-name'). """ - self.tokenizer = AutoTokenizer.from_pretrained(model_repo_id) - self.model = AutoModelForSequenceClassification.from_pretrained(model_repo_id) - self.id_to_label = self.model.config.id2label + # Ensure the HF_TOKEN is available if the repo is private + # For public repos, this is not strictly necessary but good practice + hf_token = os.getenv("HF_TOKEN") + + # 1. Download the artifacts from the Hub. + # hf_hub_download returns the local path to the cached file. + knn_model_path = hf_hub_download( + repo_id=repo_id, filename="knn_model.joblib", token=hf_token + ) + id2label_path = hf_hub_download( + repo_id=repo_id, filename="id2label.json", token=hf_token + ) + + # 2. Load the Sentence Transformer model + self.embedding_model = SentenceTransformer( + "sentence-transformers/all-MiniLM-L6-v2" + ) + + # 3. Load the downloaded KNN model and label mapping + self.knn = joblib.load(knn_model_path) + with open(id2label_path, "r") as f: + self.id_to_label = {int(k): v for k, v in json.load(f).items()} def predict(self, text: str) -> str: """ - Predicts the intent for a given text string. + Predicts the intent for a given text string using embeddings and KNN. """ - inputs = self.tokenizer( - text, return_tensors="pt", padding=True, truncation=True - ) - with torch.no_grad(): - logits = self.model(**inputs).logits - - predicted_class_id = torch.argmax(logits, dim=1).item() + text_embedding = self.embedding_model.encode( + text, convert_to_tensor=False + ).reshape(1, -1) + predicted_class_id = self.knn.predict(text_embedding)[0] return self.id_to_label[predicted_class_id] @@ -31,7 +52,7 @@ def predict(self, text: str) -> str: # Load the model from the Hub load_dotenv() model_repo_id = os.getenv("HUB_MODEL_ID") - classifier = IntentClassifier(model_repo_id=model_repo_id) + classifier = IntentClassifier(repo_id=model_repo_id) # Tests text1 = "I want to schedule a meeting with John for next Tuesday" diff --git a/src/schedulebot/nlp/slot_filler.py b/src/schedulebot/nlp/slot_filler.py new file mode 100644 index 0000000..4b19570 --- /dev/null +++ b/src/schedulebot/nlp/slot_filler.py @@ -0,0 +1,65 @@ +import requests +from datetime import datetime + + +class SlotFiller: + def __init__(self, duckling_url: str = "http://localhost:8000/parse"): + """ + Initializes the SlotFiller with the Duckling service URL. + """ + self.duckling_url = duckling_url + + def parse_time(self, text: str) -> dict | None: + """ + Sends text to Duckling to extract time-related information. + Returns the first valid time entity found. + """ + try: + # Data to send to Duckling + data = {"text": text, "locale": "en_US", "dims": '["time"]'} + + response = requests.post(self.duckling_url, data=data) + response.raise_for_status() # Raises an exception for HTTP errors + + parsed_data = response.json() + + if not parsed_data: + return None + + # Extract the most relevant value + # Duckling can return multiple values (e.g., "tomorrow at 5" -> tomorrow's date, hour 5) + # We look for the 'value' type that contains complete date and time. + for entity in parsed_data: + if entity.get("dim") == "time" and entity["value"]["type"] == "value": + raw_time = entity["value"]["value"] + # Convert to a standard format (ISO 8601) + dt_object = datetime.fromisoformat(raw_time) + return {"text": entity["body"], "value": dt_object.isoformat()} + + return None + + except requests.exceptions.RequestException as e: + print( + f"ERROR: Unable to communicate with Duckling. Make sure it is running. Details: {e}" + ) + return None + + +# Block to test the script directly +if __name__ == "__main__": + filler = SlotFiller() + + test_text_1 = "I would like to book a meeting for tomorrow at 5 PM" + time_info_1 = filler.parse_time(test_text_1) + print(f"Text: '{test_text_1}'") + print(f"Extracted info: {time_info_1}\n") + + test_text_2 = "Can we meet next Friday?" + time_info_2 = filler.parse_time(test_text_2) + print(f"Text: '{test_text_2}'") + print(f"Extracted info: {time_info_2}\n") + + test_text_3 = "Hi, how are you?" + time_info_3 = filler.parse_time(test_text_3) + print(f"Text: '{test_text_3}'") + print(f"Extracted info: {time_info_3}\n")