[chores] refactoring API

ThomasFaria · ThomasFaria · commit d6a40024da5a · 2025-03-31T14:47:04.000Z
diff --git a/src/api/main.py b/src/api/main.py
@@ -6,23 +6,17 @@
 import os
 from contextlib import asynccontextmanager
 from pathlib import Path
-from typing import Annotated, List
+from typing import Annotated
 
 import mlflow
 import yaml
 from fastapi import Depends, FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBasicCredentials
-from pydantic import BaseModel
 
-from api.models.forms import BatchForms, SingleForm
+from api.routes import predict_batch, predict_single
 from utils.logging import configure_logging
-from utils.preprocessing import preprocess_inputs
-from utils.utils import (
-    get_credentials,
-    get_model,
-    process_response,
-)
+from utils.security import get_credentials
 
 
 @asynccontextmanager
@@ -40,7 +34,9 @@ async def lifespan(app: FastAPI):
     logger = logging.getLogger(__name__)
     logger.info("🚀 Starting API lifespan")
 
-    app.state.model = get_model(os.environ["MLFLOW_MODEL_NAME"], os.environ["MLFLOW_MODEL_VERSION"])
+    app.state.model = mlflow.pyfunc.load_model(
+        model_uri=f"models:/{os.environ['MLFLOW_MODEL_NAME']}/{os.environ['MLFLOW_MODEL_VERSION']}"
+    )
     run_data = mlflow.get_run(app.state.model.metadata.run_id).data.params
     app.state.training_names = [
         run_data["text_feature"],
@@ -54,58 +50,16 @@ async def lifespan(app: FastAPI):
     logger.info("🛑 Shutting down API lifespan")
 
 
-class BatchPredictionRequest(BaseModel):
-    """
-    Pydantic BaseModel for representing the input data for the API.
-
-    This BaseModel defines the structure of the input data required
-    for the API's "/predict-batch" endpoint.
-
-    Attributes:
-        description_activity (List[str]): The text description.
-        other_nature_activity (List[str]): Other nature of the activity.
-        precision_act_sec_agricole (List[str]): Precision of the activity in the agricultural sector.
-        type_form (List[str]): The type of the form CERFA.
-        nature (List[str]): The nature of the activity.
-        surface (List[str]): The surface of activity.
-        cj (List[str]): The legal category code.
-        activity_permanence_status (List[str]): The activity permanence status (permanent or seasonal).
-
-    """
-
-    description_activity: List[str]
-    other_nature_activity: List[str]
-    precision_act_sec_agricole: List[str]
-    type_form: List[str]
-    nature: List[str]
-    surface: List[str]
-    cj: List[str]
-    activity_permanence_status: List[str]
-
-    class Config:
-        json_schema_extra = {
-            "example": {
-                "description_activity": [
-                    ("LOUEUR MEUBLE NON PROFESSIONNEL EN RESIDENCE DE SERVICES (CODE APE 6820A Location de logements)")
-                ],
-                "other_nature_activity": [""],
-                "precision_act_sec_agricole": [""],
-                "type_form": ["I"],
-                "nature": [""],
-                "surface": [""],
-                "cj": [""],
-                "activity_permanence_status": [""],
-            }
-        }
-
-
 app = FastAPI(
     lifespan=lifespan,
     title="Prédiction code APE",
     description="Application de prédiction pour l'activité principale de l'entreprise (APE)",
     version="0.0.1",
 )
 
+app.include_router(predict_single.router)
+app.include_router(predict_batch.router)
+
 
 app.add_middleware(
     CORSMiddleware,
@@ -128,71 +82,3 @@ def show_welcome_page(
         "Model_name": f"{os.environ['MLFLOW_MODEL_NAME']}",
         "Model_version": f"{os.environ['MLFLOW_MODEL_VERSION']}",
     }
-
-
-@app.post("/predict", tags=["Predict"])
-async def predict(
-    credentials: Annotated[HTTPBasicCredentials, Depends(get_credentials)],
-    form: SingleForm,
-    nb_echos_max: int = 5,
-    prob_min: float = 0.01,
-):
-    """
-    Predict code APE.
-
-    This endpoint accepts input data as query parameters and uses the loaded
-    ML model to predict the code APE based on the input data.
-
-    Args:
-        nb_echos_max (int): Maximum number of echoes to consider. Default is 5.
-        prob_min (float): Minimum probability threshold. Default is 0.01.
-
-    Returns:
-        dict: Response containing APE codes.
-    """
-
-    query = preprocess_inputs(app.state.training_names, [form])
-
-    predictions = app.state.model.predict(query, params={"k": nb_echos_max})
-
-    response = process_response(predictions, 0, nb_echos_max, prob_min, app.state.libs)
-
-    # Logging
-    query_to_log = {key: value[0] for key, value in query.items()}
-    logging.info(f"{{'Query': {query_to_log}, 'Response': {response}}}")
-
-    return response
-
-
-@app.post("/predict-batch", tags=["Predict"])
-async def predict_batch(
-    credentials: Annotated[HTTPBasicCredentials, Depends(get_credentials)],
-    forms: BatchForms,
-    nb_echos_max: int = 5,
-    prob_min: float = 0.01,
-):
-    """
-    Endpoint for predicting batches of data.
-
-    Args:
-        credentials (HTTPBasicCredentials): The credentials for authentication.
-        forms (Forms): The input data in the form of Forms object.
-        nb_echos_max (int, optional): The maximum number of predictions to return. Defaults to 5.
-        prob_min (float, optional): The minimum probability threshold for predictions. Defaults to 0.01.
-
-    Returns:
-        list: The list of predicted responses.
-    """
-    query = preprocess_inputs(app.state.training_names, forms.forms)
-
-    predictions = app.state.model.predict(query, params={"k": nb_echos_max})
-
-    response = [process_response(predictions, i, nb_echos_max, prob_min, app.state.libs) for i in range(len(predictions[0]))]
-
-    # Logging
-    for line in range(len(query[app.state.training_names[0]])):
-        query_line = {key: value[line] for key, value in query.items()}
-        response_line = response[line]
-        logging.info(f"{{'Query': {query_line}, 'Response': {response_line}}}")
-
-    return response
diff --git a/src/api/models/forms.py b/src/api/models/forms.py
@@ -39,7 +39,7 @@ class BatchForms(BaseModel):
 
     @model_validator(mode="after")
     def check_description_not_empty(cls, values):
-        forms = values.get("forms", [])
+        forms = values.forms
         missing_indexes = [
             idx for idx, form in enumerate(forms) if not form.description_activity or form.description_activity.strip() == ""
         ]
diff --git a/src/api/models/responses.py b/src/api/models/responses.py
@@ -0,0 +1,13 @@
+from typing import Dict, Union
+
+from pydantic import BaseModel, RootModel
+
+
+class Prediction(BaseModel):
+    code: str
+    probabilite: float
+    libelle: str
+
+
+class PredictionResponse(RootModel[Dict[str, Union[Prediction, float]]]):
+    pass
diff --git a/src/api/routes/batch.py b/src/api/routes/batch.py
diff --git a/src/api/routes/predict.py b/src/api/routes/predict.py
diff --git a/src/api/routes/predict_batch.py b/src/api/routes/predict_batch.py
@@ -0,0 +1,50 @@
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, Request
+from fastapi.security import HTTPBasicCredentials
+
+from api.models.forms import BatchForms
+from api.models.responses import PredictionResponse
+from utils.logging import log_prediction
+from utils.prediction import process_response
+from utils.preprocessing import preprocess_inputs
+from utils.security import get_credentials
+
+router = APIRouter(prefix="/batch", tags=["Predict a batch of activity"])
+
+
+@router.post("/predict", response_model=PredictionResponse)
+async def predict(
+    credentials: Annotated[HTTPBasicCredentials, Depends(get_credentials)],
+    request: Request,
+    forms: BatchForms,
+    nb_echos_max: int = 5,
+    prob_min: float = 0.01,
+):
+    """
+    Endpoint for predicting batches of data.
+
+    Args:
+        credentials (HTTPBasicCredentials): The credentials for authentication.
+        forms (Forms): The input data in the form of Forms object.
+        nb_echos_max (int, optional): The maximum number of predictions to return. Defaults to 5.
+        prob_min (float, optional): The minimum probability threshold for predictions. Defaults to 0.01.
+
+    Returns:
+        list: The list of predicted responses.
+    """
+    query = preprocess_inputs(request.app.state.training_names, forms.forms)
+
+    predictions = request.app.state.model.predict(query, params={"k": nb_echos_max})
+
+    response = [
+        process_response(predictions, i, nb_echos_max, prob_min, request.app.state.libs) for i in range(len(predictions[0]))
+    ]
+
+    responses = []
+    for i in range(len(predictions[0])):
+        response = process_response(predictions, i, nb_echos_max, prob_min, request.app.state.libs)
+        log_prediction(query, response, i)
+        responses.append(response)
+
+    return responses
diff --git a/src/api/routes/predict_single.py b/src/api/routes/predict_single.py
@@ -0,0 +1,46 @@
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, Request
+from fastapi.security import HTTPBasicCredentials
+
+from api.models.forms import SingleForm
+from api.models.responses import PredictionResponse
+from utils.logging import log_prediction
+from utils.prediction import process_response
+from utils.preprocessing import preprocess_inputs
+from utils.security import get_credentials
+
+router = APIRouter(prefix="/single", tags=["Predict an activity"])
+
+
+@router.post("/predict", response_model=PredictionResponse)
+async def predict(
+    credentials: Annotated[HTTPBasicCredentials, Depends(get_credentials)],
+    request: Request,
+    form: SingleForm,
+    nb_echos_max: int = 5,
+    prob_min: float = 0.01,
+):
+    """
+    Predict code APE.
+
+    This endpoint accepts input data as query parameters and uses the loaded
+    ML model to predict the code APE based on the input data.
+
+    Args:
+        nb_echos_max (int): Maximum number of echoes to consider. Default is 5.
+        prob_min (float): Minimum probability threshold. Default is 0.01.
+
+    Returns:
+        dict: Response containing APE codes.
+    """
+
+    query = preprocess_inputs(request.app.state.training_names, [form])
+
+    predictions = request.app.state.model.predict(query, params={"k": nb_echos_max})
+
+    response = process_response(predictions, 0, nb_echos_max, prob_min, request.app.state.libs)
+
+    log_prediction(query, response, 0)
+
+    return response
diff --git a/src/utils/logging.py b/src/utils/logging.py
@@ -1,5 +1,7 @@
 import logging
 
+from api.models.responses import PredictionResponse
+
 
 def configure_logging():
     logging.basicConfig(
@@ -10,3 +12,9 @@ def configure_logging():
             logging.StreamHandler(),
         ],
     )
+
+
+def log_prediction(query: dict, response: PredictionResponse, index: int = 0):
+    query_line = {key: value[index] for key, value in query.items()}
+    logging.info(f"{{'Query': {query_line}, 'Response': {response}}}")
+    # TODO : response.model_dump() ?
diff --git a/src/utils/prediction.py b/src/utils/prediction.py
@@ -0,0 +1,43 @@
+from fastapi import HTTPException
+
+from api.models.responses import Prediction, PredictionResponse
+
+
+def process_response(
+    predictions: tuple,
+    liasse_nb: int,
+    nb_echos_max: int,
+    prob_min: float,
+    libs: dict,
+) -> PredictionResponse:
+    """
+    Process model
+    predictions into a structured response.
+    """
+    labels, probs = predictions
+    pred_labels = labels[liasse_nb]
+    pred_probs = probs[liasse_nb]
+
+    valid_indices = [i for i, p in enumerate(pred_probs) if p >= prob_min]
+    k = min(nb_echos_max, len(valid_indices)) if valid_indices else 0
+
+    if k == 0:
+        raise HTTPException(
+            status_code=400,
+            detail="No prediction exceeds the given minimum probability threshold.",
+        )
+
+    response_data = {
+        str(i + 1): Prediction(
+            code=label,
+            probabilite=float(prob),
+            libelle=libs[label],
+        )
+        for i in range(k)
+        for label, prob in [(pred_labels[i].replace("__label__", ""), pred_probs[i])]
+    }
+
+    ic = response_data["1"].probabilite - float(pred_probs[1]) if k > 1 else 0.0
+    response_data["IC"] = ic
+
+    return PredictionResponse(__root__=response_data)
diff --git a/src/utils/preprocessing.py b/src/utils/preprocessing.py
@@ -8,7 +8,12 @@ def preprocess_inputs(training_names: list, inputs: list[SingleForm]) -> dict:
     Preprocess both single and batch inputs using shared logic.
     """
     df = pd.DataFrame([form.model_dump() for form in inputs])
-    df.fillna("NaN", inplace=True)
+
+    for feature in training_names[:2]:  # textual features
+        df[feature] = df[feature].fillna(value="")
+    for feature in training_names[2:]:  # categorical features
+        df[feature] = df[feature].fillna(value="NaN")
+
     df = df.astype(str)
 
     mapping = {
diff --git a/src/utils/security.py b/src/utils/security.py
@@ -0,0 +1,20 @@
+import os
+
+from fastapi import Request
+from fastapi.security import HTTPBasic
+
+
+async def get_credentials(request: Request):
+    """
+    Determines whether to apply optional security measures based on the value of the AUTH_API environment variable.
+
+    Args:
+        request (Request): The incoming request object.
+
+    Returns:
+        Union[HTTPBasic, None]: An instance of the HTTPBasic class if AUTH_API is set to "True", otherwise None.
+    """
+    if os.getenv("AUTH_API") == "True":
+        return await HTTPBasic(request)
+    else:
+        return None
diff --git a/src/utils/utils.py b/src/utils/utils.py

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ class BatchForms(BaseModel):`
`39`	`39`
`40`	`40`	`@model_validator(mode="after")`
`41`	`41`	`def check_description_not_empty(cls, values):`
`42`		`- forms = values.get("forms", [])`
	`42`	`+ forms = values.forms`
`43`	`43`	`missing_indexes = [`
`44`	`44`	`idx for idx, form in enumerate(forms) if not form.description_activity or form.description_activity.strip() == ""`
`45`	`45`	`]`