Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions DSL/CronManager/DSL/data_resync.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
agency_data_resync:
# trigger: "0 0/1 * * * ?"
trigger: off
type: exec
command: "../app/scripts/agency_data_resync.sh -s 10"
5 changes: 5 additions & 0 deletions DSL/CronManager/DSL/initiate_vector_indexer.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vector_indexer:
trigger: off
type: exec
command: "../app/scripts/vector_indexer_pipeline.sh"
allowedEnvs: ['signedUrl', 'clientDataHash']
19 changes: 19 additions & 0 deletions DSL/CronManager/script/agency_data_resync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# DEFINING ENDPOINTS

CHECK_RESYNC_DATA_AVAILABILITY_ENDPOINT=http://ruuter-public:8086/rag-search/data/update

# Construct payload to update training status using cat
payload=$(cat <<EOF
{}
EOF
)

echo "SENDING REQUEST TO CHECK_RESYNC_DATA_AVAILABILITY_ENDPOINT"
response=$(curl -s -X POST "$CHECK_RESYNC_DATA_AVAILABILITY_ENDPOINT" \
-H "Content-Type: application/json" \
-d "$payload")

echo "DATA RESYNC SUMMARY:"
echo "$response"
84 changes: 84 additions & 0 deletions DSL/CronManager/script/vector_indexer_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

echo "Starting vector indexer pipeline..."

if [ -z "$signedUrl" ] || [ -z "$clientDataHash" ]; then
echo "Please set the signedUrl and clientDataHash environment variables."
exit 1
fi

PYTHON_SCRIPT="/app/src/vector_indexer/main_indexer.py"

echo "Using signedUrl: $signedUrl"
echo "Using clientDataHash: $clientDataHash"

# Install uv if not found
UV_BIN="/root/.local/bin/uv"
if [ ! -f "$UV_BIN" ]; then
echo "[UV] Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh || {
echo "[ERROR] Failed to install uv"
exit 1
}
fi

# Activate Python virtual environment
VENV_PATH="/app/python_virtual_env"
echo "[VENV] Activating virtual environment at: $VENV_PATH"
source "$VENV_PATH/bin/activate" || {
echo "[ERROR] Failed to activate virtual environment"
exit 1
}

# Install required packages
echo "[PACKAGES] Installing required packages..."

"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "numpy>=1.21.0,<2.0" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "requests>=2.32.5" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "pydantic>=2.11.7" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "qdrant-client>=1.15.1" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "rank-bm25>=0.2.2" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "tiktoken>=0.11.0" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "dvc[s3]>=3.55.2" || exit 1
"$UV_BIN" pip install --python "$VENV_PATH/bin/python3" "loguru>=0.7.3" || exit 1

echo "[PACKAGES] All packages installed successfully"

export PYTHONPATH="/app:/app/src:/app/src/vector_indexer:$PYTHONPATH"

[ ! -f "$PYTHON_SCRIPT" ] && { echo "[ERROR] Python script not found"; exit 1; }

echo "[FOUND] Python script at: $PYTHON_SCRIPT"

# Run vector indexer with signed URL parameter
echo "[STARTING] Vector indexer processing..."

echo "[DEBUG] About to execute main_indexer.py..."
if [ -n "$signedUrl" ]; then
echo "[SIGNED_URL] Using signed URL for dataset processing"
echo "[COMMAND] python3 -u $PYTHON_SCRIPT --signed-url $signedUrl"
python3 -u "$PYTHON_SCRIPT" --signed-url "$signedUrl" 2>&1
PYTHON_EXIT_CODE=$?
else
echo "[NO_URL] Running without signed URL"
echo "[COMMAND] python3 -u $PYTHON_SCRIPT"
python3 -u "$PYTHON_SCRIPT" 2>&1
PYTHON_EXIT_CODE=$?
fi

echo "[DEBUG] Python execution completed with exit code: $PYTHON_EXIT_CODE"

# Handle exit codes
if [ $PYTHON_EXIT_CODE -eq 0 ]; then
echo "[SUCCESS] Vector indexer completed successfully"
exit 0
elif [ $PYTHON_EXIT_CODE -eq 2 ]; then
echo "[WARNING] Vector indexer completed with some failures"
exit 2
elif [ $PYTHON_EXIT_CODE -eq 130 ]; then
echo "[INTERRUPTED] Vector indexer was interrupted by user"
exit 130
else
echo "[ERROR] Vector indexer failed with exit code: $PYTHON_EXIT_CODE"
exit $PYTHON_EXIT_CODE
fi
17 changes: 17 additions & 0 deletions DSL/Liquibase/changelog/rag-search-script-v1-llm-connections.sql
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,20 @@ INSERT INTO embedding_models (platform_id, model_key, model_name) VALUES
CREATE INDEX idx_llm_models_platform_id ON llm_models(platform_id);
CREATE INDEX idx_embedding_models_platform_id ON embedding_models(platform_id);

CREATE TABLE public.agency_sync (
agency_id VARCHAR(50) PRIMARY KEY,
agency_data_hash VARCHAR(255),
data_url TEXT,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);

INSERT INTO public.agency_sync (agency_id, created_at) VALUES
('AGENCY001', NOW());

CREATE TABLE public.mock_ckb (
client_id VARCHAR(50) PRIMARY KEY,
client_data_hash VARCHAR(255) NOT NULL,
signed_s3_url TEXT NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT NOW()
);
4 changes: 4 additions & 0 deletions DSL/Resql/rag-search/POST/get-agency-id.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SELECT
agency_id,
agency_data_hash
FROM public.agency_sync;
5 changes: 5 additions & 0 deletions DSL/Resql/rag-search/POST/mock-get-data-from-kb.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
client_id,
client_data_hash,
signed_s3_url
FROM public.mock_ckb;
33 changes: 33 additions & 0 deletions DSL/Ruuter.private/rag-search/POST/ckb/agency_data_import.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
declaration:
call: declare
version: 0.1
description: "Get agency data information by agency IDs"
method: post
accepts: json
returns: json
namespace: rag-search
allowlist:
body:
- field: agencyIds
type: array
description: "Array of unique institution IDs"

extractRequestData:
assign:
agencyIds: ${incoming.body.agencyIds || []}
log: "Received request for agency data: ${agencyIds}"

get_agency_data:
call: http.post
args:
url: "[#GLOBAL_CLASSIFIER_RESQL]/mock-get-data-from-kb"
headers:
type: json
body:
agencyIds: ${agencyIds}
result: agency_data_info
next: return_result

return_result:
return: ${agency_data_info.response.body}
next: end
4 changes: 2 additions & 2 deletions DSL/Ruuter.private/rag-search/POST/inference/production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ assign_disconnected_response:
disconnected_response:
{
chatId: "${chatId}",
content: "The LLM connection is currently unavailable. Your request couldnt be processed. Please retry shortly.",
content: "The LLM connection is currently unavailable. Your request couldn't be processed. Please retry shortly.",
status: 400
}
next: return_connection_disconnected
Expand All @@ -118,4 +118,4 @@ return_budget_check_error:
return_no_production_connection:
status: 404
return: "No production connection found"
next: end
next: end
33 changes: 33 additions & 0 deletions DSL/Ruuter.public/rag-search/POST/ckb/agency-data-import.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
declaration:
call: declare
version: 0.1
description: "Get agency data information by agency IDs"
method: post
accepts: json
returns: json
namespace: rag-search
allowlist:
body:
- field: agencyIds
type: array
description: "Array of unique institution IDs"

extractRequestData:
assign:
agencyIds: ${incoming.body.agencyIds || []}
log: "Received request for agency data: ${agencyIds}"

get_agency_data:
call: http.post
args:
url: "[#RAG_SEARCH_RESQL]/mock-get-data-from-kb"
headers:
type: json
body:
agencyIds: ${agencyIds}
result: agency_data_info
next: return_result

return_result:
return: ${agency_data_info.response.body}
next: end
88 changes: 88 additions & 0 deletions DSL/Ruuter.public/rag-search/POST/data/update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
declaration:
call: declare
version: 0.1
description: "Resync new data from KB"
method: post
accepts: json
returns: json
namespace: rag-search

get_agency_id:
call: http.post
args:
url: "[#RAG_SEARCH_RESQL]/get-agency-id"
result: get_agency_id_result
next: log_result

log_result:
log: ${get_agency_id_result.response.body[0].agencyId}
next: extract_params

extract_params:
assign:
single_agency_id: ${get_agency_id_result.response.body[0].agencyId}
agency_ids:
- ${single_agency_id}
agency_data_hash: ${get_agency_id_result.response.body[0].agencyDataHash}
next: logs_params

logs_params:
log: "Agency ID: ${agency_ids}, Agency Data Hash: ${agency_data_hash}"
next: import_agency_data

import_agency_data:
call: http.post
args:
url: "[#RAG_SEARCH_RUUTER_PUBLIC]/ckb/agency-data-import"
body:
agencyIds: ${agency_ids}
result: importResult
next: log_import_agency_data_response

log_import_agency_data_response:
log: ${JSON.stringify(importResult.response)}
next: assign_import_agency_data

assign_import_agency_data:
assign:
client_data_hash: ${importResult.response.body.response[0].clientDataHash}
signed_s3_url: ${importResult.response.body.response[0].signedS3Url}
next: check_has_match

check_has_match:
switch:
- condition: ${agency_data_hash === importResult.response.body.response[0].clientDataHash}
next: noAgencyData
- condition: true
next: execute_cron_manager

execute_cron_manager:
call: http.post
args:
url: "[#RAG_SEARCH_CRON_MANAGER]/execute/initiate_vector_indexer/vector_indexer"
query:
signedUrl: ${signed_s3_url}
clientDataHash: ${client_data_hash}
result: res
next: log_new_data_present

log_new_data_present:
log: "New data present - synchronization required"
next: end

assignNoAgencyResponse:
assign:
no_agency_response:
success: false
message: "No agency data available for sync"
next: noAgencyData

noAgencyData:
assign:
response_data:
success: true
message: "No sync required - data is up to date"
status: 200
return: ${response_data}
next: end

Loading
Loading