diff --git a/.github/workflows/end_to_end_test_ci.disabled b/.github/workflows/end_to_end_test_ci.disabled index 9b9389f3f..86a4018a7 100644 --- a/.github/workflows/end_to_end_test_ci.disabled +++ b/.github/workflows/end_to_end_test_ci.disabled @@ -97,7 +97,7 @@ jobs: cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create wallet files run: | diff --git a/.github/workflows/executor_ci.yml b/.github/workflows/executor_ci.yml index 53a1e97f6..35d0c4717 100644 --- a/.github/workflows/executor_ci.yml +++ b/.github/workflows/executor_ci.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run linters @@ -45,7 +45,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run mypy @@ -66,7 +66,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Setup common virtualenv # In order not to exhaust disk on GitHub runner, we use one single # virtualenv for all pdm projects: miner, executor, validator. diff --git a/.github/workflows/integration_ci.yml b/.github/workflows/integration_ci.yml index 9cddc0e5d..e2b3f4a59 100644 --- a/.github/workflows/integration_ci.yml +++ b/.github/workflows/integration_ci.yml @@ -23,7 +23,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Setup common virtualenv # In order not to exhaust disk on GitHub runner, we use one single # virtualenv for all pdm projects: miner, executor, validator. diff --git a/.github/workflows/library_cd.yml b/.github/workflows/library_cd.yml index 518f42b82..bd6bc3e65 100644 --- a/.github/workflows/library_cd.yml +++ b/.github/workflows/library_cd.yml @@ -27,7 +27,7 @@ jobs: with: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Get version from tag id: get-version run: echo "version=${GITHUB_REF#refs/tags/library-v}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/library_ci.yml b/.github/workflows/library_ci.yml index d8056b3a7..cef7f09c6 100644 --- a/.github/workflows/library_ci.yml +++ b/.github/workflows/library_ci.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Run linters run: nox -vs lint - name: Check for missing migrations @@ -45,7 +45,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Run mypy run: nox -vs type_check test: @@ -64,6 +64,6 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Run unit tests run: nox -vs test diff --git a/.github/workflows/miner_ci.yml b/.github/workflows/miner_ci.yml index 98d1365b3..bbf4c44ed 100644 --- a/.github/workflows/miner_ci.yml +++ b/.github/workflows/miner_ci.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run linters check @@ -45,7 +45,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run mypy @@ -66,7 +66,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Setup common virtualenv # In order not to exhaust disk on GitHub runner, we use one single # virtualenv for all pdm projects: miner, executor, validator. diff --git a/.github/workflows/validator_ci.yml b/.github/workflows/validator_ci.yml index b3244bfd7..3b30d24f7 100644 --- a/.github/workflows/validator_ci.yml +++ b/.github/workflows/validator_ci.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run linters @@ -45,7 +45,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Create dotenv file run: cp ./envs/dev/.env.template .env - name: Run mypy @@ -66,7 +66,7 @@ jobs: python-version: ${{ env.PYTHON_DEFAULT_VERSION }} cache: "pip" - name: Install dependencies - run: python -m pip install --upgrade nox 'pdm>=2.12,<3' + run: python -m pip install --upgrade nox 'pdm==2.19.3' - name: Setup common virtualenv # In order not to exhaust disk on GitHub runner, we use one single # virtualenv for all pdm projects: miner, executor, validator. diff --git a/compute_horde/compute_horde/executor_class.py b/compute_horde/compute_horde/executor_class.py index ae2d83d20..2dfcb5a38 100644 --- a/compute_horde/compute_horde/executor_class.py +++ b/compute_horde/compute_horde/executor_class.py @@ -44,7 +44,7 @@ class ExecutorClassSpec: description="always on, NVIDIA RTX A6000 GPU machine for LLM prompts solving", has_gpu=True, gpu_vram_gb=48, - spin_up_time=int(timedelta(minutes=1).total_seconds()), + spin_up_time=int(timedelta(minutes=4).total_seconds()), ), # ExecutorClass.always_on__cpu_16c__ram_64gb: ExecutorClassSpec( # cpu_cores=16, diff --git a/validator/app/envs/prod/Dockerfile b/validator/app/envs/prod/Dockerfile index cf1b703b3..a22a40ad4 100644 --- a/validator/app/envs/prod/Dockerfile +++ b/validator/app/envs/prod/Dockerfile @@ -6,7 +6,7 @@ LABEL builder=true WORKDIR /root/src/ -RUN pip3 install --no-cache-dir 'pdm>=2.12,<3' +RUN pip3 install --no-cache-dir 'pdm==2.19.3' RUN apt-get update && apt-get install -y git COPY pyproject.toml pdm.lock ./ diff --git a/validator/app/src/compute_horde_validator/celery.py b/validator/app/src/compute_horde_validator/celery.py index 83f066994..aef75881e 100644 --- a/validator/app/src/compute_horde_validator/celery.py +++ b/validator/app/src/compute_horde_validator/celery.py @@ -39,6 +39,8 @@ "compute_horde_validator.validator.tasks.fetch_dynamic_config": DEFAULT_QUEUE, } +CELERY_TASK_QUEUES = list(set(TASK_QUEUE_MAP.values())) + def route_task(name, args, kwargs, options, task=None, **kw): if name not in TASK_QUEUE_MAP: @@ -60,3 +62,8 @@ def apply_startup_hook(*args, **kwargs): importlib.import_module(hook_script_file) else: print("Not loading any startup hook") + + +def get_num_tasks_in_queue(queue_name: str) -> int: + with app.pool.acquire(block=True) as conn: + return conn.default_channel.client.llen(queue_name) diff --git a/validator/app/src/compute_horde_validator/settings.py b/validator/app/src/compute_horde_validator/settings.py index 845426982..73c1bc105 100644 --- a/validator/app/src/compute_horde_validator/settings.py +++ b/validator/app/src/compute_horde_validator/settings.py @@ -464,6 +464,8 @@ def wrapped(*args, **kwargs): CELERY_TASK_ROUTES = ["compute_horde_validator.celery.route_task"] CELERY_TASK_TIME_LIMIT = int(timedelta(hours=2, minutes=5).total_seconds()) CELERY_TASK_ALWAYS_EAGER = env.bool("CELERY_TASK_ALWAYS_EAGER", default=False) +CELERY_WORKER_SEND_TASK_EVENTS = True +CELERY_TASK_SEND_SENT_EVENT = True CELERY_ACCEPT_CONTENT = ["json"] CELERY_TASK_SERIALIZER = "json" CELERY_RESULT_SERIALIZER = "json" diff --git a/validator/app/src/compute_horde_validator/validator/metrics.py b/validator/app/src/compute_horde_validator/validator/metrics.py index db89bfee1..397467159 100644 --- a/validator/app/src/compute_horde_validator/validator/metrics.py +++ b/validator/app/src/compute_horde_validator/validator/metrics.py @@ -1,10 +1,15 @@ import glob import os +from collections.abc import Iterator import prometheus_client from django.http import HttpResponse from django_prometheus.exports import ExportToDjangoView from prometheus_client import multiprocess +from prometheus_client.core import REGISTRY, GaugeMetricFamily, Metric +from prometheus_client.registry import Collector + +from ..celery import get_num_tasks_in_queue, CELERY_TASK_QUEUES class RecursiveMultiProcessCollector(multiprocess.MultiProcessCollector): @@ -23,9 +28,25 @@ def metrics_view(request): if os.environ.get(ENV_VAR_NAME): registry = prometheus_client.CollectorRegistry() RecursiveMultiProcessCollector(registry) + registry.register(CustomCeleryCollector()) return HttpResponse( prometheus_client.generate_latest(registry), content_type=prometheus_client.CONTENT_TYPE_LATEST, ) else: return ExportToDjangoView(request) + + +class CustomCeleryCollector(Collector): + def collect(self) -> Iterator[Metric]: + num_tasks_in_queue = GaugeMetricFamily( + "celery_queue_len", + "How many tasks are there in a queue", + labels=("queue",), + ) + for queue in CELERY_TASK_QUEUES: + num_tasks_in_queue.add_metric([queue], get_num_tasks_in_queue(queue)) + yield num_tasks_in_queue + + +REGISTRY.register(CustomCeleryCollector()) diff --git a/validator/app/src/compute_horde_validator/validator/synthetic_jobs/batch_run.py b/validator/app/src/compute_horde_validator/validator/synthetic_jobs/batch_run.py index 991db7063..7750c9f0c 100644 --- a/validator/app/src/compute_horde_validator/validator/synthetic_jobs/batch_run.py +++ b/validator/app/src/compute_horde_validator/validator/synthetic_jobs/batch_run.py @@ -1466,7 +1466,7 @@ def _db_persist_system_events(ctx: BatchContext) -> None: # sync_to_async is needed since we use the sync Django ORM @sync_to_async -def _db_persist(ctx: BatchContext) -> None: +def _db_persist_critical(ctx: BatchContext) -> None: start_time = time.time() # persist the batch and the jobs in the same transaction, to @@ -1505,6 +1505,19 @@ def _db_persist(ctx: BatchContext) -> None: ) synthetic_jobs.append(synthetic_job) synthetic_jobs = SyntheticJob.objects.bulk_create(synthetic_jobs) + duration = time.time() - start_time + logger.info("Persisted to database in %.2f seconds", duration) + + +# sync_to_async is needed since we use the sync Django ORM +@sync_to_async +def _db_persist(ctx: BatchContext) -> None: + start_time = time.time() + + if ctx.batch_id is not None: + batch = SyntheticJobBatch.objects.get(id=ctx.batch_id) + else: + batch = SyntheticJobBatch.objects.get(started_at=ctx.stage_start_time["BATCH_BEGIN"]) miner_manifests: list[MinerManifest] = [] for miner in ctx.miners.values(): @@ -1523,7 +1536,7 @@ def _db_persist(ctx: BatchContext) -> None: # TODO: refactor into nicer abstraction synthetic_jobs_map: dict[str, SyntheticJob] = { - str(synthetic_job.job_uuid): synthetic_job for synthetic_job in synthetic_jobs + str(synthetic_job.job_uuid): synthetic_job for synthetic_job in batch.synthetic_jobs.all() } prompt_samples: list[PromptSample] = [] @@ -1700,6 +1713,9 @@ async def execute_synthetic_batch_run( func="_multi_close_client", ) + await ctx.checkpoint_system_event("_db_persist_critical") + await _db_persist_critical(ctx) + await ctx.checkpoint_system_event("_emit_telemetry_events") try: _emit_telemetry_events(ctx) diff --git a/validator/envs/prod/.env.template b/validator/envs/prod/.env.template deleted file mode 100644 index dda8128c9..000000000 --- a/validator/envs/prod/.env.template +++ /dev/null @@ -1,87 +0,0 @@ -ENV=backend-prod -DEBUG=off -DEBUG_TOOLBAR=off -SECRET_KEY= - -POSTGRES_DB=compute_horde_validator -POSTGRES_HOST=db -POSTGRES_PORT=5432 -POSTGRES_USER=postgres -POSTGRES_PASSWORD= -DATABASE_POOL_URL= -# using transaction-based db connection pool as DATABASE_URL instead of DATABASE_POOL_URL will break production -DATABASE_URL=postgres://postgres:@db:5432/compute_horde_validator - -NGINX_HOST= -BITTENSOR_NETUID= -BITTENSOR_NETWORK=finney -BITTENSOR_WALLET_NAME=compute_horde_validator -BITTENSOR_WALLET_HOTKEY_NAME=default -FACILITATOR_URI=wss://facilitator.computehorde.io/ws/v0/ -STATS_COLLECTOR_URL=https://facilitator.computehorde.io/stats_collector/v0/ - -CORS_ENABLED=on -CORS_ALLOWED_ORIGINS= -CORS_ALLOWED_ORIGIN_REGEXES= -CORS_ALLOW_ALL_ORIGINS=0 - -REDIS_HOST=redis -REDIS_PORT=6379 - - -CELERY_BROKER_URL=redis://redis:6379/0 -CELERY_TASK_ALWAYS_EAGER=0 -CELERY_CONCURRENCY=2 - -CELERY_FLOWER_USER=flower -CELERY_FLOWER_PASSWORD= - - - - -LOKI_URL=https://loki.reef.pl -LOKI_REFRESH_INTERVAL=5s -LOKI_USER= -LOKI_PASSWORD= -LOKI_CLIENT= -LOKI_CLIENT_SERVER_GROUP= - - -EMAIL_BACKEND=django.core.mail.backends.smtp.EmailBackend -EMAIL_FILE_PATH=/tmp/email -EMAIL_HOST=smtp.sendgrid.net -EMAIL_PORT=587 -EMAIL_USE_TLS=1 -EMAIL_HOST_USER=apikey -EMAIL_HOST_PASSWORD= -DEFAULT_FROM_EMAIL= - -SENTRY_DSN= - -CSP_ENABLED=n -CSP_REPORT_ONLY=y -CSP_REPORT_URL="" -CSP_DEFAULT_SRC="'none'" -CSP_SCRIPT_SRC="'self'" -CSP_STYLE_SRC="'self'" -CSP_FONT_SRC="'self'" -CSP_IMG_SRC="'self'" -CSP_MEDIA_SRC="'self'" -CSP_OBJECT_SRC="'self'" -CSP_FRAME_SRC="'self'" -CSP_CONNECT_SRC="'self'" -CSP_CHILD_SRC="'self'" -CSP_MANIFEST_SRC="'self'" -CSP_WORKER_SRC="'self'" -CSP_BLOCK_ALL_MIXED_CONTENT=y -CSP_EXCLUDE_URL_PREFIXES= - -BACKUP_B2_BUCKET= -BACKUP_B2_KEY_ID= -BACKUP_B2_KEY_SECRET= -BACKUP_LOCAL_ROTATE_KEEP_LAST= - -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= -S3_BUCKET_NAME_PROMPTS= -S3_BUCKET_NAME_ANSWERS= diff --git a/validator/envs/prod/docker-compose.yml b/validator/envs/prod/docker-compose.yml deleted file mode 100644 index e0ea783d8..000000000 --- a/validator/envs/prod/docker-compose.yml +++ /dev/null @@ -1,291 +0,0 @@ -version: '3.7' - -services: - redis: - image: redis:6-alpine - command: redis-server --appendonly yes - healthcheck: - test: redis-cli ping - restart: unless-stopped - volumes: - - ./redis/data:/data - logging: &logging - driver: journald - options: - tag: '{{.Name}}' - - db: - image: postgres:14.0-alpine - healthcheck: - test: pg_isready -U ${POSTGRES_USER} || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - POSTGRES_DB=${POSTGRES_DB} - - POSTGRES_USER=${POSTGRES_USER} - - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} - volumes: - - ./db/data:/var/lib/postgresql/data - logging: - <<: *logging - - app: - build: - context: . - dockerfile: app/Dockerfile - additional_contexts: - compute-horde: ../compute_horde - image: compute_horde_validator/app - healthcheck: - test: wget -q --spider 127.0.0.1:8000/admin/login/ || exit 1 - init: true - restart: unless-stopped - env_file: ./.env - environment: - # Add this variable to all containers that should dump Prometheus metrics. Each container besides this one - # should use a different subdirectory of /prometheus-multiproc-dir, e.g. - # - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/other-container - # Don't forget to also mount the prometheus-metrics volume in other containers too. - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir - volumes: - - backend-static:/root/src/static - - ./media:/root/src/media - # Add this mount to each container that should dump Prometheus metrics. - - ./prometheus-metrics:/prometheus-multiproc-dir - depends_on: - - redis - - db - logging: - <<: *logging - - celery-worker-default: - image: compute_horde_validator/app - init: true - healthcheck: - test: celery -A compute_horde_validator status > /dev/null || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-default - command: ./celery-entrypoint.sh -Q default -E -l INFO -c ${CELERY_CONCURRENCY} - volumes: - - ./prometheus-metrics:/prometheus-multiproc-dir - tmpfs: /run - depends_on: - - redis - logging: - <<: *logging - - celery-worker-weights: - image: compute_horde_validator/app - init: true - healthcheck: - test: celery -A compute_horde_validator status > /dev/null || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-weights - command: ./celery-entrypoint.sh -Q weights -E -l INFO -c ${CELERY_CONCURRENCY} - volumes: - - ./prometheus-metrics:/prometheus-multiproc-dir - tmpfs: /run - depends_on: - - redis - logging: - <<: *logging - - celery-worker-jobs: - image: compute_horde_validator/app - init: true - healthcheck: - test: celery -A compute_horde_validator status > /dev/null || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-jobs - command: ./celery-entrypoint.sh -Q jobs -E -l INFO -c ${CELERY_CONCURRENCY} - volumes: - - ./prometheus-metrics:/prometheus-multiproc-dir - tmpfs: /run - depends_on: - - redis - logging: - <<: *logging - - celery-worker-llm: - image: compute_horde_validator/app - init: true - healthcheck: - test: celery -A compute_horde_validator status > /dev/null || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-llm - command: ./celery-entrypoint.sh -Q llm -E -l INFO -c ${CELERY_CONCURRENCY} - volumes: - - ./prometheus-metrics:/prometheus-multiproc-dir - tmpfs: /run - depends_on: - - redis - logging: - <<: *logging - - celery-worker-receipts: - image: compute_horde_validator/app - init: true - healthcheck: - test: celery -A compute_horde_validator status > /dev/null || exit 1 - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-receipts - command: ./celery-entrypoint.sh -Q receipts -E -l INFO -c ${CELERY_CONCURRENCY} - volumes: - - ./prometheus-metrics:/prometheus-multiproc-dir - tmpfs: /run - depends_on: - - redis - logging: - <<: *logging - - celery-beat: - image: compute_horde_validator/app - init: true - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - command: nice celery -A compute_horde_validator beat -l INFO --schedule /tmp/celerybeat-schedule -f /tmp/logs/celery-beat.log - volumes: - - ./logs:/tmp/logs - depends_on: - - redis - logging: - <<: *logging - - celery-flower: - image: compute_horde_validator/app - healthcheck: - test: wget --user "${CELERY_FLOWER_USER}" --password "${CELERY_FLOWER_PASSWORD}" -qO- 127.0.0.1:5555 > /dev/null || exit 1 - init: true - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - command: celery --app=compute_horde_validator --broker="${CELERY_BROKER_URL}" flower --basic_auth="${CELERY_FLOWER_USER}:${CELERY_FLOWER_PASSWORD}" - ports: - - 5555:5555 - logging: - <<: *logging - - connect-facilitator: - image: compute_horde_validator/app - init: true - restart: unless-stopped - env_file: ./.env - environment: - - DEBUG=off - command: python manage.py connect_facilitator - volumes: - - ./logs:/tmp/logs - depends_on: - - redis - logging: - <<: *logging - - nginx: - image: 'ghcr.io/reef-technologies/nginx-rt:v1.2.1' - restart: unless-stopped - healthcheck: - test: wget -q --spider 0.0.0.0:80 || exit 1 - environment: - - NGINX_HOST=${NGINX_HOST} - volumes: - - ./nginx/templates:/etc/nginx/templates - - ./nginx/config_helpers:/etc/nginx/config_helpers - - backend-static:/srv/static:ro - - ./media:/srv/media:ro - - ./letsencrypt/etc:/etc/letsencrypt - - ./nginx/monitoring_certs:/etc/monitoring_certs - depends_on: - - app - - - cadvisor - - node-exporter - - command: nginx -g 'daemon off;' - ports: - - 127.0.0.1:80:80 - - 127.0.0.1:443:443 - - - 10443:10443 - - logging: - <<: *logging - - extra_hosts: - - "host.docker.internal:host-gateway" - - - - node-exporter: - image: prom/node-exporter:latest - container_name: node-exporter - restart: unless-stopped - network_mode: host - pid: host - volumes: - - /:/host:ro,rslave - command: - - '--path.rootfs=/host' - - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc|run|boot|var/.+)($$|/)' - - - '--collector.tcpstat' - - logging: - <<: *logging - - cadvisor: - image: gcr.io/cadvisor/cadvisor:v0.40.0 - container_name: cadvisor - devices: - - /dev/kmsg:/dev/kmsg - volumes: - - /:/rootfs:ro - - /var/run:/var/run:ro - - /sys:/sys:ro - - /var/lib/docker:/var/lib/docker:ro - - /cgroup:/cgroup:ro - restart: unless-stopped - logging: - <<: *logging - - - - promtail: - image: grafana/promtail:2.9.2 - restart: unless-stopped - environment: - - LOKI_URL=${LOKI_URL} - - LOKI_REFRESH_INTERVAL=${LOKI_REFRESH_INTERVAL} - - LOKI_USER=${LOKI_USER} - - LOKI_PASSWORD=${LOKI_PASSWORD} - - LOKI_CLIENT=${LOKI_CLIENT} - - LOKI_CLIENT_SERVER_GROUP=${LOKI_CLIENT_SERVER_GROUP} - volumes: - - ./promtail:/etc/promtail - - /var/run/docker.sock:/var/run/docker.sock:ro - command: - - -config.file=/etc/promtail/config.yml - - -config.expand-env=true - logging: - <<: *logging - - -volumes: - backend-static: diff --git a/validator/envs/runner/data/docker-compose.yml b/validator/envs/runner/data/docker-compose.yml index e8a7c8eea..e3aef12b6 100644 --- a/validator/envs/runner/data/docker-compose.yml +++ b/validator/envs/runner/data/docker-compose.yml @@ -27,7 +27,7 @@ services: app: image: backenddevelopersltd/${VALIDATOR_IMAGE_REPO}:v0-latest - pull_policy: always + pull_policy: ${PULL_POLICY:-always} healthcheck: test: wget -q --spider 127.0.0.1:8000/admin/login/ || exit 1 init: true @@ -48,9 +48,9 @@ services: labels: - "com.centurylinklabs.watchtower.enable=true" - celery-worker: + celery-worker-default: &celery image: backenddevelopersltd/${VALIDATOR_IMAGE_REPO}:v0-latest - pull_policy: always + pull_policy: ${PULL_POLICY:-always} init: true healthcheck: test: celery -A compute_horde_validator status > /dev/null || exit 1 @@ -58,10 +58,11 @@ services: env_file: ./.env environment: - CELERY_CONCURRENCY=4 - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir + - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-default volumes: - ${HOST_WALLET_DIR}:/root/.bittensor/wallets - command: ./celery-entrypoint.sh + - ${HOST_PROMETHEUS_METRICS_DIR:-./prometheus-metrics}:/prometheus-multiproc-dir + command: ./celery-entrypoint.sh -Q default -E -l INFO -c 4 tmpfs: /run depends_on: - redis @@ -71,14 +72,120 @@ services: labels: - "com.centurylinklabs.watchtower.enable=true" - celery-beat: + celery-worker-weights: + <<: *celery + command: ./celery-entrypoint.sh -Q weights -E -l INFO -c 4 + environment: + - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-weights + + celery-worker-jobs: + <<: *celery + command: ./celery-entrypoint.sh -Q jobs -E -l INFO -c 4 + environment: + - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-jobs + + celery-worker-llm: + <<: *celery + command: ./celery-entrypoint.sh -Q llm -E -l INFO -c 4 + environment: + - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-llm + + celery-worker-receipts: + <<: *celery + command: ./celery-entrypoint.sh -Q receipts -E -l INFO -c 4 + environment: + - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir/celery-worker-receipts + + cadvisor: + image: gcr.io/cadvisor/cadvisor:v0.40.0 + devices: + - /dev/kmsg:/dev/kmsg + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + - /cgroup:/cgroup:ro + restart: unless-stopped + logging: + <<: *logging + + node-exporter: + image: prom/node-exporter:latest + restart: unless-stopped + network_mode: host + pid: host + volumes: + - /:/host:ro,rslave + command: + - '--path.rootfs=/host' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc|run|boot|var/.+)($$|/)' + - '--collector.tcpstat' + logging: + <<: *logging + + prometheus: + image: backenddevelopersltd/bittensor_prometheus:latest + restart: unless-stopped + links: + - cadvisor + - app + - prometheus-proxy + - celery-flower + env_file: ./.env + volumes: + - ${HOST_WALLET_DIR}:/wallets + entrypoint: | + /bin/sh -c 'cat < /etc/prometheus/prometheus.yml.template + global: + scrape_interval: 30s + scrape_configs: + - job_name: 'validator' + static_configs: + - targets: ['cadvisor:8080', 'host.docker.internal:9100', 'app:8000', 'celery-flower:5555'] + labels: + hotkey: '{hotkey}' + remote_write: + - url: "http://prometheus-proxy:8000/prometheus_outbound_proxy/" + EOF + exec /entrypoint.sh --log.level=debug --storage.tsdb.retention.size=1GB' + + extra_hosts: + - "host.docker.internal:host-gateway" + + prometheus-proxy: + image: backenddevelopersltd/bittensor-prometheus-proxy:latest + restart: unless-stopped + environment: + - CENTRAL_PROMETHEUS_PROXY_URL=https://prometheus.bactensor.io + - ENV=prometheus-proxy-prod + - SECRET_KEY=${SECRET_KEY} + - BITTENSOR_WALLET_NAME=${BITTENSOR_WALLET_NAME} + - BITTENSOR_WALLET_HOTKEY_NAME=${BITTENSOR_WALLET_HOTKEY_NAME} + - SENTRY_DSN=${SENTRY_DSN} + volumes: + - ${HOST_WALLET_DIR}:/root/.bittensor/wallets + + celery-flower: image: backenddevelopersltd/${VALIDATOR_IMAGE_REPO}:v0-latest - pull_policy: always + healthcheck: + test: wget -qO- 127.0.0.1:5555 > /dev/null || exit 1 init: true restart: unless-stopped env_file: ./.env environment: - - PROMETHEUS_MULTIPROC_DIR=/prometheus-multiproc-dir + - DEBUG=off + - FLOWER_TASK_RUNTIME_METRIC_BUCKETS=1,2,3,5,10,20,30,45,60,120,180,240,300,600,inf + command: celery --app=compute_horde_validator --broker="redis://redis:6379/0" flower + logging: + <<: *logging + + celery-beat: + image: backenddevelopersltd/${VALIDATOR_IMAGE_REPO}:v0-latest + pull_policy: ${PULL_POLICY:-always} + init: true + restart: unless-stopped + env_file: ./.env volumes: - ${HOST_WALLET_DIR}:/root/.bittensor/wallets command: ./celery-beat-entrypoint.sh @@ -92,7 +199,7 @@ services: connect-facilitator: image: backenddevelopersltd/${VALIDATOR_IMAGE_REPO}:v0-latest - pull_policy: always + pull_policy: ${PULL_POLICY:-always} init: true restart: unless-stopped env_file: ./.env