diff --git a/github-metrics/.drone.yml b/github-metrics/.drone.yml new file mode 100644 index 0000000..2fe65c2 --- /dev/null +++ b/github-metrics/.drone.yml @@ -0,0 +1,77 @@ +--- +kind: pipeline +type: kubernetes +name: github-metrics + +trigger: + branch: + - main + event: + - push + +steps: +- name: check-changes + image: alpine/git + commands: + - | + # Check if any files in github-metrics/ directory changed + git diff --name-only $DRONE_COMMIT_BEFORE $DRONE_COMMIT_AFTER | grep -q "^github-metrics/" && echo "Changes detected" || (echo "No changes in github-metrics/, skipping" && exit 78) + +- name: test + image: node:20-alpine + commands: + - cd github-metrics + - npm ci + - node --version + - npm --version + - echo "Validating package.json and dependencies..." + +- name: publish + image: plugins/kaniko-ecr + settings: + create_repository: true + registry: 795250896452.dkr.ecr.us-east-1.amazonaws.com + repo: docs/github-metrics + tags: + - git-${DRONE_COMMIT_SHA:0:7} + - latest + access_key: + from_secret: ecr_access_key + secret_key: + from_secret: ecr_secret_key + context: github-metrics + dockerfile: github-metrics/Dockerfile + +- name: deploy + image: quay.io/mongodb/drone-helm:v3 + settings: + chart: mongodb/cronjobs + chart_version: 1.21.2 + add_repos: [ mongodb=https://10gen.github.io/helm-charts ] + namespace: docs + release: github-metrics + values: image.tag=git-${DRONE_COMMIT_SHA:0:7},image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/github-metrics + values_files: [ 'github-metrics/cronjobs.yml' ] + api_server: https://api.prod.corp.mongodb.com + kubernetes_token: + from_secret: kubernetes_token + +- name: notify-slack + image: alpine/curl + environment: + SLACK_WEBHOOK: + from_secret: slack_webhook_url + commands: + - | + if [ "$DRONE_BUILD_STATUS" = "success" ]; then + STATUS_MSG="✅ *GitHub Metrics CronJob Deploy Succeeded*" + else + STATUS_MSG="❌ *GitHub Metrics CronJob Deploy Failed*" + fi + curl -X POST -H 'Content-type: application/json' \ + --data "{\"text\": \"$STATUS_MSG\n*Repo:* $DRONE_REPO_NAME\n*Branch:* $DRONE_BRANCH\n*Commit:* ${DRONE_COMMIT_SHA:0:7}\n*Author:* $DRONE_COMMIT_AUTHOR\n*Build:* <$DRONE_BUILD_LINK|#$DRONE_BUILD_NUMBER>\"}" \ + "$SLACK_WEBHOOK" + when: + status: + - success + - failure diff --git a/github-metrics/Dockerfile b/github-metrics/Dockerfile new file mode 100644 index 0000000..e4bd3f0 --- /dev/null +++ b/github-metrics/Dockerfile @@ -0,0 +1,29 @@ +FROM node:20-alpine + +# Set working directory +WORKDIR /app + +# Copy package files first (for better Docker layer caching) +COPY package.json package-lock.json ./ + +# Install dependencies (use ci for reproducible builds) +RUN npm ci --only=production + +# Copy the rest of the application files +COPY . . + +# Create a non-root user for security best practices +RUN addgroup -g 1001 -S nodejs && \ + adduser -S nodejs -u 1001 && \ + chown -R nodejs:nodejs /app + +# Switch to non-root user +USER nodejs + +# Set NODE_ENV to production +ENV NODE_ENV=production + +# Command to run the application +# This will be executed by the Kubernetes CronJob +CMD ["node", "index.js"] + diff --git a/github-metrics/README.md b/github-metrics/README.md index 3dd81eb..ef65d82 100644 --- a/github-metrics/README.md +++ b/github-metrics/README.md @@ -2,9 +2,9 @@ This directory contains tooling to enable us to track various GitHub project metrics programmatically. -Currently, it contains a PoC for a simple pipeline to pull metrics from GitHub into MongoDB Atlas. +This tool runs as a Kubernetes CronJob on Kanopy, automatically collecting metrics from GitHub approximately every 13-14 days and storing them in MongoDB Atlas. -Planned future work: +Planned future work: - Add logic to work with pulled maintenance metrics once available in the test repo - Set up Atlas Charts to visualize the data @@ -14,7 +14,7 @@ Planned future work: ### Get metrics from GitHub This is a simple PoC that uses [octokit](https://github.com/octokit/octokit.js) to get the following data out of GitHub -for a given repository over a trailing 14 day period: +for a given repository over a trailing 14-day period: - Views - Unique Views @@ -24,7 +24,7 @@ for a given repository over a trailing 14 day period: - Top 10 referral sources - Top 10 paths/destinations in the repo -The intent is to also get the following maintenance-related stats for a given repository over a trailing 14 day period: +The intent is to also get the following maintenance-related stats for a given repository over a trailing 14-day period: - Code frequency - Commit count @@ -119,7 +119,7 @@ For this project, as a MongoDB org member, you must also auth your PAT with SSO. npm install ``` -3. **Run the utility** +3. **Manually run the utility** From the root of the directory, run the following command to run the utility: @@ -132,3 +132,107 @@ For this project, as a MongoDB org member, you must also auth your PAT with SSO. ``` A document was inserted into mongodb_docs-notebooks with the _id: 678197a0ffe1539ff213bd86 ``` + +## Automated Deployment (Kanopy CronJob) + +This tool is deployed as a Kubernetes CronJob on Kanopy that runs automatically approximately every 13-14 days. + +### Deployment Architecture + +The deployment consists of three main components: + +1. **Dockerfile**: Containerizes the Node.js application +2. **cronjobs.yml**: Helm values file that configures the CronJob schedule and resources +3. **.drone.yml**: CI/CD pipeline that builds, publishes, and deploys the application + +### CronJob Schedule + +The cronjob is **scheduled to run weekly on Mondays at 8:00 AM UTC** (`0 8 * * 1`), but the application includes smart logic to prevent running too frequently: + +- The cronjob triggers every Monday +- The application checks if 13 days have passed since the last successful run +- If less than 13 days have passed, the job exits early without collecting metrics +- If 13 days or more have passed, it collects metrics and updates the timestamp + +The last run timestamp is stored in a persistent volume (`/data/last-run.json`) that survives between cronjob executions. + +#### Environment Variables + +The following environment variables can be configured: + +- **`ATLAS_CONNECTION_STRING`** (required): MongoDB Atlas connection string for storing metrics +- **`GITHUB_TOKEN`** (required): GitHub Personal Access Token with `repo` permissions +- **`STATE_FILE_PATH`** (optional): Path to the state file for tracking last run timestamp. Default: `/data/last-run.json` +- **`MIN_DAYS_BETWEEN_RUNS`** (optional): Minimum number of days between metric collection runs. Default: `13` + +The required secrets (`ATLAS_CONNECTION_STRING` and `GITHUB_TOKEN`) are configured in `cronjobs.yml` as Kubernetes secrets. + +### Deployment Process + +The deployment is fully automated via Drone CI/CD with the following steps: + +1. **Check Changes**: Verifies if files in `github-metrics/` directory changed +2. **Test**: Validates dependencies with `npm ci` +3. **Build**: Builds Docker image using Kaniko and publishes to ECR +4. **Deploy**: Deploys to production Kanopy cluster using Helm +5. **Notify**: Sends Slack notification on success or failure + +The pipeline only runs on pushes to the `main` branch and skips if no github-metrics files changed. + +### Manual Deployment + +To manually trigger a deployment: + +1. Push changes to the `main` branch +2. Drone will automatically run the test, build, and deploy pipelines + +### Manually Triggering the CronJob + +To manually run the cronjob outside of its schedule: + +```bash +# Find the cronjob +kubectl get cronjobs -n docs + +# Create a one-time job from the cronjob +kubectl create job --from=cronjob/github-metrics-collection \ + github-metrics-manual-$(date +%s) -n docs + +# Check the job status +kubectl get jobs -n docs + +# View logs +kubectl logs -n docs job/github-metrics-manual- +``` + +### Monitoring + +To check the status of the cronjob: + +```bash +# View cronjob details +kubectl get cronjob github-metrics-collection -n docs + +# View recent job runs +kubectl get jobs -n docs | grep github-metrics + +# View logs from the most recent run +kubectl logs -n docs -l job-name= + +# Check the last run timestamp (requires exec into a pod) +kubectl exec -n docs -- cat /data/last-run.json +``` + +The logs will show whether the job ran or was skipped: +- `Skipping run - only X days since last run (need 13)` - Job skipped, not enough time passed +- `Proceeding with run - X days since last run` - Job is collecting metrics + +### Configuration Changes + +To modify the cronjob configuration: + +1. **Change schedule**: Edit `cronjobs.yml` and update the `schedule` field +2. **Change resources**: Edit `cronjobs.yml` and update the `resources` section +3. **Change repositories tracked**: Edit `repo-details.json` + +After making changes, commit and push to the `main` branch. Drone will automatically deploy the updates. diff --git a/github-metrics/check-last-run.js b/github-metrics/check-last-run.js new file mode 100644 index 0000000..104e157 --- /dev/null +++ b/github-metrics/check-last-run.js @@ -0,0 +1,80 @@ +import fs from 'fs'; +import { writeFile, mkdir } from 'fs/promises'; +import path from 'path'; + +// Path to the state file (mounted from persistent volume) +// Can be overridden via STATE_FILE_PATH environment variable +const STATE_FILE_PATH = process.env.STATE_FILE_PATH || '/data/last-run.json'; + +// Minimum days between runs (13 days to account for timing variations with weekly Monday runs) +// Can be overridden via MIN_DAYS_BETWEEN_RUNS environment variable +const MIN_DAYS_BETWEEN_RUNS = parseInt(process.env.MIN_DAYS_BETWEEN_RUNS || '13', 10); + +/** + * Check if enough time has passed since the last run + * @returns {boolean} true if should run, false if should skip + */ +export function shouldRun() { + try { + // Check if state file exists + if (!fs.existsSync(STATE_FILE_PATH)) { + console.log('No previous run found. Running for the first time.'); + return true; + } + + // Read the last run timestamp + const stateData = JSON.parse(fs.readFileSync(STATE_FILE_PATH, 'utf8')); + const lastRunTime = new Date(stateData.lastRun); + const now = new Date(); + + // Calculate days since last run + const daysSinceLastRun = (now - lastRunTime) / (1000 * 60 * 60 * 24); + + console.log(`Last run: ${lastRunTime.toISOString()}`); + console.log(`Days since last run: ${daysSinceLastRun.toFixed(2)}`); + console.log(`Minimum days required: ${MIN_DAYS_BETWEEN_RUNS}`); + + if (daysSinceLastRun < MIN_DAYS_BETWEEN_RUNS) { + console.log(`⏭️ Skipping run - only ${daysSinceLastRun.toFixed(2)} days since last run (need ${MIN_DAYS_BETWEEN_RUNS})`); + return false; + } + + console.log(`✅ Proceeding with run - ${daysSinceLastRun.toFixed(2)} days since last run`); + return true; + + } catch (error) { + console.error('Error checking last run time:', error.message); + console.log('Proceeding with run due to error reading state file'); + return true; // Run if we can't read the state file + } +} + +/** + * Update the state file with the current timestamp + */ +export async function updateLastRun() { + try { + const now = new Date(); + const stateData = { + lastRun: now.toISOString(), + timestamp: now.getTime() + }; + + // Ensure the directory exists + const dir = path.dirname(STATE_FILE_PATH); + if (!fs.existsSync(dir)) { + await mkdir(dir, { recursive: true }); + } + + // Write the state file + await writeFile(STATE_FILE_PATH, JSON.stringify(stateData, null, 2), 'utf8'); + console.log(`✅ Updated last run timestamp: ${now.toISOString()}`); + + } catch (error) { + console.error('Error updating last run time:', error.message); + // Don't throw - we don't want to fail the job just because we can't write the state file + } +} + +export { MIN_DAYS_BETWEEN_RUNS }; + diff --git a/github-metrics/cronjobs.yml b/github-metrics/cronjobs.yml new file mode 100644 index 0000000..b0e757a --- /dev/null +++ b/github-metrics/cronjobs.yml @@ -0,0 +1,35 @@ +--- +# `image` can be skipped if the values are being set in your .drone.yml file +image: + repository: 795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/github-metrics + tag: latest + +# global secrets are references to k8s Secrets +globalEnvSecrets: + GITHUB_TOKEN: github-token + ATLAS_CONNECTION_STRING: atlas-connection-string + +cronJobs: +- name: github-metrics-collection + # Run weekly on Mondays at 8am UTC + # The application checks if it ran in the last 14 days and skips if so + # Cron format: minute hour day-of-month month day-of-week + # 0 = Sunday, 1 = Monday, etc. + schedule: "0 8 * * 1" + command: + - node + - index.js + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + # Persistent volume to store last run timestamp + persistence: + enabled: true + storageClass: "standard" + accessMode: ReadWriteOnce + size: 1Gi + mountPath: /data diff --git a/github-metrics/index.js b/github-metrics/index.js index 0616e92..0f6df2b 100644 --- a/github-metrics/index.js +++ b/github-metrics/index.js @@ -2,6 +2,7 @@ import { readFile } from 'fs/promises'; import { getGitHubMetrics } from "./get-github-metrics.js"; import { addMetricsToAtlas } from "./write-to-db.js"; import { RepoDetails } from './RepoDetails.js'; // Import the RepoDetails class +import { shouldRun, updateLastRun } from './check-last-run.js'; /* To change which repos to track metrics for, update the `repo-details.json` file. To track metrics for a new repo, add a new entry with the owner and repo name. @@ -36,13 +37,33 @@ async function processRepos() { } await addMetricsToAtlas(metricsDocs); + + // Update the last run timestamp after successful completion + await updateLastRun(); } catch (error) { console.error('Error processing repos:', error); + throw error; // Re-throw to be caught by main handler } } -// Call the function -processRepos().catch(error => { - console.error('Fatal error:', error); +// Main execution +async function main() { + console.log('🚀 GitHub Metrics Collection Starting...'); + + // Check if enough time has passed since last run + if (!shouldRun()) { + console.log('Exiting - not enough time has passed since last run'); + process.exit(0); + } + + // Process repos and collect metrics + await processRepos(); + + console.log('✅ GitHub Metrics Collection Complete'); +} + +// Call the main function +main().catch(error => { + console.error('❌ Fatal error:', error); process.exit(1); }); diff --git a/github-metrics/package-lock.json b/github-metrics/package-lock.json index f52d4b0..fc1bfbe 100644 --- a/github-metrics/package-lock.json +++ b/github-metrics/package-lock.json @@ -134,6 +134,7 @@ "resolved": "https://registry.npmjs.org/@octokit/core/-/core-7.0.5.tgz", "integrity": "sha512-t54CUOsFMappY1Jbzb7fetWeO0n6K0k/4+/ZpkS+3Joz8I4VcvY9OiEBFRYISqaI2fq5sCiPtAjRDOzVYG8m+Q==", "license": "MIT", + "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.2",