eea · mckeea · Jun 11, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
@@ -1,42 +1,15 @@
 #!/bin/bash
 set -e
 
-# echo "🐍 Setting up Python environment..."
-# apt-get update
-# apt-get install -y python3 python3-venv python3-pip
-
-# echo "📦 Creating virtual environment..."
-# python3 -m venv venv
-# source venv/bin/activate
-
-# echo "⬆️ Upgrading pip inside virtual environment..."
-# pip install --upgrade pip
-
-# echo "📦 Installing Python dependencies..."
-# pip install \
-#     keybert \
-#     ruamel.yaml \
-#     pyyaml \
-#     transformers==4.37.2 \
-#     accelerate==0.27.2
-
-# source venv/bin/activate
-
-# echo "🛠 Setting up default Quarto configuration..."
-# mv _quarto_not_used.yaml _quarto.yaml
-
-# echo "🏷 Generating keywords..."
-# python scripts/render/generate_keywords.py
-
-#echo "🧹 Cleaning up cached _site directory..."
-#rm -rf _site
-
-
 echo "🖼 Render all documents into to HTML/DOCX"
 sudo cp /usr/bin/chromium /usr/bin/chromium-browser
-QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html
-QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx --no-clean
+QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx 
 find _site -type f -name 'index.docx' -delete
+QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html --no-clean
+
+# Backup the correct sitemap as it may be overwritten by next operations
+sleep 5
+mv _site/sitemap.xml _site/sitemap.xml.bkp
 
 echo "🛠 Generate index.qmd files for all DOCS/* folders"e
 node .github/scripts/generate_index_all.mjs
@@ -63,6 +36,9 @@ echo '<!DOCTYPE html>
   </body>
 </html>' > _site/index.html
 
+# Revert the correct sitemap
+cp _site/sitemap.xml.bkp _site/sitemap.xml
+rm -f _site/sitemap.xml.bkp
 
 echo "📄 Converting .docx files to .pdf..."
 #chmod +x ./convert_docx_to_pdf.sh

@@ -19,7 +19,7 @@ listing:
   type: table
   contents: .
   sort: title
-  fields: [title]
+  fields: [title, date, version]
 ---
 `;
 

@@ -0,0 +1,132 @@
+from pathlib import Path
+import json
+import time
+import re
+import google.generativeai as genai
+import tiktoken
+import yaml
+from io import StringIO
+import os
+from pathlib import Path
+
+# Configuration
+API_KEY = os.getenv("GEMINI_API_KEY")
+if not API_KEY:
+    raise EnvironmentError("GEMINI_API_KEY environment variable not set")
+MODEL_NAME = "gemini-2.0-flash"
+TOKEN_LIMIT_PER_MINUTE = 950_000  # Keep a safe margin below 1M
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve()
+
+PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA).
+
+Your tasks:
+1. Read and understand the entire attached document.
+2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus.
+3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document.
+
+Keyword guidance:
+- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\".
+- Focus on **specific concepts, methods, environmental indicators, technical systems, data processing strategies**, or **analytical results** that are central to the document.
+- Use **multi-word phrases** when needed for clarity and specificity.
+- Think like an expert indexing the document for scientific search or semantic web use.
+
+Return only the result as a raw JSON object (no code block, no explanation):
+
+{
+  \"introduction\": \"...\",
+  \"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"]
+}
+"""
+
+# Setup Gemini
+genai.configure(api_key=API_KEY)
+model = genai.GenerativeModel(MODEL_NAME)
+encoding = tiktoken.get_encoding("cl100k_base")
+total_tokens_sent = 0
+
+
+# Function to update YAML frontmatter using PyYAML
+def update_yaml_header(content: str, description: str, keywords_list: list):
+    lines = content.splitlines()
+    if lines[0].strip() != "---":
+        return content
+
+    try:
+        end_idx = lines[1:].index("---") + 1
+    except ValueError:
+        return content
+
+    yaml_block = "\n".join(lines[1:end_idx])
+    yaml_data = yaml.safe_load(yaml_block) or {}
+    yaml_data["description"] = description.replace("\n", " ").strip()
+    yaml_data["keywords"] = keywords_list
+
+    new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip()
+    new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :]
+    return "\n".join(new_lines)
+
+
+# Function to process one document with Gemini
+def process_document_with_llm(doc_path: Path):
+    print("Processing ", doc_path)
+    global total_tokens_sent
+
+    file_contents = doc_path.read_text(encoding="utf-8")
+    input_tokens = len(encoding.encode(file_contents))
+    if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE:
+        print(
+            f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens."
+        )
+        return
+
+    response = model.generate_content(
+        contents=[
+            {
+                "role": "user",
+                "parts": [
+                    {"text": PROMPT},
+                    {
+                        "inline_data": {
+                            "mime_type": "text/plain",
+                            "data": file_contents.encode("utf-8"),
+                        }
+                    },
+                ],
+            }
+        ]
+    )
+
+    total_tokens_sent += input_tokens
+
+    raw_text = response.text.strip()
+    if raw_text.startswith("```"):
+        raw_text = re.sub(r"^```(?:json)?\s*", "", raw_text)
+        raw_text = re.sub(r"\s*```$", "", raw_text)
+
+    try:
+        parsed_output = json.loads(raw_text)
+        introduction = parsed_output["introduction"]
+        keywords_list = parsed_output["keywords"]
+        keywords = ", ".join(keywords_list)
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"[ERROR] Invalid response for {doc_path}:", raw_text)
+        return
+
+    updated_content = update_yaml_header(file_contents, introduction, keywords_list)
+    output_file = doc_path.with_name(doc_path.stem + ".qmd")
+    output_file.write_text(updated_content, encoding="utf-8")
+
+    print("Estimated input tokens:", input_tokens)
+
+
+# Process all .qmd files
+BLACKLISTED_DIRS = {"templates", "includes", "theme"}
+
+for doc_path in INPUT_DIR.rglob("*.qmd"):
+    if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
+        continue
+    process_document_with_llm(doc_path)
+
+print("Total tokens sent:", total_tokens_sent)
@@ -6,10 +6,6 @@ on:
       - develop
       - test
       - main
-  workflow_run:
-    workflows: ["Auto Merge Approved PRs"]
-    types:
-      - completed
 
 jobs:
   deploy_docs:
@@ -25,6 +21,13 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Generate intros and keywords
+        uses: addnab/docker-run-action@v3
+        with:
+          image: mckeea/llm-doc-annotator:latest
+          options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app
+          run: python .github/scripts/generate_intros_and_keywords.py
+
       - name: Build Docs
         run: .github/scripts/build-docs.sh
 

@@ -2,7 +2,7 @@
 title: "Guide for Writing Techncial Documentation"
 subtitle: "Copernicus Land Monitoring Service"
 author: "European Environment Agency (EEA)"
-version: 0.5
+version: 0.6
 description: "A comprehensive guide for creating technical documentation for the Copernicus
   Land Monitoring Service using Quarto. It covers Markdown basics, document rendering,
   and the review process, ensuring consistency and clarity in documentation."
@@ -93,6 +93,72 @@ Markdown is a simple way to format text using plain characters --- no need for c
 
 This section shows the most useful Markdown elements you'll need when writing documentation. If you want to explore more, visit the official [Quarto Markdown guide](https://quarto.org/docs/authoring/markdown-basics.html).
 
+## Line Breaks and New Lines
+
+In Markdown, how you break a line can affect how your text is displayed in the final document. Quarto follows standard Markdown behavior, so it's important to understand the difference between soft and hard line breaks.
+
+### Soft Line Break (Just Pressing Enter)
+
+When you press `Enter` once and start a new line in your text editor, Markdown **does not** create a visible line break in the output. Instead, it treats the two lines as part of the same paragraph. 
+
+Example (input):
+
+```markdown
+This is the first line
+and this is the second line.
+```
+
+Rendered output:
+
+&nbsp;&nbsp;&nbsp;&nbsp;This is the first line
+and this is the second line.
+
+This keeps your Markdown source tidy, but it won’t create new lines unless explicitly instructed
+
+### Hard Line Break (Using \ at End of Line)
+
+To force a visible line break in Markdown, you must add two spaces at the end of a line or use a backslash `\`. Quarto supports both, but using `\` is clearer and more explicit.
+
+```markdown
+This is the first line.\
+and this is the second line.
+```
+
+Rendered output:
+
+&nbsp;&nbsp;&nbsp;&nbsp;This is the first line.\
+&nbsp;&nbsp;&nbsp;&nbsp;and this is the second line.
+
+
+### Paragraph Break (Double Enter)
+
+
+If you press Enter twice (i.e., leave a blank line between two lines), Markdown will treat the content as two separate paragraphs. This results in a larger vertical space between the lines in the rendered output.
+
+Example (input):
+
+```markdown
+This is the first paragraph.
+
+This is the second paragraph.
+```
+
+Rendered output:
+
+&nbsp;&nbsp;&nbsp;&nbsp;This is the first paragraph.
+
+&nbsp;&nbsp;&nbsp;&nbsp;This is the second paragraph.
+
+
+This behavior is especially important when structuring readable documentation, separating ideas, or organizing content clearly.
+
+### Summary
+
+- Use `Enter` for a new line in your editor, but **don’t expect a visible line break**.
+- Use `\` at the end of a line when you want to **force a line break**.
+- Use **double Enter** (i.e., an empty line between paragraphs) to start a **new paragraph with extra spacing**.
+
+
 ## Headings
 
 Use the `#` symbol to create headings and organize your content. More `#` means a smaller heading level:

@@ -2,4 +2,4 @@
 
 This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines.
 
-The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/)
+The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@

		This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines.

		The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/)
		The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)