From 8c00a7481f0b01b61d42bff7bee38a79c8cf7008 Mon Sep 17 00:00:00 2001 From: mckeea Date: Wed, 4 Jun 2025 09:40:30 +0000 Subject: [PATCH 1/7] update: editor's manual --- DOCS/guidelines/editor-manual.qmd | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/DOCS/guidelines/editor-manual.qmd b/DOCS/guidelines/editor-manual.qmd index c2b2a6e7..a6167422 100644 --- a/DOCS/guidelines/editor-manual.qmd +++ b/DOCS/guidelines/editor-manual.qmd @@ -93,6 +93,72 @@ Markdown is a simple way to format text using plain characters --- no need for c This section shows the most useful Markdown elements you'll need when writing documentation. If you want to explore more, visit the official [Quarto Markdown guide](https://quarto.org/docs/authoring/markdown-basics.html). +## Line Breaks and New Lines + +In Markdown, how you break a line can affect how your text is displayed in the final document. Quarto follows standard Markdown behavior, so it's important to understand the difference between soft and hard line breaks. + +### Soft Line Break (Just Pressing Enter) + +When you press `Enter` once and start a new line in your text editor, Markdown **does not** create a visible line break in the output. Instead, it treats the two lines as part of the same paragraph. + +Example (input): + +```markdown +This is the first line +and this is the second line. +``` + +Rendered output: + +    This is the first line +and this is the second line. + +This keeps your Markdown source tidy, but it won’t create new lines unless explicitly instructed + +### Hard Line Break (Using \ at End of Line) + +To force a visible line break in Markdown, you must add two spaces at the end of a line or use a backslash `\`. Quarto supports both, but using `\` is clearer and more explicit. + +```markdown +This is the first line.\ +and this is the second line. +``` + +Rendered output: + +    This is the first line.\ +    and this is the second line. + + +### Paragraph Break (Double Enter) + + +If you press Enter twice (i.e., leave a blank line between two lines), Markdown will treat the content as two separate paragraphs. This results in a larger vertical space between the lines in the rendered output. + +Example (input): + +```markdown +This is the first paragraph. + +This is the second paragraph. +``` + +Rendered output: + +    This is the first paragraph. + +    This is the second paragraph. + + +This behavior is especially important when structuring readable documentation, separating ideas, or organizing content clearly. + +### Summary + +- Use `Enter` for a new line in your editor, but **don’t expect a visible line break**. +- Use `\` at the end of a line when you want to **force a line break**. +- Use **double Enter** (i.e., an empty line between paragraphs) to start a **new paragraph with extra spacing**. + + ## Headings Use the `#` symbol to create headings and organize your content. More `#` means a smaller heading level: From fa28df5eb03348abb3a7cdd9e6a2a0c7cd09eee1 Mon Sep 17 00:00:00 2001 From: mckeea <148862448+mckeea@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:13:45 +0200 Subject: [PATCH 2/7] Update deploy-docs.yml --- .github/workflows/deploy-docs.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index a30fb12e..4da55977 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -6,10 +6,6 @@ on: - develop - test - main - workflow_run: - workflows: ["Auto Merge Approved PRs"] - types: - - completed jobs: deploy_docs: From a8eb0d990983e6c06762fcbf7377fcca336a96c2 Mon Sep 17 00:00:00 2001 From: mckeea Date: Wed, 4 Jun 2025 12:25:41 +0000 Subject: [PATCH 3/7] Small update to editor's manual to check git workflow --- DOCS/guidelines/editor-manual.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DOCS/guidelines/editor-manual.qmd b/DOCS/guidelines/editor-manual.qmd index a6167422..3b8bb42a 100644 --- a/DOCS/guidelines/editor-manual.qmd +++ b/DOCS/guidelines/editor-manual.qmd @@ -2,7 +2,7 @@ title: "Guide for Writing Techncial Documentation" subtitle: "Copernicus Land Monitoring Service" author: "European Environment Agency (EEA)" -version: 0.5 +version: 0.6 description: "A comprehensive guide for creating technical documentation for the Copernicus Land Monitoring Service using Quarto. It covers Markdown basics, document rendering, and the review process, ensuring consistency and clarity in documentation." From 5db2676f1575ccc13a66b2651ac2c5e8a4e8628a Mon Sep 17 00:00:00 2001 From: mckeea Date: Mon, 9 Jun 2025 12:40:44 +0000 Subject: [PATCH 4/7] LLM annotator for intros and keywords --- .../scripts/generate_intros_and_keywords.py | 132 ++++++++++++++++++ .github/workflows/deploy-docs.yml | 7 + 2 files changed, 139 insertions(+) create mode 100644 .github/scripts/generate_intros_and_keywords.py diff --git a/.github/scripts/generate_intros_and_keywords.py b/.github/scripts/generate_intros_and_keywords.py new file mode 100644 index 00000000..c7546af1 --- /dev/null +++ b/.github/scripts/generate_intros_and_keywords.py @@ -0,0 +1,132 @@ +from pathlib import Path +import json +import time +import re +import google.generativeai as genai +import tiktoken +import yaml +from io import StringIO +import os +from pathlib import Path + +# Configuration +API_KEY = os.getenv("GEMINI_API_KEY") +if not API_KEY: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") +MODEL_NAME = "gemini-2.0-flash" +TOKEN_LIMIT_PER_MINUTE = 950_000 # Keep a safe margin below 1M + +SCRIPT_DIR = Path(__file__).resolve().parent +INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve() + +PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA). + +Your tasks: +1. Read and understand the entire attached document. +2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus. +3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document. + +Keyword guidance: +- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\". +- Focus on **specific concepts, methods, environmental indicators, technical systems, data processing strategies**, or **analytical results** that are central to the document. +- Use **multi-word phrases** when needed for clarity and specificity. +- Think like an expert indexing the document for scientific search or semantic web use. + +Return only the result as a raw JSON object (no code block, no explanation): + +{ + \"introduction\": \"...\", + \"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"] +} +""" + +# Setup Gemini +genai.configure(api_key=API_KEY) +model = genai.GenerativeModel(MODEL_NAME) +encoding = tiktoken.get_encoding("cl100k_base") +total_tokens_sent = 0 + + +# Function to update YAML frontmatter using PyYAML +def update_yaml_header(content: str, description: str, keywords_list: list): + lines = content.splitlines() + if lines[0].strip() != "---": + return content + + try: + end_idx = lines[1:].index("---") + 1 + except ValueError: + return content + + yaml_block = "\n".join(lines[1:end_idx]) + yaml_data = yaml.safe_load(yaml_block) or {} + yaml_data["description"] = description.replace("\n", " ").strip() + yaml_data["keywords"] = keywords_list + + new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip() + new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :] + return "\n".join(new_lines) + + +# Function to process one document with Gemini +def process_document_with_llm(doc_path: Path): + print("Processing ", doc_path) + global total_tokens_sent + + file_contents = doc_path.read_text(encoding="utf-8") + input_tokens = len(encoding.encode(file_contents)) + if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE: + print( + f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens." + ) + return + + response = model.generate_content( + contents=[ + { + "role": "user", + "parts": [ + {"text": PROMPT}, + { + "inline_data": { + "mime_type": "text/plain", + "data": file_contents.encode("utf-8"), + } + }, + ], + } + ] + ) + + total_tokens_sent += input_tokens + + raw_text = response.text.strip() + if raw_text.startswith("```"): + raw_text = re.sub(r"^```(?:json)?\s*", "", raw_text) + raw_text = re.sub(r"\s*```$", "", raw_text) + + try: + parsed_output = json.loads(raw_text) + introduction = parsed_output["introduction"] + keywords_list = parsed_output["keywords"] + keywords = ", ".join(keywords_list) + except (json.JSONDecodeError, KeyError) as e: + print(f"[ERROR] Invalid response for {doc_path}:", raw_text) + return + + updated_content = update_yaml_header(file_contents, introduction, keywords_list) + output_file = doc_path.with_name(doc_path.stem + ".qmd") + output_file.write_text(updated_content, encoding="utf-8") + + print("Estimated input tokens:", input_tokens) + + +# Process all .qmd files +BLACKLISTED_DIRS = {"templates", "includes", "theme"} + +for doc_path in INPUT_DIR.rglob("*.qmd"): + if any(part in BLACKLISTED_DIRS for part in doc_path.parts): + continue + process_document_with_llm(doc_path) + +print("Total tokens sent:", total_tokens_sent) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index 4da55977..bc65abce 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -21,6 +21,13 @@ jobs: with: fetch-depth: 0 + - name: Generate intros and keywords + uses: addnab/docker-run-action@v3 + with: + image: mckeea/llm-doc-annotator:latest + options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app + run: python .github/scripts/generate_intros_and_keywords.py + - name: Build Docs run: .github/scripts/build-docs.sh From e5090f29d9ec3604f59ee6b285000f17ee27df74 Mon Sep 17 00:00:00 2001 From: Matteo Mattiuzzi Date: Tue, 10 Jun 2025 10:10:26 +0200 Subject: [PATCH 5/7] corrected to the new url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f5d285b..87879bc1 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines. -The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/) +The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/) From a11e80a2804ffd1bfe351d4dc400fd1aebee352f Mon Sep 17 00:00:00 2001 From: mckeea Date: Tue, 10 Jun 2025 08:55:16 +0000 Subject: [PATCH 6/7] Add date,version fields to listings --- .github/scripts/generate_index_all.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_index_all.mjs b/.github/scripts/generate_index_all.mjs index 44b31b00..cfc87624 100644 --- a/.github/scripts/generate_index_all.mjs +++ b/.github/scripts/generate_index_all.mjs @@ -19,7 +19,7 @@ listing: type: table contents: . sort: title - fields: [title] + fields: [title, date, version] --- `; From 51ea2bbc0e6c9c0215a280a1287adf0f65f49d79 Mon Sep 17 00:00:00 2001 From: mckeea Date: Wed, 11 Jun 2025 14:41:44 +0000 Subject: [PATCH 7/7] Fix: generate a correct sitemap.xml --- .github/scripts/build-docs.sh | 42 ++++++++--------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/.github/scripts/build-docs.sh b/.github/scripts/build-docs.sh index 0f2dbc58..f7f852fe 100755 --- a/.github/scripts/build-docs.sh +++ b/.github/scripts/build-docs.sh @@ -1,42 +1,15 @@ #!/bin/bash set -e -# echo "🐍 Setting up Python environment..." -# apt-get update -# apt-get install -y python3 python3-venv python3-pip - -# echo "📦 Creating virtual environment..." -# python3 -m venv venv -# source venv/bin/activate - -# echo "⬆️ Upgrading pip inside virtual environment..." -# pip install --upgrade pip - -# echo "📦 Installing Python dependencies..." -# pip install \ -# keybert \ -# ruamel.yaml \ -# pyyaml \ -# transformers==4.37.2 \ -# accelerate==0.27.2 - -# source venv/bin/activate - -# echo "🛠 Setting up default Quarto configuration..." -# mv _quarto_not_used.yaml _quarto.yaml - -# echo "🏷 Generating keywords..." -# python scripts/render/generate_keywords.py - -#echo "🧹 Cleaning up cached _site directory..." -#rm -rf _site - - echo "🖼 Render all documents into to HTML/DOCX" sudo cp /usr/bin/chromium /usr/bin/chromium-browser -QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html -QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx --no-clean +QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx find _site -type f -name 'index.docx' -delete +QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html --no-clean + +# Backup the correct sitemap as it may be overwritten by next operations +sleep 5 +mv _site/sitemap.xml _site/sitemap.xml.bkp echo "🛠 Generate index.qmd files for all DOCS/* folders"e node .github/scripts/generate_index_all.mjs @@ -63,6 +36,9 @@ echo ' ' > _site/index.html +# Revert the correct sitemap +cp _site/sitemap.xml.bkp _site/sitemap.xml +rm -f _site/sitemap.xml.bkp echo "📄 Converting .docx files to .pdf..." #chmod +x ./convert_docx_to_pdf.sh