diff --git a/.github/scripts/build-docs.sh b/.github/scripts/build-docs.sh index 0f2dbc58..f7f852fe 100755 --- a/.github/scripts/build-docs.sh +++ b/.github/scripts/build-docs.sh @@ -1,42 +1,15 @@ #!/bin/bash set -e -# echo "🐍 Setting up Python environment..." -# apt-get update -# apt-get install -y python3 python3-venv python3-pip - -# echo "📦 Creating virtual environment..." -# python3 -m venv venv -# source venv/bin/activate - -# echo "⬆️ Upgrading pip inside virtual environment..." -# pip install --upgrade pip - -# echo "📦 Installing Python dependencies..." -# pip install \ -# keybert \ -# ruamel.yaml \ -# pyyaml \ -# transformers==4.37.2 \ -# accelerate==0.27.2 - -# source venv/bin/activate - -# echo "🛠 Setting up default Quarto configuration..." -# mv _quarto_not_used.yaml _quarto.yaml - -# echo "🏷 Generating keywords..." -# python scripts/render/generate_keywords.py - -#echo "🧹 Cleaning up cached _site directory..." -#rm -rf _site - - echo "🖼 Render all documents into to HTML/DOCX" sudo cp /usr/bin/chromium /usr/bin/chromium-browser -QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html -QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx --no-clean +QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx find _site -type f -name 'index.docx' -delete +QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html --no-clean + +# Backup the correct sitemap as it may be overwritten by next operations +sleep 5 +mv _site/sitemap.xml _site/sitemap.xml.bkp echo "🛠 Generate index.qmd files for all DOCS/* folders"e node .github/scripts/generate_index_all.mjs @@ -63,6 +36,9 @@ echo ' ' > _site/index.html +# Revert the correct sitemap +cp _site/sitemap.xml.bkp _site/sitemap.xml +rm -f _site/sitemap.xml.bkp echo "📄 Converting .docx files to .pdf..." #chmod +x ./convert_docx_to_pdf.sh diff --git a/.github/scripts/generate_index_all.mjs b/.github/scripts/generate_index_all.mjs index 44b31b00..cfc87624 100644 --- a/.github/scripts/generate_index_all.mjs +++ b/.github/scripts/generate_index_all.mjs @@ -19,7 +19,7 @@ listing: type: table contents: . sort: title - fields: [title] + fields: [title, date, version] --- `; diff --git a/.github/scripts/generate_intros_and_keywords.py b/.github/scripts/generate_intros_and_keywords.py new file mode 100644 index 00000000..c7546af1 --- /dev/null +++ b/.github/scripts/generate_intros_and_keywords.py @@ -0,0 +1,132 @@ +from pathlib import Path +import json +import time +import re +import google.generativeai as genai +import tiktoken +import yaml +from io import StringIO +import os +from pathlib import Path + +# Configuration +API_KEY = os.getenv("GEMINI_API_KEY") +if not API_KEY: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") +MODEL_NAME = "gemini-2.0-flash" +TOKEN_LIMIT_PER_MINUTE = 950_000 # Keep a safe margin below 1M + +SCRIPT_DIR = Path(__file__).resolve().parent +INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve() + +PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA). + +Your tasks: +1. Read and understand the entire attached document. +2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus. +3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document. + +Keyword guidance: +- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\". +- Focus on **specific concepts, methods, environmental indicators, technical systems, data processing strategies**, or **analytical results** that are central to the document. +- Use **multi-word phrases** when needed for clarity and specificity. +- Think like an expert indexing the document for scientific search or semantic web use. + +Return only the result as a raw JSON object (no code block, no explanation): + +{ + \"introduction\": \"...\", + \"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"] +} +""" + +# Setup Gemini +genai.configure(api_key=API_KEY) +model = genai.GenerativeModel(MODEL_NAME) +encoding = tiktoken.get_encoding("cl100k_base") +total_tokens_sent = 0 + + +# Function to update YAML frontmatter using PyYAML +def update_yaml_header(content: str, description: str, keywords_list: list): + lines = content.splitlines() + if lines[0].strip() != "---": + return content + + try: + end_idx = lines[1:].index("---") + 1 + except ValueError: + return content + + yaml_block = "\n".join(lines[1:end_idx]) + yaml_data = yaml.safe_load(yaml_block) or {} + yaml_data["description"] = description.replace("\n", " ").strip() + yaml_data["keywords"] = keywords_list + + new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip() + new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :] + return "\n".join(new_lines) + + +# Function to process one document with Gemini +def process_document_with_llm(doc_path: Path): + print("Processing ", doc_path) + global total_tokens_sent + + file_contents = doc_path.read_text(encoding="utf-8") + input_tokens = len(encoding.encode(file_contents)) + if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE: + print( + f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens." + ) + return + + response = model.generate_content( + contents=[ + { + "role": "user", + "parts": [ + {"text": PROMPT}, + { + "inline_data": { + "mime_type": "text/plain", + "data": file_contents.encode("utf-8"), + } + }, + ], + } + ] + ) + + total_tokens_sent += input_tokens + + raw_text = response.text.strip() + if raw_text.startswith("```"): + raw_text = re.sub(r"^```(?:json)?\s*", "", raw_text) + raw_text = re.sub(r"\s*```$", "", raw_text) + + try: + parsed_output = json.loads(raw_text) + introduction = parsed_output["introduction"] + keywords_list = parsed_output["keywords"] + keywords = ", ".join(keywords_list) + except (json.JSONDecodeError, KeyError) as e: + print(f"[ERROR] Invalid response for {doc_path}:", raw_text) + return + + updated_content = update_yaml_header(file_contents, introduction, keywords_list) + output_file = doc_path.with_name(doc_path.stem + ".qmd") + output_file.write_text(updated_content, encoding="utf-8") + + print("Estimated input tokens:", input_tokens) + + +# Process all .qmd files +BLACKLISTED_DIRS = {"templates", "includes", "theme"} + +for doc_path in INPUT_DIR.rglob("*.qmd"): + if any(part in BLACKLISTED_DIRS for part in doc_path.parts): + continue + process_document_with_llm(doc_path) + +print("Total tokens sent:", total_tokens_sent) diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml index a30fb12e..bc65abce 100644 --- a/.github/workflows/deploy-docs.yml +++ b/.github/workflows/deploy-docs.yml @@ -6,10 +6,6 @@ on: - develop - test - main - workflow_run: - workflows: ["Auto Merge Approved PRs"] - types: - - completed jobs: deploy_docs: @@ -25,6 +21,13 @@ jobs: with: fetch-depth: 0 + - name: Generate intros and keywords + uses: addnab/docker-run-action@v3 + with: + image: mckeea/llm-doc-annotator:latest + options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app + run: python .github/scripts/generate_intros_and_keywords.py + - name: Build Docs run: .github/scripts/build-docs.sh diff --git a/DOCS/guidelines/editor-manual.qmd b/DOCS/guidelines/editor-manual.qmd index c2b2a6e7..3b8bb42a 100644 --- a/DOCS/guidelines/editor-manual.qmd +++ b/DOCS/guidelines/editor-manual.qmd @@ -2,7 +2,7 @@ title: "Guide for Writing Techncial Documentation" subtitle: "Copernicus Land Monitoring Service" author: "European Environment Agency (EEA)" -version: 0.5 +version: 0.6 description: "A comprehensive guide for creating technical documentation for the Copernicus Land Monitoring Service using Quarto. It covers Markdown basics, document rendering, and the review process, ensuring consistency and clarity in documentation." @@ -93,6 +93,72 @@ Markdown is a simple way to format text using plain characters --- no need for c This section shows the most useful Markdown elements you'll need when writing documentation. If you want to explore more, visit the official [Quarto Markdown guide](https://quarto.org/docs/authoring/markdown-basics.html). +## Line Breaks and New Lines + +In Markdown, how you break a line can affect how your text is displayed in the final document. Quarto follows standard Markdown behavior, so it's important to understand the difference between soft and hard line breaks. + +### Soft Line Break (Just Pressing Enter) + +When you press `Enter` once and start a new line in your text editor, Markdown **does not** create a visible line break in the output. Instead, it treats the two lines as part of the same paragraph. + +Example (input): + +```markdown +This is the first line +and this is the second line. +``` + +Rendered output: + +    This is the first line +and this is the second line. + +This keeps your Markdown source tidy, but it won’t create new lines unless explicitly instructed + +### Hard Line Break (Using \ at End of Line) + +To force a visible line break in Markdown, you must add two spaces at the end of a line or use a backslash `\`. Quarto supports both, but using `\` is clearer and more explicit. + +```markdown +This is the first line.\ +and this is the second line. +``` + +Rendered output: + +    This is the first line.\ +    and this is the second line. + + +### Paragraph Break (Double Enter) + + +If you press Enter twice (i.e., leave a blank line between two lines), Markdown will treat the content as two separate paragraphs. This results in a larger vertical space between the lines in the rendered output. + +Example (input): + +```markdown +This is the first paragraph. + +This is the second paragraph. +``` + +Rendered output: + +    This is the first paragraph. + +    This is the second paragraph. + + +This behavior is especially important when structuring readable documentation, separating ideas, or organizing content clearly. + +### Summary + +- Use `Enter` for a new line in your editor, but **don’t expect a visible line break**. +- Use `\` at the end of a line when you want to **force a line break**. +- Use **double Enter** (i.e., an empty line between paragraphs) to start a **new paragraph with extra spacing**. + + ## Headings Use the `#` symbol to create headings and organize your content. More `#` means a smaller heading level: diff --git a/README.md b/README.md index 3f5d285b..87879bc1 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines. -The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/) +The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)