Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 9 additions & 33 deletions .github/scripts/build-docs.sh
Original file line number Diff line number Diff line change
@@ -1,42 +1,15 @@
#!/bin/bash
set -e

# echo "🐍 Setting up Python environment..."
# apt-get update
# apt-get install -y python3 python3-venv python3-pip

# echo "📦 Creating virtual environment..."
# python3 -m venv venv
# source venv/bin/activate

# echo "⬆️ Upgrading pip inside virtual environment..."
# pip install --upgrade pip

# echo "📦 Installing Python dependencies..."
# pip install \
# keybert \
# ruamel.yaml \
# pyyaml \
# transformers==4.37.2 \
# accelerate==0.27.2

# source venv/bin/activate

# echo "🛠 Setting up default Quarto configuration..."
# mv _quarto_not_used.yaml _quarto.yaml

# echo "🏷 Generating keywords..."
# python scripts/render/generate_keywords.py

#echo "🧹 Cleaning up cached _site directory..."
#rm -rf _site


echo "🖼 Render all documents into to HTML/DOCX"
sudo cp /usr/bin/chromium /usr/bin/chromium-browser
QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html
QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx --no-clean
QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to docx
find _site -type f -name 'index.docx' -delete
QUARTO_CHROMIUM_HEADLESS_MODE=new quarto render --to html --no-clean

# Backup the correct sitemap as it may be overwritten by next operations
sleep 5
mv _site/sitemap.xml _site/sitemap.xml.bkp

echo "🛠 Generate index.qmd files for all DOCS/* folders"e
node .github/scripts/generate_index_all.mjs
Expand All @@ -63,6 +36,9 @@ echo '<!DOCTYPE html>
</body>
</html>' > _site/index.html

# Revert the correct sitemap
cp _site/sitemap.xml.bkp _site/sitemap.xml
rm -f _site/sitemap.xml.bkp

echo "📄 Converting .docx files to .pdf..."
#chmod +x ./convert_docx_to_pdf.sh
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/generate_index_all.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ listing:
type: table
contents: .
sort: title
fields: [title]
fields: [title, date, version]
---
`;

Expand Down
132 changes: 132 additions & 0 deletions .github/scripts/generate_intros_and_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from pathlib import Path
import json
import time
import re
import google.generativeai as genai
import tiktoken
import yaml
from io import StringIO
import os
from pathlib import Path

# Configuration
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
raise EnvironmentError("GEMINI_API_KEY environment variable not set")
MODEL_NAME = "gemini-2.0-flash"
TOKEN_LIMIT_PER_MINUTE = 950_000 # Keep a safe margin below 1M

SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = (SCRIPT_DIR / "../../DOCS").resolve()

PROMPT = """You are an AI assistant helping to enrich a Quarto Markdown (.qmd) technical document prepared for the European Environment Agency (EEA).

Your tasks:
1. Read and understand the entire attached document.
2. Generate a professional, engaging **Introduction** (max 1 paragraph) that clearly explains the document’s purpose, scope, and technical focus.
3. Extract exactly 10 **precise and conceptually meaningful keywords or key phrases** that reflect the core scientific or technical content of the document.

Keyword guidance:
- Do **not** use general terms like \"Urban Atlas\", \"metadata\", \"documentation\", \"nomenclature\", or \"report\".
- Focus on **specific concepts, methods, environmental indicators, technical systems, data processing strategies**, or **analytical results** that are central to the document.
- Use **multi-word phrases** when needed for clarity and specificity.
- Think like an expert indexing the document for scientific search or semantic web use.

Return only the result as a raw JSON object (no code block, no explanation):

{
\"introduction\": \"...\",
\"keywords\": [\"keyword1\", \"keyword2\", ..., \"keyword10\"]
}
"""

# Setup Gemini
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel(MODEL_NAME)
encoding = tiktoken.get_encoding("cl100k_base")
total_tokens_sent = 0


# Function to update YAML frontmatter using PyYAML
def update_yaml_header(content: str, description: str, keywords_list: list):
lines = content.splitlines()
if lines[0].strip() != "---":
return content

try:
end_idx = lines[1:].index("---") + 1
except ValueError:
return content

yaml_block = "\n".join(lines[1:end_idx])
yaml_data = yaml.safe_load(yaml_block) or {}
yaml_data["description"] = description.replace("\n", " ").strip()
yaml_data["keywords"] = keywords_list

new_yaml_block = yaml.dump(yaml_data, sort_keys=False, allow_unicode=True).strip()
new_lines = ["---"] + new_yaml_block.splitlines() + ["---"] + lines[end_idx + 1 :]
return "\n".join(new_lines)


# Function to process one document with Gemini
def process_document_with_llm(doc_path: Path):
print("Processing ", doc_path)
global total_tokens_sent

file_contents = doc_path.read_text(encoding="utf-8")
input_tokens = len(encoding.encode(file_contents))
if total_tokens_sent + input_tokens > TOKEN_LIMIT_PER_MINUTE:
print(
f"[SKIPPED] {doc_path} would exceed token budget. Estimated at {input_tokens} tokens."
)
return

response = model.generate_content(
contents=[
{
"role": "user",
"parts": [
{"text": PROMPT},
{
"inline_data": {
"mime_type": "text/plain",
"data": file_contents.encode("utf-8"),
}
},
],
}
]
)

total_tokens_sent += input_tokens

raw_text = response.text.strip()
if raw_text.startswith("```"):
raw_text = re.sub(r"^```(?:json)?\s*", "", raw_text)
raw_text = re.sub(r"\s*```$", "", raw_text)

try:
parsed_output = json.loads(raw_text)
introduction = parsed_output["introduction"]
keywords_list = parsed_output["keywords"]
keywords = ", ".join(keywords_list)
except (json.JSONDecodeError, KeyError) as e:
print(f"[ERROR] Invalid response for {doc_path}:", raw_text)
return

updated_content = update_yaml_header(file_contents, introduction, keywords_list)
output_file = doc_path.with_name(doc_path.stem + ".qmd")
output_file.write_text(updated_content, encoding="utf-8")

print("Estimated input tokens:", input_tokens)


# Process all .qmd files
BLACKLISTED_DIRS = {"templates", "includes", "theme"}

for doc_path in INPUT_DIR.rglob("*.qmd"):
if any(part in BLACKLISTED_DIRS for part in doc_path.parts):
continue
process_document_with_llm(doc_path)

print("Total tokens sent:", total_tokens_sent)
11 changes: 7 additions & 4 deletions .github/workflows/deploy-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ on:
- develop
- test
- main
workflow_run:
workflows: ["Auto Merge Approved PRs"]
types:
- completed

jobs:
deploy_docs:
Expand All @@ -25,6 +21,13 @@ jobs:
with:
fetch-depth: 0

- name: Generate intros and keywords
uses: addnab/docker-run-action@v3
with:
image: mckeea/llm-doc-annotator:latest
options: -e GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} -v ${{ github.workspace }}:/app
run: python .github/scripts/generate_intros_and_keywords.py

- name: Build Docs
run: .github/scripts/build-docs.sh

Expand Down
68 changes: 67 additions & 1 deletion DOCS/guidelines/editor-manual.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
title: "Guide for Writing Techncial Documentation"
subtitle: "Copernicus Land Monitoring Service"
author: "European Environment Agency (EEA)"
version: 0.5
version: 0.6
description: "A comprehensive guide for creating technical documentation for the Copernicus
Land Monitoring Service using Quarto. It covers Markdown basics, document rendering,
and the review process, ensuring consistency and clarity in documentation."
Expand Down Expand Up @@ -93,6 +93,72 @@ Markdown is a simple way to format text using plain characters --- no need for c

This section shows the most useful Markdown elements you'll need when writing documentation. If you want to explore more, visit the official [Quarto Markdown guide](https://quarto.org/docs/authoring/markdown-basics.html).

## Line Breaks and New Lines

In Markdown, how you break a line can affect how your text is displayed in the final document. Quarto follows standard Markdown behavior, so it's important to understand the difference between soft and hard line breaks.

### Soft Line Break (Just Pressing Enter)

When you press `Enter` once and start a new line in your text editor, Markdown **does not** create a visible line break in the output. Instead, it treats the two lines as part of the same paragraph.

Example (input):

```markdown
This is the first line
and this is the second line.
```

Rendered output:

&nbsp;&nbsp;&nbsp;&nbsp;This is the first line
and this is the second line.

This keeps your Markdown source tidy, but it won’t create new lines unless explicitly instructed

### Hard Line Break (Using \ at End of Line)

To force a visible line break in Markdown, you must add two spaces at the end of a line or use a backslash `\`. Quarto supports both, but using `\` is clearer and more explicit.

```markdown
This is the first line.\
and this is the second line.
```

Rendered output:

&nbsp;&nbsp;&nbsp;&nbsp;This is the first line.\
&nbsp;&nbsp;&nbsp;&nbsp;and this is the second line.


### Paragraph Break (Double Enter)


If you press Enter twice (i.e., leave a blank line between two lines), Markdown will treat the content as two separate paragraphs. This results in a larger vertical space between the lines in the rendered output.

Example (input):

```markdown
This is the first paragraph.

This is the second paragraph.
```

Rendered output:

&nbsp;&nbsp;&nbsp;&nbsp;This is the first paragraph.

&nbsp;&nbsp;&nbsp;&nbsp;This is the second paragraph.


This behavior is especially important when structuring readable documentation, separating ideas, or organizing content clearly.

### Summary

- Use `Enter` for a new line in your editor, but **don’t expect a visible line break**.
- Use `\` at the end of a line when you want to **force a line break**.
- Use **double Enter** (i.e., an empty line between paragraphs) to start a **new paragraph with extra spacing**.


## Headings

Use the `#` symbol to create headings and organize your content. More `#` means a smaller heading level:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

This repository contains technical documents for the CLMS, such as ATBD's, PUM's, or nomenclature guidelines.

The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/)
The CLMS documents library is deployed [here](https://eea.github.io/CLMS_documents/main/DOCS/)