From 2eaf3974493cdae2c40a40be49c63bd92c0b1984 Mon Sep 17 00:00:00 2001
From: Hassene A <hasseneafif@gmail.com>
Date: Sun, 8 Feb 2026 18:27:14 +0100
Subject: [PATCH] HTML sanitization to Algolia search results

---
 .../src/theme/TOCItems/Tree.tsx               |   2 +-
 .../client/__tests__/htmlSanitizer.test.ts    | 115 ++++++++++++
 .../src/client/htmlSanitizer.ts               | 167 ++++++++++++++++++
 .../src/client/index.ts                       |   1 +
 .../src/theme/SearchPage/index.tsx            |  15 +-
 5 files changed, 289 insertions(+), 11 deletions(-)
 create mode 100644 packages/docusaurus-theme-search-algolia/src/client/__tests__/htmlSanitizer.test.ts
 create mode 100644 packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts
diff --git a/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx b/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx
index 94a18454e92f..9463026b2a42 100644
--- a/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx
+++ b/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx
@@ -26,7 +26,7 @@ function TOCItemTree({
           <Link
             to={`#${heading.id}`}
             className={linkClassName ?? undefined}
-            // Developer provided the HTML, so assume it's safe.
+            // HTML is generated at build time from markdown and properly escaped
             dangerouslySetInnerHTML={{__html: heading.value}}
           />
           <TOCItemTree
diff --git a/packages/docusaurus-theme-search-algolia/src/client/__tests__/htmlSanitizer.test.ts b/packages/docusaurus-theme-search-algolia/src/client/__tests__/htmlSanitizer.test.ts
new file mode 100644
index 000000000000..00310b492cb5
--- /dev/null
+++ b/packages/docusaurus-theme-search-algolia/src/client/__tests__/htmlSanitizer.test.ts
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import {sanitizeHtml, sanitizeAlgoliaHtml} from '../htmlSanitizer';
+
+describe('sanitizeHtml', () => {
+  it('preserves safe formatting tags', () => {
+    expect(sanitizeHtml('<em>highlighted</em>')).toBe('<em>highlighted</em>');
+    expect(sanitizeHtml('<mark>marked</mark>')).toBe('<mark>marked</mark>');
+    expect(sanitizeHtml('<strong>strong</strong>')).toBe(
+      '<strong>strong</strong>',
+    );
+    expect(sanitizeHtml('<b>bold</b>')).toBe('<b>bold</b>');
+    expect(sanitizeHtml('<i>italic</i>')).toBe('<i>italic</i>');
+  });
+
+  it('preserves allowed class attributes', () => {
+    expect(sanitizeHtml('<em class="highlight">text</em>')).toBe(
+      '<em class="highlight">text</em>',
+    );
+  });
+
+  it('strips dangerous tags', () => {
+    expect(sanitizeHtml('<script>alert("xss")</script>')).toBe('');
+    expect(sanitizeHtml('<img src="x" onerror="alert(1)">')).toBe('');
+    expect(sanitizeHtml('<iframe src="evil.com"></iframe>')).toBe('');
+    expect(sanitizeHtml('<object data="evil.swf"></object>')).toBe('');
+    expect(sanitizeHtml('<embed src="evil.swf">')).toBe('');
+  });
+
+  it('strips dangerous attributes', () => {
+    expect(sanitizeHtml('<em onclick="alert(1)">text</em>')).toBe(
+      '<em>text</em>',
+    );
+    expect(sanitizeHtml('<em onload="alert(1)">text</em>')).toBe(
+      '<em>text</em>',
+    );
+    expect(sanitizeHtml('<em style="color:red">text</em>')).toBe(
+      '<em>text</em>',
+    );
+  });
+
+  it('escapes HTML entities in text content', () => {
+    // Already-escaped entities are preserved as-is (not double-escaped)
+    expect(sanitizeHtml('&lt;script&gt;')).toBe('&lt;script&gt;');
+    expect(sanitizeHtml('<em>&lt;test&gt;</em>')).toBe('<em>&lt;test&gt;</em>');
+  });
+
+  it('handles nested tags correctly', () => {
+    expect(sanitizeHtml('<em><strong>nested</strong></em>')).toBe(
+      '<em><strong>nested</strong></em>',
+    );
+    expect(sanitizeHtml('<em><script>evil</script>safe</em>')).toBe(
+      '<em>safe</em>',
+    );
+  });
+
+  it('handles mixed content', () => {
+    expect(sanitizeHtml('text <em>highlighted</em> more text')).toBe(
+      'text <em>highlighted</em> more text',
+    );
+  });
+
+  it('returns empty string for empty input', () => {
+    expect(sanitizeHtml('')).toBe('');
+  });
+
+  it('handles malformed HTML gracefully', () => {
+    // Regex-based sanitizer doesn't auto-close tags (acceptable trade-off)
+    expect(sanitizeHtml('<em>unclosed')).toBe('<em>unclosed');
+    // Unopened closing tags are kept (harmless in HTML)
+    expect(sanitizeHtml('unopened</em>')).toBe('unopened</em>');
+  });
+
+  it('prevents javascript: protocol in attributes', () => {
+    // Even though href is not allowed, test that it would be stripped
+    expect(sanitizeHtml('<a href="javascript:alert(1)">link</a>')).toBe('link');
+  });
+
+  it('prevents data: protocol attacks', () => {
+    expect(
+      sanitizeHtml('<img src="data:text/html,<script>alert(1)</script>">'),
+    ).toBe('');
+  });
+});
+
+describe('sanitizeAlgoliaHtml', () => {
+  it('replaces Algolia CSS classes', () => {
+    expect(
+      sanitizeAlgoliaHtml(
+        '<em class="algolia-docsearch-suggestion--highlight">text</em>',
+      ),
+    ).toBe('<em class="search-result-match">text</em>');
+  });
+
+  it('sanitizes and replaces classes together', () => {
+    expect(
+      sanitizeAlgoliaHtml(
+        '<em class="algolia-docsearch-suggestion--highlight">safe</em><script>evil</script>',
+      ),
+    ).toBe('<em class="search-result-match">safe</em>');
+  });
+
+  it('handles typical Algolia search result format', () => {
+    const algoliaResult =
+      'This is a <em class="algolia-docsearch-suggestion--highlight">search</em> result';
+    expect(sanitizeAlgoliaHtml(algoliaResult)).toBe(
+      'This is a <em class="search-result-match">search</em> result',
+    );
+  });
+});
diff --git a/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts b/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts
new file mode 100644
index 000000000000..e3112c6d84be
--- /dev/null
+++ b/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts
@@ -0,0 +1,167 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+import ExecutionEnvironment from '@docusaurus/ExecutionEnvironment';
+
+/**
+ * Sanitizes HTML content from Algolia search results.
+ * Algolia returns HTML with highlight tags (<em>) that we need to preserve,
+ * but we must strip any potentially malicious content.
+ *
+ * This is a lightweight sanitizer that:
+ * - Allows only safe tags: <em>, <mark>, <strong>, <b>, <i>
+ * - Strips all attributes except 'class' on allowed tags
+ * - Removes all other HTML tags and their content
+ * - Escapes any remaining HTML entities
+ */
+
+const ALLOWED_TAGS = ['em', 'mark', 'strong', 'b', 'i'];
+const ALLOWED_ATTRIBUTES = ['class'];
+
+/**
+ * Sanitizes HTML string by allowing only safe formatting tags.
+ * This prevents XSS attacks while preserving search result highlighting.
+ */
+export function sanitizeHtml(html: string): string {
+  if (!html) {
+    return '';
+  }
+
+  // Only run in browser environment
+  if (!ExecutionEnvironment.canUseDOM) {
+    // In SSR/test environment, use regex-based sanitization
+    return sanitizeHtmlRegex(html);
+  }
+
+  // Create a temporary DOM element to parse HTML
+  const doc = new DOMParser().parseFromString(html, 'text/html');
+
+  // Recursive function to sanitize nodes
+  function sanitizeNode(node: Node): string {
+    if (node.nodeType === Node.TEXT_NODE) {
+      // Text nodes are safe, just escape HTML entities
+      return escapeHtml(node.textContent || '');
+    }
+
+    if (node.nodeType === Node.ELEMENT_NODE) {
+      const element = node as Element;
+      const tagName = element.tagName.toLowerCase();
+
+      // If tag is not allowed, return only its text content (stripped)
+      if (!ALLOWED_TAGS.includes(tagName)) {
+        return escapeHtml(element.textContent || '');
+      }
+
+      // Process allowed tags
+      const children = Array.from(element.childNodes)
+        .map((child) => sanitizeNode(child))
+        .join('');
+
+      // Build the sanitized tag with allowed attributes only
+      const attributes = Array.from(element.attributes)
+        .filter((attr) => ALLOWED_ATTRIBUTES.includes(attr.name.toLowerCase()))
+        .map((attr) => `${attr.name}="${escapeHtml(attr.value)}"`)
+        .join(' ');
+
+      const attrString = attributes ? ` ${attributes}` : '';
+      return `<${tagName}${attrString}>${children}</${tagName}>`;
+    }
+
+    return '';
+  }
+
+  // Process all nodes in the body
+  return Array.from(doc.body.childNodes)
+    .map((node) => sanitizeNode(node))
+    .join('');
+}
+
+/**
+ * Regex-based sanitization for SSR/test environments.
+ * Less robust than DOM-based but works without browser APIs.
+ */
+function sanitizeHtmlRegex(html: string): string {
+  // Remove script tags and their content
+  let sanitized = html.replace(
+    /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
+    '',
+  );
+
+  // Remove dangerous tags
+  sanitized = sanitized.replace(
+    /<(iframe|object|embed|link|style)[^>]*>.*?<\/\1>/gi,
+    '',
+  );
+  sanitized = sanitized.replace(/<(img|input|form)[^>]*>/gi, '');
+
+  // Remove event handlers and dangerous attributes
+  sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
+  sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, '');
+  sanitized = sanitized.replace(/\s*style\s*=\s*["'][^"']*["']/gi, '');
+  sanitized = sanitized.replace(
+    /\s*href\s*=\s*["']javascript:[^"']*["']/gi,
+    '',
+  );
+
+  // Keep only allowed tags with class attribute
+  sanitized = sanitized.replace(
+    // eslint-disable-next-line regexp/no-super-linear-backtracking
+    /<(\w+)([^>]*)>/g,
+    (match, tag, attrs) => {
+      const tagLower = tag.toLowerCase();
+      if (!ALLOWED_TAGS.includes(tagLower)) {
+        return '';
+      }
+      // Extract class attribute if present
+      const classMatch = attrs.match(/\s*class\s*=\s*["']([^"']*)["']/i);
+      if (classMatch) {
+        return `<${tagLower} class="${escapeHtml(classMatch[1])}">`;
+      }
+      return `<${tagLower}>`;
+    },
+  );
+
+  // Remove closing tags for non-allowed tags
+  sanitized = sanitized.replace(/<\/(\w+)>/g, (match, tag) => {
+    const tagLower = tag.toLowerCase();
+    return ALLOWED_TAGS.includes(tagLower) ? match : '';
+  });
+
+  return sanitized;
+}
+
+/**
+ * Escapes HTML special characters to prevent XSS.
+ */
+function escapeHtml(text: string): string {
+  const htmlEscapeMap: Record<string, string> = {
+    '&': '&amp;',
+    '<': '&lt;',
+    '>': '&gt;',
+    '"': '&quot;',
+    "'": '&#x27;',
+    '/': '&#x2F;',
+  };
+
+  return text.replace(/[&<>"'/]/g, (char) => htmlEscapeMap[char]!);
+}
+
+/**
+ * Sanitizes Algolia search result value by:
+ * 1. Replacing Algolia-specific CSS classes with our own
+ * 2. Sanitizing HTML to prevent XSS attacks
+ */
+export function sanitizeAlgoliaHtml(value: string): string {
+  // First replace Algolia CSS classes
+  const withReplacedClasses = value.replace(
+    /algolia-docsearch-suggestion--highlight/g,
+    'search-result-match',
+  );
+
+  // Then sanitize the HTML
+  return sanitizeHtml(withReplacedClasses);
+}
diff --git a/packages/docusaurus-theme-search-algolia/src/client/index.ts b/packages/docusaurus-theme-search-algolia/src/client/index.ts
index c2d61a668de6..cbcd5e9444fa 100644
--- a/packages/docusaurus-theme-search-algolia/src/client/index.ts
+++ b/packages/docusaurus-theme-search-algolia/src/client/index.ts
@@ -13,3 +13,4 @@ export {
 export {useSearchResultUrlProcessor} from './useSearchResultUrlProcessor';
 export {useAlgoliaAskAi} from './useAlgoliaAskAi';
 export {mergeFacetFilters} from './utils';
+export {sanitizeHtml, sanitizeAlgoliaHtml} from './htmlSanitizer';
diff --git a/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx b/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx
index fb275f506b69..b107cabcd52d 100644
--- a/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx
+++ b/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx
@@ -35,6 +35,7 @@ import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 import {
   useAlgoliaThemeConfig,
   useSearchResultUrlProcessor,
+  sanitizeAlgoliaHtml,
 } from '@docusaurus/theme-search-algolia/client';
 import Layout from '@theme/Layout';
 import Heading from '@theme/Heading';
@@ -321,12 +322,6 @@ function SearchPageContent(): ReactNode {
         return;
       }
 
-      const sanitizeValue = (value: string) =>
-        value.replace(
-          /algolia-docsearch-suggestion--highlight/g,
-          'search-result-match',
-        );
-
       const items = hits.map(
         ({
           url,
@@ -338,13 +333,13 @@ function SearchPageContent(): ReactNode {
           _snippetResult: {content?: {value: string}};
         }) => {
           const titles = Object.keys(hierarchy).map((key) =>
-            sanitizeValue(hierarchy[key]!.value),
+            sanitizeAlgoliaHtml(hierarchy[key]!.value),
           );
           return {
             title: titles.pop()!,
             url: processSearchResultUrl(url),
             summary: snippet.content
-              ? `${sanitizeValue(snippet.content.value)}...`
+              ? `${sanitizeAlgoliaHtml(snippet.content.value)}...`
               : '',
             breadcrumbs: titles,
           };
@@ -535,7 +530,7 @@ function SearchPageContent(): ReactNode {
                           <li
                             key={index}
                             className="breadcrumbs__item"
-                            // Developer provided the HTML, so assume it's safe.
+                            // HTML is sanitized to prevent XSS attacks
                             // eslint-disable-next-line react/no-danger
                             dangerouslySetInnerHTML={{__html: html}}
                           />
@@ -547,7 +542,7 @@ function SearchPageContent(): ReactNode {
                   {summary && (
                     <p
                       className={styles.searchResultItemSummary}
-                      // Developer provided the HTML, so assume it's safe.
+                      // HTML is sanitized to prevent XSS attacks
                       // eslint-disable-next-line react/no-danger
                       dangerouslySetInnerHTML={{__html: summary}}
                     />