From 2eaf3974493cdae2c40a40be49c63bd92c0b1984 Mon Sep 17 00:00:00 2001 From: Hassene A Date: Sun, 8 Feb 2026 18:27:14 +0100 Subject: [PATCH] HTML sanitization to Algolia search results --- .../src/theme/TOCItems/Tree.tsx | 2 +- .../client/__tests__/htmlSanitizer.test.ts | 115 ++++++++++++ .../src/client/htmlSanitizer.ts | 167 ++++++++++++++++++ .../src/client/index.ts | 1 + .../src/theme/SearchPage/index.tsx | 15 +- 5 files changed, 289 insertions(+), 11 deletions(-) create mode 100644 packages/docusaurus-theme-search-algolia/src/client/__tests__/htmlSanitizer.test.ts create mode 100644 packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts diff --git a/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx b/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx index 94a18454e92f..9463026b2a42 100644 --- a/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx +++ b/packages/docusaurus-theme-classic/src/theme/TOCItems/Tree.tsx @@ -26,7 +26,7 @@ function TOCItemTree({ { + it('preserves safe formatting tags', () => { + expect(sanitizeHtml('highlighted')).toBe('highlighted'); + expect(sanitizeHtml('marked')).toBe('marked'); + expect(sanitizeHtml('strong')).toBe( + 'strong', + ); + expect(sanitizeHtml('bold')).toBe('bold'); + expect(sanitizeHtml('italic')).toBe('italic'); + }); + + it('preserves allowed class attributes', () => { + expect(sanitizeHtml('text')).toBe( + 'text', + ); + }); + + it('strips dangerous tags', () => { + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml('')).toBe(''); + expect(sanitizeHtml('')).toBe(''); + }); + + it('strips dangerous attributes', () => { + expect(sanitizeHtml('text')).toBe( + 'text', + ); + expect(sanitizeHtml('text')).toBe( + 'text', + ); + expect(sanitizeHtml('text')).toBe( + 'text', + ); + }); + + it('escapes HTML entities in text content', () => { + // Already-escaped entities are preserved as-is (not double-escaped) + expect(sanitizeHtml('<script>')).toBe('<script>'); + expect(sanitizeHtml('<test>')).toBe('<test>'); + }); + + it('handles nested tags correctly', () => { + expect(sanitizeHtml('nested')).toBe( + 'nested', + ); + expect(sanitizeHtml('safe')).toBe( + 'safe', + ); + }); + + it('handles mixed content', () => { + expect(sanitizeHtml('text highlighted more text')).toBe( + 'text highlighted more text', + ); + }); + + it('returns empty string for empty input', () => { + expect(sanitizeHtml('')).toBe(''); + }); + + it('handles malformed HTML gracefully', () => { + // Regex-based sanitizer doesn't auto-close tags (acceptable trade-off) + expect(sanitizeHtml('unclosed')).toBe('unclosed'); + // Unopened closing tags are kept (harmless in HTML) + expect(sanitizeHtml('unopened')).toBe('unopened'); + }); + + it('prevents javascript: protocol in attributes', () => { + // Even though href is not allowed, test that it would be stripped + expect(sanitizeHtml('link')).toBe('link'); + }); + + it('prevents data: protocol attacks', () => { + expect( + sanitizeHtml(''), + ).toBe(''); + }); +}); + +describe('sanitizeAlgoliaHtml', () => { + it('replaces Algolia CSS classes', () => { + expect( + sanitizeAlgoliaHtml( + 'text', + ), + ).toBe('text'); + }); + + it('sanitizes and replaces classes together', () => { + expect( + sanitizeAlgoliaHtml( + 'safe', + ), + ).toBe('safe'); + }); + + it('handles typical Algolia search result format', () => { + const algoliaResult = + 'This is a search result'; + expect(sanitizeAlgoliaHtml(algoliaResult)).toBe( + 'This is a search result', + ); + }); +}); diff --git a/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts b/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts new file mode 100644 index 000000000000..e3112c6d84be --- /dev/null +++ b/packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts @@ -0,0 +1,167 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +import ExecutionEnvironment from '@docusaurus/ExecutionEnvironment'; + +/** + * Sanitizes HTML content from Algolia search results. + * Algolia returns HTML with highlight tags () that we need to preserve, + * but we must strip any potentially malicious content. + * + * This is a lightweight sanitizer that: + * - Allows only safe tags: , , , , + * - Strips all attributes except 'class' on allowed tags + * - Removes all other HTML tags and their content + * - Escapes any remaining HTML entities + */ + +const ALLOWED_TAGS = ['em', 'mark', 'strong', 'b', 'i']; +const ALLOWED_ATTRIBUTES = ['class']; + +/** + * Sanitizes HTML string by allowing only safe formatting tags. + * This prevents XSS attacks while preserving search result highlighting. + */ +export function sanitizeHtml(html: string): string { + if (!html) { + return ''; + } + + // Only run in browser environment + if (!ExecutionEnvironment.canUseDOM) { + // In SSR/test environment, use regex-based sanitization + return sanitizeHtmlRegex(html); + } + + // Create a temporary DOM element to parse HTML + const doc = new DOMParser().parseFromString(html, 'text/html'); + + // Recursive function to sanitize nodes + function sanitizeNode(node: Node): string { + if (node.nodeType === Node.TEXT_NODE) { + // Text nodes are safe, just escape HTML entities + return escapeHtml(node.textContent || ''); + } + + if (node.nodeType === Node.ELEMENT_NODE) { + const element = node as Element; + const tagName = element.tagName.toLowerCase(); + + // If tag is not allowed, return only its text content (stripped) + if (!ALLOWED_TAGS.includes(tagName)) { + return escapeHtml(element.textContent || ''); + } + + // Process allowed tags + const children = Array.from(element.childNodes) + .map((child) => sanitizeNode(child)) + .join(''); + + // Build the sanitized tag with allowed attributes only + const attributes = Array.from(element.attributes) + .filter((attr) => ALLOWED_ATTRIBUTES.includes(attr.name.toLowerCase())) + .map((attr) => `${attr.name}="${escapeHtml(attr.value)}"`) + .join(' '); + + const attrString = attributes ? ` ${attributes}` : ''; + return `<${tagName}${attrString}>${children}`; + } + + return ''; + } + + // Process all nodes in the body + return Array.from(doc.body.childNodes) + .map((node) => sanitizeNode(node)) + .join(''); +} + +/** + * Regex-based sanitization for SSR/test environments. + * Less robust than DOM-based but works without browser APIs. + */ +function sanitizeHtmlRegex(html: string): string { + // Remove script tags and their content + let sanitized = html.replace( + /)<[^<]*)*<\/script>/gi, + '', + ); + + // Remove dangerous tags + sanitized = sanitized.replace( + /<(iframe|object|embed|link|style)[^>]*>.*?<\/\1>/gi, + '', + ); + sanitized = sanitized.replace(/<(img|input|form)[^>]*>/gi, ''); + + // Remove event handlers and dangerous attributes + sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, ''); + sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, ''); + sanitized = sanitized.replace(/\s*style\s*=\s*["'][^"']*["']/gi, ''); + sanitized = sanitized.replace( + /\s*href\s*=\s*["']javascript:[^"']*["']/gi, + '', + ); + + // Keep only allowed tags with class attribute + sanitized = sanitized.replace( + // eslint-disable-next-line regexp/no-super-linear-backtracking + /<(\w+)([^>]*)>/g, + (match, tag, attrs) => { + const tagLower = tag.toLowerCase(); + if (!ALLOWED_TAGS.includes(tagLower)) { + return ''; + } + // Extract class attribute if present + const classMatch = attrs.match(/\s*class\s*=\s*["']([^"']*)["']/i); + if (classMatch) { + return `<${tagLower} class="${escapeHtml(classMatch[1])}">`; + } + return `<${tagLower}>`; + }, + ); + + // Remove closing tags for non-allowed tags + sanitized = sanitized.replace(/<\/(\w+)>/g, (match, tag) => { + const tagLower = tag.toLowerCase(); + return ALLOWED_TAGS.includes(tagLower) ? match : ''; + }); + + return sanitized; +} + +/** + * Escapes HTML special characters to prevent XSS. + */ +function escapeHtml(text: string): string { + const htmlEscapeMap: Record = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + "'": ''', + '/': '/', + }; + + return text.replace(/[&<>"'/]/g, (char) => htmlEscapeMap[char]!); +} + +/** + * Sanitizes Algolia search result value by: + * 1. Replacing Algolia-specific CSS classes with our own + * 2. Sanitizing HTML to prevent XSS attacks + */ +export function sanitizeAlgoliaHtml(value: string): string { + // First replace Algolia CSS classes + const withReplacedClasses = value.replace( + /algolia-docsearch-suggestion--highlight/g, + 'search-result-match', + ); + + // Then sanitize the HTML + return sanitizeHtml(withReplacedClasses); +} diff --git a/packages/docusaurus-theme-search-algolia/src/client/index.ts b/packages/docusaurus-theme-search-algolia/src/client/index.ts index c2d61a668de6..cbcd5e9444fa 100644 --- a/packages/docusaurus-theme-search-algolia/src/client/index.ts +++ b/packages/docusaurus-theme-search-algolia/src/client/index.ts @@ -13,3 +13,4 @@ export { export {useSearchResultUrlProcessor} from './useSearchResultUrlProcessor'; export {useAlgoliaAskAi} from './useAlgoliaAskAi'; export {mergeFacetFilters} from './utils'; +export {sanitizeHtml, sanitizeAlgoliaHtml} from './htmlSanitizer'; diff --git a/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx b/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx index fb275f506b69..b107cabcd52d 100644 --- a/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx +++ b/packages/docusaurus-theme-search-algolia/src/theme/SearchPage/index.tsx @@ -35,6 +35,7 @@ import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import { useAlgoliaThemeConfig, useSearchResultUrlProcessor, + sanitizeAlgoliaHtml, } from '@docusaurus/theme-search-algolia/client'; import Layout from '@theme/Layout'; import Heading from '@theme/Heading'; @@ -321,12 +322,6 @@ function SearchPageContent(): ReactNode { return; } - const sanitizeValue = (value: string) => - value.replace( - /algolia-docsearch-suggestion--highlight/g, - 'search-result-match', - ); - const items = hits.map( ({ url, @@ -338,13 +333,13 @@ function SearchPageContent(): ReactNode { _snippetResult: {content?: {value: string}}; }) => { const titles = Object.keys(hierarchy).map((key) => - sanitizeValue(hierarchy[key]!.value), + sanitizeAlgoliaHtml(hierarchy[key]!.value), ); return { title: titles.pop()!, url: processSearchResultUrl(url), summary: snippet.content - ? `${sanitizeValue(snippet.content.value)}...` + ? `${sanitizeAlgoliaHtml(snippet.content.value)}...` : '', breadcrumbs: titles, }; @@ -535,7 +530,7 @@ function SearchPageContent(): ReactNode {
  • @@ -547,7 +542,7 @@ function SearchPageContent(): ReactNode { {summary && (