Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function TOCItemTree({
<Link
to={`#${heading.id}`}
className={linkClassName ?? undefined}
// Developer provided the HTML, so assume it's safe.
// HTML is generated at build time from markdown and properly escaped
dangerouslySetInnerHTML={{__html: heading.value}}
/>
<TOCItemTree
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

import {sanitizeHtml, sanitizeAlgoliaHtml} from '../htmlSanitizer';

describe('sanitizeHtml', () => {
it('preserves safe formatting tags', () => {
expect(sanitizeHtml('<em>highlighted</em>')).toBe('<em>highlighted</em>');
expect(sanitizeHtml('<mark>marked</mark>')).toBe('<mark>marked</mark>');
expect(sanitizeHtml('<strong>strong</strong>')).toBe(
'<strong>strong</strong>',
);
expect(sanitizeHtml('<b>bold</b>')).toBe('<b>bold</b>');
expect(sanitizeHtml('<i>italic</i>')).toBe('<i>italic</i>');
});

it('preserves allowed class attributes', () => {
expect(sanitizeHtml('<em class="highlight">text</em>')).toBe(
'<em class="highlight">text</em>',
);
});

it('strips dangerous tags', () => {
expect(sanitizeHtml('<script>alert("xss")</script>')).toBe('');
expect(sanitizeHtml('<img src="x" onerror="alert(1)">')).toBe('');
expect(sanitizeHtml('<iframe src="evil.com"></iframe>')).toBe('');
expect(sanitizeHtml('<object data="evil.swf"></object>')).toBe('');
expect(sanitizeHtml('<embed src="evil.swf">')).toBe('');
});

it('strips dangerous attributes', () => {
expect(sanitizeHtml('<em onclick="alert(1)">text</em>')).toBe(
'<em>text</em>',
);
expect(sanitizeHtml('<em onload="alert(1)">text</em>')).toBe(
'<em>text</em>',
);
expect(sanitizeHtml('<em style="color:red">text</em>')).toBe(
'<em>text</em>',
);
});

it('escapes HTML entities in text content', () => {
// Already-escaped entities are preserved as-is (not double-escaped)
expect(sanitizeHtml('&lt;script&gt;')).toBe('&lt;script&gt;');
expect(sanitizeHtml('<em>&lt;test&gt;</em>')).toBe('<em>&lt;test&gt;</em>');
});

it('handles nested tags correctly', () => {
expect(sanitizeHtml('<em><strong>nested</strong></em>')).toBe(
'<em><strong>nested</strong></em>',
);
expect(sanitizeHtml('<em><script>evil</script>safe</em>')).toBe(
'<em>safe</em>',
);
});

it('handles mixed content', () => {
expect(sanitizeHtml('text <em>highlighted</em> more text')).toBe(
'text <em>highlighted</em> more text',
);
});

it('returns empty string for empty input', () => {
expect(sanitizeHtml('')).toBe('');
});

it('handles malformed HTML gracefully', () => {
// Regex-based sanitizer doesn't auto-close tags (acceptable trade-off)
expect(sanitizeHtml('<em>unclosed')).toBe('<em>unclosed');
// Unopened closing tags are kept (harmless in HTML)
expect(sanitizeHtml('unopened</em>')).toBe('unopened</em>');
});

it('prevents javascript: protocol in attributes', () => {
// Even though href is not allowed, test that it would be stripped
expect(sanitizeHtml('<a href="javascript:alert(1)">link</a>')).toBe('link');
});

it('prevents data: protocol attacks', () => {
expect(
sanitizeHtml('<img src="data:text/html,<script>alert(1)</script>">'),
).toBe('');
});
});

describe('sanitizeAlgoliaHtml', () => {
it('replaces Algolia CSS classes', () => {
expect(
sanitizeAlgoliaHtml(
'<em class="algolia-docsearch-suggestion--highlight">text</em>',
),
).toBe('<em class="search-result-match">text</em>');
});

it('sanitizes and replaces classes together', () => {
expect(
sanitizeAlgoliaHtml(
'<em class="algolia-docsearch-suggestion--highlight">safe</em><script>evil</script>',
),
).toBe('<em class="search-result-match">safe</em>');
});

it('handles typical Algolia search result format', () => {
const algoliaResult =
'This is a <em class="algolia-docsearch-suggestion--highlight">search</em> result';
expect(sanitizeAlgoliaHtml(algoliaResult)).toBe(
'This is a <em class="search-result-match">search</em> result',
);
});
});
167 changes: 167 additions & 0 deletions packages/docusaurus-theme-search-algolia/src/client/htmlSanitizer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

import ExecutionEnvironment from '@docusaurus/ExecutionEnvironment';

/**
* Sanitizes HTML content from Algolia search results.
* Algolia returns HTML with highlight tags (<em>) that we need to preserve,
* but we must strip any potentially malicious content.
*
* This is a lightweight sanitizer that:
* - Allows only safe tags: <em>, <mark>, <strong>, <b>, <i>
* - Strips all attributes except 'class' on allowed tags
* - Removes all other HTML tags and their content
* - Escapes any remaining HTML entities
*/

const ALLOWED_TAGS = ['em', 'mark', 'strong', 'b', 'i'];
const ALLOWED_ATTRIBUTES = ['class'];

/**
* Sanitizes HTML string by allowing only safe formatting tags.
* This prevents XSS attacks while preserving search result highlighting.
*/
export function sanitizeHtml(html: string): string {
if (!html) {
return '';
}

// Only run in browser environment
if (!ExecutionEnvironment.canUseDOM) {
// In SSR/test environment, use regex-based sanitization
return sanitizeHtmlRegex(html);
}

// Create a temporary DOM element to parse HTML
const doc = new DOMParser().parseFromString(html, 'text/html');

// Recursive function to sanitize nodes
function sanitizeNode(node: Node): string {
if (node.nodeType === Node.TEXT_NODE) {
// Text nodes are safe, just escape HTML entities
return escapeHtml(node.textContent || '');
}

if (node.nodeType === Node.ELEMENT_NODE) {
const element = node as Element;
const tagName = element.tagName.toLowerCase();

// If tag is not allowed, return only its text content (stripped)
if (!ALLOWED_TAGS.includes(tagName)) {
return escapeHtml(element.textContent || '');
}

// Process allowed tags
const children = Array.from(element.childNodes)
.map((child) => sanitizeNode(child))
.join('');

// Build the sanitized tag with allowed attributes only
const attributes = Array.from(element.attributes)
.filter((attr) => ALLOWED_ATTRIBUTES.includes(attr.name.toLowerCase()))
.map((attr) => `${attr.name}="${escapeHtml(attr.value)}"`)
.join(' ');

const attrString = attributes ? ` ${attributes}` : '';
return `<${tagName}${attrString}>${children}</${tagName}>`;
}

return '';
}

// Process all nodes in the body
return Array.from(doc.body.childNodes)
.map((node) => sanitizeNode(node))
.join('');
}

/**
* Regex-based sanitization for SSR/test environments.
* Less robust than DOM-based but works without browser APIs.
*/
function sanitizeHtmlRegex(html: string): string {
// Remove script tags and their content
let sanitized = html.replace(
/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi,
'',
);

// Remove dangerous tags
sanitized = sanitized.replace(
/<(iframe|object|embed|link|style)[^>]*>.*?<\/\1>/gi,
'',
);
sanitized = sanitized.replace(/<(img|input|form)[^>]*>/gi, '');

// Remove event handlers and dangerous attributes
sanitized = sanitized.replace(/\s*on\w+\s*=\s*["'][^"']*["']/gi, '');
sanitized = sanitized.replace(/\s*on\w+\s*=\s*[^\s>]*/gi, '');
sanitized = sanitized.replace(/\s*style\s*=\s*["'][^"']*["']/gi, '');
sanitized = sanitized.replace(
/\s*href\s*=\s*["']javascript:[^"']*["']/gi,
'',
);

// Keep only allowed tags with class attribute
sanitized = sanitized.replace(
// eslint-disable-next-line regexp/no-super-linear-backtracking
/<(\w+)([^>]*)>/g,
(match, tag, attrs) => {
const tagLower = tag.toLowerCase();
if (!ALLOWED_TAGS.includes(tagLower)) {
return '';
}
// Extract class attribute if present
const classMatch = attrs.match(/\s*class\s*=\s*["']([^"']*)["']/i);
if (classMatch) {
return `<${tagLower} class="${escapeHtml(classMatch[1])}">`;
}
return `<${tagLower}>`;
},
);

// Remove closing tags for non-allowed tags
sanitized = sanitized.replace(/<\/(\w+)>/g, (match, tag) => {
const tagLower = tag.toLowerCase();
return ALLOWED_TAGS.includes(tagLower) ? match : '';
});

return sanitized;
}

/**
* Escapes HTML special characters to prevent XSS.
*/
function escapeHtml(text: string): string {
const htmlEscapeMap: Record<string, string> = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
'"': '&quot;',
"'": '&#x27;',
'/': '&#x2F;',
};

return text.replace(/[&<>"'/]/g, (char) => htmlEscapeMap[char]!);
}

/**
* Sanitizes Algolia search result value by:
* 1. Replacing Algolia-specific CSS classes with our own
* 2. Sanitizing HTML to prevent XSS attacks
*/
export function sanitizeAlgoliaHtml(value: string): string {
// First replace Algolia CSS classes
const withReplacedClasses = value.replace(
/algolia-docsearch-suggestion--highlight/g,
'search-result-match',
);

// Then sanitize the HTML
return sanitizeHtml(withReplacedClasses);
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ export {
export {useSearchResultUrlProcessor} from './useSearchResultUrlProcessor';
export {useAlgoliaAskAi} from './useAlgoliaAskAi';
export {mergeFacetFilters} from './utils';
export {sanitizeHtml, sanitizeAlgoliaHtml} from './htmlSanitizer';
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
import {
useAlgoliaThemeConfig,
useSearchResultUrlProcessor,
sanitizeAlgoliaHtml,
} from '@docusaurus/theme-search-algolia/client';
import Layout from '@theme/Layout';
import Heading from '@theme/Heading';
Expand Down Expand Up @@ -321,12 +322,6 @@ function SearchPageContent(): ReactNode {
return;
}

const sanitizeValue = (value: string) =>
value.replace(
/algolia-docsearch-suggestion--highlight/g,
'search-result-match',
);

const items = hits.map(
({
url,
Expand All @@ -338,13 +333,13 @@ function SearchPageContent(): ReactNode {
_snippetResult: {content?: {value: string}};
}) => {
const titles = Object.keys(hierarchy).map((key) =>
sanitizeValue(hierarchy[key]!.value),
sanitizeAlgoliaHtml(hierarchy[key]!.value),
);
return {
title: titles.pop()!,
url: processSearchResultUrl(url),
summary: snippet.content
? `${sanitizeValue(snippet.content.value)}...`
? `${sanitizeAlgoliaHtml(snippet.content.value)}...`
: '',
breadcrumbs: titles,
};
Expand Down Expand Up @@ -535,7 +530,7 @@ function SearchPageContent(): ReactNode {
<li
key={index}
className="breadcrumbs__item"
// Developer provided the HTML, so assume it's safe.
// HTML is sanitized to prevent XSS attacks
// eslint-disable-next-line react/no-danger
dangerouslySetInnerHTML={{__html: html}}
/>
Expand All @@ -547,7 +542,7 @@ function SearchPageContent(): ReactNode {
{summary && (
<p
className={styles.searchResultItemSummary}
// Developer provided the HTML, so assume it's safe.
// HTML is sanitized to prevent XSS attacks
// eslint-disable-next-line react/no-danger
dangerouslySetInnerHTML={{__html: summary}}
/>
Expand Down