From 0b6b2c153d6d64bf5a79f8d4455762a0cc721913 Mon Sep 17 00:00:00 2001 From: David Raphi Date: Tue, 20 Jan 2026 19:34:04 +0530 Subject: [PATCH 1/2] feat: [DRO-65] add comprehensive SEO meta tag rewriting and configuration --- .gitignore | 1 + src/cli/templates/config.js | 177 ++++++++++++++++---- src/handlers/handle-sitemap.ts | 21 ++- src/helpers/index.ts | 4 +- src/proxy.ts | 4 +- src/rewriters/data-rewriter.ts | 42 ++++- src/rewriters/header-rewriter.ts | 27 ++- src/rewriters/meta-rewriter.ts | 277 ++++++++++++++++++++++--------- src/types.ts | 34 ++++ 9 files changed, 455 insertions(+), 132 deletions(-) diff --git a/.gitignore b/.gitignore index 43e7680..01f257a 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,4 @@ dist .wrangler/ .vscode .git +test-results/ \ No newline at end of file diff --git a/src/cli/templates/config.js b/src/cli/templates/config.js index 4fdf149..096d768 100644 --- a/src/cli/templates/config.js +++ b/src/cli/templates/config.js @@ -5,60 +5,167 @@ import { HEADER_HTML_STRING } from './generated/_header-html-string.js'; /** @type {import('nooxy').NooxySiteConfig} */ export const SITE_CONFIG = { - // Site domain, example.com + // ============================================================================ + // REQUIRED: Basic Site Configuration + // ============================================================================ + + // Your custom domain (without https://) + // Example: 'example.com' or 'docs.example.com' domain: 'your-domain.com', - // Map slugs (short page names) to Notion page IDs - // '/' slug is your root page + + // Map URL slugs to Notion page IDs + // The '/' slug is your homepage + // Get page ID from Notion: Share > Copy link > extract the 32-char ID + // Example: https://notion.so/My-Page-abc123... -> 'abc123...' (32 chars) slugToPage: { '/': 'NOTION_HOME_PAGE_ID', - // '/contact': 'NOTION_PAGE_ID', // '/about': 'NOTION_PAGE_ID', - // Hint: you can use '/' in slug name to create subpages - // '/about/people': 'NOTION_PAGE_ID', + // '/contact': 'NOTION_PAGE_ID', + // Use '/' in slug for nested pages: + // '/docs/getting-started': 'NOTION_PAGE_ID', }, - // SEO metadata - // For main page link preview - // Rewrite meta tags for specific pages - // Use the Notion page ID as the key + + // Site name displayed in browser tabs and social previews + // Used for og:site_name meta tag + siteName: 'Your Site Name', + + // Your Notion workspace domain (without https://) + // Find it in your Notion page URL: https://YOUR-WORKSPACE.notion.site/... + // This prevents serving unintended Notion content through your domain + notionDomain: 'your-workspace.notion.site', + + // ============================================================================ + // OPTIONAL: Social Media & Branding + // ============================================================================ + + // Twitter/X handle for twitter:site meta tag (include @) + // Shows "via @handle" when your page is shared on Twitter + // twitterHandle: '@yourhandle', + + // Custom favicon URL (must be .ico format) + // If not set, uses Notion's default favicon + // siteIcon: 'https://example.com/favicon.ico', + + // ============================================================================ + // OPTIONAL: Page-Specific Metadata + // ============================================================================ + + // Override meta tags for specific pages + // Key: Notion page ID (32 chars), Value: metadata object + // Useful for custom titles/descriptions on important pages // pageMetadata: { // 'NOTION_PAGE_ID': { - // title: 'My Custom Page Title', - // description: 'My custom page description', - // image: 'https://imagehosting.com/images/page_preview.jpg', - // author: 'My Name', + // title: 'Custom Page Title', // and og:title + // description: 'Custom description', // meta description and og:description + // image: 'https://example.com/img.jpg', // og:image and twitter:image + // author: 'Author Name', // article:author meta tag // }, // }, - // og:site_name - siteName: 'Your Site Name', - // Social media links, optional - // twitter:site - // twitterHandle: '@mytwitter', - // URL to custom favicon.ico - // siteIcon: '', - // Additional safety: avoid serving extraneous Notion content from your website - // Use the value from your Notion like example.notion.site - notionDomain: 'example.notion.site', - // The 404 (not found) page is optional - // If you don't have one, the default 404 page will be used + + // ============================================================================ + // OPTIONAL: 404 Page + // ============================================================================ + + // Custom 404 page from Notion + // If not set, visitors see a generic 404 page // fof: { - // page: "NOTION_PAGE_ID", - // slug: "/404", // default + // page: 'NOTION_404_PAGE_ID', // Your custom 404 page ID + // slug: '/404', // URL path (default: '/404') // }, - // Subdomain redirects are optional - // But it is recommended to have one for www + + // ============================================================================ + // OPTIONAL: Subdomain Redirects + // ============================================================================ + + // Redirect subdomains to your main domain + // Common use: redirect www to non-www (or vice versa) // subDomains: { // www: { - // redirect: 'https://your-domain.com', + // redirect: 'https://example.com', // Redirects www.example.com -> example.com // }, // }, - // Google Font name, you can choose from https://fonts.google.com + + // ============================================================================ + // OPTIONAL: Typography & Analytics + // ============================================================================ + + // Google Font name from https://fonts.google.com + // Applies to all text on your site + // googleFont: 'Inter', // googleFont: 'Roboto', - // Set this to your Google Tag ID from Google Analytics - // googleTagID: 'GOOGLE_TAG_ID', + // googleFont: 'Open Sans', + + // Google Analytics 4 Measurement ID + // Find it: GA4 > Admin > Data Streams > your stream > Measurement ID + // Format: 'G-XXXXXXXXXX' + // googleTagID: 'G-XXXXXXXXXX', + + // ============================================================================ + // OPTIONAL: SEO Configuration + // ============================================================================ - // Custom JS, CSS, HTML for head and body of a Notion page + // seo: { + // // Enable/disable search engine indexing + // // true (default): Removes Notion's noindex, adds canonical URLs, allows crawling + // // false: Keeps noindex, hides site from search engines + // indexing: true, + // + // // Canonical domain for SEO + // // Use when nooxy runs on a subdomain but SEO should point to main domain + // // Example: Site on 'docs.example.com' but canonical URLs point to 'example.com' + // // All og:url, twitter:url, canonical tags, and sitemap URLs use this domain + // // canonicalDomain: 'example.com', + // + // // Path mapping for canonical domain + // // Maps paths from your nooxy domain to the canonical domain + // // Example: 'docs.example.com/' -> 'example.com/docs' + // // canonicalPathMap: { + // // '/': '/docs', + // // '/guide': '/docs/guide', + // // }, + // + // // Meta keywords for SEO (comma-separated) + // // Adds <meta name="keywords"> tag + // // keywords: 'keyword1, keyword2, keyword3', + // + // // Default author for all pages + // // Used when page-specific author isn't set in pageMetadata + // // Adds <meta name="author"> and article:author tags + // // defaultAuthor: 'Your Name', + // + // // Replace "Notion" branding in meta tags + // // Notion adds "Notion" to various meta tags by default + // // This replaces all occurrences with your brand name + // // If not set, uses siteName value + // // brandReplacement: 'Your Brand', + // + // // AI attribution meta tags + // // Helps AI systems (ChatGPT, Claude, Perplexity) properly credit your content + // // Adds: <meta name="ai:source_url"> and <meta name="ai:source_attribution"> + // // aiAttribution: 'Your Name - example.com', + // }, + + // ============================================================================ + // ADVANCED: Custom Code Injection + // ============================================================================ + + // Custom CSS injected into <head> + // Use for styling overrides, hiding Notion elements, custom themes + // Edit: ./generated/_head-css-string.js customHeadCSS: HEAD_CSS_STRING, + + // Custom JavaScript injected into <head> + // Runs before page content loads + // Edit: ./generated/_head-js-string.js customHeadJS: HEAD_JS_STRING, + + // Custom JavaScript injected before </body> + // Runs after page content loads + // Edit: ./generated/_body-js-string.js customBodyJS: BODY_JS_STRING, + + // Custom HTML injected into header area + // Use for navigation bars, announcement banners, etc. + // Edit: ./generated/_header-html-string.js customHeader: HEADER_HTML_STRING, }; diff --git a/src/handlers/handle-sitemap.ts b/src/handlers/handle-sitemap.ts index f18c708..075a272 100644 --- a/src/handlers/handle-sitemap.ts +++ b/src/handlers/handle-sitemap.ts @@ -1,17 +1,30 @@ import { NooxySiteConfigFull } from '../types'; +function escapeXml(str: string): string { + return str + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + export function handleSitemap(siteConfig: NooxySiteConfigFull, protocol: string) { - const { domain, slugs } = siteConfig; - let sitemap = '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'; + const { domain, slugs, seo } = siteConfig; + const sitemapDomain = seo?.canonicalDomain || domain; + let sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n'; + sitemap += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'; slugs.forEach((slug) => { - sitemap += `<url><loc>${protocol}//${domain}${slug}</loc></url>`; + const mappedSlug = seo?.canonicalPathMap?.[slug] ?? slug; + const safeSlug = escapeXml(mappedSlug); + sitemap += `<url><loc>${protocol}//${sitemapDomain}${safeSlug}</loc></url>`; }); sitemap += '</urlset>'; const response = new Response(sitemap); - response.headers.set('content-type', 'application/xml'); + response.headers.set('content-type', 'application/xml; charset=utf-8'); return response; } diff --git a/src/helpers/index.ts b/src/helpers/index.ts index 07d8b8d..03429d6 100644 --- a/src/helpers/index.ts +++ b/src/helpers/index.ts @@ -105,6 +105,6 @@ export function isNotion404(pathname: string, slugToPage: Record<string, string> // Helper function to extract page ID from pathname export function extractPageId(input: string) { const path = input.split('?')[0]; - const match = path?.match(/([a-fA-F0-9]{32})(?=\/?$)/); - return match?.[1] ? match[1] : ''; + const match = path?.match(/([a-f0-9]{32})(?=\/?$)/i); + return match?.[1] ? match[1].toLowerCase() : ''; } diff --git a/src/proxy.ts b/src/proxy.ts index fec01b7..7655aa0 100644 --- a/src/proxy.ts +++ b/src/proxy.ts @@ -113,7 +113,7 @@ async function reverseProxy(request: Request, siteConfig: NooxySiteConfigFull): // For 304 Not Modified responses, return with null body if (response.status === 304) { - const modifiedResponseHeaders = modifyResponseHeaders(response.headers, hostname); + const modifiedResponseHeaders = modifyResponseHeaders(response.headers, hostname, siteConfig); return new Response(null, { status: response.status, statusText: response.statusText, @@ -128,7 +128,7 @@ async function reverseProxy(request: Request, siteConfig: NooxySiteConfigFull): const modifiedData = modifyResponseData(data, targetUrl.pathname, siteConfig, urlOrgState.protocol); // Modify response headers - const modifiedResponseHeaders = modifyResponseHeaders(response.headers, hostname); + const modifiedResponseHeaders = modifyResponseHeaders(response.headers, hostname, siteConfig); return new Response(modifiedData, { status: response.status, diff --git a/src/rewriters/data-rewriter.ts b/src/rewriters/data-rewriter.ts index e4b3c31..ec380d7 100644 --- a/src/rewriters/data-rewriter.ts +++ b/src/rewriters/data-rewriter.ts @@ -4,6 +4,18 @@ import { HEAD_JS_STRING } from './custom/generated/_head-js-string'; import { HEAD_CSS_STRING } from './custom/generated/_head-css-string'; import { rewriteMetaTags } from './meta-rewriter'; +function escapeForJS(str: string): string { + return str + .replace(/\\/g, '\\\\') // Escape backslashes first + .replace(/'/g, "\\'") // Escape single quotes + .replace(/"/g, '\\"') // Escape double quotes + .replace(/\n/g, '\\n') // Escape newlines + .replace(/\r/g, '\\r') // Escape carriage returns + .replace(/\t/g, '\\t') // Escape tabs + .replace(/</g, '\\x3c') // Escape < to prevent </script> injection + .replace(/>/g, '\\x3e'); // Escape > for safety +} + // Helper function to modify response data export function modifyResponseData( responseData: string, @@ -12,6 +24,7 @@ export function modifyResponseData( protocol: string, ): string { const { + domain, notionDomain, slugToPage, pageToSlug, @@ -21,13 +34,17 @@ export function modifyResponseData( customHeadJS, customBodyJS, googleFont, + seo, } = siteConfig; let data = responseData; const notionDomainUrl = new URL(ensureHttpsUrl(notionDomain)).origin; - const customJSCode = `var notionDomain='${notionDomainUrl}',slugToPage=${JSON.stringify(slugToPage)},pageToSlug=${JSON.stringify(pageToSlug)},customHeader='${customHeader}';${HEAD_JS_STRING}`; + + const targetDomain = seo?.canonicalDomain || domain; + const safeCustomHeader = escapeForJS(customHeader || ''); + const customJSCode = `var notionDomain='${notionDomainUrl}',slugToPage=${JSON.stringify(slugToPage)},pageToSlug=${JSON.stringify(pageToSlug)},customHeader='${safeCustomHeader}';${HEAD_JS_STRING}`; const googleFontInject = googleFont ? `<link href='https://fonts.googleapis.com/css?family=${googleFont.replace( - ' ', + / /g, '+', )}:Regular,Bold,Italic&display=swap' rel='stylesheet'> <style>* { font-family: "${googleFont}" !important; }</style>` @@ -65,17 +82,26 @@ export function modifyResponseData( // // console.log('[DEBUG]', pathname, found); // } + // IMPORTANT: This must happen BEFORE script injection to avoid replacing the notionDomain variable + const escapedNotionDomain = notionDomain.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const notionDomainPattern = new RegExp(`https?://${escapedNotionDomain}(?=[/?"'>#\\s]|$)`, 'gi'); + if (/^\/_assets\/[^/]*\.js$/.test(pathname)) { data = data.replace(/window\.location\.href(?=[^=]|={2,})/g, 'window.nooxy.href()'); // Exclude 'window.location.href=' but not 'window.location.href==' - } else if (data.includes('<html') || data.includes('<!DOCTYPE')) { - // Assume HTML - // Apply meta tag rewriting - data = rewriteMetaTags(data, pathname, siteConfig, protocol) + } else if (/<html/i.test(data) || /<!DOCTYPE/i.test(data)) { + // Assume HTML (case-insensitive check for <html> and <!DOCTYPE>) + // Apply meta tag rewriting first + data = rewriteMetaTags(data, pathname, siteConfig, protocol); + + // Replace notion domain URLs in the original HTML content (before script injection) + data = data.replace(notionDomainPattern, `${protocol}//${targetDomain}`); + + data = data .replace( - '</head>', + /<\/head>/i, `${googleFontInject}<script>${customHeadJS}</script><script>${customJSCode}</script><style>${customHeadCSS}</style><style>${HEAD_CSS_STRING}</style></head>`, ) - .replace('</body>', `<script>${customBodyJS}</script>${ga}</body>`); + .replace(/<\/body>/i, `<script>${customBodyJS}</script>${ga}</body>`); } return ( diff --git a/src/rewriters/header-rewriter.ts b/src/rewriters/header-rewriter.ts index 0437647..2b0a6fc 100644 --- a/src/rewriters/header-rewriter.ts +++ b/src/rewriters/header-rewriter.ts @@ -1,3 +1,5 @@ +import { NooxySiteConfigFull } from '../types'; + // Helper function to modify request headers export function modifyRequestHeaders(headers: Headers): Headers { const newHeaders = new Headers(headers); @@ -6,25 +8,34 @@ export function modifyRequestHeaders(headers: Headers): Headers { } // Helper function to modify response headers -export function modifyResponseHeaders(headers: Headers, hostname: string): Headers { +export function modifyResponseHeaders(headers: Headers, hostname: string, siteConfig?: NooxySiteConfigFull): Headers { const newHeaders = new Headers(headers); - // Handle cookies + // Handle cookies - rewrite Notion domain to custom domain const cookies = headers.get('set-cookie'); if (cookies) { // Note: set-cookie can have multiple values, but Headers.get() returns them comma-separated - // For proper cookie handling, we'd need to parse them individually - const modifiedCookies = cookies.replace(/((?:^|; )Domain=)(?:[^.]+\.)?notion\.site(;|$)/gi, `$1${hostname}$2`); + // Handle various Notion domain patterns: + // - notion.site, *.notion.site + // - notion.so, *.notion.so + // - www.notion.so + const modifiedCookies = cookies.replace( + /((?:^|; )Domain=)(?:[^.]+\.)?notion\.(?:site|so)(;|$)/gi, + `$1${hostname}$2`, + ); newHeaders.set('set-cookie', modifiedCookies); } - // Handle CSP + // This allows embedding the proxied site in iframes + newHeaders.delete('x-frame-options'); + + // Handle CSP - add common domains that might be used by proxied sites const csp = headers.get('content-security-policy'); if (csp) { let modifiedCsp = csp .replace( /(?=(script-src|connect-src) )[^;]*/g, - '$& https://www.googletagmanager.com https://www.google-analytics.com', + '$& https://www.googletagmanager.com https://www.google-analytics.com https://static.cloudflareinsights.com https://cloudflareinsights.com', ) .replace(/(?=(style-src) )[^;]*/g, '$& https://fonts.googleapis.com') .replace(/(?=(font-src) )[^;]*/g, '$& https://fonts.gstatic.com') @@ -36,5 +47,9 @@ export function modifyResponseHeaders(headers: Headers, hostname: string): Heade newHeaders.set('content-security-policy', modifiedCsp); } + if (siteConfig?.seo?.indexing !== false) { + newHeaders.set('X-Robots-Tag', 'index, follow'); + } + return newHeaders; } diff --git a/src/rewriters/meta-rewriter.ts b/src/rewriters/meta-rewriter.ts index 0bb2f94..bd4aea2 100644 --- a/src/rewriters/meta-rewriter.ts +++ b/src/rewriters/meta-rewriter.ts @@ -1,6 +1,47 @@ import { extractPageId } from '../helpers'; import { NooxySiteConfigFull } from '../types'; +function getCanonicalUrl( + protocol: string, + domain: string, + slug: string, + pageId: string, + siteConfig: NooxySiteConfigFull, +): string { + const { seo } = siteConfig; + const canonicalDomain = seo?.canonicalDomain; + const canonicalPathMap = seo?.canonicalPathMap; + + const path = slug === '/' ? '/' : slug || `/${pageId}`; + const targetDomain = canonicalDomain || domain; + const finalPath = canonicalDomain && canonicalPathMap?.[path] ? canonicalPathMap[path] : path; + + return `${protocol}//${targetDomain}${finalPath}`; +} + +function replaceBranding(text: string, replacement: string): string { + return text.replace(/\bNotion\b/g, replacement); +} + +function escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +// Helper to create flexible meta tag pattern that handles: +// - Single or double quotes +// - Any attribute order +// - Extra whitespace (but NOT newlines - prevents cross-tag matching) +// - Self-closing or not +function createMetaPattern(nameOrProperty: 'name' | 'property', value: string): RegExp { + const escapedValue = value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + return new RegExp(`<meta[^>]*${nameOrProperty}=["']${escapedValue}["'][^>]*\\/?>`, 'gi'); +} + // Rewrite meta tags and page metadata export function rewriteMetaTags( responseData: string, @@ -8,126 +49,212 @@ export function rewriteMetaTags( siteConfig: NooxySiteConfigFull, protocol: string, ): string { - const { siteName, domain, pageToSlug, pageMetadata, twitterHandle } = siteConfig; + const { siteName, domain, pageToSlug, pageMetadata, twitterHandle, seo } = siteConfig; const pageId = extractPageId(pathname); - const isRootPage = pageToSlug[pageId] === '/'; + const slug = pageToSlug[pageId] || ''; const pageMeta = pageMetadata?.[pageId]; const pageTitle = pageMeta?.title; const pageDescription = pageMeta?.description; const pageImage = pageMeta?.image; - const pageAuthor = pageMeta?.author; + const pageAuthor = pageMeta?.author || seo?.defaultAuthor; + + // SEO settings with defaults + const indexingEnabled = seo?.indexing !== false; // Default: true + const brandReplacement = seo?.brandReplacement || siteName; + const keywords = seo?.keywords; + const aiAttribution = seo?.aiAttribution; + + const canonicalUrl = getCanonicalUrl(protocol, domain, slug, pageId, siteConfig); - const finalUrl = isRootPage - ? `${protocol}//${domain}/` - : pageToSlug[pageId] - ? `${protocol}//${domain}${pageToSlug[pageId]}` - : `${protocol}//${domain}/${pageId}`; + let result = responseData; - let result = responseData + // === SEO === + if (indexingEnabled) { + result = result + // Pattern 1: name before content + .replace(/<meta[^>]*name=["']robots["'][^>]*content=["'][^"']*(?:noindex|none)[^"']*["'][^>]*\/?>/gi, '') + // Pattern 2: content before name + .replace(/<meta[^>]*content=["'][^"']*(?:noindex|none)[^"']*["'][^>]*name=["']robots["'][^>]*\/?>/gi, ''); + + // Remove existing canonical tags (we'll add our own) + result = result.replace(/<link[^>]*rel=["']canonical["'][^>]*\/?>/gi, ''); + } + + // === BRANDING REPLACEMENT === + result = result.replace(/<title[^>]*>([^<]*)<\/title>/gi, (_match, content) => { + return `<title>${replaceBranding(content, brandReplacement)}`; + }); + + // Replace "Notion" in meta content attributes + result = result.replace( + /(]*(?:name|property)=["'](?:description|og:title|og:description|og:site_name|twitter:title|twitter:description)["'][^>]*content=["'])([^"']*)(["'])/gi, + (_match, prefix, content, quote) => { + return `${prefix}${replaceBranding(content, brandReplacement)}${quote}`; + }, + ); + + // handle reversed attribute order (content before name/property) + result = result.replace( + /(]*content=["'])([^"']*)(["'][^>]*(?:name|property)=["'](?:description|og:title|og:description|og:site_name|twitter:title|twitter:description)["'])/gi, + (_match, prefix, content, suffix) => { + return `${prefix}${replaceBranding(content, brandReplacement)}${suffix}`; + }, + ); + + // Catch-all: Replace "Notion" in any remaining meta tags not caught above + result = result.replace(/(]*>)/gi, (match) => { + return match.replace(/\bNotion\b/g, brandReplacement); + }); + + // === STANDARD META TAG REWRITES === + result = result + // twitter:site - flexible pattern .replace( - //gi, + createMetaPattern('name', 'twitter:site'), twitterHandle ? `` : '', ) + // twitter:url - flexible pattern + .replace(createMetaPattern('name', 'twitter:url'), ``) + // og:site_name - flexible pattern + .replace(createMetaPattern('property', 'og:site_name'), ``) + // og:url - flexible pattern + .replace(createMetaPattern('property', 'og:url'), ``) + // Remove apple-itunes-app (handles both name and property) + .replace(/]*(?:name|property)=["']apple-itunes-app["'][^>]*\/?>/gi, '') + // noscript refresh redirect .replace( - //gi, - ``, - ) - .replace( - //gi, - ``, - ) - .replace(//gi, ``) - .replace(/]*\/?>/gi, '') - .replace( - /