diff --git a/common/pagetypeclassifier/pagetypeclassifier.go b/common/pagetypeclassifier/pagetypeclassifier.go index 6e62ea2b..13fb60b2 100644 --- a/common/pagetypeclassifier/pagetypeclassifier.go +++ b/common/pagetypeclassifier/pagetypeclassifier.go @@ -2,8 +2,12 @@ package pagetypeclassifier import ( _ "embed" + "sync" + + "fmt" htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2" + "github.com/microcosm-cc/bluemonday" "github.com/projectdiscovery/utils/ml/naive_bayes" ) @@ -30,7 +34,58 @@ func (n *PageTypeClassifier) Classify(html string) string { return n.classifier.Classify(text) } +var ( + // sanitizerPolicy is an aggressive bluemonday policy that strips most HTML + // to reduce nesting depth and prevent parser stack overflow + sanitizerPolicy *bluemonday.Policy + sanitizerPolicyOnce sync.Once +) + +// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips +// most elements to reduce nesting depth and prevent parser stack overflow. +func getSanitizerPolicy() *bluemonday.Policy { + sanitizerPolicyOnce.Do(func() { + p := bluemonday.NewPolicy() + // Allow only basic text elements with minimal nesting + // This aggressive policy helps reduce nesting depth significantly + p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6") + p.AllowElements("strong", "em", "b", "i", "u") + p.AllowElements("ul", "ol", "li") + p.AllowElements("blockquote", "pre", "code") + // Allow basic attributes but no style (which can cause nesting issues) + p.AllowStandardAttributes() + sanitizerPolicy = p + }) + return sanitizerPolicy +} + // htmlToText safely converts HTML to text and protects against panics from Go's HTML parser. -func htmlToText(html string) (string, error) { - return htmltomarkdown.ConvertString(html) +// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased. +// Strategy: +// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting +// 2. Convert sanitized HTML to markdown +// 3. If conversion panics, recover and return empty string with error +func htmlToText(html string) (text string, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("html parser panic: %v", r) + text = "" + } + }() + + // First, sanitize HTML with bluemonday to strip useless elements and reduce nesting + sanitizedHTML := getSanitizerPolicy().Sanitize(html) + + // If sanitization failed or produced empty result, return empty + if sanitizedHTML == "" { + return "", nil + } + + // Convert sanitized HTML to markdown + text, err = htmltomarkdown.ConvertString(sanitizedHTML) + if err != nil || text == "" { + return "", err + } + + return } diff --git a/common/pagetypeclassifier/pagetypeclassifier_test.go b/common/pagetypeclassifier/pagetypeclassifier_test.go index 4d0e9a78..944b12ce 100644 --- a/common/pagetypeclassifier/pagetypeclassifier_test.go +++ b/common/pagetypeclassifier/pagetypeclassifier_test.go @@ -7,7 +7,6 @@ import ( ) func TestPageTypeClassifier(t *testing.T) { - t.Run("test creation of new PageTypeClassifier", func(t *testing.T) { epc, err := New() require.NoError(t, err) @@ -56,4 +55,51 @@ func TestPageTypeClassifier(t *testing.T) { `)) }) + + t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) { + epc, err := New() + require.NoError(t, err) + require.NotNil(t, epc) + + // Generate deeply nested HTML that exceeds the 512 node stack limit + // This should trigger a panic in the HTML parser, which we recover from + deeplyNestedHTML := "
" + for i := 0; i < 600; i++ { + deeplyNestedHTML += "
" + } + deeplyNestedHTML += "Some text content" + for i := 0; i < 600; i++ { + deeplyNestedHTML += "
" + } + deeplyNestedHTML += "
" + + // Should not panic and should return "other" when htmlToText returns empty string + result := epc.Classify(deeplyNestedHTML) + require.Equal(t, "other", result) + }) + + t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) { + // Generate deeply nested HTML that exceeds the 512 node stack limit + deeplyNestedHTML := "
" + for i := 0; i < 600; i++ { + deeplyNestedHTML += "
" + } + deeplyNestedHTML += "Some text content" + for i := 0; i < 600; i++ { + deeplyNestedHTML += "
" + } + deeplyNestedHTML += "
" + + // Should not panic and should return empty string with error on panic + result, err := htmlToText(deeplyNestedHTML) + require.Error(t, err) + require.Equal(t, "", result) + }) + + t.Run("test htmlToText with normal HTML", func(t *testing.T) { + normalHTML := `

Title

Some content here

` + result, err := htmlToText(normalHTML) + require.NoError(t, err) + require.NotEmpty(t, result) + }) }