Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions common/pagetypeclassifier/pagetypeclassifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ package pagetypeclassifier

import (
_ "embed"
"sync"

"fmt"

htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/microcosm-cc/bluemonday"
"github.com/projectdiscovery/utils/ml/naive_bayes"
)

Expand All @@ -30,7 +34,58 @@ func (n *PageTypeClassifier) Classify(html string) string {
return n.classifier.Classify(text)
}

var (
// sanitizerPolicy is an aggressive bluemonday policy that strips most HTML
// to reduce nesting depth and prevent parser stack overflow
sanitizerPolicy *bluemonday.Policy
sanitizerPolicyOnce sync.Once
)

// getSanitizerPolicy returns an aggressive HTML sanitizer policy that strips
// most elements to reduce nesting depth and prevent parser stack overflow.
func getSanitizerPolicy() *bluemonday.Policy {
sanitizerPolicyOnce.Do(func() {
p := bluemonday.NewPolicy()
// Allow only basic text elements with minimal nesting
// This aggressive policy helps reduce nesting depth significantly
p.AllowElements("p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6")
p.AllowElements("strong", "em", "b", "i", "u")
p.AllowElements("ul", "ol", "li")
p.AllowElements("blockquote", "pre", "code")
// Allow basic attributes but no style (which can cause nesting issues)
p.AllowStandardAttributes()
sanitizerPolicy = p
})
return sanitizerPolicy
}

// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
func htmlToText(html string) (string, error) {
return htmltomarkdown.ConvertString(html)
// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
// Strategy:
// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
// 2. Convert sanitized HTML to markdown
// 3. If conversion panics, recover and return empty string with error
func htmlToText(html string) (text string, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("html parser panic: %v", r)
text = ""
}
}()

// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
sanitizedHTML := getSanitizerPolicy().Sanitize(html)

// If sanitization failed or produced empty result, return empty
if sanitizedHTML == "" {
return "", nil
}

// Convert sanitized HTML to markdown
text, err = htmltomarkdown.ConvertString(sanitizedHTML)
if err != nil || text == "" {
return "", err
}

return
}
48 changes: 47 additions & 1 deletion common/pagetypeclassifier/pagetypeclassifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
)

func TestPageTypeClassifier(t *testing.T) {

t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
epc, err := New()
require.NoError(t, err)
Expand Down Expand Up @@ -56,4 +55,51 @@ func TestPageTypeClassifier(t *testing.T) {
</html>
`))
})

t.Run("test panic recovery with deeply nested HTML", func(t *testing.T) {
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)

// Generate deeply nested HTML that exceeds the 512 node stack limit
// This should trigger a panic in the HTML parser, which we recover from
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
}
deeplyNestedHTML += "Some text content"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "</span></div>"
}
deeplyNestedHTML += "</div>"

// Should not panic and should return "other" when htmlToText returns empty string
result := epc.Classify(deeplyNestedHTML)
require.Equal(t, "other", result)
})

t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
// Generate deeply nested HTML that exceeds the 512 node stack limit
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
}
deeplyNestedHTML += "Some text content"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "</span></div>"
}
deeplyNestedHTML += "</div>"

// Should not panic and should return empty string with error on panic
result, err := htmlToText(deeplyNestedHTML)
require.Error(t, err)
require.Equal(t, "", result)
})

t.Run("test htmlToText with normal HTML", func(t *testing.T) {
normalHTML := `<html><body><h1>Title</h1><p>Some content here</p></body></html>`
result, err := htmlToText(normalHTML)
require.NoError(t, err)
require.NotEmpty(t, result)
})
}
Loading