From f3776079aa957a0c902642fd5001340730411a9f Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Mon, 5 Jan 2026 17:37:52 +0000 Subject: [PATCH] Fix excessive vertical whitespace in passage headers - Updated `ParseNodesForPassage` to trim header text. - Added `CleanPassageText` with regex cleanup to normalize newlines to max 2. - Verified with reproduction test case covering the reported issue. - Ensured existing tests pass. --- pkg/app/passage.go | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/pkg/app/passage.go b/pkg/app/passage.go index 2c610bc..0af0998 100644 --- a/pkg/app/passage.go +++ b/pkg/app/passage.go @@ -7,6 +7,7 @@ import ( "fmt" "log" "net/url" + "regexp" "strings" stdhtml "html" @@ -59,6 +60,19 @@ func isNextSiblingBr(node *html.Node) bool { return false } +func hasNextSignificantSibling(node *html.Node) bool { + for next := node.NextSibling; next != nil; next = next.NextSibling { + if next.Type == html.TextNode { + if len(strings.TrimSpace(next.Data)) == 0 { + continue + } + return true + } + return true // Any element + } + return false +} + func ParseNodesForPassage(node *html.Node) string { var parts []string @@ -114,7 +128,7 @@ func ParseNodesForPassage(node *html.Node) string { if headerText == "Footnotes" || headerText == "Cross references" { continue } - parts = append(parts, fmt.Sprintf("\n\n%s\n", headerText)) + parts = append(parts, fmt.Sprintf("\n\n%s\n", strings.TrimSpace(headerText))) case "ul", "ol": parts = append(parts, ParseNodesForPassage(child)) case "li": @@ -136,11 +150,21 @@ func ParseNodesForPassage(node *html.Node) string { return strings.Join(parts, "") } +// Collapse multiple newlines (potentially with spaces in between) to max 2 newlines +// \n\s*\n\s*\n+ -> \n\n +var newlineRegex = regexp.MustCompile(`\n\s*\n[\s\n]*`) + +func CleanPassageText(text string) string { + text = newlineRegex.ReplaceAllString(text, "\n\n") + return strings.TrimSpace(text) +} + func GetPassage(ref string, doc *html.Node, version string) string { // Replaced FilterTree with direct parsing of the root node // This allows handling arbitrary structure (divs, lists) returned by the API text := ParseNodesForPassage(doc) + text = CleanPassageText(text) var passage strings.Builder @@ -151,7 +175,7 @@ func GetPassage(ref string, doc *html.Node, version string) string { } passage.WriteString("\n") - passage.WriteString(strings.TrimSpace(text)) + passage.WriteString(text) return passage.String() }