From f3776079aa957a0c902642fd5001340730411a9f Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
<161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 5 Jan 2026 17:37:52 +0000
Subject: [PATCH] Fix excessive vertical whitespace in passage headers
- Updated `ParseNodesForPassage` to trim header text.
- Added `CleanPassageText` with regex cleanup to normalize newlines to max 2.
- Verified with reproduction test case covering the reported issue.
- Ensured existing tests pass.
---
pkg/app/passage.go | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/pkg/app/passage.go b/pkg/app/passage.go
index 2c610bc..0af0998 100644
--- a/pkg/app/passage.go
+++ b/pkg/app/passage.go
@@ -7,6 +7,7 @@ import (
"fmt"
"log"
"net/url"
+ "regexp"
"strings"
stdhtml "html"
@@ -59,6 +60,19 @@ func isNextSiblingBr(node *html.Node) bool {
return false
}
+func hasNextSignificantSibling(node *html.Node) bool {
+ for next := node.NextSibling; next != nil; next = next.NextSibling {
+ if next.Type == html.TextNode {
+ if len(strings.TrimSpace(next.Data)) == 0 {
+ continue
+ }
+ return true
+ }
+ return true // Any element
+ }
+ return false
+}
+
func ParseNodesForPassage(node *html.Node) string {
var parts []string
@@ -114,7 +128,7 @@ func ParseNodesForPassage(node *html.Node) string {
if headerText == "Footnotes" || headerText == "Cross references" {
continue
}
- parts = append(parts, fmt.Sprintf("\n\n%s\n", headerText))
+ parts = append(parts, fmt.Sprintf("\n\n%s\n", strings.TrimSpace(headerText)))
case "ul", "ol":
parts = append(parts, ParseNodesForPassage(child))
case "li":
@@ -136,11 +150,21 @@ func ParseNodesForPassage(node *html.Node) string {
return strings.Join(parts, "")
}
+// Collapse multiple newlines (potentially with spaces in between) to max 2 newlines
+// \n\s*\n\s*\n+ -> \n\n
+var newlineRegex = regexp.MustCompile(`\n\s*\n[\s\n]*`)
+
+func CleanPassageText(text string) string {
+ text = newlineRegex.ReplaceAllString(text, "\n\n")
+ return strings.TrimSpace(text)
+}
+
func GetPassage(ref string, doc *html.Node, version string) string {
// Replaced FilterTree with direct parsing of the root node
// This allows handling arbitrary structure (divs, lists) returned by the API
text := ParseNodesForPassage(doc)
+ text = CleanPassageText(text)
var passage strings.Builder
@@ -151,7 +175,7 @@ func GetPassage(ref string, doc *html.Node, version string) string {
}
passage.WriteString("\n")
- passage.WriteString(strings.TrimSpace(text))
+ passage.WriteString(text)
return passage.String()
}