From 472b2b67c342dd78fd4dd362b2cf66559e3cec8f Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 28 Nov 2025 11:56:42 +0000 Subject: [PATCH] Refactor HTML parser to treat p and h tags as blocks - Updated `pkg/app/passage.go` to treat `p` and `h1`-`h4` tags as block elements, wrapping them in newlines. - Ensured headers are still formatted as bold text using `platform.TelegramBold`. - Cleaned up redundant logic and confirmed functionality with a reproduction test (which was deleted after verification). - This ensures proper visual separation of paragraphs and headers in the Telegram output, preventing run-on text. --- pkg/app/passage.go | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/pkg/app/passage.go b/pkg/app/passage.go index c83ff51..7492bd1 100644 --- a/pkg/app/passage.go +++ b/pkg/app/passage.go @@ -82,7 +82,30 @@ func parseNode(node *html.Node) string { if tag == "br" { return "\n" } - if !isFormattingTag(tag) && !isHeaderTag(tag) { + + // Treat headers and paragraphs as block elements + if tag == "p" || isHeaderTag(tag) { + var content strings.Builder + content.WriteString("\n") + + // Buffer to hold content of the block + var blockContent strings.Builder + + for c := node.FirstChild; c != nil; c = c.NextSibling { + blockContent.WriteString(parseNode(c)) + } + + if isHeaderTag(tag) { + content.WriteString(platform.TelegramBold(blockContent.String())) + } else { + content.WriteString(blockContent.String()) + } + + content.WriteString("\n") + return content.String() + } + + if !isFormattingTag(tag) { var content strings.Builder for c := node.FirstChild; c != nil; c = c.NextSibling { content.WriteString(parseNode(c)) @@ -110,7 +133,8 @@ func parseNode(node *html.Node) string { } for c := node.FirstChild; c != nil; c = c.NextSibling { - if c.Type == html.ElementNode && (isFormattingTag(c.Data) || isHeaderTag(c.Data)) { + // Note: isHeaderTag is removed here because it's handled above as a block element + if c.Type == html.ElementNode && isFormattingTag(c.Data) { flushTextBuffer() content.WriteString(parseNode(c)) } else { @@ -128,7 +152,7 @@ func ParsePassageFromHtml(rawHtml string) string { log.Printf("Error parsing html: %v", err) return rawHtml } - return parseNode(doc) + return strings.TrimSpace(parseNode(doc)) } // Deprecated: Using new API service