Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
chapter_commentary_parts,
collect_unique_book_codes,
collect_unique_lang_codes,
demote_headings_by_one,
demote_headings_by_two,
filter_books_by_book_code,
filter_books_by_lang_code,
get_book_intros,
Expand All @@ -26,6 +24,7 @@
tq_verses_parts,
rg_verses_parts,
)
from doc.utils.text_utils import demote_headings_by_one
from doc.domain.bible_books import BOOK_CHAPTERS, BOOK_ID_MAP, BOOK_NAMES
from doc.domain.model import (
AssemblyLayoutEnum,
Expand Down
32 changes: 3 additions & 29 deletions backend/doc/domain/assembly_strategies/assembly_strategy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Utility functions used by assembly_strategies.
"""

from re import compile, IGNORECASE, Match, search
from typing import Optional, Sequence, TypeVar
from re import search
from typing import Optional, Sequence

from doc.config import settings
from doc.domain.bible_books import BOOK_ID_MAP
Expand All @@ -19,6 +19,7 @@
from doc.reviewers_guide.model import RGBook
from doc.reviewers_guide.render_to_html import render_chapter
from doc.utils.tw_utils import translation_words_content
from doc.utils.text_utils import demote_headings_by_one
from docx.document import Document as DocxDocument
from docx.enum.section import WD_SECTION
from docx.enum.text import WD_BREAK
Expand All @@ -29,11 +30,6 @@
logger = settings.logger(__name__)


HEADING_RE = compile(r"</?h([1-6])\b", IGNORECASE)

T = TypeVar("T")


OXML_LANGUAGE_LIST: list[str] = [
"ar-SA",
"bg-BG",
Expand Down Expand Up @@ -137,7 +133,6 @@ def tn_chapter_verses(
tn_book: Optional[TNBook],
chapter_num: int,
use_two_column_layout_for_tn_notes: bool,
# fmt_str: str = TN_VERSE_NOTES_ENCLOSING_DIV_FMT_STR,
) -> str:
"""
Return the HTML for verses that are in the chapter with
Expand Down Expand Up @@ -970,27 +965,6 @@ def get_non_usfm_resources_verse(
return document_parts


def _demote_heading(match: Match[str], levels: int) -> str:
tag = match.group(0)
level = int(match.group(1))
new_level = min(level + levels, 6)
return tag.replace(f"h{level}", f"h{new_level}", 1)


def demote_headings_by_one(content: str) -> str:
return HEADING_RE.sub(
lambda m: _demote_heading(m, levels=1),
content,
)


def demote_headings_by_two(content: str) -> str:
return HEADING_RE.sub(
lambda m: _demote_heading(m, levels=2),
content,
)


def tnc_chapter_intro(
tnc_book: Optional[TNCBook],
chapter_num: int,
Expand Down
2 changes: 1 addition & 1 deletion backend/doc/domain/document_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def compose_docx_document(
add_full_width_hr(doc)
if part.add_page_break:
add_page_break(doc)
style_superscripts(doc, lift_half_points=4, color=RGBColor(0x99, 0x99, 0x99))
style_superscripts(doc, lift_half_points=6, color=RGBColor(0x99, 0x99, 0x99))
t1 = time.time()
logger.info("Time for converting HTML to Docx: %.2f seconds", t1 - t0)
return doc
Expand Down
67 changes: 39 additions & 28 deletions backend/doc/domain/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
import requests
from bs4 import BeautifulSoup
from doc.config import settings
from doc.domain.assembly_strategies.assembly_strategy_utils import (
demote_headings_by_one,
demote_headings_by_two,
)
from doc.utils.text_utils import demote_headings_by_one
from doc.domain.bible_books import BOOK_ID_MAP, BOOK_NAMES
from doc.domain.model import (
BC_RESOURCE_TYPE,
Expand Down Expand Up @@ -80,6 +77,11 @@

H1, H2, H3, H4, H5 = "h1", "h2", "h3", "h4", "h5"

_SECTIONHEAD5_RE = re.compile(
r'<div\s+class="sectionhead-5">\s*</div>',
re.MULTILINE,
)


BC_ARTICLE_URL_FMT_STR: str = (
"https://content.bibletranslationtools.org/WycliffeAssociates/en_bc/src/branch/master/{}"
Expand Down Expand Up @@ -447,6 +449,10 @@ def ensure_chapter_marker(
return f"\\c {chapter_num}\n" + chapter_usfm_text


def remove_sectionhead5_elements(content: str) -> str:
return _SECTIONHEAD5_RE.sub(" ", content)


def usfm_book_content(
resource_lookup_dto: ResourceLookupDto,
resource_dir: str,
Expand Down Expand Up @@ -511,9 +517,12 @@ def usfm_book_content(
cleaned_chapter_html_content = remove_null_bytes_and_control_characters(
chapter_html_content
)
chapter_html_content_sans_s5 = remove_sectionhead5_elements(
cleaned_chapter_html_content
)
usfm_chapters[chapter_num] = USFMChapter(
content=(
cleaned_chapter_html_content if cleaned_chapter_html_content else ""
chapter_html_content_sans_s5 if chapter_html_content_sans_s5 else ""
),
verses=None,
)
Expand Down Expand Up @@ -1278,7 +1287,7 @@ def assemble_chapter_usfm(
verse_content = read_verse_file(usfm_file)
cleaned_verse_content = clean_verse_content(verse_content)
verse_content = ensure_paragraph_before_verses(usfm_file, cleaned_verse_content)
chapter_usfm_content.append(cleaned_verse_content)
chapter_usfm_content.append(verse_content)
chapter_usfm_content.append(
" \n"
) # Make sure a space before next chunk, e.g., auh, mat, ch 9, v 14
Expand Down Expand Up @@ -1456,6 +1465,8 @@ def handle_split_chapter_into_verses(

def split_chapter_into_verses_with_formatting(
chapter: USFMChapter,
empty_paragraph: str = "<p></p>",
sectionhead5_element: str = '<div class="sectionhead-5"></div>',
) -> dict[VerseRef, str]:
"""
Given a USFMChapter instance, return the same instance with its
Expand All @@ -1465,29 +1476,27 @@ def split_chapter_into_verses_with_formatting(
Sample HTML content with multiple verse elements:

>>> html_content = '''
>>> <span class="verse">
>>> <sup class="versemarker">19</sup>
>>> For through the law I died to the law, so that I might live for God. I have been crucified with Christ.
>>> <sup id="footnote-caller-1" class="caller"><a href="#footnote-target-1">1</a></sup>
>>> <div class="sectionhead-5"></div>
>>> </span>
>>> <span class="verse">
>>> <sup class="versemarker">20</sup>
>>> I have been crucified with Christ and I no longer live, but Christ lives in me. The life I now live in the body, I live by faith in the Son of God, who loved me and gave himself for me.
>>> <sup id="footnote-caller-2" class="caller"><a href="#footnote-target-2">2</a></sup>
>>> <div class="sectionhead-5"></div>
>>> </span>
>>> '''
... <span class="verse">
... <sup class="versemarker">19</sup>
... For through the law I died to the law, so that I might live for God. I have been crucified with Christ.
... <sup id="footnote-caller-1" class="caller"><a href="#footnote-target-1">1</a></sup>
... <div class="sectionhead-5"></div>
... </span>
... <span class="verse">
... <sup class="versemarker">20</sup>
... I have been crucified with Christ and I no longer live, but Christ lives in me. The life I now live in the body, I live by faith in the Son of God, who loved me and gave himself for me.
... <sup id="footnote-caller-2" class="caller"><a href="#footnote-target-2">2</a></sup>
... <div class="sectionhead-5"></div>
... </span>
... '''
>>> from doc.domain.parsing import split_chapter_into_verses_with_formatting
>>> chapter = USFMChapter(content=html_content)
>>> chapter = USFMChapter(content=html_content, verses=None)
>>> chapter.verses = split_chapter_into_verses_with_formatting(chapter)
>>> chapter.verses["19"]
<span class="verse">
>>> print(chapter.verses["19"])
<sup class="versemarker">19</sup>
For through the law I died to the law, so that I might live for God. I have been crucified with Christ.
<sup id="footnote-caller-1" class="caller"><a href="#footnote-target-1">1</a></sup>
<div class="sectionhead-5"></div>
</span>
<BLANKLINE>
"""
# TODO What to do about footnote targets? Perhaps have the value be a
# tuple with first element of the verse HTML (which includes the
Expand All @@ -1505,14 +1514,16 @@ def split_chapter_into_verses_with_formatting(
# Add to the dictionary with verse number as the key and verse text as the value
verse_dict[verse_number_] = (
verse_span.strip()
.replace("<p></p>", "")
.replace('<div class="sectionhead-5"></div>', "")
.replace(empty_paragraph, "")
.replace(sectionhead5_element, "")
)
return verse_dict


def split_chapter_into_verses_with_formatting_for_f10(
chapter: USFMChapter,
empty_paragraph: str = "<p></p>",
sectionhead5_element: str = '<div class="sectionhead-5"></div>',
) -> dict[str, str]:
"""
Parse chapter.content as HTML, extract each <span class="verse">,
Expand Down Expand Up @@ -1555,8 +1566,8 @@ def split_chapter_into_verses_with_formatting_for_f10(
# store cleaned HTML fragment (still contains <sup> etc.)
verse_dict[verse_number] = (
cleaned_html.strip()
.replace("<p></p>", "")
.replace('<div class="sectionhead-5"></div>', "")
.replace(empty_paragraph, "")
.replace(sectionhead5_element, "")
)
return verse_dict

Expand Down
52 changes: 33 additions & 19 deletions backend/doc/domain/resource_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import requests
from cachetools import TTLCache, cached
from doc.config import settings
from doc.domain import worker, parsing
from doc.domain import worker
from doc.domain.bible_books import BOOK_CHAPTERS, BOOK_ID_MAP, BOOK_NAMES
from doc.domain.model import (
NON_USFM_RESOURCE_TYPES,
Expand All @@ -36,7 +36,6 @@
from doc.utils.file_utils import (
delete_tree,
file_needs_update,
read_file,
)
from doc.utils.list_utils import unique_tuples, unique_book_codes
from doc.utils.text_utils import maybe_correct_book_name, normalize_localized_book_name
Expand Down Expand Up @@ -79,7 +78,7 @@ def fetch_source_data(
>>> ();result = resource_lookup.fetch_source_data();() # doctest: +ELLIPSIS
(...)
>>> result.git_repo[0]
RepoEntry(repo_url=HttpUrl('https://content.bibletranslationtools.org/klero/ach-SS-acholi_rev_text_reg'), content=Content(resource_type='reg', language=Language(english_name='Acholi', ietf_code='ach-SS-acholi', national_name='Acholi', direction=<LangDirEnum.LTR: 'ltr'>)))
RepoEntry(repo_url=HttpUrl('https://content.bibletranslationtools.org/0success/cli_1jn_text_reg'), content=Content(resource_type='reg', language=Language(english_name='Chakali', ietf_code='cli', national_name='Chakali', direction=<LangDirEnum.LTR: 'ltr'>)))
"""
graphql_query = """
query MyQuery {
Expand Down Expand Up @@ -111,6 +110,8 @@ def fetch_source_data(
for repo in data_payload["git_repo"]
if repo.get("content", {}).get("resource_type") is not None
]
# Sort for test stability - ensures consistent ordering
valid_repos.sort(key=lambda repo: repo["repo_url"])
return SourceData.model_validate({"git_repo": valid_repos})
else:
logger.info("Invalid payload structure, no data.")
Expand Down Expand Up @@ -263,6 +264,8 @@ def get_resource_types(
str, str
] = settings.RESOURCE_TYPE_CODES_AND_NAMES,
) -> list[tuple[str, str]]:
from doc.domain.parsing import find_usfm_files

resource_types = []
for url, resource_filepath, resource_type in repo_clone_list:
if resource_type:
Expand All @@ -285,7 +288,7 @@ def get_resource_types(
and file.name.split("-")[1].lower() in book_codes
]
elif resource_type in usfm_resource_types:
book_assets = parsing.find_usfm_files(resource_filepath)
book_assets = find_usfm_files(resource_filepath)
elif resource_type == "rg":
between_texts, bible_reference_strs = find_bible_references(
join(en_rg, docx_file_path)
Expand Down Expand Up @@ -574,6 +577,8 @@ def usfm_resource_types_and_book_tuples(
>>> sorted(tuples, key=lambda value: value[1])
[('reg', '1co'), ('reg', '1jn'), ('reg', '1pe'), ('reg', '1th'), ('reg', '1ti'), ('reg', '2co'), ('reg', '2jn'), ('reg', '2pe'), ('reg', '2th'), ('reg', '2ti'), ('reg', '3jn'), ('reg', 'act'), ('reg', 'col'), ('reg', 'eph'), ('reg', 'gal'), ('reg', 'heb'), ('reg', 'jas'), ('reg', 'jhn'), ('reg', 'jud'), ('reg', 'luk'), ('reg', 'mat'), ('reg', 'mrk'), ('reg', 'phm'), ('reg', 'php'), ('reg', 'rev'), ('reg', 'rom'), ('reg', 'tit')]
"""
from doc.domain.parsing import usfm_asset_file

book_codes = book_codes_str.split(",")
data: SourceData | None = fetch_source_data()
resource_type_and_book_tuples = set()
Expand Down Expand Up @@ -602,9 +607,7 @@ def usfm_resource_types_and_book_tuples(
resource_filepath = prepare_resource_filepath(dto)
if file_needs_update(resource_filepath):
provision_asset_files(dto.url, resource_filepath)
content_file = parsing.usfm_asset_file(
dto, resource_filepath, False
)
content_file = usfm_asset_file(dto, resource_filepath, False)
if content_file:
resource_type_and_book_tuples.add((resource_type, book_code))
return sorted(resource_type_and_book_tuples, key=lambda value: value[0])
Expand Down Expand Up @@ -942,18 +945,24 @@ def get_book_names_from_usfm_metadata(
be localized, it depends on the translation work done for language
lang_code.
"""
from doc.domain.parsing import (
find_usfm_files,
split_usfm_by_chapters,
maybe_localized_book_name,
)

book_codes_and_names_localized: dict[str, str] = {}
usfm_files = parsing.find_usfm_files(resource_filepath)
usfm_files = find_usfm_files(resource_filepath)
for usfm_file in usfm_files:
usfm = ""
usfm_file_components = Path(usfm_file).stem.lower().split("-")
book_code = usfm_file_components[1]
with open(usfm_file, "r") as f:
usfm = f.read()
frontmatter, _, _ = parsing.split_usfm_by_chapters(
frontmatter, _, _ = split_usfm_by_chapters(
lang_code, resource_type, book_code, usfm
)
localized_book_name = parsing.maybe_localized_book_name(frontmatter)
localized_book_name = maybe_localized_book_name(frontmatter)
# localized_book_name = maybe_correct_book_name(lang_code, localized_book_name)
book_codes_and_names_localized[book_code] = localized_book_name
logger.debug("book_codes_and_names_localized: %s", book_codes_and_names_localized)
Expand Down Expand Up @@ -1204,7 +1213,8 @@ def nt_survey_rg_passages(
) -> list[BibleReference]:
"""
>>> from doc.domain import resource_lookup
>>> rg_books = resource_lookup.nt_survey_rg_passages()
>>> ();rg_books = resource_lookup.nt_survey_rg_passages() ;() # doctest: +ELLIPSIS
(...)
>>> rg_books[0]
BibleReference(book_code='mat', book_name='Matthew', start_chapter=2, start_chapter_verse_ref='1-12', end_chapter=None, end_chapter_verse_ref=None)
"""
Expand Down Expand Up @@ -1245,9 +1255,10 @@ def ot_survey_rg1_passages(
) -> list[BibleReference]:
"""
>>> from doc.domain import resource_lookup
>>> rg_books = resource_lookup.ot_survey_rg1_passages()
>>> ();rg_books = resource_lookup.ot_survey_rg1_passages();() # doctest: +ELLIPSIS
(...)
>>> rg_books[0]
BibleReference(book_code='gen', book_name='Genesis', start_chapter=2, start_chapter_verse_ref='1-12', end_chapter=None, end_chapter_verse_ref=None)
BibleReference(book_code='gen', book_name='Genesis', start_chapter=1, start_chapter_verse_ref='1', end_chapter=2, end_chapter_verse_ref='3')
"""
path = join(resource_dir, docx_file_path)
rg_books = get_rg_books(
Expand Down Expand Up @@ -1286,9 +1297,10 @@ def ot_survey_rg2_passages(
) -> list[BibleReference]:
"""
>>> from doc.domain import resource_lookup
>>> rg_books = resource_lookup.ot_survey_rg2_passages()
>>> ();rg_books = resource_lookup.ot_survey_rg2_passages();() # doctest: +ELLIPSIS
(...)
>>> rg_books[0]
BibleReference(book_code='jos', book_name='Joshua', start_chapter=2, start_chapter_verse_ref='1-12', end_chapter=None, end_chapter_verse_ref=None)
BibleReference(book_code='jos', book_name='Joshua', start_chapter=1, start_chapter_verse_ref='1-9', end_chapter=None, end_chapter_verse_ref=None)
"""
path = join(resource_dir, docx_file_path)
rg_books = get_rg_books(
Expand Down Expand Up @@ -1327,9 +1339,10 @@ def ot_survey_rg3_passages(
) -> list[BibleReference]:
"""
>>> from doc.domain import resource_lookup
>>> rg_books = resource_lookup.ot_survey_rg3_passages()
>>> ();rg_books = resource_lookup.ot_survey_rg3_passages();() # doctest: +ELLIPSIS
(...)
>>> rg_books[0]
BibleReference(book_code='job', book_name='Job', start_chapter=2, start_chapter_verse_ref='1-12', end_chapter=None, end_chapter_verse_ref=None)
BibleReference(book_code='job', book_name='Job', start_chapter=1, start_chapter_verse_ref='6-22', end_chapter=None, end_chapter_verse_ref=None)
"""
path = join(resource_dir, docx_file_path)
rg_books = get_rg_books(
Expand Down Expand Up @@ -1368,9 +1381,10 @@ def ot_survey_rg4_passages(
) -> list[BibleReference]:
"""
>>> from doc.domain import resource_lookup
>>> rg_books = resource_lookup.ot_survey_rg4_passages()
>>> ();rg_books = resource_lookup.ot_survey_rg4_passages();() # doctest: +ELLIPSIS
(...)
>>> rg_books[0]
BibleReference(book_code='isa', book_name='Isaiah', start_chapter=2, start_chapter_verse_ref='1-12', end_chapter=None, end_chapter_verse_ref=None)
BibleReference(book_code='isa', book_name='Isaiah', start_chapter=1, start_chapter_verse_ref='1-9', end_chapter=None, end_chapter_verse_ref=None)
"""
path = join(resource_dir, docx_file_path)
rg_books = get_rg_books(
Expand Down
Loading