Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion machine/corpora/usfm_file_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ def _get_id(filename: StrPath, encoding: str) -> Optional[str]:
if line.startswith("\\id "):
id = line[4:]
index = id.find(" ")
# If the id is longer than 3 characters, truncate it to 3 characters.
if (index == -1 or index > 3) and len(id) >= 3:
index = 3
if index != -1:
id = id[:index]
id = id[:index].upper()
return id.strip().upper()
return None
2 changes: 2 additions & 0 deletions machine/corpora/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,8 @@ def process_token(self) -> bool:
# Code is always upper case
assert token.data is not None
code = token.data.upper()
if len(code) > 3:
code = code[:3]

# Update verse ref. Leave book alone if not empty to prevent parsing errors on books with bad id lines.
verse_ref = self.state.verse_ref
Expand Down
3 changes: 3 additions & 0 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def rows(self) -> Iterable[TextRow]:

def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
super().start_book(state, marker, code)
if state.verse_ref.book != "" and state.verse_ref.book != code:
# Ignore \id markers that don't match the book code in the verse ref, if it was set
return
if code not in ALL_BOOK_IDS:
raise ValueError(f"The book {code} is not a valid book id.")
if code != self._text.id:
Expand Down
65 changes: 65 additions & 0 deletions tests/corpora/test_usfm_memory_text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List

import pytest
from testutils.corpora_test_helpers import scripture_ref

from machine.corpora import ScriptureRef, TextRow, UsfmMemoryText
Expand Down Expand Up @@ -465,6 +466,70 @@ def test_get_rows_incomplete_verse_range():
assert rows[3].text == "verse 1 text"


def test_get_rows_book_code_different_to_filename() -> None:
with pytest.raises(RuntimeError):
get_rows(
r"""\id LUK - Test
\c 1
\v 1 Verse 1 Text
""",
include_all_text=True,
)


def test_get_rows_book_code_invalid() -> None:
with pytest.raises(RuntimeError):
get_rows(
r"""\id ZZZ - Test
\c 1
\v 1 Verse 1 Text
""",
include_all_text=True,
)


def test_get_rows_book_code_truncated() -> None:
with pytest.raises(RuntimeError):
get_rows(
r"""\id MA
\c 1
\v 1 Verse 1 Text
""",
include_all_text=True,
)


def test_get_rows_book_code_multiple() -> None:
rows: List[TextRow] = get_rows(
r"""\id MAT
\id LUK
\c 1
\v 1 Verse 1 Text
""",
include_all_text=True,
)

assert len(rows) == 1

assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows])
assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows])


def test_get_rows_book_code_no_space() -> None:
rows: List[TextRow] = get_rows(
r"""\id Matthew
\c 1
\v 1 Verse 1 Text
""",
include_all_text=True,
)

assert len(rows) == 1

assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows])
assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows])


def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]:
text = UsfmMemoryText(
UsfmStylesheet("usfm.sty"),
Expand Down
2 changes: 1 addition & 1 deletion tests/testutils/data/usfm/Tes/03LEVTes.SFM
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
\id lev - Test
\id Leviticus
\h Leviticus
\mt Leviticus
\c 14
Expand Down
2 changes: 1 addition & 1 deletion tests/testutils/data/usfm/Tes/131CHTes.SFM
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
\id 1CH - Test
\id 1CH
\h 1 Chronicles
\mt 1 Chronicles
\c 12
Expand Down
Loading