diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py index dbc5f675..6ef8b8b2 100644 --- a/machine/corpora/usfm_file_text_corpus.py +++ b/machine/corpora/usfm_file_text_corpus.py @@ -42,7 +42,10 @@ def _get_id(filename: StrPath, encoding: str) -> Optional[str]: if line.startswith("\\id "): id = line[4:] index = id.find(" ") + # If the id is longer than 3 characters, truncate it to 3 characters. + if (index == -1 or index > 3) and len(id) >= 3: + index = 3 if index != -1: - id = id[:index] + id = id[:index].upper() return id.strip().upper() return None diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index a37d5396..220ade28 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -176,6 +176,8 @@ def process_token(self) -> bool: # Code is always upper case assert token.data is not None code = token.data.upper() + if len(code) > 3: + code = code[:3] # Update verse ref. Leave book alone if not empty to prevent parsing errors on books with bad id lines. verse_ref = self.state.verse_ref diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index ee400909..d50372c6 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -91,6 +91,9 @@ def rows(self) -> Iterable[TextRow]: def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: super().start_book(state, marker, code) + if state.verse_ref.book != "" and state.verse_ref.book != code: + # Ignore \id markers that don't match the book code in the verse ref, if it was set + return if code not in ALL_BOOK_IDS: raise ValueError(f"The book {code} is not a valid book id.") if code != self._text.id: diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index aa89412d..b4f5f67e 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -1,5 +1,6 @@ from typing import List +import pytest from testutils.corpora_test_helpers import scripture_ref from machine.corpora import ScriptureRef, TextRow, UsfmMemoryText @@ -465,6 +466,70 @@ def test_get_rows_incomplete_verse_range(): assert rows[3].text == "verse 1 text" +def test_get_rows_book_code_different_to_filename() -> None: + with pytest.raises(RuntimeError): + get_rows( + r"""\id LUK - Test +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_invalid() -> None: + with pytest.raises(RuntimeError): + get_rows( + r"""\id ZZZ - Test +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_truncated() -> None: + with pytest.raises(RuntimeError): + get_rows( + r"""\id MA +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_multiple() -> None: + rows: List[TextRow] = get_rows( + r"""\id MAT +\id LUK +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + assert len(rows) == 1 + + assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows]) + assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows]) + + +def test_get_rows_book_code_no_space() -> None: + rows: List[TextRow] = get_rows( + r"""\id Matthew +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + assert len(rows) == 1 + + assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows]) + assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows]) + + def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]: text = UsfmMemoryText( UsfmStylesheet("usfm.sty"), diff --git a/tests/testutils/data/usfm/Tes/03LEVTes.SFM b/tests/testutils/data/usfm/Tes/03LEVTes.SFM index 6fc8cd9f..2ced0843 100644 --- a/tests/testutils/data/usfm/Tes/03LEVTes.SFM +++ b/tests/testutils/data/usfm/Tes/03LEVTes.SFM @@ -1,4 +1,4 @@ -\id lev - Test +\id Leviticus \h Leviticus \mt Leviticus \c 14 diff --git a/tests/testutils/data/usfm/Tes/131CHTes.SFM b/tests/testutils/data/usfm/Tes/131CHTes.SFM index 328b513a..f080c42b 100644 --- a/tests/testutils/data/usfm/Tes/131CHTes.SFM +++ b/tests/testutils/data/usfm/Tes/131CHTes.SFM @@ -1,4 +1,4 @@ -\id 1CH - Test +\id 1CH \h 1 Chronicles \mt 1 Chronicles \c 12