🐛 Relax to aggressive mid-line block starts (#515)

MiWeiss · web-flow · commit fcfb0cb43635 · 2026-01-17T22:03:02.000+01:00
diff --git a/bibtexparser/splitter.py b/bibtexparser/splitter.py
@@ -57,6 +57,24 @@ def _reset_block_status(self, current_char_index: int) -> None:
         self._implicit_comment_start_line = self._current_line
         self._implicit_comment_start: Optional[int] = current_char_index
 
+    def _is_at_line_start(self, pos: int) -> bool:
+        """Check if position is at the start of a line (after optional whitespace).
+
+        This is used to determine whether an @ sign should be treated as a new
+        block start (for error recovery) or as content within a field value.
+        We only want to abort parsing and start a new block if the @ is at the
+        beginning of a line, to avoid false positives with @ signs in content.
+        """
+        # Scan backwards from pos to find either newline or non-whitespace
+        for i in range(pos - 1, -1, -1):
+            char = self.bibstr[i]
+            if char == "\n":
+                return True
+            elif not char.isspace():
+                return False
+        # Start of string counts as line start
+        return True
+
     def _end_implicit_comment(self, end_char_index) -> Optional[ImplicitComment]:
         if self._implicit_comment_start is None:
             return  # No implicit comment started
@@ -122,7 +140,11 @@ def _move_to_closed_bracket(self) -> int:
                     return m.start()
                 else:
                     num_additional_brackets -= 1
-            elif m.group(0).startswith("@"):
+            elif m.group(0).startswith("@") and self._is_at_line_start(m.start()):
+                # Only abort if the @ is at the start of a line.
+                # This allows @ signs in field values (e.g., "LeQua @ {CLEF}")
+                # while still providing error recovery when a new block starts
+                # on a new line within an unclosed block.
                 self._unaccepted_mark = m
                 raise BlockAbortedException(
                     abort_reason=f"Unexpected block start: `{m.group(0)}`. "
@@ -169,8 +191,11 @@ def _is_escaped():
                 self._unaccepted_mark = next_mark
                 return next_mark.start()
 
-            # Sanity-check: If new block is starting, we abort
-            elif next_mark.group(0).startswith("@"):
+            # Sanity-check: If new block is starting at line start, we abort.
+            # We only abort if the @ is at the start of a line to allow @ signs
+            # in field values (e.g., "LeQua @ {CLEF}") while still providing
+            # error recovery when a new block starts on a new line.
+            elif next_mark.group(0).startswith("@") and self._is_at_line_start(next_mark.start()):
                 self._unaccepted_mark = next_mark
 
                 if currently_quote_escaped:
diff --git a/tests/splitter_tests/test_splitter_block_start_detection.py b/tests/splitter_tests/test_splitter_block_start_detection.py
@@ -0,0 +1,293 @@
+"""Tests for block start detection behavior.
+
+These tests verify the fix for issue #488 and the tradeoffs discussed in PR #416:
+- @ signs in field values should not be treated as new block starts
+- Multiple blocks on the same line should be parsed correctly
+- Error recovery should still work when a new block starts at line start
+"""
+
+from textwrap import dedent
+
+import pytest
+
+from bibtexparser.splitter import Splitter
+
+# =============================================================================
+# Test: @ signs in field values (issue #488)
+# =============================================================================
+
+
+@pytest.mark.parametrize(
+    "bibtex_str,expected_key,expected_field,expected_substring",
+    [
+        pytest.param(
+            dedent(
+                """\
+                @inproceedings{DBLP:conf/cikm/EsuliM021,
+                  author = {Andrea Esuli},
+                  title = {LeQua @ {CLEF} 2022: {A} Shared Task},
+                  year = {2021}
+                }"""
+            ),
+            "DBLP:conf/cikm/EsuliM021",
+            "title",
+            "@ {CLEF}",
+            id="at_sign_space_brace_in_title",
+        ),
+        pytest.param(
+            "@article{test, email = {john.doe@example.com}}",
+            "test",
+            "email",
+            "john.doe@example.com",
+            id="email_address_in_braces",
+        ),
+        pytest.param(
+            '@article{test, email = "john.doe@example.com"}',
+            "test",
+            "email",
+            "john.doe@example.com",
+            id="email_address_in_quotes",
+        ),
+        pytest.param(
+            "@article{test, note = {Contact alice@a.com or bob@b.com}}",
+            "test",
+            "note",
+            "alice@a.com",
+            id="multiple_at_signs",
+        ),
+        pytest.param(
+            "@article{test, title = {Workshop @ {ICML} 2023}}",
+            "test",
+            "title",
+            "@ {ICML}",
+            id="at_sign_followed_by_brace",
+        ),
+        pytest.param(
+            '@article{test, title = "BibTeX entries start with @article{"}',
+            "test",
+            "title",
+            "@article{",
+            id="literal_at_entry_in_quotes",
+        ),
+        pytest.param(
+            # Note: 3 closing braces - inner {}, title field, entry
+            "@article{test, title = {BibTeX entries start with @article{}}}",
+            "test",
+            "title",
+            "@article{",
+            id="literal_at_entry_in_braces",
+        ),
+    ],
+)
+def test_at_sign_in_field_value(
+    bibtex_str: str, expected_key: str, expected_field: str, expected_substring: str
+):
+    """@ signs in field values should be parsed as content, not block starts."""
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.entries) == 1
+    assert library.entries[0].key == expected_key
+    assert expected_substring in library.entries[0][expected_field]
+
+
+# =============================================================================
+# Test: Multiple blocks on the same line
+# =============================================================================
+
+
+@pytest.mark.parametrize(
+    "bibtex_str,expected_entry_keys",
+    [
+        pytest.param(
+            "@article{key1, title={A}} @book{key2, title={B}}",
+            ["key1", "key2"],
+            id="two_entries_with_space",
+        ),
+        pytest.param(
+            "@article{key1,title={A}}@book{key2,title={B}}",
+            ["key1", "key2"],
+            id="two_entries_no_space",
+        ),
+        pytest.param(
+            "@article{a, x={1}} @book{b, y={2}} @misc{c, z={3}}",
+            ["a", "b", "c"],
+            id="three_entries",
+        ),
+    ],
+)
+def test_multiple_entries_same_line(bibtex_str: str, expected_entry_keys: list):
+    """Multiple well-formed entries on the same line should all be parsed."""
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.entries) == len(expected_entry_keys)
+    assert [e.key for e in library.entries] == expected_entry_keys
+
+
+@pytest.mark.parametrize(
+    "bibtex_str,expected_entries,expected_strings,expected_comments",
+    [
+        pytest.param(
+            '@article{key1, title={A}} @string{mystr = "value"}',
+            1,
+            1,
+            0,
+            id="entry_and_string",
+        ),
+        pytest.param(
+            "@article{key1, title={A}} @comment{A comment}",
+            1,
+            0,
+            1,
+            id="entry_and_comment",
+        ),
+    ],
+)
+def test_mixed_blocks_same_line(
+    bibtex_str: str, expected_entries: int, expected_strings: int, expected_comments: int
+):
+    """Different block types on the same line should all be parsed."""
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.entries) == expected_entries
+    assert len(library.strings) == expected_strings
+    assert len(library.comments) == expected_comments
+
+
+# =============================================================================
+# Test: Error recovery when new block starts at line start
+# =============================================================================
+
+
+@pytest.mark.parametrize(
+    "bibtex_str,expected_valid_key",
+    [
+        pytest.param(
+            dedent(
+                """\
+                @article{broken, title={Unclosed
+                @article{valid, title={Valid Entry}}"""
+            ),
+            "valid",
+            id="unclosed_entry_field",
+        ),
+        pytest.param(
+            dedent(
+                """\
+                @string{broken = {unclosed value
+                @article{valid, title={Valid Entry}}"""
+            ),
+            "valid",
+            id="unclosed_string",
+        ),
+        pytest.param(
+            dedent(
+                """\
+                @article{broken, title={Unclosed
+                    @article{valid, title={Valid Entry}}"""
+            ),
+            "valid",
+            id="indented_new_block",
+        ),
+    ],
+)
+def test_error_recovery_at_line_start(bibtex_str: str, expected_valid_key: str):
+    """New block at line start should trigger recovery from malformed block."""
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 1
+    assert len(library.entries) == 1
+    assert library.entries[0].key == expected_valid_key
+
+
+def test_error_recovery_preserves_failed_block_raw():
+    """The failed block should contain raw text up to where recovery started."""
+    bibtex_str = dedent(
+        """\
+        @article{broken, title={This is unclosed
+        @article{valid, title={OK}}"""
+    )
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 1
+    failed = library.failed_blocks[0]
+    assert "broken" in failed.raw
+    assert "This is unclosed" in failed.raw
+
+
+# =============================================================================
+# Test: No false recovery for @ mid-line
+# =============================================================================
+
+
+@pytest.mark.parametrize(
+    "bibtex_str",
+    [
+        pytest.param(
+            "@article{test, title={unclosed @misc{fake}",
+            id="at_entry_mid_line",
+        ),
+        pytest.param(
+            "@article{test, title={text @ {more} unclosed",
+            id="at_brace_mid_line",
+        ),
+    ],
+)
+def test_no_false_recovery_mid_line(bibtex_str: str):
+    """@ mid-line should not trigger false error recovery."""
+    library = Splitter(bibtex_str).split()
+
+    # Should fail as one block, no recovery
+    assert len(library.failed_blocks) == 1
+    assert len(library.entries) == 0
+
+
+# =============================================================================
+# Test: Edge cases
+# =============================================================================
+
+
+@pytest.mark.parametrize(
+    "bibtex_str",
+    [
+        pytest.param(
+            "@article{test, title={Hello}}",
+            id="block_at_file_start",
+        ),
+        pytest.param(
+            "   \t  @article{test, title={Hello}}",
+            id="block_after_whitespace_only",
+        ),
+        pytest.param(
+            "@article{test, title={L1 {L2 {user@email.com} back} done}}",
+            id="nested_braces_with_at",
+        ),
+    ],
+)
+def test_edge_cases_entries(bibtex_str: str):
+    """Various edge cases should parse without failure."""
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.entries) == 1
+
+
+def test_preamble_with_at_sign():
+    """@ sign inside a preamble block."""
+    bibtex_str = '@preamble{"Contact: admin@site.org"}'
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.preambles) == 1
+
+
+def test_explicit_comment_with_at_sign():
+    """@ sign inside an explicit comment block."""
+    bibtex_str = "@comment{Email: test@example.com}"
+    library = Splitter(bibtex_str).split()
+
+    assert len(library.failed_blocks) == 0
+    assert len(library.comments) == 1
+    assert "test@example.com" in library.comments[0].comment