Skip to content

Commit fcfb0cb

Browse files
authored
🐛 Relax to aggressive mid-line block starts (#515)
1 parent c015d35 commit fcfb0cb

File tree

2 files changed

+321
-3
lines changed

2 files changed

+321
-3
lines changed

bibtexparser/splitter.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,24 @@ def _reset_block_status(self, current_char_index: int) -> None:
5757
self._implicit_comment_start_line = self._current_line
5858
self._implicit_comment_start: Optional[int] = current_char_index
5959

60+
def _is_at_line_start(self, pos: int) -> bool:
61+
"""Check if position is at the start of a line (after optional whitespace).
62+
63+
This is used to determine whether an @ sign should be treated as a new
64+
block start (for error recovery) or as content within a field value.
65+
We only want to abort parsing and start a new block if the @ is at the
66+
beginning of a line, to avoid false positives with @ signs in content.
67+
"""
68+
# Scan backwards from pos to find either newline or non-whitespace
69+
for i in range(pos - 1, -1, -1):
70+
char = self.bibstr[i]
71+
if char == "\n":
72+
return True
73+
elif not char.isspace():
74+
return False
75+
# Start of string counts as line start
76+
return True
77+
6078
def _end_implicit_comment(self, end_char_index) -> Optional[ImplicitComment]:
6179
if self._implicit_comment_start is None:
6280
return # No implicit comment started
@@ -122,7 +140,11 @@ def _move_to_closed_bracket(self) -> int:
122140
return m.start()
123141
else:
124142
num_additional_brackets -= 1
125-
elif m.group(0).startswith("@"):
143+
elif m.group(0).startswith("@") and self._is_at_line_start(m.start()):
144+
# Only abort if the @ is at the start of a line.
145+
# This allows @ signs in field values (e.g., "LeQua @ {CLEF}")
146+
# while still providing error recovery when a new block starts
147+
# on a new line within an unclosed block.
126148
self._unaccepted_mark = m
127149
raise BlockAbortedException(
128150
abort_reason=f"Unexpected block start: `{m.group(0)}`. "
@@ -169,8 +191,11 @@ def _is_escaped():
169191
self._unaccepted_mark = next_mark
170192
return next_mark.start()
171193

172-
# Sanity-check: If new block is starting, we abort
173-
elif next_mark.group(0).startswith("@"):
194+
# Sanity-check: If new block is starting at line start, we abort.
195+
# We only abort if the @ is at the start of a line to allow @ signs
196+
# in field values (e.g., "LeQua @ {CLEF}") while still providing
197+
# error recovery when a new block starts on a new line.
198+
elif next_mark.group(0).startswith("@") and self._is_at_line_start(next_mark.start()):
174199
self._unaccepted_mark = next_mark
175200

176201
if currently_quote_escaped:
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
"""Tests for block start detection behavior.
2+
3+
These tests verify the fix for issue #488 and the tradeoffs discussed in PR #416:
4+
- @ signs in field values should not be treated as new block starts
5+
- Multiple blocks on the same line should be parsed correctly
6+
- Error recovery should still work when a new block starts at line start
7+
"""
8+
9+
from textwrap import dedent
10+
11+
import pytest
12+
13+
from bibtexparser.splitter import Splitter
14+
15+
# =============================================================================
16+
# Test: @ signs in field values (issue #488)
17+
# =============================================================================
18+
19+
20+
@pytest.mark.parametrize(
21+
"bibtex_str,expected_key,expected_field,expected_substring",
22+
[
23+
pytest.param(
24+
dedent(
25+
"""\
26+
@inproceedings{DBLP:conf/cikm/EsuliM021,
27+
author = {Andrea Esuli},
28+
title = {LeQua @ {CLEF} 2022: {A} Shared Task},
29+
year = {2021}
30+
}"""
31+
),
32+
"DBLP:conf/cikm/EsuliM021",
33+
"title",
34+
"@ {CLEF}",
35+
id="at_sign_space_brace_in_title",
36+
),
37+
pytest.param(
38+
"@article{test, email = {john.doe@example.com}}",
39+
"test",
40+
"email",
41+
"john.doe@example.com",
42+
id="email_address_in_braces",
43+
),
44+
pytest.param(
45+
'@article{test, email = "john.doe@example.com"}',
46+
"test",
47+
"email",
48+
"john.doe@example.com",
49+
id="email_address_in_quotes",
50+
),
51+
pytest.param(
52+
"@article{test, note = {Contact alice@a.com or bob@b.com}}",
53+
"test",
54+
"note",
55+
"alice@a.com",
56+
id="multiple_at_signs",
57+
),
58+
pytest.param(
59+
"@article{test, title = {Workshop @ {ICML} 2023}}",
60+
"test",
61+
"title",
62+
"@ {ICML}",
63+
id="at_sign_followed_by_brace",
64+
),
65+
pytest.param(
66+
'@article{test, title = "BibTeX entries start with @article{"}',
67+
"test",
68+
"title",
69+
"@article{",
70+
id="literal_at_entry_in_quotes",
71+
),
72+
pytest.param(
73+
# Note: 3 closing braces - inner {}, title field, entry
74+
"@article{test, title = {BibTeX entries start with @article{}}}",
75+
"test",
76+
"title",
77+
"@article{",
78+
id="literal_at_entry_in_braces",
79+
),
80+
],
81+
)
82+
def test_at_sign_in_field_value(
83+
bibtex_str: str, expected_key: str, expected_field: str, expected_substring: str
84+
):
85+
"""@ signs in field values should be parsed as content, not block starts."""
86+
library = Splitter(bibtex_str).split()
87+
88+
assert len(library.failed_blocks) == 0
89+
assert len(library.entries) == 1
90+
assert library.entries[0].key == expected_key
91+
assert expected_substring in library.entries[0][expected_field]
92+
93+
94+
# =============================================================================
95+
# Test: Multiple blocks on the same line
96+
# =============================================================================
97+
98+
99+
@pytest.mark.parametrize(
100+
"bibtex_str,expected_entry_keys",
101+
[
102+
pytest.param(
103+
"@article{key1, title={A}} @book{key2, title={B}}",
104+
["key1", "key2"],
105+
id="two_entries_with_space",
106+
),
107+
pytest.param(
108+
"@article{key1,title={A}}@book{key2,title={B}}",
109+
["key1", "key2"],
110+
id="two_entries_no_space",
111+
),
112+
pytest.param(
113+
"@article{a, x={1}} @book{b, y={2}} @misc{c, z={3}}",
114+
["a", "b", "c"],
115+
id="three_entries",
116+
),
117+
],
118+
)
119+
def test_multiple_entries_same_line(bibtex_str: str, expected_entry_keys: list):
120+
"""Multiple well-formed entries on the same line should all be parsed."""
121+
library = Splitter(bibtex_str).split()
122+
123+
assert len(library.failed_blocks) == 0
124+
assert len(library.entries) == len(expected_entry_keys)
125+
assert [e.key for e in library.entries] == expected_entry_keys
126+
127+
128+
@pytest.mark.parametrize(
129+
"bibtex_str,expected_entries,expected_strings,expected_comments",
130+
[
131+
pytest.param(
132+
'@article{key1, title={A}} @string{mystr = "value"}',
133+
1,
134+
1,
135+
0,
136+
id="entry_and_string",
137+
),
138+
pytest.param(
139+
"@article{key1, title={A}} @comment{A comment}",
140+
1,
141+
0,
142+
1,
143+
id="entry_and_comment",
144+
),
145+
],
146+
)
147+
def test_mixed_blocks_same_line(
148+
bibtex_str: str, expected_entries: int, expected_strings: int, expected_comments: int
149+
):
150+
"""Different block types on the same line should all be parsed."""
151+
library = Splitter(bibtex_str).split()
152+
153+
assert len(library.failed_blocks) == 0
154+
assert len(library.entries) == expected_entries
155+
assert len(library.strings) == expected_strings
156+
assert len(library.comments) == expected_comments
157+
158+
159+
# =============================================================================
160+
# Test: Error recovery when new block starts at line start
161+
# =============================================================================
162+
163+
164+
@pytest.mark.parametrize(
165+
"bibtex_str,expected_valid_key",
166+
[
167+
pytest.param(
168+
dedent(
169+
"""\
170+
@article{broken, title={Unclosed
171+
@article{valid, title={Valid Entry}}"""
172+
),
173+
"valid",
174+
id="unclosed_entry_field",
175+
),
176+
pytest.param(
177+
dedent(
178+
"""\
179+
@string{broken = {unclosed value
180+
@article{valid, title={Valid Entry}}"""
181+
),
182+
"valid",
183+
id="unclosed_string",
184+
),
185+
pytest.param(
186+
dedent(
187+
"""\
188+
@article{broken, title={Unclosed
189+
@article{valid, title={Valid Entry}}"""
190+
),
191+
"valid",
192+
id="indented_new_block",
193+
),
194+
],
195+
)
196+
def test_error_recovery_at_line_start(bibtex_str: str, expected_valid_key: str):
197+
"""New block at line start should trigger recovery from malformed block."""
198+
library = Splitter(bibtex_str).split()
199+
200+
assert len(library.failed_blocks) == 1
201+
assert len(library.entries) == 1
202+
assert library.entries[0].key == expected_valid_key
203+
204+
205+
def test_error_recovery_preserves_failed_block_raw():
206+
"""The failed block should contain raw text up to where recovery started."""
207+
bibtex_str = dedent(
208+
"""\
209+
@article{broken, title={This is unclosed
210+
@article{valid, title={OK}}"""
211+
)
212+
library = Splitter(bibtex_str).split()
213+
214+
assert len(library.failed_blocks) == 1
215+
failed = library.failed_blocks[0]
216+
assert "broken" in failed.raw
217+
assert "This is unclosed" in failed.raw
218+
219+
220+
# =============================================================================
221+
# Test: No false recovery for @ mid-line
222+
# =============================================================================
223+
224+
225+
@pytest.mark.parametrize(
226+
"bibtex_str",
227+
[
228+
pytest.param(
229+
"@article{test, title={unclosed @misc{fake}",
230+
id="at_entry_mid_line",
231+
),
232+
pytest.param(
233+
"@article{test, title={text @ {more} unclosed",
234+
id="at_brace_mid_line",
235+
),
236+
],
237+
)
238+
def test_no_false_recovery_mid_line(bibtex_str: str):
239+
"""@ mid-line should not trigger false error recovery."""
240+
library = Splitter(bibtex_str).split()
241+
242+
# Should fail as one block, no recovery
243+
assert len(library.failed_blocks) == 1
244+
assert len(library.entries) == 0
245+
246+
247+
# =============================================================================
248+
# Test: Edge cases
249+
# =============================================================================
250+
251+
252+
@pytest.mark.parametrize(
253+
"bibtex_str",
254+
[
255+
pytest.param(
256+
"@article{test, title={Hello}}",
257+
id="block_at_file_start",
258+
),
259+
pytest.param(
260+
" \t @article{test, title={Hello}}",
261+
id="block_after_whitespace_only",
262+
),
263+
pytest.param(
264+
"@article{test, title={L1 {L2 {user@email.com} back} done}}",
265+
id="nested_braces_with_at",
266+
),
267+
],
268+
)
269+
def test_edge_cases_entries(bibtex_str: str):
270+
"""Various edge cases should parse without failure."""
271+
library = Splitter(bibtex_str).split()
272+
273+
assert len(library.failed_blocks) == 0
274+
assert len(library.entries) == 1
275+
276+
277+
def test_preamble_with_at_sign():
278+
"""@ sign inside a preamble block."""
279+
bibtex_str = '@preamble{"Contact: admin@site.org"}'
280+
library = Splitter(bibtex_str).split()
281+
282+
assert len(library.failed_blocks) == 0
283+
assert len(library.preambles) == 1
284+
285+
286+
def test_explicit_comment_with_at_sign():
287+
"""@ sign inside an explicit comment block."""
288+
bibtex_str = "@comment{Email: test@example.com}"
289+
library = Splitter(bibtex_str).split()
290+
291+
assert len(library.failed_blocks) == 0
292+
assert len(library.comments) == 1
293+
assert "test@example.com" in library.comments[0].comment

0 commit comments

Comments
 (0)