|
| 1 | +"""Tests for block start detection behavior. |
| 2 | +
|
| 3 | +These tests verify the fix for issue #488 and the tradeoffs discussed in PR #416: |
| 4 | +- @ signs in field values should not be treated as new block starts |
| 5 | +- Multiple blocks on the same line should be parsed correctly |
| 6 | +- Error recovery should still work when a new block starts at line start |
| 7 | +""" |
| 8 | + |
| 9 | +from textwrap import dedent |
| 10 | + |
| 11 | +import pytest |
| 12 | + |
| 13 | +from bibtexparser.splitter import Splitter |
| 14 | + |
| 15 | +# ============================================================================= |
| 16 | +# Test: @ signs in field values (issue #488) |
| 17 | +# ============================================================================= |
| 18 | + |
| 19 | + |
| 20 | +@pytest.mark.parametrize( |
| 21 | + "bibtex_str,expected_key,expected_field,expected_substring", |
| 22 | + [ |
| 23 | + pytest.param( |
| 24 | + dedent( |
| 25 | + """\ |
| 26 | + @inproceedings{DBLP:conf/cikm/EsuliM021, |
| 27 | + author = {Andrea Esuli}, |
| 28 | + title = {LeQua @ {CLEF} 2022: {A} Shared Task}, |
| 29 | + year = {2021} |
| 30 | + }""" |
| 31 | + ), |
| 32 | + "DBLP:conf/cikm/EsuliM021", |
| 33 | + "title", |
| 34 | + "@ {CLEF}", |
| 35 | + id="at_sign_space_brace_in_title", |
| 36 | + ), |
| 37 | + pytest.param( |
| 38 | + "@article{test, email = {john.doe@example.com}}", |
| 39 | + "test", |
| 40 | + "email", |
| 41 | + "john.doe@example.com", |
| 42 | + id="email_address_in_braces", |
| 43 | + ), |
| 44 | + pytest.param( |
| 45 | + '@article{test, email = "john.doe@example.com"}', |
| 46 | + "test", |
| 47 | + "email", |
| 48 | + "john.doe@example.com", |
| 49 | + id="email_address_in_quotes", |
| 50 | + ), |
| 51 | + pytest.param( |
| 52 | + "@article{test, note = {Contact alice@a.com or bob@b.com}}", |
| 53 | + "test", |
| 54 | + "note", |
| 55 | + "alice@a.com", |
| 56 | + id="multiple_at_signs", |
| 57 | + ), |
| 58 | + pytest.param( |
| 59 | + "@article{test, title = {Workshop @ {ICML} 2023}}", |
| 60 | + "test", |
| 61 | + "title", |
| 62 | + "@ {ICML}", |
| 63 | + id="at_sign_followed_by_brace", |
| 64 | + ), |
| 65 | + pytest.param( |
| 66 | + '@article{test, title = "BibTeX entries start with @article{"}', |
| 67 | + "test", |
| 68 | + "title", |
| 69 | + "@article{", |
| 70 | + id="literal_at_entry_in_quotes", |
| 71 | + ), |
| 72 | + pytest.param( |
| 73 | + # Note: 3 closing braces - inner {}, title field, entry |
| 74 | + "@article{test, title = {BibTeX entries start with @article{}}}", |
| 75 | + "test", |
| 76 | + "title", |
| 77 | + "@article{", |
| 78 | + id="literal_at_entry_in_braces", |
| 79 | + ), |
| 80 | + ], |
| 81 | +) |
| 82 | +def test_at_sign_in_field_value( |
| 83 | + bibtex_str: str, expected_key: str, expected_field: str, expected_substring: str |
| 84 | +): |
| 85 | + """@ signs in field values should be parsed as content, not block starts.""" |
| 86 | + library = Splitter(bibtex_str).split() |
| 87 | + |
| 88 | + assert len(library.failed_blocks) == 0 |
| 89 | + assert len(library.entries) == 1 |
| 90 | + assert library.entries[0].key == expected_key |
| 91 | + assert expected_substring in library.entries[0][expected_field] |
| 92 | + |
| 93 | + |
| 94 | +# ============================================================================= |
| 95 | +# Test: Multiple blocks on the same line |
| 96 | +# ============================================================================= |
| 97 | + |
| 98 | + |
| 99 | +@pytest.mark.parametrize( |
| 100 | + "bibtex_str,expected_entry_keys", |
| 101 | + [ |
| 102 | + pytest.param( |
| 103 | + "@article{key1, title={A}} @book{key2, title={B}}", |
| 104 | + ["key1", "key2"], |
| 105 | + id="two_entries_with_space", |
| 106 | + ), |
| 107 | + pytest.param( |
| 108 | + "@article{key1,title={A}}@book{key2,title={B}}", |
| 109 | + ["key1", "key2"], |
| 110 | + id="two_entries_no_space", |
| 111 | + ), |
| 112 | + pytest.param( |
| 113 | + "@article{a, x={1}} @book{b, y={2}} @misc{c, z={3}}", |
| 114 | + ["a", "b", "c"], |
| 115 | + id="three_entries", |
| 116 | + ), |
| 117 | + ], |
| 118 | +) |
| 119 | +def test_multiple_entries_same_line(bibtex_str: str, expected_entry_keys: list): |
| 120 | + """Multiple well-formed entries on the same line should all be parsed.""" |
| 121 | + library = Splitter(bibtex_str).split() |
| 122 | + |
| 123 | + assert len(library.failed_blocks) == 0 |
| 124 | + assert len(library.entries) == len(expected_entry_keys) |
| 125 | + assert [e.key for e in library.entries] == expected_entry_keys |
| 126 | + |
| 127 | + |
| 128 | +@pytest.mark.parametrize( |
| 129 | + "bibtex_str,expected_entries,expected_strings,expected_comments", |
| 130 | + [ |
| 131 | + pytest.param( |
| 132 | + '@article{key1, title={A}} @string{mystr = "value"}', |
| 133 | + 1, |
| 134 | + 1, |
| 135 | + 0, |
| 136 | + id="entry_and_string", |
| 137 | + ), |
| 138 | + pytest.param( |
| 139 | + "@article{key1, title={A}} @comment{A comment}", |
| 140 | + 1, |
| 141 | + 0, |
| 142 | + 1, |
| 143 | + id="entry_and_comment", |
| 144 | + ), |
| 145 | + ], |
| 146 | +) |
| 147 | +def test_mixed_blocks_same_line( |
| 148 | + bibtex_str: str, expected_entries: int, expected_strings: int, expected_comments: int |
| 149 | +): |
| 150 | + """Different block types on the same line should all be parsed.""" |
| 151 | + library = Splitter(bibtex_str).split() |
| 152 | + |
| 153 | + assert len(library.failed_blocks) == 0 |
| 154 | + assert len(library.entries) == expected_entries |
| 155 | + assert len(library.strings) == expected_strings |
| 156 | + assert len(library.comments) == expected_comments |
| 157 | + |
| 158 | + |
| 159 | +# ============================================================================= |
| 160 | +# Test: Error recovery when new block starts at line start |
| 161 | +# ============================================================================= |
| 162 | + |
| 163 | + |
| 164 | +@pytest.mark.parametrize( |
| 165 | + "bibtex_str,expected_valid_key", |
| 166 | + [ |
| 167 | + pytest.param( |
| 168 | + dedent( |
| 169 | + """\ |
| 170 | + @article{broken, title={Unclosed |
| 171 | + @article{valid, title={Valid Entry}}""" |
| 172 | + ), |
| 173 | + "valid", |
| 174 | + id="unclosed_entry_field", |
| 175 | + ), |
| 176 | + pytest.param( |
| 177 | + dedent( |
| 178 | + """\ |
| 179 | + @string{broken = {unclosed value |
| 180 | + @article{valid, title={Valid Entry}}""" |
| 181 | + ), |
| 182 | + "valid", |
| 183 | + id="unclosed_string", |
| 184 | + ), |
| 185 | + pytest.param( |
| 186 | + dedent( |
| 187 | + """\ |
| 188 | + @article{broken, title={Unclosed |
| 189 | + @article{valid, title={Valid Entry}}""" |
| 190 | + ), |
| 191 | + "valid", |
| 192 | + id="indented_new_block", |
| 193 | + ), |
| 194 | + ], |
| 195 | +) |
| 196 | +def test_error_recovery_at_line_start(bibtex_str: str, expected_valid_key: str): |
| 197 | + """New block at line start should trigger recovery from malformed block.""" |
| 198 | + library = Splitter(bibtex_str).split() |
| 199 | + |
| 200 | + assert len(library.failed_blocks) == 1 |
| 201 | + assert len(library.entries) == 1 |
| 202 | + assert library.entries[0].key == expected_valid_key |
| 203 | + |
| 204 | + |
| 205 | +def test_error_recovery_preserves_failed_block_raw(): |
| 206 | + """The failed block should contain raw text up to where recovery started.""" |
| 207 | + bibtex_str = dedent( |
| 208 | + """\ |
| 209 | + @article{broken, title={This is unclosed |
| 210 | + @article{valid, title={OK}}""" |
| 211 | + ) |
| 212 | + library = Splitter(bibtex_str).split() |
| 213 | + |
| 214 | + assert len(library.failed_blocks) == 1 |
| 215 | + failed = library.failed_blocks[0] |
| 216 | + assert "broken" in failed.raw |
| 217 | + assert "This is unclosed" in failed.raw |
| 218 | + |
| 219 | + |
| 220 | +# ============================================================================= |
| 221 | +# Test: No false recovery for @ mid-line |
| 222 | +# ============================================================================= |
| 223 | + |
| 224 | + |
| 225 | +@pytest.mark.parametrize( |
| 226 | + "bibtex_str", |
| 227 | + [ |
| 228 | + pytest.param( |
| 229 | + "@article{test, title={unclosed @misc{fake}", |
| 230 | + id="at_entry_mid_line", |
| 231 | + ), |
| 232 | + pytest.param( |
| 233 | + "@article{test, title={text @ {more} unclosed", |
| 234 | + id="at_brace_mid_line", |
| 235 | + ), |
| 236 | + ], |
| 237 | +) |
| 238 | +def test_no_false_recovery_mid_line(bibtex_str: str): |
| 239 | + """@ mid-line should not trigger false error recovery.""" |
| 240 | + library = Splitter(bibtex_str).split() |
| 241 | + |
| 242 | + # Should fail as one block, no recovery |
| 243 | + assert len(library.failed_blocks) == 1 |
| 244 | + assert len(library.entries) == 0 |
| 245 | + |
| 246 | + |
| 247 | +# ============================================================================= |
| 248 | +# Test: Edge cases |
| 249 | +# ============================================================================= |
| 250 | + |
| 251 | + |
| 252 | +@pytest.mark.parametrize( |
| 253 | + "bibtex_str", |
| 254 | + [ |
| 255 | + pytest.param( |
| 256 | + "@article{test, title={Hello}}", |
| 257 | + id="block_at_file_start", |
| 258 | + ), |
| 259 | + pytest.param( |
| 260 | + " \t @article{test, title={Hello}}", |
| 261 | + id="block_after_whitespace_only", |
| 262 | + ), |
| 263 | + pytest.param( |
| 264 | + "@article{test, title={L1 {L2 {user@email.com} back} done}}", |
| 265 | + id="nested_braces_with_at", |
| 266 | + ), |
| 267 | + ], |
| 268 | +) |
| 269 | +def test_edge_cases_entries(bibtex_str: str): |
| 270 | + """Various edge cases should parse without failure.""" |
| 271 | + library = Splitter(bibtex_str).split() |
| 272 | + |
| 273 | + assert len(library.failed_blocks) == 0 |
| 274 | + assert len(library.entries) == 1 |
| 275 | + |
| 276 | + |
| 277 | +def test_preamble_with_at_sign(): |
| 278 | + """@ sign inside a preamble block.""" |
| 279 | + bibtex_str = '@preamble{"Contact: admin@site.org"}' |
| 280 | + library = Splitter(bibtex_str).split() |
| 281 | + |
| 282 | + assert len(library.failed_blocks) == 0 |
| 283 | + assert len(library.preambles) == 1 |
| 284 | + |
| 285 | + |
| 286 | +def test_explicit_comment_with_at_sign(): |
| 287 | + """@ sign inside an explicit comment block.""" |
| 288 | + bibtex_str = "@comment{Email: test@example.com}" |
| 289 | + library = Splitter(bibtex_str).split() |
| 290 | + |
| 291 | + assert len(library.failed_blocks) == 0 |
| 292 | + assert len(library.comments) == 1 |
| 293 | + assert "test@example.com" in library.comments[0].comment |
0 commit comments