Skip to content

Commit 0fd342e

Browse files
authored
fix: granite33 citation spans wrong for duplicate sentences (#851) (#872)
str.find() without an offset always returned the first occurrence, so when the same sentence appeared more than once in the response, every citation after the first got response_begin/response_end pointing at the first occurrence. Introduce search_offset that advances past each consumed sentence. When find() misses after search_offset the sentence appears only once (multiple citations on one sentence); fall back to searching from the start so those citations share the correct span. Adds two regression tests to TestAddCitationResponseSpans: - test_duplicate_sentences_each_get_own_span - test_multiple_citations_on_same_sentence_share_span Assisted-by: Claude Code Signed-off-by: Nigel Jones <jonesn@uk.ibm.com>
1 parent 8501b3d commit 0fd342e

2 files changed

Lines changed: 73 additions & 7 deletions

File tree

mellea/formatters/granite/granite3/granite33/output.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -211,16 +211,28 @@ def _add_citation_response_spans(
211211
)
212212
continue
213213

214-
# For each citation bring the response sentence to which it refers and its
215-
# begin/end spans
214+
# For each citation, find its span in the clean response.
215+
# Citations are indexed in left-to-right response order (citation_idx above),
216+
# so search_offset can advance monotonically (fixes issue #851 — str.find()
217+
# without an offset always returned the first occurrence for duplicate sentences).
218+
# If find() misses after search_offset the sentence appears only once in the
219+
# response; fall back to the start so multi-citation sentences share their span.
220+
search_offset = 0
216221
for i, citation in enumerate(augmented_citation_info):
217222
response_text = response_sents_by_citation_id.get(str(i), "")
218-
index = response_text_without_citations.find(response_text)
223+
224+
index = response_text_without_citations.find(response_text, search_offset)
219225
if index < 0:
220-
logger.warning(
221-
"Error in extracting the response sentence of a citation: Unexpected error."
222-
)
223-
continue
226+
# Single occurrence: fall back so every citation on this sentence
227+
# gets the same span; do not advance search_offset.
228+
index = response_text_without_citations.find(response_text)
229+
if index < 0:
230+
logger.warning(
231+
"Error in extracting the response sentence of a citation: Unexpected error."
232+
)
233+
continue
234+
else:
235+
search_offset = index + len(response_text)
224236

225237
citation["response_text"] = response_text
226238
citation["response_begin"] = index

test/formatters/granite/test_granite33_output.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,60 @@ def test_single_sentence_response(self):
292292
end = citation["response_end"]
293293
assert response_without[begin:end] == citation["response_text"]
294294

295+
def test_duplicate_sentences_each_get_own_span(self):
296+
"""Regression (#851): duplicate sentence text must map to distinct occurrences.
297+
298+
Without this fix, str.find() always returns the first occurrence, so both
299+
citations end up with the same span pointing at the first sentence.
300+
"""
301+
sent = "The sky is blue."
302+
cite1 = f'{CITE_START}{{"document_id": "1"}}{CITE_END}'
303+
cite2 = f'{CITE_START}{{"document_id": "2"}}{CITE_END}'
304+
# Two identical sentences, each followed by a separate citation tag.
305+
response_with = f"{sent} {cite1} {sent} {cite2}"
306+
# Clean response has both sentences, separated by a space.
307+
response_without = f"{sent} {sent}"
308+
309+
result = _add_citation_response_spans(
310+
[self._make_citation(), self._make_citation()],
311+
response_with,
312+
response_without,
313+
)
314+
315+
assert len(result) == 2
316+
spans = [(c["response_begin"], c["response_end"]) for c in result]
317+
# Both spans must be valid slices of the clean response.
318+
for begin, end in spans:
319+
assert response_without[begin:end] == sent
320+
# The two spans must be different (pointing at the two different occurrences).
321+
assert spans[0] != spans[1]
322+
# They must not overlap.
323+
spans_sorted = sorted(spans)
324+
assert spans_sorted[0][1] <= spans_sorted[1][0]
325+
326+
def test_multiple_citations_on_same_sentence_share_span(self):
327+
"""Two citations on a single occurrence of a sentence must share the same span."""
328+
sent = "The sky is blue."
329+
cite1 = f'{CITE_START}{{"document_id": "1"}}{CITE_END}'
330+
cite2 = f'{CITE_START}{{"document_id": "2"}}{CITE_END}'
331+
# Single sentence with two citation tags; clean response has one occurrence.
332+
response_with = f"{sent} {cite1} {cite2}"
333+
response_without = sent
334+
335+
result = _add_citation_response_spans(
336+
[self._make_citation(), self._make_citation()],
337+
response_with,
338+
response_without,
339+
)
340+
341+
assert len(result) == 2
342+
# Both citations reference the one occurrence — spans must be identical.
343+
assert result[0]["response_begin"] == result[1]["response_begin"]
344+
assert result[0]["response_end"] == result[1]["response_end"]
345+
begin = result[0]["response_begin"]
346+
end = result[0]["response_end"]
347+
assert response_without[begin:end] == sent
348+
295349

296350
# ---------------------------------------------------------------------------
297351
# Granite33OutputProcessor.transform

0 commit comments

Comments
 (0)