Skip to content

Commit 54548b7

Browse files
committed
filter NOP stanzas
1 parent 1559397 commit 54548b7

5 files changed

Lines changed: 611 additions & 29 deletions

File tree

CMakeLists.txt

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ install(PROGRAMS tools/concat.sh DESTINATION bin)
1515

1616
# single patched file, directly taken from Gemini 2.5 copy button
1717

18-
add_test(NAME lws1
18+
add_test(NAME fixdiff1
1919
COMMAND ${CMAKE_COMMAND}
2020
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
2121
-DSRC=client-parser-ws.c
@@ -27,7 +27,7 @@ add_test(NAME lws1
2727

2828
# two concatenated single file patches
2929

30-
add_test(NAME lws2
30+
add_test(NAME fixdiff2
3131
COMMAND ${CMAKE_COMMAND}
3232
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
3333
-DSRC=deaddrop.js
@@ -42,7 +42,7 @@ add_test(NAME lws2
4242

4343
# two concatenated patches using git style diff and index lines (otherwise same as above)
4444

45-
add_test(NAME lws3
45+
add_test(NAME fixdiff3
4646
COMMAND ${CMAKE_COMMAND}
4747
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
4848
-DSRC=deaddrop.js
@@ -58,7 +58,7 @@ add_test(NAME lws2
5858
# two concatenated patches using git style diff and index lines with extra newline at end
5959
# (otherwise same as above)
6060

61-
add_test(NAME lws4
61+
add_test(NAME fixdiff4
6262
COMMAND ${CMAKE_COMMAND}
6363
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
6464
-DSRC=deaddrop.js
@@ -74,7 +74,7 @@ add_test(NAME lws2
7474
# Same as test1 but with 3 x examples of whitespace-only lines added that should be
7575
# reduced down to empty lines
7676

77-
add_test(NAME lws5
77+
add_test(NAME fixdiff5
7878
COMMAND ${CMAKE_COMMAND}
7979
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
8080
-DSRC=client-parser-ws.c
@@ -84,7 +84,7 @@ add_test(NAME lws5
8484
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
8585
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/5)
8686

87-
add_test(NAME lws6
87+
add_test(NAME fixdiff6
8888
COMMAND ${CMAKE_COMMAND}
8989
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
9090
-DSRC=b-comms.c
@@ -94,4 +94,14 @@ add_test(NAME lws6
9494
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
9595
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/6)
9696

97+
add_test(NAME fixdiff7
98+
COMMAND ${CMAKE_COMMAND}
99+
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
100+
-DSRC=deaddrop.js
101+
-DPATCH=gemini.patch
102+
-DEXPSHA=1fc2b5b927ee6f4b3ee43d6d5db02c8f00322ea7b6b2a6e346d08faae82b4d3a
103+
-DEXPSHA_WIN=2e6b9b12ae0128c9edfc109744b9c67848712b0521c322a45104895aa4cbc3b1
104+
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
105+
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/7)
106+
97107

README.md

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,21 @@ headers with accurate line counts on stdout.
3232

3333
It silently repairs:
3434

35-
1. new empty lines with only whitespace, by rewriting to blank lines
36-
2. original lines in diff that differ from real line in file only by
35+
1. New empty + lines with only whitespace are rewritten to be empty blank lines
36+
2. Diff stanzas that do not contain any +/- lines are removed
37+
3. Original lines in diff that differ from real line in file only by
3738
whitespace are rewritten to contain the correct whitespace
38-
3. wrong "before" line in original stanza header
39-
4. wrong "before" line count in original stanza header
40-
5. wrong "after" line in original stanza header
41-
6. wrong "after" line count in original stanza header
42-
7. extra lead-in context lines to stanza by removing until only 3
43-
8. diffs adding to end of file with missing or wrong context caused by
39+
4. All stanza header line offsets and counts are recomputed from the actual
40+
match in the original source and counting before and after lines in the diff,
41+
the incoming @@ line is completely ignored and rewritten with actual info
42+
5. Extra lead-in context lines to stanza by removing until only 3
43+
6. Excessive lead-out-context is removed, missing lead-out context is added.
44+
Diffs adding to EOF with missing or wrong context caused by
4445
LLM losing blank lines at the original EOF are rewritten by checking
45-
the original source file for extra lines and adding them to the stanza as context)
46+
the original source file for extra lines and adding them as needed.
47+
7. Unexpected blank lines in a stanza (without space, + or -) are either ignored
48+
if happening at the end of the stanza, or rewritten to be context by adding
49+
a space at the beginning, if the normal diff resumes.
4650

4751
It finds and scans the sources the patches apply to and uses the diff stanza to
4852
find the original line it applied to by itself, along with the original line

fixdiff.c

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,13 @@ typedef struct {
114114

115115
int li_out;
116116

117+
int pending_empty_lines;
118+
117119
char ongoing;
118120
char skip_this_one;
119121
char lead_in_active;
120122
char cx_active;
123+
char have_seen_delta;
121124

122125
char osh[128];
123126
char temp[64];
@@ -286,15 +289,17 @@ _mkstemp(const char *base, char *tmp, size_t len)
286289
static int
287290
fixdiff_stanza_start(dp_t *pdp, char *sh, size_t len)
288291
{
289-
pdp->pre = 0;
290-
pdp->post = 0;
291-
pdp->lead_in = 0;
292-
pdp->lead_in_active = 1;
293-
pdp->lead_out = 0;
294-
pdp->cx_active = 1;
295-
pdp->lead_in_corrected = 0;
296-
pdp->d = DSS_PMSAD;
297-
pdp->ongoing = 1;
292+
pdp->pre = 0;
293+
pdp->post = 0;
294+
pdp->lead_in = 0;
295+
pdp->lead_in_active = 1;
296+
pdp->lead_out = 0;
297+
pdp->cx_active = 1;
298+
pdp->lead_in_corrected = 0;
299+
pdp->d = DSS_PMSAD;
300+
pdp->ongoing = 1;
301+
pdp->have_seen_delta = 0;
302+
pdp->pending_empty_lines = 0;
298303

299304
pdp->stanzas++;
300305

@@ -432,7 +437,7 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
432437
break;
433438

434439
if (!ls) {
435-
elog("failed to match, best chunk %d lines at %s:%d (tabs shown below as >)\n",
440+
elog("Failed to match, best chunk %d lines started at %s:%d (tabs shown below as >)\n",
436441
lmc, pdp->pf, lg_lis);
437442
elog("last match: patch = '%s"
438443
"', source = '%s'\n", b1, b2);
@@ -628,6 +633,17 @@ fixdiff_stanza_end(dp_t *pdp)
628633
if (!pdp->ongoing)
629634
return 0;
630635

636+
if (!pdp->have_seen_delta) {
637+
pdp->ongoing = 0;
638+
close(pdp->fd_temp);
639+
elog(" - stanza %d: (filtered out due to no delta inside)\n", pdp->stanzas);
640+
641+
return 0;
642+
}
643+
644+
if (dp.pending_empty_lines)
645+
elog(" Dropped %d unexpected empty lines\n", dp.pending_empty_lines);
646+
631647
if (fixdiff_find_original(pdp, &orig)) {
632648
elog("Unable to find original stanza in source\n");
633649
goto probs;
@@ -653,7 +669,7 @@ fixdiff_stanza_end(dp_t *pdp)
653669
/* is that what we already had? */
654670

655671
if (strcmp(buf, pdp->osh)) {
656-
elog(" - (lead_in %d, lead_out %d) %s", pdp->lead_in, pdp->lead_out, buf);
672+
elog(" - stanza %d: %s", pdp->stanzas, buf);
657673
pdp->bad++;
658674
}
659675

@@ -865,6 +881,32 @@ main(int argc, char *argv[])
865881
goto bail;
866882
}
867883

884+
if (dp.pending_empty_lines &&
885+
(in[0] == ' ' || in[0] == '-' || in[0] == '+')) {
886+
char ctx[3];
887+
888+
elog(" Treating %d unexpected newline(s) as context\n", dp.pending_empty_lines);
889+
890+
ctx[0] = ' ';
891+
ctx[1] = '\n';
892+
ctx[2] = '\0';
893+
894+
while (dp.pending_empty_lines > 0) {
895+
dp.pending_empty_lines--;
896+
dp.pre++;
897+
dp.post++;
898+
if (dp.lead_in_active)
899+
dp.lead_in++;
900+
dp.cx_active++;
901+
902+
w = write(dp.fd_temp != -1 ? dp.fd_temp : 1, ctx, TO_POSLEN(2));
903+
if (w < 0) {
904+
elog("write to stdout failed: %d\n", errno);
905+
goto bail;
906+
}
907+
}
908+
}
909+
868910
if (in[0] == ' ') { /* Space */
869911
dp.pre++;
870912
dp.post++;
@@ -888,6 +930,7 @@ main(int argc, char *argv[])
888930
dp.pre++;
889931
dp.lead_in_active = 0;
890932
dp.cx_active = 0;
933+
dp.have_seen_delta = 1;
891934
break;
892935
} else
893936
if (in[0] == '+') { /* Plus */
@@ -930,6 +973,7 @@ main(int argc, char *argv[])
930973
dp.post++;
931974
dp.lead_in_active = 0;
932975
dp.cx_active = 0;
976+
dp.have_seen_delta = 1;
933977
break;
934978
}
935979

@@ -957,15 +1001,29 @@ main(int argc, char *argv[])
9571001
}
9581002

9591003
if (in[0] == 0xa) {
960-
/* we can often find this from extra lines at EOT */
961-
elog(" Skipping unexpected newline\n");
1004+
/*
1005+
* We can find this blank diff line illegally generated by the LLM:
1006+
*
1007+
* 1) from extra lines at diff EOT, maybe the user
1008+
* picked them up from screenscraping too (tests/4)
1009+
* 2) because there was an empty line there,
1010+
* but the LLM did not prepend it with a
1011+
* character indicating what action to
1012+
* take with it (tests/7)
1013+
*
1014+
* We can distinguish what to (for these cases anyway) by waiting
1015+
* to see if there are any more lines in the stanza that have the
1016+
* +/-/space, if not, just drop the CR-only line
1017+
*/
1018+
1019+
dp.pending_empty_lines++;
9621020
continue;
9631021
}
9641022

9651023
elog("'%c' (0x%x)\n", in[0], in[0]);
9661024
dp.reason = "unexpected character in stanza";
9671025
goto bail;
968-
}
1026+
} /* switch */
9691027

9701028
if (dp.skip_this_one) {
9711029
dp.skip_this_one = 0;

0 commit comments

Comments
 (0)