Skip to content

Commit fd8e560

Browse files
committed
rewrite whitespace-only
1 parent 2e5ad07 commit fd8e560

5 files changed

Lines changed: 1006 additions & 28 deletions

File tree

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,14 @@ add_test(NAME lws5
8484
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
8585
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/5)
8686

87+
add_test(NAME lws6
88+
COMMAND ${CMAKE_COMMAND}
89+
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
90+
-DSRC=b-comms.c
91+
-DPATCH=gemini.patch
92+
-DEXPSHA=6ea83a67aba0358099752cfaf83a28d5d983b50855e93352ae9c04d656c7911e
93+
-DEXPSHA_WIN=2e6b9b12ae0128c9edfc109744b9c67848712b0521c322a45104895aa4cbc3b1
94+
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
95+
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/6)
96+
8797

README.md

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# fixdiff
22

3-
Andy Green <andy@warmcat.com> 2025
4-
See MIT license in LICENSE
3+
Copyright (C) 2025 Andy Green <andy@warmcat.com>
4+
Licensed under MIT license, see LICENSE
55

66
```
77
$ cat llm-patch.diff | fixdiff | patch -p1
@@ -24,22 +24,25 @@ $ cat llm-patch.diff | fixdiff /path/to/sources | patch -p1
2424

2525
LLM find it hard to generate diff headers with correct line counts or even
2626
line offsets, although some LLMs are smart enough to produce otherwise
27-
legible diffs.
27+
legible diffs. Often the content or just the context lines around the
28+
changes are not quite right.
2829

2930
This utility adjusts the diff stanzas sent to it on stdin and produces new stanza
3031
headers with accurate line counts on stdout.
3132

3233
It silently repairs:
3334

34-
- added empty lines with only whitespace become blank lines
35-
- wrong "before" line in original stanza header
36-
- wrong "before" line count in original stanza header
37-
- wrong "after" line in original stanza header
38-
- wrong "after" line count in original stanza header
39-
- removes extra lead-in context lines in stanza
40-
- for diffs adding to end of file, corrects mismatching context caused by
41-
LLM losing blank lines at the original EOF (by checking the original
42-
source file for extra lines and adding them to the stanza as context)
35+
1. new empty lines with only whitespace, by rewriting to blank lines
36+
2. original lines in diff that differ from real line in file only by
37+
whitespace are rewritten to contain the correct whitespace
38+
3. wrong "before" line in original stanza header
39+
4. wrong "before" line count in original stanza header
40+
5. wrong "after" line in original stanza header
41+
6. wrong "after" line count in original stanza header
42+
7. extra lead-in context lines to stanza by removing until only 3
43+
8. diffs adding to end of file with missing or wrong context caused by
44+
LLM losing blank lines at the original EOF are rewritten by checking
45+
the original source file for extra lines and adding them to the stanza as context)
4346

4447
It finds and scans the sources the patches apply to and uses the diff stanza to
4548
find the original line it applied to by itself, along with the original line

fixdiff.c

Lines changed: 168 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,21 @@ typedef struct {
8484
int li;
8585
} lbuf_t;
8686

87+
typedef struct rewriter {
88+
struct rewriter *next;
89+
size_t len;
90+
int line;
91+
char *text;
92+
} rewriter_t;
93+
/* new_text is overcommitted below */
94+
8795
typedef struct {
8896
off_t flo;
8997

9098
const char *reason;
9199

100+
rewriter_t *rewriter_head;
101+
92102
dss_t d;
93103
int pre;
94104
int post;
@@ -102,6 +112,8 @@ typedef struct {
102112

103113
int fd_temp;
104114

115+
int li_out;
116+
105117
char ongoing;
106118
char skip_this_one;
107119
char lead_in_active;
@@ -313,10 +325,26 @@ fixdiff_stanza_start(dp_t *pdp, char *sh, size_t len)
313325
return 0;
314326
}
315327

328+
static void
329+
stain_copy(char *dest, const char *in, size_t len)
330+
{
331+
char *p = dest;
332+
333+
strncpy(dest, in, len - 1);
334+
dest[len - 1] = '\0';
335+
do {
336+
p = strchr(p, '\t');
337+
if (!p)
338+
break;
339+
*p = '>';
340+
p++;
341+
} while (1);
342+
}
343+
316344
static int
317345
fixdiff_find_original(dp_t *pdp, int *line_start)
318346
{
319-
char in_src[4096], in_temp[4096], b1[256], b2[256], hit = 0;
347+
char in_src[4096], in_temp[4096], b1[256], b2[256], f1[256], f2[256], hit = 0;
320348
int ret = 1, mc = 0, lmc = 0, lis = 0, lg_lis = 0;
321349
lbuf_t lb_temp, lb_src, lb;
322350
size_t lt, ls;
@@ -329,6 +357,8 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
329357
lb_src.fd = lb.fd = -1;
330358
b1[0] = '\0';
331359
b2[0] = '\0';
360+
f1[0] = '\0';
361+
f2[0] = '\0';
332362

333363
init_lbuf(&lb_temp, "temp");
334364
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDWR));
@@ -402,26 +432,103 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
402432
break;
403433

404434
if (!ls) {
405-
elog("failed to match, best chunk %d lines at %s:%d\n",
435+
elog("failed to match, best chunk %d lines at %s:%d (tabs shown below as >)\n",
406436
lmc, pdp->pf, lg_lis);
407-
elog("patch: '%s', source '%s'\n", b1, b2);
437+
elog("last match: patch = '%s"
438+
"', source = '%s'\n", b1, b2);
439+
elog("divergence: patch = '%s"
440+
"', source = '%s'\n", f1, f2);
408441
mc = 0;
409442
break;
410443
}
411444

412445
if (fixdiff_strcmp(in_temp + 1, lt - 1, &let, in_src, ls, &les)) {
413-
if (mc > pdp->pre + pdp->post)
414-
elog("match failed after %d: '%s' / '%s'", mc, in_temp + 1, in_src);
446+
/*
447+
* It's not a match.
448+
*
449+
* It's still possible we only differ by whitespace.
450+
* Does it match if we treat any whitespace as a single
451+
* whitespace match token?
452+
*/
453+
454+
char *p1 = in_temp + 1, *p1_end = p1 + lt - 1 - (int)let,
455+
*p2 = in_src, *p2_end = p2 + ls - (int)les;
456+
457+
while (p1 < p1_end && p2 < p2_end) {
458+
char wst1 = 0, wst2 = 0;
459+
460+
while (*p1 == ' ' || *p1 == '\t' && p1 < p1_end) {
461+
p1++;
462+
wst1 = 1;
463+
}
464+
while (*p2 == ' ' || *p2 == '\t' && p2 < p2_end) {
465+
p2++;
466+
wst2 = 1;
467+
}
468+
469+
if (wst1 != wst2)
470+
goto record_breakage;
471+
472+
if (*p1 != *p2)
473+
goto record_breakage;
474+
475+
p1++;
476+
p2++;
477+
}
478+
479+
if ((p1 < p1_end) != (p2 < p2_end))
480+
goto record_breakage;
481+
482+
elog("(fixable whitespace-only difference at stanza line %d)\n", lb_temp.li);
483+
484+
/*
485+
* We have to take care about picking up windows _TEXT
486+
* CRLF, eliminating that if present and only putting
487+
* the LF, so rewritten lines are indistinguishable
488+
*/
489+
490+
{
491+
rewriter_t *rwt = malloc(sizeof(*rwt) + ls + 1 - les + 1);
492+
if (!rwt) {
493+
elog("OOM\n");
494+
return -1;
495+
}
496+
rwt->next = pdp->rewriter_head;
497+
pdp->rewriter_head = rwt;
498+
rwt->line = lb_temp.li;
499+
rwt->text = (char *)&rwt[1];
500+
rwt->text[0] = *in_temp;
501+
rwt->len = ls + 1 - les + 1;
502+
rwt->text[rwt->len - 1] = '\n';
503+
memcpy(rwt->text + 1, in_src, ls);
504+
}
505+
goto allow_match_ws;
506+
507+
record_breakage:
508+
if (mc + 1 > lmc) {
509+
stain_copy(f1, in_temp + 1, sizeof(f1));
510+
stain_copy(f2, in_src, sizeof(f2));
511+
}
415512
mc = 0;
513+
{
514+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
515+
516+
while (rwt) {
517+
rwt1 = rwt->next;
518+
free(rwt);
519+
rwt = rwt1;
520+
}
521+
522+
pdp->rewriter_head = NULL;
523+
}
416524
break;
417525
}
418526

527+
allow_match_ws:
419528
mc++;
420529
if (mc > lmc) {
421-
strncpy(b1, in_temp + 1, sizeof(b1) - 1);
422-
b1[sizeof(b1) - 1] = '\0';
423-
strncpy(b2, in_src + 1, sizeof(b2) - 1);
424-
b2[sizeof(b2) - 1] = '\0';
530+
stain_copy(b1, in_temp + 1, sizeof(b1));
531+
stain_copy(b2, in_src, sizeof(b2));
425532
lmc++;
426533
lg_lis = lis;
427534
}
@@ -512,8 +619,9 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
512619
static int
513620
fixdiff_stanza_end(dp_t *pdp)
514621
{
622+
int orig, nope = 0;
623+
lbuf_t lb_temp;
515624
char buf[256];
516-
int orig;
517625

518626
if (!pdp->ongoing)
519627
return 0;
@@ -554,21 +662,64 @@ fixdiff_stanza_end(dp_t *pdp)
554662

555663
/* dump the temp side-buffer into stdout */
556664

557-
lseek(pdp->fd_temp, pdp->flo, SEEK_SET);
665+
init_lbuf(&lb_temp, "lb_temp");
666+
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDONLY));
667+
lseek(lb_temp.fd, pdp->flo, SEEK_SET);
668+
558669
while (1) {
559-
ssize_t l = read(pdp->fd_temp, buf, sizeof(buf));
670+
char buf[4096];
671+
ssize_t l = fixdiff_get_line(&lb_temp, buf, sizeof(buf));
672+
rewriter_t *rwt = pdp->rewriter_head;
673+
560674
if (!l)
561675
break;
562676

563-
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
564-
pdp->reason = "failed to write to stdout";
565-
return 1;
677+
// elog("dumping %d (len %d)\n", (int)pdp->li_out, (int)l);
678+
679+
while (rwt) {
680+
// elog("%d %d\n", rwt->line, pdp->li_out);
681+
if (rwt->line == lb_temp.li /*pdp->li_out*/) /* we need to rewrite this line */
682+
break;
683+
684+
rwt = rwt->next;
685+
}
686+
687+
if (rwt) {
688+
// elog("rewriting '%.*s' to '%.*s'\n", (int)l, buf, (int)rwt->len, rwt->text);
689+
if (write(1, rwt->text, TO_POSLEN(rwt->len)) != (ssize_t)rwt->len) {
690+
pdp->reason = "failed to write to stdout";
691+
nope = 1;
692+
break;
693+
}
694+
} else {
695+
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
696+
pdp->reason = "failed to write to stdout";
697+
nope = 1;
698+
break;
699+
}
700+
}
701+
702+
pdp->li_out++;
703+
}
704+
705+
{
706+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
707+
708+
while (rwt) {
709+
rwt1 = rwt->next;
710+
free(rwt);
711+
rwt = rwt1;
566712
}
713+
714+
pdp->rewriter_head = NULL;
567715
}
568716

569-
close(pdp->fd_temp);
717+
close(lb_temp.fd);
570718
pdp->fd_temp = -1;
571719

720+
if (nope)
721+
return 1;
722+
572723
/* track the effect stanza changes are having on line offsets */
573724
pdp->delta += pdp->post - pdp->pre;
574725

@@ -611,6 +762,7 @@ main(int argc, char *argv[])
611762
dp.d = DSS_WAIT_MMM;
612763
dp.lb.fd = 0; /* stdin */
613764
dp.fd_temp = -1;
765+
dp.li_out = 1;
614766

615767
while (1) {
616768
size_t l = fixdiff_get_line(&dp.lb, in, sizeof(in));

0 commit comments

Comments
 (0)