diff --git a/internal/api/document_delete_test.go b/internal/api/document_delete_test.go new file mode 100644 index 0000000..c579c05 --- /dev/null +++ b/internal/api/document_delete_test.go @@ -0,0 +1,265 @@ +//go:build sqlite_fts5 + +package api + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/RandomCodeSpace/docsiq/internal/config" + "github.com/RandomCodeSpace/docsiq/internal/store" +) + +// seedDeleteFixture creates a router with two documents, two chunks +// each, claims and relationships rooted in each doc, and one entity +// shared across both docs plus one entity unique to doc-A. The shared +// entity must survive deletion of doc-A; the unique entity must not. +func seedDeleteFixture(t *testing.T) (http.Handler, *store.Store) { + t.Helper() + dir := t.TempDir() + st, err := store.OpenForProject(dir, "_default") + if err != nil { + t.Fatalf("OpenForProject: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + + ctx := context.Background() + for _, d := range []*store.Document{ + {ID: "docA", Path: "/tmp/a.md", DocType: "md", FileHash: "ah", IsLatest: true}, + {ID: "docB", Path: "/tmp/b.md", DocType: "md", FileHash: "bh", IsLatest: true}, + } { + if err := st.UpsertDocument(ctx, d); err != nil { + t.Fatalf("UpsertDocument %s: %v", d.ID, err) + } + } + + chunks := []*store.Chunk{ + {ID: "cA1", DocID: "docA", ChunkIndex: 0, Content: "alpha-1"}, + {ID: "cA2", DocID: "docA", ChunkIndex: 1, Content: "alpha-2"}, + {ID: "cB1", DocID: "docB", ChunkIndex: 0, Content: "beta-1"}, + } + if err := st.BatchInsertChunks(ctx, chunks); err != nil { + t.Fatalf("BatchInsertChunks: %v", err) + } + if err := st.BatchUpsertEmbeddings(ctx, "test-model", + []string{"cA1", "cA2", "cB1"}, + [][]float32{{0.1, 0.2}, {0.3, 0.4}, {0.5, 0.6}}); err != nil { + t.Fatalf("BatchUpsertEmbeddings: %v", err) + } + + // Entities: shared (referenced by both docs) and unique-to-A. + for _, e := range []*store.Entity{ + {ID: "eShared", Name: "Shared"}, + {ID: "eOnlyA", Name: "OnlyA"}, + {ID: "eOnlyB", Name: "OnlyB"}, + } { + if err := st.UpsertEntity(ctx, e); err != nil { + t.Fatalf("UpsertEntity %s: %v", e.ID, err) + } + } + + rels := []*store.Relationship{ + {ID: "rA1", SourceID: "eShared", TargetID: "eOnlyA", Predicate: "mentions", DocID: "docA"}, + {ID: "rA2", SourceID: "eOnlyA", TargetID: "eShared", Predicate: "rel", DocID: "docA"}, + {ID: "rB1", SourceID: "eShared", TargetID: "eOnlyB", Predicate: "mentions", DocID: "docB"}, + } + if err := st.BatchInsertRelationships(ctx, rels); err != nil { + t.Fatalf("BatchInsertRelationships: %v", err) + } + + claims := []*store.Claim{ + {ID: "clA", EntityID: "eOnlyA", Claim: "claim-a", Status: "verified", DocID: "docA"}, + {ID: "clB", EntityID: "eOnlyB", Claim: "claim-b", Status: "verified", DocID: "docB"}, + } + if err := st.BatchInsertClaims(ctx, claims); err != nil { + t.Fatalf("BatchInsertClaims: %v", err) + } + + cfg := &config.Config{} + cfg.DataDir = dir + h := NewRouter(nil, nil, cfg, nil, + WithProjectStores(testSingleStore(dir, st, "_default"))) + return h, st +} + +func TestDeleteDocumentHandler(t *testing.T) { + t.Run("unknown_id_returns_404", func(t *testing.T) { + h, _ := seedDeleteFixture(t) + req := httptest.NewRequest(http.MethodDelete, "/api/documents/no-such-id", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusNotFound { + t.Fatalf("status = %d, want 404; body=%s", rec.Code, rec.Body.String()) + } + }) + + t.Run("happy_path_returns_204_and_cleans_graph", func(t *testing.T) { + h, st := seedDeleteFixture(t) + req := httptest.NewRequest(http.MethodDelete, "/api/documents/docA", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusNoContent { + t.Fatalf("status = %d, want 204; body=%s", rec.Code, rec.Body.String()) + } + if rec.Body.Len() != 0 { + t.Errorf("body = %q, want empty for 204", rec.Body.String()) + } + + ctx := context.Background() + + // docA gone, docB preserved. + if d, err := st.GetDocument(ctx, "docA"); err != nil || d != nil { + t.Errorf("docA still present (err=%v, doc=%v)", err, d) + } + if d, err := st.GetDocument(ctx, "docB"); err != nil || d == nil { + t.Errorf("docB unexpectedly removed (err=%v, doc=%v)", err, d) + } + + // Chunks for docA gone, docB chunk preserved. + aChunks, _ := st.ListChunksByDoc(ctx, "docA") + if len(aChunks) != 0 { + t.Errorf("docA chunks remain: %d", len(aChunks)) + } + bChunks, _ := st.ListChunksByDoc(ctx, "docB") + if len(bChunks) != 1 { + t.Errorf("docB chunks count = %d, want 1", len(bChunks)) + } + + // Embeddings for docA chunks gone (cascade via FK). + var embCount int + row := st.DB().QueryRowContext(ctx, + `SELECT COUNT(*) FROM embeddings WHERE chunk_id IN ('cA1','cA2')`) + if err := row.Scan(&embCount); err != nil { + t.Fatalf("embeddings count: %v", err) + } + if embCount != 0 { + t.Errorf("docA embeddings remain: %d", embCount) + } + + // Claims/relationships for docA gone, docB preserved. + var relCount int + _ = st.DB().QueryRowContext(ctx, + `SELECT COUNT(*) FROM relationships WHERE doc_id='docA'`).Scan(&relCount) + if relCount != 0 { + t.Errorf("docA relationships remain: %d", relCount) + } + var bRelCount int + _ = st.DB().QueryRowContext(ctx, + `SELECT COUNT(*) FROM relationships WHERE doc_id='docB'`).Scan(&bRelCount) + if bRelCount != 1 { + t.Errorf("docB relationships count = %d, want 1", bRelCount) + } + var aClaimCount int + _ = st.DB().QueryRowContext(ctx, + `SELECT COUNT(*) FROM claims WHERE doc_id='docA'`).Scan(&aClaimCount) + if aClaimCount != 0 { + t.Errorf("docA claims remain: %d", aClaimCount) + } + + // Orphan entity (eOnlyA) removed; shared (eShared) and + // docB-only (eOnlyB) preserved. + if e, err := st.GetEntity(ctx, "eOnlyA"); err != nil || e != nil { + t.Errorf("orphan entity eOnlyA still present (err=%v, ent=%v)", err, e) + } + if e, err := st.GetEntity(ctx, "eShared"); err != nil || e == nil { + t.Errorf("shared entity eShared was removed (err=%v, ent=%v)", err, e) + } + if e, err := st.GetEntity(ctx, "eOnlyB"); err != nil || e == nil { + t.Errorf("docB-only entity eOnlyB was removed (err=%v, ent=%v)", err, e) + } + }) + + t.Run("idempotent_after_delete", func(t *testing.T) { + h, _ := seedDeleteFixture(t) + req1 := httptest.NewRequest(http.MethodDelete, "/api/documents/docA", nil) + rec1 := httptest.NewRecorder() + h.ServeHTTP(rec1, req1) + if rec1.Code != http.StatusNoContent { + t.Fatalf("first delete status = %d, want 204", rec1.Code) + } + // Second delete of the same id must be 404, not 204. + req2 := httptest.NewRequest(http.MethodDelete, "/api/documents/docA", nil) + rec2 := httptest.NewRecorder() + h.ServeHTTP(rec2, req2) + if rec2.Code != http.StatusNotFound { + t.Fatalf("second delete status = %d, want 404", rec2.Code) + } + }) +} + +// TestStoreDeleteDocumentCascade is a thin store-layer fence around +// the cascade transaction so a future schema change that breaks the +// graph cleanup fails here, not just at the HTTP boundary. +func TestStoreDeleteDocumentCascade(t *testing.T) { + dir := t.TempDir() + st, err := store.OpenForProject(dir, "tx") + if err != nil { + t.Fatalf("OpenForProject: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + + ctx := context.Background() + if err := st.UpsertDocument(ctx, &store.Document{ + ID: "d1", Path: "/p.md", DocType: "md", FileHash: "h1", IsLatest: true, + }); err != nil { + t.Fatalf("UpsertDocument: %v", err) + } + if err := st.BatchInsertChunks(ctx, []*store.Chunk{ + {ID: "ch1", DocID: "d1", ChunkIndex: 0, Content: "x"}, + }); err != nil { + t.Fatalf("BatchInsertChunks: %v", err) + } + if err := st.UpsertEntity(ctx, &store.Entity{ID: "e1", Name: "E1"}); err != nil { + t.Fatalf("UpsertEntity: %v", err) + } + if err := st.BatchInsertRelationships(ctx, []*store.Relationship{ + {ID: "r1", SourceID: "e1", TargetID: "e1", Predicate: "self", DocID: "d1"}, + }); err != nil { + t.Fatalf("BatchInsertRelationships: %v", err) + } + if err := st.BatchInsertClaims(ctx, []*store.Claim{ + {ID: "cl1", EntityID: "e1", Claim: "c", Status: "v", DocID: "d1"}, + }); err != nil { + t.Fatalf("BatchInsertClaims: %v", err) + } + + affected, err := st.DeleteDocument(ctx, "d1") + if err != nil { + t.Fatalf("DeleteDocument: %v", err) + } + if affected != 1 { + t.Errorf("affected = %d, want 1", affected) + } + + // Everything tied to d1 must be gone, including the now-orphan + // entity e1. + for _, q := range []struct { + name string + sql string + }{ + {"chunks", `SELECT COUNT(*) FROM chunks WHERE doc_id='d1'`}, + {"relationships", `SELECT COUNT(*) FROM relationships WHERE doc_id='d1'`}, + {"claims", `SELECT COUNT(*) FROM claims WHERE doc_id='d1'`}, + {"entities", `SELECT COUNT(*) FROM entities WHERE id='e1'`}, + {"documents", `SELECT COUNT(*) FROM documents WHERE id='d1'`}, + } { + var n int + if err := st.DB().QueryRowContext(ctx, q.sql).Scan(&n); err != nil { + t.Fatalf("count %s: %v", q.name, err) + } + if n != 0 { + t.Errorf("%s count = %d, want 0", q.name, n) + } + } + + // Idempotent: deleting an unknown id is a 0-affected non-error. + affected, err = st.DeleteDocument(ctx, "missing") + if err != nil { + t.Errorf("delete missing returned err: %v", err) + } + if affected != 0 { + t.Errorf("delete missing affected = %d, want 0", affected) + } +} diff --git a/internal/api/handlers.go b/internal/api/handlers.go index 751d496..fa5ca79 100644 --- a/internal/api/handlers.go +++ b/internal/api/handlers.go @@ -252,6 +252,64 @@ func (h *handlers) getDocumentChunks(w http.ResponseWriter, r *http.Request) { writeJSON(w, 200, out) } +// deleteDocument hard-deletes a document and cascades cleanup of +// chunks, embeddings, claims, doc-scoped relationships, and any +// entities that no longer have a relationship or claim referencing +// them. +// +// Communities are deliberately NOT recomputed inline. Louvain on a +// graph of any meaningful size is too slow for an interactive DELETE, +// and stale community titles/summaries are tolerable until the next +// manual finalize. The vector index is invalidated so the next search +// rebuilds against the post-delete chunk set. +// +// Returns 204 on success, 404 if the id does not exist, 500 on +// transactional failure (the store rolls back so the graph stays +// consistent). +func (h *handlers) deleteDocument(w http.ResponseWriter, r *http.Request) { + st, ok := h.resolveStore(w, r) + if !ok { + return + } + id := r.PathValue("id") + doc, err := st.GetDocument(r.Context(), id) + if err != nil { + writeError(w, r, 500, err.Error(), err) + return + } + if doc == nil { + writeError(w, r, 404, "document not found", nil) + return + } + + affected, err := st.DeleteDocument(r.Context(), id) + if err != nil { + writeError(w, r, 500, fmt.Errorf("delete document: %w", err).Error(), err) + return + } + if affected == 0 { + // Race: the row vanished between our GetDocument and the + // delete. Treat as 404 β the operation succeeded only if it + // actually removed something. + writeError(w, r, 404, "document not found", nil) + return + } + + // Drop the cached vector index for this project so the next local + // search rebuilds against the post-delete chunk set. Safe on a nil + // receiver. + slug := ProjectFromContext(r.Context()) + h.vecIndexes.Invalidate(slug) + + slog.Info("ποΈ document deleted", + "project", slug, + "doc_id", id, + "path", doc.Path, + ) + + w.WriteHeader(http.StatusNoContent) +} + func (h *handlers) entityGraph(w http.ResponseWriter, r *http.Request) { st, ok := h.resolveStore(w, r) if !ok { diff --git a/internal/api/router.go b/internal/api/router.go index 48f69ec..1791736 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -169,6 +169,7 @@ func NewRouter(prov llm.Provider, emb *embedder.Embedder, cfg *config.Config, re mux.HandleFunc("GET /api/stats", h.getStats) mux.HandleFunc("GET /api/documents", h.listDocuments) mux.HandleFunc("GET /api/documents/{id}", h.getDocument) + mux.HandleFunc("DELETE /api/documents/{id}", h.deleteDocument) mux.HandleFunc("GET /api/documents/{id}/chunks", h.getDocumentChunks) mux.HandleFunc("GET /api/documents/{id}/versions", h.getDocumentVersions) mux.HandleFunc("POST /api/search", h.search) diff --git a/internal/store/store.go b/internal/store/store.go index a6ed37f..5d5ce49 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -441,16 +441,70 @@ func (s *Store) ListDocuments(ctx context.Context, docType string, limit, offset return docs, rows.Err() } -// DeleteDocument hard-deletes a document row. ON DELETE CASCADE on -// chunks/embeddings/relationships takes care of dependent rows. -// Returns the number of rows removed so callers can distinguish "did -// nothing" from "did something." +// DeleteDocument hard-deletes a document and cascades cleanup of +// graph artifacts that came from it. Runs as a single SQLite +// transaction so partial failures roll back. +// +// Cascade order: +// 1. relationships WHERE doc_id=? β drop edges sourced from this doc +// 2. claims WHERE doc_id=? β drop claims sourced from this doc +// 3. chunks WHERE doc_id=? β embeddings cascade via FK +// 4. documents WHERE id=? β the row itself +// 5. orphan entities β entities with no remaining relationship or +// claim references are deleted; community_members rows cascade +// via FK so the graph stays consistent +// +// Communities are deliberately NOT recomputed here. Stale community +// summaries are tolerable until the next manual `community.finalize`; +// recomputing on every delete would dominate the request budget for +// users dropping a bad upload. Trade-off documented in the API +// handler comment as well. +// +// Returns the number of document rows removed (0 if the id did not +// exist) so callers can distinguish "did nothing" from "did something." func (s *Store) DeleteDocument(ctx context.Context, id string) (int64, error) { - res, err := s.db.ExecContext(ctx, `DELETE FROM documents WHERE id=?`, id) + tx, err := s.db.BeginTx(ctx, nil) if err != nil { - return 0, err + return 0, fmt.Errorf("begin tx: %w", err) + } + defer tx.Rollback() + + if _, err := tx.ExecContext(ctx, `DELETE FROM relationships WHERE doc_id=?`, id); err != nil { + return 0, fmt.Errorf("delete relationships: %w", err) + } + if _, err := tx.ExecContext(ctx, `DELETE FROM claims WHERE doc_id=?`, id); err != nil { + return 0, fmt.Errorf("delete claims: %w", err) + } + // chunks cascade-deletes embeddings via embeddings.chunk_id FK. + if _, err := tx.ExecContext(ctx, `DELETE FROM chunks WHERE doc_id=?`, id); err != nil { + return 0, fmt.Errorf("delete chunks: %w", err) + } + res, err := tx.ExecContext(ctx, `DELETE FROM documents WHERE id=?`, id) + if err != nil { + return 0, fmt.Errorf("delete document: %w", err) + } + affected, err := res.RowsAffected() + if err != nil { + return 0, fmt.Errorf("rows affected: %w", err) + } + + // Orphan entity sweep: an entity is orphan when no remaining row + // in relationships (as source or target) or claims still + // references it. Entities are deduped by name across docs, so + // shared entities (e.g. "OpenAI" mentioned in two unrelated docs) + // remain after deleting one of those docs. + if _, err := tx.ExecContext(ctx, ` + DELETE FROM entities + WHERE id NOT IN (SELECT source_id FROM relationships) + AND id NOT IN (SELECT target_id FROM relationships) + AND id NOT IN (SELECT entity_id FROM claims WHERE entity_id IS NOT NULL)`); err != nil { + return 0, fmt.Errorf("sweep orphan entities: %w", err) + } + + if err := tx.Commit(); err != nil { + return 0, fmt.Errorf("commit: %w", err) } - return res.RowsAffected() + return affected, nil } // AllDocuments returns every row in the documents table regardless of diff --git a/ui/dist/index.html b/ui/dist/index.html index 7377cc6..566bca9 100644 --- a/ui/dist/index.html +++ b/ui/dist/index.html @@ -11,10 +11,18 @@