diff --git a/internal/search/local.go b/internal/search/local.go index 3d4314c..e7a4d78 100644 --- a/internal/search/local.go +++ b/internal/search/local.go @@ -124,8 +124,10 @@ func LocalSearch(ctx context.Context, st *store.Store, emb *embedder.Embedder, i seenEntities[es.e.ID] = true result.Entities = append(result.Entities, es.e) - // Walk relationships - rels, err := st.RelationshipsForEntity(ctx, es.e.ID, graphDepth) + // Walk relationships scoped to the top-hit doc set so the + // graph expansion cannot leak edges from unrelated + // documents into a scoped local-search result. + rels, err := st.RelationshipsForEntityInDocs(ctx, es.e.ID, graphDepth, docIDList) if err != nil { continue } diff --git a/internal/search/local_scope_test.go b/internal/search/local_scope_test.go new file mode 100644 index 0000000..b7504c2 --- /dev/null +++ b/internal/search/local_scope_test.go @@ -0,0 +1,107 @@ +package search + +import ( + "context" + "testing" + + "github.com/RandomCodeSpace/docsiq/internal/store" +) + +// TestLocalSearch_GraphExpansionScopedToTopHitDocs is the RAN-35 regression +// guard. LocalSearch used to re-expand through every relationship a seed +// entity touched, regardless of doc_id, so a scoped query could surface +// unrelated-doc edges into the result set. After the fix, the graph walk +// must stay inside the top-hit doc set. +// +// Fixture: +// +// d-alpha: chunk "alpha" + entity "alpha" with one edge alpha -> beta +// (doc_id=d-alpha) +// d-delta: chunk "delta" + entity "alpha" shares a second edge +// alpha -> gamma (doc_id=d-delta) +// +// A query for "almost alpha" tops out on chunk "c-alpha" (d-alpha). With +// graphDepth=1 the result must include the d-alpha edge and must NOT +// include the d-delta edge, even though the seed entity "alpha" has a +// relationship row in d-delta. +func TestLocalSearch_GraphExpansionScopedToTopHitDocs(t *testing.T) { + st, emb, _ := seedCorpus(t) + ctx := context.Background() + + must := func(err error) { + t.Helper() + if err != nil { + t.Fatal(err) + } + } + + // Seed entities with vectors close to query so they rank in the + // top-K entity set. "alpha" shares an ID so scoped and unscoped + // edges collide on a single seed. + entAlpha := &store.Entity{ID: "ent-alpha", Name: "alpha", Vector: []float32{1, 0, 0, 0}} + entBeta := &store.Entity{ID: "ent-beta", Name: "beta", Vector: []float32{0, 1, 0, 0}} + entGamma := &store.Entity{ID: "ent-gamma", Name: "gamma", Vector: []float32{0, 0, 1, 0}} + must(st.UpsertEntity(ctx, entAlpha)) + must(st.UpsertEntity(ctx, entBeta)) + must(st.UpsertEntity(ctx, entGamma)) + + // In-scope edge: alpha -> beta in d-alpha (the top-hit doc). + must(st.InsertRelationship(ctx, &store.Relationship{ + ID: "rel-in-scope", SourceID: "ent-alpha", TargetID: "ent-beta", + Predicate: "knows", DocID: "d-alpha", + })) + // Out-of-scope edge: alpha -> gamma in d-delta (unrelated doc). + must(st.InsertRelationship(ctx, &store.Relationship{ + ID: "rel-out-of-scope", SourceID: "ent-alpha", TargetID: "ent-gamma", + Predicate: "knows", DocID: "d-delta", + })) + + res, err := LocalSearch(ctx, st, emb, nil, "almost alpha", 1, 1) + if err != nil { + t.Fatalf("LocalSearch: %v", err) + } + + // Expect the top chunk to belong to d-alpha. + if len(res.Chunks) == 0 || res.Chunks[0].Chunk.DocID != "d-alpha" { + t.Fatalf("top chunk: want doc d-alpha; got %+v", res.Chunks) + } + + // Every returned relationship must belong to a top-hit doc. + topHitDocs := map[string]bool{} + for _, c := range res.Chunks { + topHitDocs[c.Chunk.DocID] = true + } + for _, r := range res.Rels { + if !topHitDocs[r.DocID] { + t.Errorf("relationship %s leaked from unrelated doc %q (top-hit docs: %v)", + r.ID, r.DocID, topHitDocs) + } + if r.ID == "rel-out-of-scope" { + t.Errorf("scoped local search returned out-of-scope edge %s (doc=%s)", r.ID, r.DocID) + } + } + + // Sanity: the in-scope edge should actually be there — otherwise + // the negative assertion above is vacuous. + var sawInScope bool + for _, r := range res.Rels { + if r.ID == "rel-in-scope" { + sawInScope = true + break + } + } + if !sawInScope { + t.Errorf("scoped local search did not return the in-scope edge rel-in-scope; rels=%v", relIDs(res.Rels)) + } +} + +// relIDs is a tiny helper so assertion failures above print readable ids +// instead of a slice of pointers. +func relIDs(rs []*store.Relationship) []string { + out := make([]string, len(rs)) + for i, r := range rs { + out[i] = r.ID + } + return out +} + diff --git a/internal/store/relationships_for_entity_in_docs_test.go b/internal/store/relationships_for_entity_in_docs_test.go new file mode 100644 index 0000000..6f10a6b --- /dev/null +++ b/internal/store/relationships_for_entity_in_docs_test.go @@ -0,0 +1,121 @@ +package store + +import ( + "context" + "testing" +) + +// Fixture layout: +// +// docA: e1 -[rA1]-> e2 -[rA2]-> e3 +// docB: e1 -[rB1]-> e4 (entity e1 is shared across both docs) +// +// A depth-2 BFS from e1 scoped to docA must reach e2 and e3 via edges rA1 +// and rA2, and must NOT return rB1 (from the unrelated document). +func TestRelationshipsForEntityInDocs_OnlyReturnsEdgesFromScopedDocs(t *testing.T) { + t.Parallel() + st := newTestStore(t) + ctx := context.Background() + + must := func(err error) { + t.Helper() + if err != nil { + t.Fatal(err) + } + } + + must(st.UpsertDocument(ctx, &Document{ID: "docA", Path: "/a", Title: "A", DocType: "txt", FileHash: "hashA"})) + must(st.UpsertDocument(ctx, &Document{ID: "docB", Path: "/b", Title: "B", DocType: "txt", FileHash: "hashB"})) + + for _, id := range []string{"e1", "e2", "e3", "e4"} { + must(st.UpsertEntity(ctx, &Entity{ID: id, Name: id})) + } + must(st.InsertRelationship(ctx, &Relationship{ID: "rA1", SourceID: "e1", TargetID: "e2", Predicate: "p", DocID: "docA"})) + must(st.InsertRelationship(ctx, &Relationship{ID: "rA2", SourceID: "e2", TargetID: "e3", Predicate: "p", DocID: "docA"})) + must(st.InsertRelationship(ctx, &Relationship{ID: "rB1", SourceID: "e1", TargetID: "e4", Predicate: "p", DocID: "docB"})) + + // Sanity: the unscoped walk must surface the out-of-scope edge. + // (That is precisely the leak RAN-35 is closing.) + all, err := st.RelationshipsForEntity(ctx, "e1", 2) + if err != nil { + t.Fatal(err) + } + var sawLeakUnscoped bool + for _, r := range all { + if r.ID == "rB1" { + sawLeakUnscoped = true + break + } + } + if !sawLeakUnscoped { + t.Fatalf("fixture sanity: unscoped walk did not include rB1 — test setup is wrong") + } + + // Scoped walk must exclude rB1. + got, err := st.RelationshipsForEntityInDocs(ctx, "e1", 2, []string{"docA"}) + if err != nil { + t.Fatal(err) + } + ids := map[string]string{} + for _, r := range got { + ids[r.ID] = r.DocID + if r.DocID != "docA" { + t.Errorf("scoped walk returned edge %s from unrelated doc %q", r.ID, r.DocID) + } + } + for _, want := range []string{"rA1", "rA2"} { + if _, ok := ids[want]; !ok { + t.Errorf("scoped walk: missing expected in-scope edge %s", want) + } + } + if _, leaked := ids["rB1"]; leaked { + t.Errorf("scoped walk leaked out-of-scope edge rB1 from docB") + } + if len(got) != 2 { + t.Errorf("scoped walk: want exactly 2 in-scope edges, got %d (%v)", len(got), ids) + } +} + +func TestRelationshipsForEntityInDocs_EmptyDocsReturnsNil(t *testing.T) { + t.Parallel() + st := newTestStore(t) + ctx := context.Background() + + got, err := st.RelationshipsForEntityInDocs(ctx, "anything", 2, nil) + if err != nil { + t.Fatalf("empty docIDs: want (nil, nil); got err=%v", err) + } + if len(got) != 0 { + t.Fatalf("empty docIDs: want 0 relationships, got %d", len(got)) + } +} + +// Depth must still bound traversal. With depth=1 we should see only the +// direct edge (rA1) out of e1, not rA2 which is one hop further out. +func TestRelationshipsForEntityInDocs_RespectsDepthLimit(t *testing.T) { + t.Parallel() + st := newTestStore(t) + ctx := context.Background() + + must := func(err error) { + t.Helper() + if err != nil { + t.Fatal(err) + } + } + + must(st.UpsertDocument(ctx, &Document{ID: "docA", Path: "/a", Title: "A", DocType: "txt", FileHash: "hashA"})) + for _, id := range []string{"e1", "e2", "e3"} { + must(st.UpsertEntity(ctx, &Entity{ID: id, Name: id})) + } + must(st.InsertRelationship(ctx, &Relationship{ID: "rA1", SourceID: "e1", TargetID: "e2", Predicate: "p", DocID: "docA"})) + must(st.InsertRelationship(ctx, &Relationship{ID: "rA2", SourceID: "e2", TargetID: "e3", Predicate: "p", DocID: "docA"})) + + got, err := st.RelationshipsForEntityInDocs(ctx, "e1", 1, []string{"docA"}) + if err != nil { + t.Fatal(err) + } + if len(got) != 1 || got[0].ID != "rA1" { + t.Fatalf("depth=1 from e1: want [rA1]; got %v", got) + } +} diff --git a/internal/store/store.go b/internal/store/store.go index a2c56fa..a6ed37f 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -828,6 +828,104 @@ func (s *Store) RelationshipsForEntity(ctx context.Context, entityID string, dep return all, nil } +// RelationshipsForEntityInDocs is the doc-scoped variant of +// RelationshipsForEntity. Each BFS hop only traverses relationships whose +// doc_id is in the provided set, so callers doing a scoped local search +// cannot leak edges from unrelated documents into the result set. +// +// Passing an empty docIDs slice returns no relationships — scoped search +// with no anchor documents has no valid expansion. +// +// The doc_id IN-list is chunked at 900 (below SQLite's default 999 +// variable limit); combined with the frontier list, each hop may split +// across multiple queries. +func (s *Store) RelationshipsForEntityInDocs(ctx context.Context, entityID string, depth int, docIDs []string) ([]*Relationship, error) { + if len(docIDs) == 0 { + return nil, nil + } + + visited := map[string]bool{entityID: true} + seenRel := make(map[string]struct{}) + frontier := []string{entityID} + var all []*Relationship + + const docChunkSize = 900 + const frontierChunkSize = 900 + + for d := 0; d < depth && len(frontier) > 0; d++ { + var nextFrontier []string + for fStart := 0; fStart < len(frontier); fStart += frontierChunkSize { + fEnd := fStart + frontierChunkSize + if fEnd > len(frontier) { + fEnd = len(frontier) + } + fChunk := frontier[fStart:fEnd] + fPlaceholders := strings.Repeat("?,", len(fChunk)) + fPlaceholders = fPlaceholders[:len(fPlaceholders)-1] + + for dStart := 0; dStart < len(docIDs); dStart += docChunkSize { + dEnd := dStart + docChunkSize + if dEnd > len(docIDs) { + dEnd = len(docIDs) + } + dChunk := docIDs[dStart:dEnd] + dPlaceholders := strings.Repeat("?,", len(dChunk)) + dPlaceholders = dPlaceholders[:len(dPlaceholders)-1] + + args := make([]any, 0, len(fChunk)*2+len(dChunk)) + for _, id := range fChunk { + args = append(args, id) + } + for _, id := range fChunk { + args = append(args, id) + } + for _, id := range dChunk { + args = append(args, id) + } + + q := fmt.Sprintf(`SELECT id,source_id,target_id,predicate,description,weight,doc_id + FROM relationships + WHERE (source_id IN (%s) OR target_id IN (%s)) + AND doc_id IN (%s)`, + fPlaceholders, fPlaceholders, dPlaceholders) + rows, err := s.db.QueryContext(ctx, q, args...) + if err != nil { + return nil, err + } + for rows.Next() { + var r Relationship + var docID sql.NullString + if err := rows.Scan(&r.ID, &r.SourceID, &r.TargetID, &r.Predicate, &r.Description, &r.Weight, &docID); err != nil { + rows.Close() + return nil, err + } + if docID.Valid { + r.DocID = docID.String + } + if _, dup := seenRel[r.ID]; dup { + continue + } + seenRel[r.ID] = struct{}{} + all = append(all, &r) + for _, nid := range []string{r.SourceID, r.TargetID} { + if !visited[nid] { + visited[nid] = true + nextFrontier = append(nextFrontier, nid) + } + } + } + if err := rows.Err(); err != nil { + rows.Close() + return nil, err + } + rows.Close() + } + } + frontier = nextFrontier + } + return all, nil +} + func (s *Store) FindRelationships(ctx context.Context, fromID, toID, predicate string) ([]*Relationship, error) { q := `SELECT id,source_id,target_id,predicate,description,weight,doc_id FROM relationships WHERE 1=1` args := []any{}