Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions internal/search/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,10 @@ func LocalSearch(ctx context.Context, st *store.Store, emb *embedder.Embedder, i
seenEntities[es.e.ID] = true
result.Entities = append(result.Entities, es.e)

// Walk relationships
rels, err := st.RelationshipsForEntity(ctx, es.e.ID, graphDepth)
// Walk relationships scoped to the top-hit doc set so the
// graph expansion cannot leak edges from unrelated
// documents into a scoped local-search result.
rels, err := st.RelationshipsForEntityInDocs(ctx, es.e.ID, graphDepth, docIDList)
if err != nil {
continue
}
Expand Down
107 changes: 107 additions & 0 deletions internal/search/local_scope_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package search

import (
"context"
"testing"

"github.com/RandomCodeSpace/docsiq/internal/store"
)

// TestLocalSearch_GraphExpansionScopedToTopHitDocs is the RAN-35 regression
// guard. LocalSearch used to re-expand through every relationship a seed
// entity touched, regardless of doc_id, so a scoped query could surface
// unrelated-doc edges into the result set. After the fix, the graph walk
// must stay inside the top-hit doc set.
//
// Fixture:
//
// d-alpha: chunk "alpha" + entity "alpha" with one edge alpha -> beta
// (doc_id=d-alpha)
// d-delta: chunk "delta" + entity "alpha" shares a second edge
// alpha -> gamma (doc_id=d-delta)
//
// A query for "almost alpha" tops out on chunk "c-alpha" (d-alpha). With
// graphDepth=1 the result must include the d-alpha edge and must NOT
// include the d-delta edge, even though the seed entity "alpha" has a
// relationship row in d-delta.
func TestLocalSearch_GraphExpansionScopedToTopHitDocs(t *testing.T) {
st, emb, _ := seedCorpus(t)
ctx := context.Background()

must := func(err error) {
t.Helper()
if err != nil {
t.Fatal(err)
}
}

// Seed entities with vectors close to query so they rank in the
// top-K entity set. "alpha" shares an ID so scoped and unscoped
// edges collide on a single seed.
entAlpha := &store.Entity{ID: "ent-alpha", Name: "alpha", Vector: []float32{1, 0, 0, 0}}
entBeta := &store.Entity{ID: "ent-beta", Name: "beta", Vector: []float32{0, 1, 0, 0}}
entGamma := &store.Entity{ID: "ent-gamma", Name: "gamma", Vector: []float32{0, 0, 1, 0}}
must(st.UpsertEntity(ctx, entAlpha))
must(st.UpsertEntity(ctx, entBeta))
must(st.UpsertEntity(ctx, entGamma))

// In-scope edge: alpha -> beta in d-alpha (the top-hit doc).
must(st.InsertRelationship(ctx, &store.Relationship{
ID: "rel-in-scope", SourceID: "ent-alpha", TargetID: "ent-beta",
Predicate: "knows", DocID: "d-alpha",
}))
// Out-of-scope edge: alpha -> gamma in d-delta (unrelated doc).
must(st.InsertRelationship(ctx, &store.Relationship{
ID: "rel-out-of-scope", SourceID: "ent-alpha", TargetID: "ent-gamma",
Predicate: "knows", DocID: "d-delta",
}))

res, err := LocalSearch(ctx, st, emb, nil, "almost alpha", 1, 1)
if err != nil {
t.Fatalf("LocalSearch: %v", err)
}

// Expect the top chunk to belong to d-alpha.
if len(res.Chunks) == 0 || res.Chunks[0].Chunk.DocID != "d-alpha" {
t.Fatalf("top chunk: want doc d-alpha; got %+v", res.Chunks)
}

// Every returned relationship must belong to a top-hit doc.
topHitDocs := map[string]bool{}
for _, c := range res.Chunks {
topHitDocs[c.Chunk.DocID] = true
}
for _, r := range res.Rels {
if !topHitDocs[r.DocID] {
t.Errorf("relationship %s leaked from unrelated doc %q (top-hit docs: %v)",
r.ID, r.DocID, topHitDocs)
}
if r.ID == "rel-out-of-scope" {
t.Errorf("scoped local search returned out-of-scope edge %s (doc=%s)", r.ID, r.DocID)
}
}

// Sanity: the in-scope edge should actually be there — otherwise
// the negative assertion above is vacuous.
var sawInScope bool
for _, r := range res.Rels {
if r.ID == "rel-in-scope" {
sawInScope = true
break
}
}
if !sawInScope {
t.Errorf("scoped local search did not return the in-scope edge rel-in-scope; rels=%v", relIDs(res.Rels))
}
}

// relIDs is a tiny helper so assertion failures above print readable ids
// instead of a slice of pointers.
func relIDs(rs []*store.Relationship) []string {
out := make([]string, len(rs))
for i, r := range rs {
out[i] = r.ID
}
return out
}

121 changes: 121 additions & 0 deletions internal/store/relationships_for_entity_in_docs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package store

import (
"context"
"testing"
)

// Fixture layout:
//
// docA: e1 -[rA1]-> e2 -[rA2]-> e3
// docB: e1 -[rB1]-> e4 (entity e1 is shared across both docs)
//
// A depth-2 BFS from e1 scoped to docA must reach e2 and e3 via edges rA1
// and rA2, and must NOT return rB1 (from the unrelated document).
func TestRelationshipsForEntityInDocs_OnlyReturnsEdgesFromScopedDocs(t *testing.T) {
t.Parallel()
st := newTestStore(t)
ctx := context.Background()

must := func(err error) {
t.Helper()
if err != nil {
t.Fatal(err)
}
}

must(st.UpsertDocument(ctx, &Document{ID: "docA", Path: "/a", Title: "A", DocType: "txt", FileHash: "hashA"}))
must(st.UpsertDocument(ctx, &Document{ID: "docB", Path: "/b", Title: "B", DocType: "txt", FileHash: "hashB"}))

for _, id := range []string{"e1", "e2", "e3", "e4"} {
must(st.UpsertEntity(ctx, &Entity{ID: id, Name: id}))
}
must(st.InsertRelationship(ctx, &Relationship{ID: "rA1", SourceID: "e1", TargetID: "e2", Predicate: "p", DocID: "docA"}))
must(st.InsertRelationship(ctx, &Relationship{ID: "rA2", SourceID: "e2", TargetID: "e3", Predicate: "p", DocID: "docA"}))
must(st.InsertRelationship(ctx, &Relationship{ID: "rB1", SourceID: "e1", TargetID: "e4", Predicate: "p", DocID: "docB"}))

// Sanity: the unscoped walk must surface the out-of-scope edge.
// (That is precisely the leak RAN-35 is closing.)
all, err := st.RelationshipsForEntity(ctx, "e1", 2)
if err != nil {
t.Fatal(err)
}
var sawLeakUnscoped bool
for _, r := range all {
if r.ID == "rB1" {
sawLeakUnscoped = true
break
}
}
if !sawLeakUnscoped {
t.Fatalf("fixture sanity: unscoped walk did not include rB1 — test setup is wrong")
}

// Scoped walk must exclude rB1.
got, err := st.RelationshipsForEntityInDocs(ctx, "e1", 2, []string{"docA"})
if err != nil {
t.Fatal(err)
}
ids := map[string]string{}
for _, r := range got {
ids[r.ID] = r.DocID
if r.DocID != "docA" {
t.Errorf("scoped walk returned edge %s from unrelated doc %q", r.ID, r.DocID)
}
}
for _, want := range []string{"rA1", "rA2"} {
if _, ok := ids[want]; !ok {
t.Errorf("scoped walk: missing expected in-scope edge %s", want)
}
}
if _, leaked := ids["rB1"]; leaked {
t.Errorf("scoped walk leaked out-of-scope edge rB1 from docB")
}
if len(got) != 2 {
t.Errorf("scoped walk: want exactly 2 in-scope edges, got %d (%v)", len(got), ids)
}
}

func TestRelationshipsForEntityInDocs_EmptyDocsReturnsNil(t *testing.T) {
t.Parallel()
st := newTestStore(t)
ctx := context.Background()

got, err := st.RelationshipsForEntityInDocs(ctx, "anything", 2, nil)
if err != nil {
t.Fatalf("empty docIDs: want (nil, nil); got err=%v", err)
}
if len(got) != 0 {
t.Fatalf("empty docIDs: want 0 relationships, got %d", len(got))
}
}

// Depth must still bound traversal. With depth=1 we should see only the
// direct edge (rA1) out of e1, not rA2 which is one hop further out.
func TestRelationshipsForEntityInDocs_RespectsDepthLimit(t *testing.T) {
t.Parallel()
st := newTestStore(t)
ctx := context.Background()

must := func(err error) {
t.Helper()
if err != nil {
t.Fatal(err)
}
}

must(st.UpsertDocument(ctx, &Document{ID: "docA", Path: "/a", Title: "A", DocType: "txt", FileHash: "hashA"}))
for _, id := range []string{"e1", "e2", "e3"} {
must(st.UpsertEntity(ctx, &Entity{ID: id, Name: id}))
}
must(st.InsertRelationship(ctx, &Relationship{ID: "rA1", SourceID: "e1", TargetID: "e2", Predicate: "p", DocID: "docA"}))
must(st.InsertRelationship(ctx, &Relationship{ID: "rA2", SourceID: "e2", TargetID: "e3", Predicate: "p", DocID: "docA"}))

got, err := st.RelationshipsForEntityInDocs(ctx, "e1", 1, []string{"docA"})
if err != nil {
t.Fatal(err)
}
if len(got) != 1 || got[0].ID != "rA1" {
t.Fatalf("depth=1 from e1: want [rA1]; got %v", got)
}
}
98 changes: 98 additions & 0 deletions internal/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,104 @@ func (s *Store) RelationshipsForEntity(ctx context.Context, entityID string, dep
return all, nil
}

// RelationshipsForEntityInDocs is the doc-scoped variant of
// RelationshipsForEntity. Each BFS hop only traverses relationships whose
// doc_id is in the provided set, so callers doing a scoped local search
// cannot leak edges from unrelated documents into the result set.
//
// Passing an empty docIDs slice returns no relationships — scoped search
// with no anchor documents has no valid expansion.
//
// The doc_id IN-list is chunked at 900 (below SQLite's default 999
// variable limit); combined with the frontier list, each hop may split
// across multiple queries.
func (s *Store) RelationshipsForEntityInDocs(ctx context.Context, entityID string, depth int, docIDs []string) ([]*Relationship, error) {
if len(docIDs) == 0 {
return nil, nil
}

visited := map[string]bool{entityID: true}
seenRel := make(map[string]struct{})
frontier := []string{entityID}
var all []*Relationship

const docChunkSize = 900
const frontierChunkSize = 900
Comment thread
aksOps marked this conversation as resolved.

for d := 0; d < depth && len(frontier) > 0; d++ {
var nextFrontier []string
for fStart := 0; fStart < len(frontier); fStart += frontierChunkSize {
fEnd := fStart + frontierChunkSize
if fEnd > len(frontier) {
fEnd = len(frontier)
}
fChunk := frontier[fStart:fEnd]
fPlaceholders := strings.Repeat("?,", len(fChunk))
fPlaceholders = fPlaceholders[:len(fPlaceholders)-1]

for dStart := 0; dStart < len(docIDs); dStart += docChunkSize {
dEnd := dStart + docChunkSize
if dEnd > len(docIDs) {
dEnd = len(docIDs)
}
dChunk := docIDs[dStart:dEnd]
dPlaceholders := strings.Repeat("?,", len(dChunk))
dPlaceholders = dPlaceholders[:len(dPlaceholders)-1]

args := make([]any, 0, len(fChunk)*2+len(dChunk))
for _, id := range fChunk {
args = append(args, id)
}
for _, id := range fChunk {
args = append(args, id)
}
for _, id := range dChunk {
args = append(args, id)
}

q := fmt.Sprintf(`SELECT id,source_id,target_id,predicate,description,weight,doc_id
FROM relationships
WHERE (source_id IN (%s) OR target_id IN (%s))
AND doc_id IN (%s)`,
fPlaceholders, fPlaceholders, dPlaceholders)
rows, err := s.db.QueryContext(ctx, q, args...)
if err != nil {
return nil, err
}
for rows.Next() {
var r Relationship
var docID sql.NullString
if err := rows.Scan(&r.ID, &r.SourceID, &r.TargetID, &r.Predicate, &r.Description, &r.Weight, &docID); err != nil {
rows.Close()
return nil, err
}
if docID.Valid {
r.DocID = docID.String
}
if _, dup := seenRel[r.ID]; dup {
continue
}
seenRel[r.ID] = struct{}{}
all = append(all, &r)
for _, nid := range []string{r.SourceID, r.TargetID} {
if !visited[nid] {
visited[nid] = true
nextFrontier = append(nextFrontier, nid)
}
}
}
if err := rows.Err(); err != nil {
rows.Close()
return nil, err
}
rows.Close()
}
}
frontier = nextFrontier
}
return all, nil
}

func (s *Store) FindRelationships(ctx context.Context, fromID, toID, predicate string) ([]*Relationship, error) {
q := `SELECT id,source_id,target_id,predicate,description,weight,doc_id FROM relationships WHERE 1=1`
args := []any{}
Expand Down
Loading