diff --git a/Makefile b/Makefile index 25e398f..093debf 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev) COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) -LDFLAGS := -X github.com/RandomCodeSpace/docsiq/cmd.Version=$(VERSION) \ - -X github.com/RandomCodeSpace/docsiq/cmd.Commit=$(COMMIT) \ - -X github.com/RandomCodeSpace/docsiq/cmd.Date=$(DATE) +LDFLAGS := -X github.com/RandomCodeSpace/docsiq/internal/buildinfo.Version=$(VERSION) \ + -X github.com/RandomCodeSpace/docsiq/internal/buildinfo.Commit=$(COMMIT) \ + -X github.com/RandomCodeSpace/docsiq/internal/buildinfo.Date=$(DATE) ui-install: cd ui && npm install diff --git a/cmd/logformat_test.go b/cmd/logformat_test.go index 271accd..b2f9e4b 100644 --- a/cmd/logformat_test.go +++ b/cmd/logformat_test.go @@ -3,7 +3,10 @@ package cmd import ( "bytes" "encoding/json" + "io" "log/slog" + "os" + "path/filepath" "testing" ) @@ -28,3 +31,73 @@ func TestLogFormatJSON(t *testing.T) { t.Errorf("k = %v, want v", decoded["k"]) } } + +// TestBuildLogHandler_JSONStripsEmoji confirms buildLogHandler returns +// a handler chain that strips emoji from the message when format=json. +// buildLogHandler writes to os.Stderr so we redirect it through a pipe +// for the test (NOT parallel — shares global os.Stderr). +func TestBuildLogHandler_JSONStripsEmoji(t *testing.T) { + origStderr := os.Stderr + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("pipe: %v", err) + } + os.Stderr = w + t.Cleanup(func() { os.Stderr = origStderr }) + + h := buildLogHandler(slog.LevelInfo, "json") + slog.New(h).Info("✅ ready", "k", "v") + _ = w.Close() + + out, err := io.ReadAll(r) + if err != nil { + t.Fatalf("read: %v", err) + } + + var rec map[string]any + if err := json.Unmarshal(bytes.TrimSpace(out), &rec); err != nil { + t.Fatalf("not JSON: %v — raw=%q", err, out) + } + msg, _ := rec["msg"].(string) + if msg != "ready" { + t.Errorf("msg=%q want 'ready' (emoji stripped)", msg) + } +} + +// TestInitConfig_LogFormatFromConfigFile asserts the config file is +// consulted when neither --log-format nor DOCSIQ_LOG_FORMAT is set. +// NOT parallel — mutates env + HOME + package-level flags. +func TestInitConfig_LogFormatFromConfigFile(t *testing.T) { + origHome := os.Getenv("HOME") + origLogFormat := os.Getenv("DOCSIQ_LOG_FORMAT") + t.Cleanup(func() { + logLevel = "info" + logFormat = "" + cfgFile = "" + cfg = nil + os.Setenv("HOME", origHome) + if origLogFormat != "" { + os.Setenv("DOCSIQ_LOG_FORMAT", origLogFormat) + } + }) + + dir := t.TempDir() + os.Setenv("HOME", dir) + os.Unsetenv("DOCSIQ_LOG_FORMAT") + + yaml := filepath.Join(dir, "config.yaml") + if err := os.WriteFile(yaml, []byte("log:\n format: json\n"), 0o600); err != nil { + t.Fatal(err) + } + cfgFile = yaml + logFormat = "" + + initConfig() + + if cfg == nil { + t.Fatal("initConfig produced nil cfg") + } + if cfg.Log.Format != "json" { + t.Errorf("cfg.Log.Format=%q want json", cfg.Log.Format) + } +} diff --git a/cmd/root.go b/cmd/root.go index daebd97..1686490 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/RandomCodeSpace/docsiq/internal/config" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/spf13/cobra" ) @@ -41,7 +42,7 @@ func init() { } func initConfig() { - // Set up structured logger + // Set up structured logger. Level comes from --log-level only. var level slog.Level switch logLevel { case "debug": @@ -53,20 +54,21 @@ func initConfig() { default: level = slog.LevelInfo } - // Log format: --log-format wins, else DOCSIQ_LOG_FORMAT, else "text". + + // Format resolution order (highest wins): + // 1. --log-format flag + // 2. DOCSIQ_LOG_FORMAT env var + // 3. config file log.format + // 4. default "text" + // (3) requires config.Load() to have run; install a temporary + // handler first so config-load errors land somewhere, then + // upgrade once the final format is known. format := strings.ToLower(strings.TrimSpace(logFormat)) if format == "" { format = strings.ToLower(strings.TrimSpace(os.Getenv("DOCSIQ_LOG_FORMAT"))) } - handlerOpts := &slog.HandlerOptions{Level: level} - var handler slog.Handler - switch format { - case "json": - handler = slog.NewJSONHandler(os.Stderr, handlerOpts) - default: - handler = slog.NewTextHandler(os.Stderr, handlerOpts) - } - slog.SetDefault(slog.New(handler)) + + slog.SetDefault(slog.New(buildLogHandler(level, format))) var err error cfg, err = config.Load(cfgFile) @@ -74,11 +76,29 @@ func initConfig() { slog.Error("❌ config error", "err", err) os.Exit(1) } - if err := os.MkdirAll(cfg.DataDir, 0755); err != nil { + if err := os.MkdirAll(cfg.DataDir, 0o755); err != nil { slog.Error("❌ failed to create data directory", "path", cfg.DataDir, "err", err) os.Exit(1) } -} - + // If neither flag nor env specified format, use the value from + // the loaded config (falls back to default "text"). + if format == "" && cfg.Log.Format != "" { + format = strings.ToLower(strings.TrimSpace(cfg.Log.Format)) + slog.SetDefault(slog.New(buildLogHandler(level, format))) + } +} +// buildLogHandler assembles the slog handler chain. For "json" format +// we wrap the JSON handler in obs.NewProductionHandler to strip emoji +// prefixes from the message field (keeping them is harmless but noisy +// for log aggregators). "text" keeps emoji for human readability. +func buildLogHandler(level slog.Level, format string) slog.Handler { + opts := &slog.HandlerOptions{Level: level} + switch format { + case "json": + return obs.NewProductionHandler(slog.NewJSONHandler(os.Stderr, opts)) + default: + return slog.NewTextHandler(os.Stderr, opts) + } +} diff --git a/cmd/serve.go b/cmd/serve.go index d15e1fe..d936fb6 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -15,9 +15,11 @@ import ( "time" "github.com/RandomCodeSpace/docsiq/internal/api" + "github.com/RandomCodeSpace/docsiq/internal/buildinfo" "github.com/RandomCodeSpace/docsiq/internal/config" "github.com/RandomCodeSpace/docsiq/internal/embedder" "github.com/RandomCodeSpace/docsiq/internal/llm" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/RandomCodeSpace/docsiq/internal/project" "github.com/RandomCodeSpace/docsiq/internal/sqlitevec" "github.com/RandomCodeSpace/docsiq/internal/vectorindex" @@ -154,6 +156,19 @@ var serveCmd = &cobra.Command{ } pool := workq.New(workq.Config{Workers: workers, QueueDepth: depth}) + // Observability — initialise the Prometheus registry once per + // process and bind the workq stats provider so the /metrics + // scrape can read live queue depth + rejection count. + obs.Init() + obs.Workq.BindStatsProvider(func() obs.WorkqStats { + s := pool.Stats() + return obs.WorkqStats{Depth: s.Depth, Rejected: s.Rejected} + }) + { + info := buildinfo.Resolve(false) + api.SetBuildInfo(info.Version, info.Commit) + } + router := api.NewRouter(prov, emb, cfg, registry, api.WithProjectStores(stores), api.WithVectorIndexes(vecIndexes), diff --git a/cmd/version.go b/cmd/version.go index f7091aa..1a6d8ca 100644 --- a/cmd/version.go +++ b/cmd/version.go @@ -2,124 +2,22 @@ package cmd import ( "fmt" - "runtime/debug" + "github.com/RandomCodeSpace/docsiq/internal/buildinfo" "github.com/spf13/cobra" ) -// Set via -ldflags at build time (see Makefile). These act as overrides. -// When the binary is installed via `go install @`, go install -// cannot pass -ldflags, so these remain at their sentinel defaults and we fall -// back to runtime/debug.ReadBuildInfo() to populate version info from the VCS -// data that `go install` embeds automatically. -var ( - Version = "dev" - Commit = "unknown" - Date = "unknown" -) - -// VersionInfo holds resolved version metadata for the running binary. -type VersionInfo struct { - Version string - Commit string - Date string - Dirty string // "true", "false", or "unknown" -} - -// isSentinel reports whether an ldflags variable is empty or equal to a -// known default placeholder, meaning we should defer to ReadBuildInfo. -func isSentinel(v string) bool { - switch v { - case "", "dev", "unknown": - return true - } - return false -} - -// readBuildInfo is a package-level indirection so tests can substitute a -// stub when needed. It mirrors the signature of debug.ReadBuildInfo. -var readBuildInfo = debug.ReadBuildInfo - -// versionInfo resolves the current version metadata using the following order: -// 1. -ldflags overrides (if non-sentinel) -// 2. runtime/debug.ReadBuildInfo() (module version + VCS settings) -// 3. "unknown" for any remaining field -func versionInfo() VersionInfo { - vi := VersionInfo{ - Version: Version, - Commit: Commit, - Date: Date, - Dirty: "unknown", - } - - info, ok := readBuildInfo() - if !ok { - if isSentinel(vi.Version) { - vi.Version = "unknown" - } - if isSentinel(vi.Commit) { - vi.Commit = "unknown" - } - if isSentinel(vi.Date) { - vi.Date = "unknown" - } - return vi - } - - // Version: fall back to module version (e.g. "v0.5.0" or "(devel)"). - if isSentinel(vi.Version) { - if info.Main.Version != "" { - vi.Version = info.Main.Version - } else { - vi.Version = "unknown" - } - } - - // Walk VCS settings for commit/time/modified. - var vcsRev, vcsTime, vcsMod string - for _, s := range info.Settings { - switch s.Key { - case "vcs.revision": - vcsRev = s.Value - case "vcs.time": - vcsTime = s.Value - case "vcs.modified": - vcsMod = s.Value - } - } - - if isSentinel(vi.Commit) { - if vcsRev != "" { - vi.Commit = vcsRev - } else { - vi.Commit = "unknown" - } - } - if isSentinel(vi.Date) { - if vcsTime != "" { - vi.Date = vcsTime - } else { - vi.Date = "unknown" - } - } - if vcsMod != "" { - vi.Dirty = vcsMod - } - - return vi -} - var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version of docsiq", Run: func(cmd *cobra.Command, args []string) { - vi := versionInfo() + info := buildinfo.Resolve(false) dirtySuffix := "" - if vi.Dirty == "true" { + if info.Dirty == "true" { dirtySuffix = " (dirty)" } fmt.Printf("docsiq %s (commit: %s, built: %s)%s\n", - vi.Version, vi.Commit, vi.Date, dirtySuffix) + info.Version, info.Commit, info.BuildDate, dirtySuffix) }, } diff --git a/cmd/version_test.go b/cmd/version_test.go index 5d12017..ae6b1c5 100644 --- a/cmd/version_test.go +++ b/cmd/version_test.go @@ -1,171 +1,37 @@ package cmd import ( - "runtime/debug" + "strings" "testing" ) -// withLdflags temporarily overrides the package-level ldflags vars. -func withLdflags(t *testing.T, ver, commit, date string) { - t.Helper() - origV, origC, origD := Version, Commit, Date - Version, Commit, Date = ver, commit, date - t.Cleanup(func() { - Version, Commit, Date = origV, origC, origD - }) -} - -// withBuildInfo swaps the readBuildInfo indirection. -func withBuildInfo(t *testing.T, fn func() (*debug.BuildInfo, bool)) { - t.Helper() - orig := readBuildInfo - readBuildInfo = fn - t.Cleanup(func() { readBuildInfo = orig }) -} - -func TestVersionInfo_LdflagsOverrideWins(t *testing.T) { - withLdflags(t, "v1.2.3", "deadbeef", "2026-01-02T03:04:05Z") - // Even with BuildInfo available, ldflags should take precedence. - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return &debug.BuildInfo{ - Main: debug.Module{Version: "v9.9.9"}, - Settings: []debug.BuildSetting{ - {Key: "vcs.revision", Value: "cafef00d"}, - {Key: "vcs.time", Value: "2099-12-31T00:00:00Z"}, - }, - }, true - }) - - vi := versionInfo() - if vi.Version != "v1.2.3" { - t.Errorf("Version = %q, want v1.2.3", vi.Version) - } - if vi.Commit != "deadbeef" { - t.Errorf("Commit = %q, want deadbeef", vi.Commit) - } - if vi.Date != "2026-01-02T03:04:05Z" { - t.Errorf("Date = %q, want date literal", vi.Date) - } -} - -func TestVersionInfo_EmptyLdflagsFallsThroughToBuildInfo(t *testing.T) { - withLdflags(t, "", "", "") - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return &debug.BuildInfo{ - Main: debug.Module{Version: "v0.5.0"}, - Settings: []debug.BuildSetting{ - {Key: "vcs.revision", Value: "abc123"}, - {Key: "vcs.time", Value: "2026-04-17T10:00:00Z"}, - {Key: "vcs.modified", Value: "false"}, - }, - }, true - }) - - vi := versionInfo() - if vi.Version != "v0.5.0" { - t.Errorf("Version = %q, want v0.5.0", vi.Version) - } - if vi.Commit != "abc123" { - t.Errorf("Commit = %q, want abc123", vi.Commit) - } - if vi.Date != "2026-04-17T10:00:00Z" { - t.Errorf("Date = %q, want timestamp", vi.Date) - } - if vi.Dirty != "false" { - t.Errorf("Dirty = %q, want false", vi.Dirty) - } -} - -func TestVersionInfo_DevSentinelTriggersBuildInfo(t *testing.T) { - // Default "dev"/"unknown" sentinels should be overridden by BuildInfo. - withLdflags(t, "dev", "unknown", "unknown") - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return &debug.BuildInfo{ - Main: debug.Module{Version: "(devel)"}, - Settings: []debug.BuildSetting{ - {Key: "vcs.revision", Value: "feedface"}, - {Key: "vcs.time", Value: "2026-02-02T02:02:02Z"}, - {Key: "vcs.modified", Value: "true"}, - }, - }, true - }) - - vi := versionInfo() - if vi.Version != "(devel)" { - t.Errorf("Version = %q, want (devel)", vi.Version) - } - if vi.Commit != "feedface" { - t.Errorf("Commit = %q, want feedface", vi.Commit) - } - if vi.Dirty != "true" { - t.Errorf("Dirty = %q, want true", vi.Dirty) - } -} - -func TestVersionInfo_LetterOnlyCommitPassedThrough(t *testing.T) { - withLdflags(t, "", "", "") - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return &debug.BuildInfo{ - Main: debug.Module{Version: "v1.0.0"}, - Settings: []debug.BuildSetting{ - {Key: "vcs.revision", Value: "ZZZZZZZZ"}, // not a hex hash - {Key: "vcs.time", Value: "2026-03-03T00:00:00Z"}, - }, - }, true - }) - - vi := versionInfo() - if vi.Commit != "ZZZZZZZZ" { - t.Errorf("Commit = %q, want ZZZZZZZZ (no validation)", vi.Commit) - } -} - -func TestVersionInfo_EmptyVcsTimeBecomesUnknown(t *testing.T) { - withLdflags(t, "", "", "") - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return &debug.BuildInfo{ - Main: debug.Module{Version: "v1.0.0"}, - Settings: []debug.BuildSetting{ - {Key: "vcs.revision", Value: "abc"}, - // vcs.time omitted - }, - }, true - }) - - vi := versionInfo() - if vi.Date != "unknown" { - t.Errorf("Date = %q, want unknown when vcs.time missing", vi.Date) - } - if vi.Dirty != "unknown" { - t.Errorf("Dirty = %q, want unknown when vcs.modified missing", vi.Dirty) - } -} - -func TestVersionInfo_NoBuildInfoAllSentinelsBecomeUnknown(t *testing.T) { - withLdflags(t, "dev", "unknown", "") - withBuildInfo(t, func() (*debug.BuildInfo, bool) { - return nil, false - }) - - vi := versionInfo() - if vi.Version != "unknown" { - t.Errorf("Version = %q, want unknown", vi.Version) - } - if vi.Commit != "unknown" { - t.Errorf("Commit = %q, want unknown", vi.Commit) - } - if vi.Date != "unknown" { - t.Errorf("Date = %q, want unknown", vi.Date) - } -} - -func TestVersionInfo_RealBuildInfoDuringGoTest(t *testing.T) { - // During `go test`, debug.ReadBuildInfo returns real data for the test - // binary. With empty ldflags, Version should be non-empty (typically - // "(devel)" when run from a local checkout, or a module version in CI). - withLdflags(t, "", "", "") - vi := versionInfo() - if vi.Version == "" { - t.Error("Version should be non-empty when falling through to real ReadBuildInfo") - } +// TestVersionCmd_Metadata verifies the `docsiq version` cobra command is +// registered and carries sensible documentation. Detailed version- +// resolution logic lives in internal/buildinfo and is covered there. +func TestVersionCmd_Metadata(t *testing.T) { + if versionCmd.Use != "version" { + t.Errorf("Use=%q want version", versionCmd.Use) + } + if !strings.Contains(strings.ToLower(versionCmd.Short), "version") { + t.Errorf("Short=%q does not mention version", versionCmd.Short) + } + // The Run func must be non-nil; a nil Run would panic on invocation. + if versionCmd.Run == nil { + t.Fatal("versionCmd.Run is nil") + } +} + +// TestVersionCmd_RunDoesNotPanic invokes the command in isolation to +// confirm the full versionInfo → fmt.Printf path executes cleanly. +// Output goes to os.Stdout (Cobra's fmt.Printf path) so we don't +// capture it here — cmd-level tests intentionally stay shallow; the +// stdout shape is covered by the smoke test in the PR verification +// gate. +func TestVersionCmd_RunDoesNotPanic(t *testing.T) { + defer func() { + if rec := recover(); rec != nil { + t.Fatalf("versionCmd.Run panicked: %v", rec) + } + }() + versionCmd.Run(versionCmd, nil) } diff --git a/go.mod b/go.mod index c836a18..e3e825a 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/google/uuid v1.6.0 github.com/mark3labs/mcp-go v0.49.0 github.com/mattn/go-sqlite3 v1.14.42 + github.com/prometheus/client_golang v1.20.5 github.com/schollz/progressbar/v3 v3.19.0 github.com/spf13/cobra v1.10.2 github.com/spf13/viper v1.21.0 @@ -21,7 +22,9 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect github.com/aymerick/douceur v0.2.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff v2.2.1+incompatible // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/dlclark/regexp2 v1.10.0 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect @@ -30,12 +33,17 @@ require ( github.com/gorilla/css v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.18.0 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 // indirect github.com/mattn/go-runewidth v0.0.19 // indirect github.com/microcosm-cc/bluemonday v1.0.26 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pkoukk/tiktoken-go v0.1.6 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.62.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect @@ -53,5 +61,6 @@ require ( golang.org/x/sys v0.43.0 // indirect golang.org/x/term v0.42.0 // indirect golang.org/x/text v0.36.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect nhooyr.io/websocket v1.8.7 // indirect ) diff --git a/go.sum b/go.sum index c41ddcf..d08a8db 100644 --- a/go.sum +++ b/go.sum @@ -7,8 +7,12 @@ github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsVi github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY= github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdohwgs8tY= @@ -75,6 +79,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/leodido/go-urn v1.2.0 h1:hpXL4XnriNwQ/ABnpepYM/1vCLWNDfUNts8dX3xTG6Y= @@ -98,6 +104,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJ github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw= @@ -105,6 +113,14 @@ github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYde github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= @@ -213,11 +229,11 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1BU= -google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= diff --git a/internal/api/auth.go b/internal/api/auth.go index 185422f..f54c1b0 100644 --- a/internal/api/auth.go +++ b/internal/api/auth.go @@ -38,8 +38,11 @@ func bearerAuthMiddleware(apiKey string, next http.Handler) http.Handler { path := r.URL.Path - // /health is always public. - if path == "/health" { + // Observability + liveness probes are always public. Defense-in- + // depth even though /health, /healthz, /readyz, /metrics, and + // /api/version are registered on the mux directly. + switch path { + case "/health", "/healthz", "/readyz", "/metrics", "/api/version": next.ServeHTTP(w, r) return } diff --git a/internal/api/docs_integration_test.go b/internal/api/docs_integration_test.go index 86bb209..7ba4e0a 100644 --- a/internal/api/docs_integration_test.go +++ b/internal/api/docs_integration_test.go @@ -3,8 +3,11 @@ package api_test import ( + "bufio" "bytes" + "context" "encoding/json" + "errors" "fmt" "io" "mime/multipart" @@ -53,29 +56,51 @@ func uploadDoc(t *testing.T, e *itest.Env, slug, filename, content string) strin return out.JobID } -// waitUploadDone polls /api/upload/progress until the job is "done" or -// an error message appears. Returns the final status string. Bails on -// timeout so the test can skip or fail with context. +// waitUploadDone opens the /api/upload/progress SSE stream and returns +// the first terminal message ("done" or "error:..."). On timeout it +// returns "" so the caller can skip. The stream is long-lived; we +// parse "data: X\n\n" events incrementally and cancel via ctx when the +// caller deadline fires. func waitUploadDone(t *testing.T, e *itest.Env, jobID string, timeout time.Duration) string { t.Helper() - deadline := time.Now().Add(timeout) - for time.Now().Before(deadline) { - resp, body := e.GET(t, "/api/upload/progress?job_id="+jobID) - if resp.StatusCode != http.StatusOK { - time.Sleep(100 * time.Millisecond) - continue + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, + e.URL("/api/upload/progress?job_id="+jobID), nil) + if err != nil { + t.Fatalf("itest: build progress req: %v", err) + } + req.Header.Set("Authorization", "Bearer "+e.APIKey) + req.Header.Set("Accept", "text/event-stream") + + client := &http.Client{} // no per-request Timeout — SSE is long-lived; ctx bounds it. + resp, err := client.Do(req) + if err != nil { + if errors.Is(err, context.DeadlineExceeded) { + return "" } - s := string(body) - if strings.Contains(s, `"done"`) || strings.Contains(s, `"status":"done"`) || strings.Contains(s, `done`) { - if strings.Contains(s, "done") && !strings.Contains(s, "error") { - return s - } + t.Fatalf("itest: open progress stream: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + t.Fatalf("itest: progress stream: status %d body=%s", resp.StatusCode, string(b)) + } + + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := scanner.Text() + if !strings.HasPrefix(line, "data: ") { + continue } - if strings.Contains(s, "error:") { - return s + msg := strings.TrimPrefix(line, "data: ") + if msg == "done" || strings.HasPrefix(msg, "error:") { + return msg } - time.Sleep(150 * time.Millisecond) } + // Scanner ended — either ctx timed out (empty return) or server closed the stream. return "" } diff --git a/internal/api/handlers.go b/internal/api/handlers.go index f3661c7..0cb6046 100644 --- a/internal/api/handlers.go +++ b/internal/api/handlers.go @@ -90,13 +90,6 @@ func writeError(w http.ResponseWriter, r *http.Request, status int, msg string, writeJSON(w, status, body) } -// health is a trivially-always-200 liveness probe. No store/config -// dependency so it works even if the backend is degraded, and the -// auth middleware explicitly whitelists /health. -func (h *handlers) health(w http.ResponseWriter, r *http.Request) { - writeJSON(w, 200, map[string]string{"status": "ok"}) -} - // projectsHandler is a thin read-only JSON shim around registry.List() // so the Phase-4 UI can populate its project-selector dropdown. type projectsHandler struct { diff --git a/internal/api/health.go b/internal/api/health.go new file mode 100644 index 0000000..d0a512f --- /dev/null +++ b/internal/api/health.go @@ -0,0 +1,193 @@ +package api + +import ( + "context" + "database/sql" + "errors" + "net/http" + "sync" + "time" + + "github.com/RandomCodeSpace/docsiq/internal/llm" +) + +// readyCheckTTL is how long the /readyz handler caches its aggregated +// verdict. Ten seconds is short enough to notice a real outage quickly +// and long enough to absorb a chatty Prometheus + Kubernetes probe loop +// without hammering SQLite or the LLM endpoint. +const readyCheckTTL = 10 * time.Second + +// readyCheckTimeout bounds each individual probe. The SQLite ping is a +// microsecond-scale PRAGMA query; the LLM ping is network-bound, so we +// allow more headroom but still fail fast when the provider is wedged. +const ( + sqliteCheckTimeout = 500 * time.Millisecond + llmCheckTimeout = 2 * time.Second +) + +// healthPinger is the narrow interface readyz needs for SQLite +// reachability: a bounded Ping call that reports the first hard error. +type healthPinger interface { + Ping(ctx context.Context) error +} + +// llmPinger is the narrow interface readyz needs for LLM reachability. +// Implementations may issue a tiny Complete call or a model-list GET. +type llmPinger interface { + Ping(ctx context.Context) error +} + +// checkStatus holds per-component readiness. +type checkStatus struct { + Status string `json:"status"` // "ok" | "error" | "skipped" + Err string `json:"err,omitempty"` +} + +type readyzBody struct { + Status string `json:"status"` // "ready" | "not_ready" + Checks map[string]checkStatus `json:"checks"` +} + +// healthzHandler implements GET /healthz. Liveness: always 200 as long +// as the goroutine scheduler can run this function. No dependencies. +func healthzHandler() http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.Header().Set("Allow", "GET") + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + writeJSON(w, http.StatusOK, map[string]string{"status": "ok"}) + }) +} + +// readyzHandler implements GET /readyz. Readiness: aggregates a SQLite +// ping and an LLM reach check with a 10-second in-memory cache. When +// llmp is nil (provider=none), the LLM check is reported as "skipped" +// and does NOT fail readiness. +// +// Pass the default project's store adapter and an llmPinger wrapper +// around the configured provider (or nil). See router.go for wiring. +func readyzHandler(sq healthPinger, llmp llmPinger) http.Handler { + return readyzHandlerForTest(sq, llmp, readyCheckTTL) +} + +// readyzHandlerForTest is the injectable-TTL variant used only by tests. +// Production code must use readyzHandler. +func readyzHandlerForTest(sq healthPinger, llmp llmPinger, ttl time.Duration) http.Handler { + rc := &readyzCache{ttl: ttl} + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.Header().Set("Allow", "GET") + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + body, code := rc.check(r.Context(), sq, llmp) + writeJSON(w, code, body) + }) +} + +// readyzCache is the TTL-cached readiness result. Uses a single mutex +// because the hot path is cheap; lock contention only matters when the +// TTL expires and many requests arrive before the first unlock. +type readyzCache struct { + ttl time.Duration + + mu sync.Mutex + expiry time.Time + body readyzBody + code int +} + +func (c *readyzCache) check(ctx context.Context, sq healthPinger, llmp llmPinger) (readyzBody, int) { + c.mu.Lock() + defer c.mu.Unlock() + + if !c.expiry.IsZero() && time.Now().Before(c.expiry) { + return c.body, c.code + } + + // Decouple probe lifetime from the caller's request ctx: if the + // probing client (Kubernetes, Prometheus, curl) disconnects or its + // own deadline expires, an in-flight Ping would otherwise return + // context.Canceled / DeadlineExceeded and poison the 10-second + // cache for every subsequent caller. WithoutCancel preserves any + // values on ctx (e.g. req_id, for logging adapters inside Ping + // implementations) while detaching its cancellation. + probeCtx := context.WithoutCancel(ctx) + + body := readyzBody{ + Status: "ready", + Checks: map[string]checkStatus{}, + } + code := http.StatusOK + + // SQLite probe — mandatory. Failure fails readiness. + { + sqCtx, cancel := context.WithTimeout(probeCtx, sqliteCheckTimeout) + err := sq.Ping(sqCtx) + cancel() + if err != nil { + body.Checks["sqlite"] = checkStatus{Status: "error", Err: err.Error()} + body.Status = "not_ready" + code = http.StatusServiceUnavailable + } else { + body.Checks["sqlite"] = checkStatus{Status: "ok"} + } + } + + // LLM probe — optional. Nil provider (provider=none) reports skipped. + switch { + case llmp == nil: + body.Checks["llm"] = checkStatus{Status: "skipped", Err: "provider=none"} + default: + llmCtx, cancel := context.WithTimeout(probeCtx, llmCheckTimeout) + err := llmp.Ping(llmCtx) + cancel() + if err != nil { + body.Checks["llm"] = checkStatus{Status: "error", Err: err.Error()} + body.Status = "not_ready" + code = http.StatusServiceUnavailable + } else { + body.Checks["llm"] = checkStatus{Status: "ok"} + } + } + + c.body = body + c.code = code + c.expiry = time.Now().Add(c.ttl) + return body, code +} + +// sqlDBPinger adapts a *sql.DB for the healthPinger interface. SQLite's +// driver treats PingContext as a cheap no-op when the handle is healthy. +type sqlDBPinger struct{ db *sql.DB } + +func (p sqlDBPinger) Ping(ctx context.Context) error { + if p.db == nil { + return errors.New("nil sql.DB") + } + return p.db.PingContext(ctx) +} + +// providerPinger adapts an llm.Provider for the llmPinger interface. It +// issues a tiny Complete call with a 1-token cap; providers that cannot +// produce tokens (e.g. misconfigured) fail the ping. +type providerPinger struct{ prov llm.Provider } + +func (p providerPinger) Ping(ctx context.Context) error { + if p.prov == nil { + return errors.New("nil provider") + } + // A minimal generation request — 1 token, temp 0, no JSON mode. + // Providers that stream will still return promptly when maxTokens=1. + _, err := p.prov.Complete(ctx, "ping", llm.WithMaxTokens(1), llm.WithTemperature(0)) + return err +} + +// healthPingerFuncForRouter is the non-test counterpart of +// healthPingerFunc; lives in the prod file so the wiring in router.go +// does not have to depend on a test-only adapter. +type healthPingerFuncForRouter func(ctx context.Context) error + +func (f healthPingerFuncForRouter) Ping(ctx context.Context) error { return f(ctx) } diff --git a/internal/api/health_test.go b/internal/api/health_test.go new file mode 100644 index 0000000..e024a8f --- /dev/null +++ b/internal/api/health_test.go @@ -0,0 +1,235 @@ +package api + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" +) + +// healthPingerStub is a test double for whatever interface the ready +// probe accepts for "ping this SQLite handle". See health.go. +type healthPingerStub struct { + err error + hits atomic.Int32 +} + +func (p *healthPingerStub) Ping(ctx context.Context) error { + p.hits.Add(1) + return p.err +} + +// llmPingerStub is a test double for the LLM reachability probe. +type llmPingerStub struct { + err error + hits atomic.Int32 +} + +func (p *llmPingerStub) Ping(ctx context.Context) error { + p.hits.Add(1) + return p.err +} + +func TestHealthz_Always200(t *testing.T) { + t.Parallel() + h := healthzHandler() + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status=%d want 200", rec.Code) + } + var body map[string]string + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("body not JSON: %v", err) + } + if body["status"] != "ok" { + t.Errorf("status=%q want ok", body["status"]) + } +} + +func TestReadyz_AllChecksOKReturns200(t *testing.T) { + t.Parallel() + sq := &healthPingerStub{} + llm := &llmPingerStub{} + h := readyzHandler(sq, llm) + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status=%d want 200 body=%s", rec.Code, rec.Body.String()) + } + var body readyzBody + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("body not JSON: %v", err) + } + if body.Status != "ready" { + t.Errorf("status=%q want ready", body.Status) + } + if body.Checks["sqlite"].Status != "ok" { + t.Errorf("sqlite=%+v", body.Checks["sqlite"]) + } + if body.Checks["llm"].Status != "ok" { + t.Errorf("llm=%+v", body.Checks["llm"]) + } +} + +func TestReadyz_SQLiteDownReturns503(t *testing.T) { + t.Parallel() + sq := &healthPingerStub{err: errors.New("database is locked")} + llm := &llmPingerStub{} + h := readyzHandler(sq, llm) + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusServiceUnavailable { + t.Fatalf("status=%d want 503", rec.Code) + } + var body readyzBody + _ = json.Unmarshal(rec.Body.Bytes(), &body) + if body.Status != "not_ready" { + t.Errorf("status=%q want not_ready", body.Status) + } + if body.Checks["sqlite"].Status != "error" { + t.Errorf("sqlite status=%q want error", body.Checks["sqlite"].Status) + } + if body.Checks["sqlite"].Err == "" { + t.Errorf("sqlite err empty; should carry 'database is locked'") + } +} + +func TestReadyz_NilLLMReportsSkippedAndStaysReady(t *testing.T) { + t.Parallel() + sq := &healthPingerStub{} + h := readyzHandler(sq, nil) // nil llm == provider:none + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status=%d want 200", rec.Code) + } + var body readyzBody + _ = json.Unmarshal(rec.Body.Bytes(), &body) + if body.Checks["llm"].Status != "skipped" { + t.Errorf("llm=%+v want skipped", body.Checks["llm"]) + } +} + +func TestReadyz_CachesResultFor10s(t *testing.T) { + t.Parallel() + sq := &healthPingerStub{} + llm := &llmPingerStub{} + h := readyzHandler(sq, llm) + + for i := 0; i < 5; i++ { + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + } + + // Each of the 5 requests must have hit the pingers at most once. + if got := sq.hits.Load(); got > 1 { + t.Errorf("sqlite pinger called %d times in a single TTL window; want <=1", got) + } + if got := llm.hits.Load(); got > 1 { + t.Errorf("llm pinger called %d times in a single TTL window; want <=1", got) + } +} + +func TestReadyz_PingerContextIsBounded(t *testing.T) { + t.Parallel() + var seenDeadline atomic.Bool + llm := &llmPingerStub{} + + // Wrap the sq probe so it reports whether the caller bounded the context. + wrapped := healthPingerFunc(func(ctx context.Context) error { + if _, ok := ctx.Deadline(); ok { + seenDeadline.Store(true) + } + return nil + }) + h := readyzHandler(wrapped, llm) + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if !seenDeadline.Load() { + t.Errorf("readyzHandler must bound the pinger context with a deadline") + } + if rec.Code != http.StatusOK { + t.Errorf("status=%d", rec.Code) + } +} + +// healthPingerFunc is an adapter so the test above can use an inline +// closure without hand-rolling another stub type. +type healthPingerFunc func(ctx context.Context) error + +func (f healthPingerFunc) Ping(ctx context.Context) error { return f(ctx) } + +// TestReadyz_ProbeCtxDecoupledFromRequestCtx: if the probing client +// cancels the request (disconnect or its own deadline), the probe must +// still complete successfully so the cached result is not poisoned +// with context.Canceled for the whole TTL window. +func TestReadyz_ProbeCtxDecoupledFromRequestCtx(t *testing.T) { + t.Parallel() + + var seenCancel atomic.Bool + // Pinger: check whether the ctx passed in has already been canceled + // by the caller. If the probe ctx was derived from the request ctx, + // it would inherit cancellation and this flag would flip. + sq := healthPingerFunc(func(ctx context.Context) error { + if err := ctx.Err(); err != nil { + seenCancel.Store(true) + return err + } + return nil + }) + llm := &llmPingerStub{} + h := readyzHandler(sq, llm) + + // Request ctx that is already canceled. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + req := httptest.NewRequest(http.MethodGet, "/readyz", nil).WithContext(ctx) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if seenCancel.Load() { + t.Errorf("probe saw caller's canceled ctx — readiness cache would be poisoned by client disconnects") + } + if rec.Code != http.StatusOK { + t.Errorf("status=%d; want 200 (probe succeeded despite canceled request ctx)", rec.Code) + } +} + +// Guardrail: test clock advance simulation ensures cached result refreshes. +func TestReadyz_RefreshesAfterTTL(t *testing.T) { + t.Parallel() + sq := &healthPingerStub{} + llm := &llmPingerStub{} + h := readyzHandlerForTest(sq, llm, 50*time.Millisecond) + + req := func() { + r := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, r) + } + + req() + req() + time.Sleep(120 * time.Millisecond) + req() + + if got := sq.hits.Load(); got != 2 { + t.Errorf("sqlite pinger hits=%d want 2 (one per TTL window)", got) + } +} diff --git a/internal/api/logging_middleware_test.go b/internal/api/logging_middleware_test.go new file mode 100644 index 0000000..a3e21a9 --- /dev/null +++ b/internal/api/logging_middleware_test.go @@ -0,0 +1,140 @@ +package api + +import ( + "bytes" + "encoding/json" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// captureLogs swaps the default slog handler for a JSON-to-buffer one +// for the duration of the test, then restores the previous default. +func captureLogs(t *testing.T) *bytes.Buffer { + t.Helper() + var buf bytes.Buffer + h := slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}) + prev := slog.Default() + slog.SetDefault(slog.New(h)) + t.Cleanup(func() { slog.SetDefault(prev) }) + return &buf +} + +func TestLoggingMiddleware_EmitsStructuredAccessLog(t *testing.T) { + // NOT parallel — mutates global slog. + buf := captureLogs(t) + + h := loggingMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("hello world")) + })) + + req := httptest.NewRequest(http.MethodGet, "/api/stats", nil) + req.Header.Set("Authorization", "Bearer dev") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + if len(lines) == 0 { + t.Fatal("no log lines emitted") + } + var last map[string]any + if err := json.Unmarshal([]byte(lines[len(lines)-1]), &last); err != nil { + t.Fatalf("last log line not JSON: %v — raw=%q", err, lines[len(lines)-1]) + } + + want := map[string]any{ + "msg": "http", + "method": "GET", + "path": "/api/stats", + "status": float64(200), + "auth": "bearer", + } + for k, v := range want { + if got := last[k]; got != v { + t.Errorf("log[%s]=%v want %v", k, got, v) + } + } + if _, ok := last["req_id"].(string); !ok { + t.Errorf("req_id missing or not string: %v", last["req_id"]) + } + if b, ok := last["bytes_out"].(float64); !ok || b != 11 { + t.Errorf("bytes_out=%v want 11", last["bytes_out"]) + } +} + +func TestLoggingMiddleware_PanicStillLogsAccessEntry(t *testing.T) { + // NOT parallel — mutates global slog. + buf := captureLogs(t) + + // Chain: loggingMiddleware wraps a handler that panics. Without + // recoveryMiddleware here the panic propagates — but the deferred + // access log must still have fired. + h := loggingMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + panic("boom") + })) + + req := httptest.NewRequest(http.MethodGet, "/panic-path", nil) + rec := httptest.NewRecorder() + + func() { + defer func() { _ = recover() }() // swallow in test + h.ServeHTTP(rec, req) + }() + + if buf.Len() == 0 { + t.Fatal("access log not emitted through panic path") + } + if !strings.Contains(buf.String(), `"panic":"boom"`) { + t.Errorf("log should mention panic=boom; got: %s", buf.String()) + } + if !strings.Contains(buf.String(), `"level":"ERROR"`) { + t.Errorf("panic log should be ERROR level; got: %s", buf.String()) + } +} + +func TestLoggingMiddleware_ReqIDPassThrough(t *testing.T) { + t.Parallel() + h := loggingMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := RequestIDFromContext(r.Context()); got != "caller-id-abc" { + t.Errorf("ctx req_id=%q want caller-id-abc", got) + } + })) + req := httptest.NewRequest(http.MethodGet, "/", nil) + req.Header.Set("X-Request-ID", "caller-id-abc") + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if got := rec.Header().Get("X-Request-ID"); got != "caller-id-abc" { + t.Errorf("echoed X-Request-ID=%q", got) + } +} + +func TestLoggingMiddleware_AnonCookieBearerClassification(t *testing.T) { + t.Parallel() + cases := []struct { + name string + setup func(r *http.Request) + want string + }{ + {name: "anon_no_auth", setup: func(r *http.Request) {}, want: "anon"}, + {name: "bearer_header", setup: func(r *http.Request) { + r.Header.Set("Authorization", "Bearer k") + }, want: "bearer"}, + {name: "session_cookie", setup: func(r *http.Request) { + r.AddCookie(&http.Cookie{Name: sessionCookieName, Value: "cookie-token"}) + }, want: "cookie"}, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/", nil) + tc.setup(req) + if got := classifyAuth(req); got != tc.want { + t.Errorf("classifyAuth=%q want %q", got, tc.want) + } + }) + } +} diff --git a/internal/api/metrics.go b/internal/api/metrics.go index 4e5c7f3..02c0e52 100644 --- a/internal/api/metrics.go +++ b/internal/api/metrics.go @@ -1,273 +1,47 @@ package api import ( - "context" - "fmt" - "maps" "net/http" - "sort" - "strconv" - "strings" - "sync" "github.com/RandomCodeSpace/docsiq/internal/config" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/RandomCodeSpace/docsiq/internal/project" + "github.com/prometheus/client_golang/prometheus/promhttp" ) -// numHistogramBuckets is the fixed number of finite upper-bound buckets -// in the request-duration histogram. Kept as a typed const so -// histogramCell.buckets can be a fixed-size array (vet rejects -// len(slice)+1 as an array size). -const numHistogramBuckets = 11 - -// histogramBuckets are the upper bounds (in seconds) used for the -// docsiq_request_duration_seconds histogram. Deliberately simple — -// Prometheus default-style powers of √10 tweaked for typical HTTP latency. -var histogramBuckets = [numHistogramBuckets]float64{ - 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, -} - -// labelKey is the composite key for a (method, path, status) counter -// cell. Using a struct rather than a stringified concat keeps the key -// comparable and avoids accidental collisions from delimiter-in-value. -type labelKey struct { - Method string - Path string - Status int -} - -type histogramCell struct { - buckets [numHistogramBuckets + 1]uint64 // +1 for +Inf - sum float64 - count uint64 -} - -type histogramKey struct { - Method string - Path string -} - -// metrics is the package-level collector. All mutations must take its -// mutex; reads under the same mutex guarantee a consistent scrape. -type metricsRegistry struct { - mu sync.Mutex - - // Monotonic counter: docsiq_requests_total{method,path,status} - requestsTotal map[labelKey]uint64 - - // Histogram: docsiq_request_duration_seconds{method,path} - requestDuration map[histogramKey]*histogramCell - - // Build info (set once at startup). - buildVersion string - buildCommit string -} - -func newMetricsRegistry() *metricsRegistry { - return &metricsRegistry{ - requestsTotal: map[labelKey]uint64{}, - requestDuration: map[histogramKey]*histogramCell{}, - buildVersion: "dev", - buildCommit: "unknown", - } -} - -// globalMetrics is the single collector shared by the logging middleware -// (writer) and /metrics handler (reader). -var globalMetrics = newMetricsRegistry() - -// SetBuildInfo lets cmd/ wire the binary version + commit into the -// docsiq_build_info gauge. Safe to call from init or main; zero-value -// defaults ("dev"/"unknown") are used if never called. -func SetBuildInfo(version, commit string) { - if version == "" { - version = "dev" - } - if commit == "" { - commit = "unknown" - } - globalMetrics.mu.Lock() - defer globalMetrics.mu.Unlock() - globalMetrics.buildVersion = version - globalMetrics.buildCommit = commit -} - -// recordRequest is called by loggingMiddleware after every HTTP request. -// method/path labels are NOT sanitized here — callers must pass values -// they are willing to expose on the scrape endpoint. -func recordRequest(method, path string, status int, durationSeconds float64) { - globalMetrics.mu.Lock() - defer globalMetrics.mu.Unlock() - - globalMetrics.requestsTotal[labelKey{method, path, status}]++ - - hk := histogramKey{method, path} - cell, ok := globalMetrics.requestDuration[hk] - if !ok { - cell = &histogramCell{} - globalMetrics.requestDuration[hk] = cell - } - cell.count++ - cell.sum += durationSeconds - placed := false - for i, ub := range histogramBuckets { - if durationSeconds <= ub { - cell.buckets[i]++ - placed = true - break - } - } - if !placed { - cell.buckets[len(histogramBuckets)]++ // +Inf - } -} - -// metricsHandler writes the Prometheus text exposition format. -// Ops endpoint — NOT gated by auth. Mounted on /metrics. +// metricsHandler returns the /metrics handler. Backed by the shared +// obs.Default registry. Public — no auth (Prometheus scrape cannot +// present a bearer token in typical configs). // -// The projects gauge + notes gauge require a registry + per-project -// store cache; those are resolved lazily via closures captured at -// NewRouter time. -// TODO(docsiq): P2-2 consider optional scrape token via cfg.Server.MetricsKey +// Retained signature (registry, stores, cfg) so NewRouter callers do +// not need to change; the args are currently unused here but kept as a +// seam for future per-project gauges. +// +// Note: obs.Init is idempotent and called on first scrape so tests +// that build a router without going through cmd/serve.go still get +// the full metric family set. func metricsHandler( - registry *project.Registry, - stores *projectStores, + _ *project.Registry, + _ *projectStores, _ *config.Config, ) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - globalMetrics.mu.Lock() - // Snapshot under the lock, release before formatting. - reqs := make(map[labelKey]uint64, len(globalMetrics.requestsTotal)) - maps.Copy(reqs, globalMetrics.requestsTotal) - hists := make(map[histogramKey]histogramCell, len(globalMetrics.requestDuration)) - for k, v := range globalMetrics.requestDuration { - hists[k] = *v - } - version := globalMetrics.buildVersion - commit := globalMetrics.buildCommit - globalMetrics.mu.Unlock() - - var b strings.Builder - writeBuildInfo(&b, version, commit) - writeProjectsGauge(&b, registry) - writeNotesGauge(r.Context(), &b, registry, stores) - writeRequestsTotal(&b, reqs) - writeRequestDuration(&b, hists) - - w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(b.String())) + obs.Init() + return promhttp.HandlerFor(obs.Default, promhttp.HandlerOpts{ + EnableOpenMetrics: false, }) } -func writeBuildInfo(b *strings.Builder, version, commit string) { - b.WriteString("# HELP docsiq_build_info Build metadata (always 1)\n") - b.WriteString("# TYPE docsiq_build_info gauge\n") - fmt.Fprintf(b, "docsiq_build_info{version=%q,commit=%q} 1\n", - version, commit) -} - -func writeProjectsGauge(b *strings.Builder, registry *project.Registry) { - b.WriteString("# HELP docsiq_projects_total Number of registered projects\n") - b.WriteString("# TYPE docsiq_projects_total gauge\n") - count := 0 - if registry != nil { - if list, err := registry.List(); err == nil { - count = len(list) - } - } - fmt.Fprintf(b, "docsiq_projects_total %d\n", count) -} - -func writeNotesGauge( - ctx context.Context, - b *strings.Builder, - registry *project.Registry, - stores *projectStores, -) { - b.WriteString("# HELP docsiq_notes_total Number of notes indexed per project\n") - b.WriteString("# TYPE docsiq_notes_total gauge\n") - if registry == nil || stores == nil { - return - } - list, err := registry.List() - if err != nil { - return - } - // Sort for stable scrape output (makes diffs + tests deterministic). - sort.Slice(list, func(i, j int) bool { return list[i].Slug < list[j].Slug }) - for _, p := range list { - st, err := stores.Get(p.Slug) - if err != nil { - continue - } - n, err := st.CountNotes(ctx) - if err != nil { - continue - } - fmt.Fprintf(b, "docsiq_notes_total{project=%q} %d\n", p.Slug, n) - } -} - -func writeRequestsTotal(b *strings.Builder, reqs map[labelKey]uint64) { - b.WriteString("# HELP docsiq_requests_total Total HTTP requests processed\n") - b.WriteString("# TYPE docsiq_requests_total counter\n") - - keys := make([]labelKey, 0, len(reqs)) - for k := range reqs { - keys = append(keys, k) - } - sort.Slice(keys, func(i, j int) bool { - if keys[i].Method != keys[j].Method { - return keys[i].Method < keys[j].Method - } - if keys[i].Path != keys[j].Path { - return keys[i].Path < keys[j].Path - } - return keys[i].Status < keys[j].Status - }) - for _, k := range keys { - fmt.Fprintf(b, "docsiq_requests_total{method=%q,path=%q,status=%q} %d\n", - k.Method, k.Path, strconv.Itoa(k.Status), reqs[k]) - } -} - -func writeRequestDuration(b *strings.Builder, hists map[histogramKey]histogramCell) { - b.WriteString("# HELP docsiq_request_duration_seconds HTTP request latency distribution\n") - b.WriteString("# TYPE docsiq_request_duration_seconds histogram\n") - - keys := make([]histogramKey, 0, len(hists)) - for k := range hists { - keys = append(keys, k) +// SetBuildInfo publishes binary version + commit to the +// docsiq_build_info gauge. Kept with its pre-existing signature so +// cmd/serve.go callers do not have to change. Safe to call before the +// first scrape — obs.Init self-initialises. +func SetBuildInfo(version, commit string) { + if version == "" { + version = "dev" } - sort.Slice(keys, func(i, j int) bool { - if keys[i].Method != keys[j].Method { - return keys[i].Method < keys[j].Method - } - return keys[i].Path < keys[j].Path - }) - - for _, k := range keys { - cell := hists[k] - // Cumulative bucket counts — Prometheus requires le is cumulative. - var cum uint64 - for i, ub := range histogramBuckets { - cum += cell.buckets[i] - fmt.Fprintf(b, "docsiq_request_duration_seconds_bucket{method=%q,path=%q,le=%q} %d\n", - k.Method, k.Path, formatFloat(ub), cum) - } - cum += cell.buckets[len(histogramBuckets)] - fmt.Fprintf(b, "docsiq_request_duration_seconds_bucket{method=%q,path=%q,le=\"+Inf\"} %d\n", - k.Method, k.Path, cum) - fmt.Fprintf(b, "docsiq_request_duration_seconds_sum{method=%q,path=%q} %s\n", - k.Method, k.Path, formatFloat(cell.sum)) - fmt.Fprintf(b, "docsiq_request_duration_seconds_count{method=%q,path=%q} %d\n", - k.Method, k.Path, cell.count) + if commit == "" { + commit = "unknown" } -} - -// formatFloat renders a float in a Prometheus-friendly form: integer -// values drop the decimal, fractionals use minimum-precision "%g". -func formatFloat(f float64) string { - return strconv.FormatFloat(f, 'g', -1, 64) + obs.Init() + obs.Build.Set(version, commit) } diff --git a/internal/api/metrics_integration_test.go b/internal/api/metrics_integration_test.go index 136924e..b9c4146 100644 --- a/internal/api/metrics_integration_test.go +++ b/internal/api/metrics_integration_test.go @@ -35,7 +35,7 @@ func TestMetrics_EndpointReturns200PublicNoAuth(t *testing.T) { func TestMetrics_IsPrometheusText(t *testing.T) { e := itest.New(t) // Warm up at least one counter so multiple metric lines exist. - resp, _ := e.GET(t, "/health") + resp, _ := e.GET(t, "/healthz") resp.Body.Close() req, _ := http.NewRequest(http.MethodGet, e.URL("/metrics"), nil) @@ -46,19 +46,19 @@ func TestMetrics_IsPrometheusText(t *testing.T) { re := regexp.MustCompile(`(?m)^docsiq_\w+(?:\{[^}]*\})?\s+\S`) matches := re.FindAllString(string(body), -1) if len(matches) < 3 { - t.Fatalf("expected ≥3 docsiq_* metric lines, got %d. body=%s", len(matches), string(body)) + t.Fatalf("expected >=3 docsiq_* metric lines, got %d. body=%s", len(matches), string(body)) } } -// TestMetrics_RequestsCounterIncrements fires N /health requests then -// scrapes /metrics and asserts the docsiq_requests_total counter for -// /health saw at least N increments. -func TestMetrics_RequestsCounterIncrements(t *testing.T) { +// TestMetrics_HTTPRequestsCounterIncrements fires N /healthz requests +// then scrapes /metrics and asserts the docsiq_http_requests_total +// counter for the matching route saw at least N increments. +func TestMetrics_HTTPRequestsCounterIncrements(t *testing.T) { e := itest.New(t) const n = 5 for i := 0; i < n; i++ { - req, _ := http.NewRequest(http.MethodGet, e.URL("/health"), nil) + req, _ := http.NewRequest(http.MethodGet, e.URL("/healthz"), nil) resp := e.Do(t, req) io.Copy(io.Discard, resp.Body) resp.Body.Close() @@ -69,14 +69,11 @@ func TestMetrics_RequestsCounterIncrements(t *testing.T) { body, _ := io.ReadAll(resp.Body) resp.Body.Close() - // Find every docsiq_requests_total line mentioning /health and sum - // the trailing integer values — multiple rows differ by status - // label but all count toward /health traffic. - // Format: docsiq_requests_total{method="GET",path="/health",status="200"} 7 - re := regexp.MustCompile(`(?m)^docsiq_requests_total\{[^}]*path="/health"[^}]*\}\s+(\d+)`) + // Format: docsiq_http_requests_total{method="GET",route="GET /healthz",status="2xx"} 7 + re := regexp.MustCompile(`(?m)^docsiq_http_requests_total\{[^}]*route="GET /healthz"[^}]*\}\s+(\d+)`) matches := re.FindAllStringSubmatch(string(body), -1) if len(matches) == 0 { - t.Fatalf("no docsiq_requests_total row for /health. body=\n%s", string(body)) + t.Fatalf("no docsiq_http_requests_total row for /healthz. body=\n%s", string(body)) } total := 0 for _, m := range matches { @@ -87,7 +84,7 @@ func TestMetrics_RequestsCounterIncrements(t *testing.T) { total += v } if total < n { - t.Fatalf("/health counter %d < %d fired requests. matches=%v body_head=%s", + t.Fatalf("/healthz counter %d < %d fired requests. matches=%v body_head=%s", total, n, matches, firstLines(string(body), 20)) } } diff --git a/internal/api/metrics_test.go b/internal/api/metrics_test.go index 341a1b7..d30d9a1 100644 --- a/internal/api/metrics_test.go +++ b/internal/api/metrics_test.go @@ -4,159 +4,87 @@ import ( "net/http" "net/http/httptest" "strings" - "sync" "testing" "github.com/RandomCodeSpace/docsiq/internal/config" "github.com/RandomCodeSpace/docsiq/internal/store" ) -// metricsTestLock serializes tests that mutate the package-level -// globalMetrics collector so parallel runs don't race on counts. -var metricsTestLock sync.Mutex - -// resetMetrics wipes the global collector so a test starts from a -// deterministic state. Must hold metricsTestLock across the whole test -// body that inspects counters. -func resetMetrics() { - globalMetrics.mu.Lock() - defer globalMetrics.mu.Unlock() - globalMetrics.requestsTotal = map[labelKey]uint64{} - globalMetrics.requestDuration = map[histogramKey]*histogramCell{} - globalMetrics.buildVersion = "dev" - globalMetrics.buildCommit = "unknown" -} - +// newMetricsRouter builds a minimal router so the /metrics endpoint can +// be reached via the full middleware chain (including the scrape +// bypasses in bearerAuthMiddleware). func newMetricsRouter(t *testing.T) http.Handler { t.Helper() dir := t.TempDir() - st, err := store.OpenForProject(dir, "testproj") + st, err := store.OpenForProject(dir, "_default") if err != nil { - t.Fatalf("store.Open: %v", err) + t.Fatalf("store.OpenForProject: %v", err) } t.Cleanup(func() { _ = st.Close() }) cfg := &config.Config{} + cfg.Server.Host = "127.0.0.1" + cfg.Server.Port = 0 cfg.DataDir = dir return NewRouter(nil, nil, cfg, nil, WithProjectStores(testSingleStore(dir, st, "_default", "testproj"))) } -func TestMetricsEndpoint(t *testing.T) { - t.Run("exposes_expected_families", func(t *testing.T) { - metricsTestLock.Lock() - defer metricsTestLock.Unlock() - resetMetrics() - SetBuildInfo("v0.5.0", "abcdef1") - - h := newMetricsRouter(t) - - // Hit a couple of real endpoints so counters/histograms populate. - for _, path := range []string{"/health", "/api/nonexistent", "/health"} { - req := httptest.NewRequest(http.MethodGet, path, nil) - rec := httptest.NewRecorder() - h.ServeHTTP(rec, req) - } - - req := httptest.NewRequest(http.MethodGet, "/metrics", nil) - rec := httptest.NewRecorder() - h.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Fatalf("status = %d, want 200", rec.Code) - } - ct := rec.Header().Get("Content-Type") - if !strings.HasPrefix(ct, "text/plain") { - t.Errorf("Content-Type = %q, want text/plain*", ct) - } - body := rec.Body.String() - - // HELP/TYPE lines and each expected family must be present. - wants := []string{ - "# HELP docsiq_build_info", - "# TYPE docsiq_build_info gauge", - `docsiq_build_info{version="v0.5.0",commit="abcdef1"} 1`, - "# HELP docsiq_projects_total", - "# TYPE docsiq_projects_total gauge", - "docsiq_projects_total ", - "# HELP docsiq_notes_total", - "# HELP docsiq_requests_total", - "# TYPE docsiq_requests_total counter", - `docsiq_requests_total{method="GET",path="/health",status="200"} 2`, - "# HELP docsiq_request_duration_seconds", - "# TYPE docsiq_request_duration_seconds histogram", - `docsiq_request_duration_seconds_bucket{method="GET",path="/health",le="0.005"}`, - `docsiq_request_duration_seconds_bucket{method="GET",path="/health",le="+Inf"}`, - `docsiq_request_duration_seconds_sum{method="GET",path="/health"}`, - `docsiq_request_duration_seconds_count{method="GET",path="/health"} 2`, - } - for _, w := range wants { - if !strings.Contains(body, w) { - t.Errorf("missing %q in scrape body\n---\n%s\n---", w, body) - } - } - }) +func TestMetrics_EndpointReturns200(t *testing.T) { + h := newMetricsRouter(t) + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status=%d want 200", rec.Code) + } + ct := rec.Header().Get("Content-Type") + if !strings.HasPrefix(ct, "text/plain") { + t.Errorf("Content-Type=%q want text/plain*", ct) + } +} - t.Run("format_parses_as_prometheus_text", func(t *testing.T) { - metricsTestLock.Lock() - defer metricsTestLock.Unlock() - resetMetrics() +func TestMetrics_HTTPRequestsCounterIncrements(t *testing.T) { + h := newMetricsRouter(t) - h := newMetricsRouter(t) - req := httptest.NewRequest(http.MethodGet, "/health", nil) + // Warm up by hitting /healthz a few times so the HTTP counter + // picks up samples for the GET /healthz route pattern. + const warmups = 3 + for i := 0; i < warmups; i++ { + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) rec := httptest.NewRecorder() h.ServeHTTP(rec, req) + } - req = httptest.NewRequest(http.MethodGet, "/metrics", nil) - rec = httptest.NewRecorder() - h.ServeHTTP(rec, req) + // Scrape. + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + body := rec.Body.String() - // Each non-comment, non-empty line must match the Prometheus - // exposition grammar: `{labels} ` or ` `. - // This is a structural sanity check, not a full parser. - for line := range strings.SplitSeq(rec.Body.String(), "\n") { - if line == "" || strings.HasPrefix(line, "#") { - continue - } - // Every sample line must contain at least one space between - // the series identifier and the value. - if !strings.Contains(line, " ") { - t.Errorf("invalid sample line: %q", line) - } - // Braces must balance (open count equals close count). - if strings.Count(line, "{") != strings.Count(line, "}") { - t.Errorf("unbalanced braces on line: %q", line) - } - } - }) + if !strings.Contains(body, "docsiq_http_requests_total") { + t.Fatalf("body missing docsiq_http_requests_total; body=\n%s", body) + } + if !strings.Contains(body, "docsiq_http_request_duration_seconds") { + t.Errorf("body missing docsiq_http_request_duration_seconds") + } +} - t.Run("histogram_buckets_are_cumulative", func(t *testing.T) { - metricsTestLock.Lock() - defer metricsTestLock.Unlock() - resetMetrics() +func TestMetrics_BuildInfoReflectsSetBuildInfo(t *testing.T) { + SetBuildInfo("v9.9.9", "abc1234") - // Record three synthetic requests at known durations. - recordRequest("GET", "/synthetic", 200, 0.001) // <= 0.005 - recordRequest("GET", "/synthetic", 200, 0.5) // <= 0.5 - recordRequest("GET", "/synthetic", 200, 10.0) // <= 10 + h := newMetricsRouter(t) + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + body := rec.Body.String() - h := newMetricsRouter(t) - req := httptest.NewRequest(http.MethodGet, "/metrics", nil) - rec := httptest.NewRecorder() - h.ServeHTTP(rec, req) - - body := rec.Body.String() - // Cumulative: le=0.005 should be >= 1, le=0.5 should be >= 2, - // le=+Inf should be 3. - mustContain(t, body, `docsiq_request_duration_seconds_bucket{method="GET",path="/synthetic",le="0.005"} 1`) - mustContain(t, body, `docsiq_request_duration_seconds_bucket{method="GET",path="/synthetic",le="0.5"} 2`) - mustContain(t, body, `docsiq_request_duration_seconds_bucket{method="GET",path="/synthetic",le="+Inf"} 3`) - mustContain(t, body, `docsiq_request_duration_seconds_count{method="GET",path="/synthetic"} 3`) - }) -} - -func mustContain(t *testing.T, body, needle string) { - t.Helper() - if !strings.Contains(body, needle) { - t.Errorf("missing %q in body", needle) + if !strings.Contains(body, "docsiq_build_info") { + t.Fatalf("missing docsiq_build_info; body=\n%s", body) + } + if !strings.Contains(body, `version="v9.9.9"`) { + t.Errorf("expected version=v9.9.9 label in %q", body) + } + if !strings.Contains(body, `commit="abc1234"`) { + t.Errorf("expected commit=abc1234 label in %q", body) } } diff --git a/internal/api/request_timeout.go b/internal/api/request_timeout.go index 75cf5f1..448d976 100644 --- a/internal/api/request_timeout.go +++ b/internal/api/request_timeout.go @@ -28,6 +28,25 @@ func isUploadRoute(r *http.Request) bool { return false } +// isStreamingRoute reports whether r serves an SSE / long-poll stream +// that must not be wrapped by http.TimeoutHandler. TimeoutHandler +// buffers the response and does not propagate http.Flusher, so any +// stream wrapped by it stalls until the handler returns — at which +// point the client has already timed out reading the body. +// +// Streaming routes rely on ctx cancellation (client disconnect or +// server shutdown) for teardown rather than a per-request wall clock. +func isStreamingRoute(r *http.Request) bool { + if r.Method != http.MethodGet { + return false + } + switch r.URL.Path { + case "/api/upload/progress", "/mcp": + return true + } + return false +} + // requestTimeoutMiddleware wraps inner in http.TimeoutHandler with // cfg.Server.RequestTimeout as the default bound, bumped to // cfg.Server.UploadTimeout for upload routes. @@ -55,6 +74,10 @@ func requestTimeoutMiddleware(cfg *config.Config) func(http.Handler) http.Handle } return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if isStreamingRoute(r) { + inner.ServeHTTP(w, r) + return + } if isUploadRoute(r) { uploadTO.ServeHTTP(w, r) return diff --git a/internal/api/router.go b/internal/api/router.go index c8b64cd..d3efad8 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -15,6 +15,7 @@ import ( "github.com/RandomCodeSpace/docsiq/internal/embedder" "github.com/RandomCodeSpace/docsiq/internal/llm" "github.com/RandomCodeSpace/docsiq/internal/mcp" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/RandomCodeSpace/docsiq/internal/project" "github.com/RandomCodeSpace/docsiq/internal/workq" "github.com/RandomCodeSpace/docsiq/ui" @@ -86,15 +87,58 @@ func NewRouter(prov llm.Provider, emb *embedder.Embedder, cfg *config.Config, re mux := http.NewServeMux() - // Public liveness probe — registered on the mux itself. The auth - // middleware also explicitly bypasses /health as defense-in-depth. - mux.HandleFunc("GET /health", h.health) + // Public liveness + readiness probes. /healthz is dependency-free + // (process-is-running); /readyz aggregates a SQLite ping + LLM reach + // check with a 10s in-memory cache. Both are registered on the mux + // and also explicitly bypassed by bearerAuthMiddleware. + mux.Handle("GET /healthz", healthzHandler()) + { + // Default-project store is the representative SQLite shard — a + // failure here means the whole server is hosed. Resolve lazily + // at handler-build time so tests that pass nil stores still work. + defaultSlug := cfg.DefaultProject + if defaultSlug == "" { + defaultSlug = "_default" + } + // Lazy SQLite pinger: resolve the default store at probe time, not + // at router-build time. This lets /readyz flip green once the + // default store becomes available without a restart, and — more + // importantly — surfaces a genuine Open failure (permissions, + // corrupt DB, disk full) as 503 instead of masking it with a + // hard-coded success. + sq := healthPingerFuncForRouter(func(ctx context.Context) error { + if stores == nil { + return fmt.Errorf("project stores not configured") + } + st, err := stores.Get(defaultSlug) + if err != nil { + return fmt.Errorf("open default store %q: %w", defaultSlug, err) + } + if st == nil { + return fmt.Errorf("nil default store %q", defaultSlug) + } + return st.DB().PingContext(ctx) + }) + var llmp llmPinger + if prov != nil { + llmp = providerPinger{prov: prov} + } + mux.Handle("GET /readyz", readyzHandler(sq, llmp)) + } + + // Back-compat alias: GET /health was the pre-Block-4 probe. Clients + // that haven't migrated to /healthz still get a 200. + mux.Handle("GET /health", healthzHandler()) // Prometheus scrape endpoint — public, NOT gated by auth or project // middleware (auth/project explicitly bypass /metrics below). // TODO(docsiq): P2-2 consider optional scrape token via cfg.Server.MetricsKey mux.Handle("GET /metrics", metricsHandler(registry, stores, cfg)) + // Version metadata — public, no auth. Used for operator diagnostics + // and CI tooling ("what's running in prod?"). No secrets exposed. + mux.Handle("GET /api/version", versionHandler()) + // MCP Streamable HTTP transport (POST /mcp, GET /mcp for SSE stream). // When prov is nil (provider=none) we omit the MCP server entirely and // return 503 on /mcp — the notes/graph/tree tools inside the MCP server @@ -249,13 +293,15 @@ func recoveryMiddleware(next http.Handler) http.Handler { }) } -// loggingMiddleware logs method, path, status code, and duration for every -// request, assigns a request ID (X-Request-ID passthrough or new hex), and -// feeds the Prometheus collector. +// loggingMiddleware assigns a per-request ID, records Prometheus +// metrics, and emits one structured "http" log line per request. The +// log emission is deferred so that a panic escaping recoveryMiddleware +// (e.g. a panic in securityHeadersMiddleware) is still observable. func loggingMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Request ID: header pass-through, otherwise generate fresh 16-hex - // (8 random bytes). Put on ctx + echo back as response header. + // Request ID: header pass-through, otherwise generate fresh + // 16-hex (8 random bytes). Put on ctx + echo back as response + // header. rid := strings.TrimSpace(r.Header.Get("X-Request-ID")) if rid == "" { rid = newRequestID() @@ -264,42 +310,112 @@ func loggingMiddleware(next http.Handler) http.Handler { r = r.WithContext(ctx) w.Header().Set("X-Request-ID", rid) - start := time.Now() rw := &responseWriter{ResponseWriter: w, status: http.StatusOK} - next.ServeHTTP(rw, r) - duration := time.Since(start) + start := time.Now() - // /metrics itself is noisy and self-referential — skip recording it - // as an observed request so a tight Prometheus scrape loop doesn't - // dominate the time series. - if r.URL.Path != "/metrics" { - recordRequest(r.Method, r.URL.Path, rw.status, duration.Seconds()) - } + // Defer the access log + metric emission so a panic still + // produces an observation. recoveryMiddleware catches the panic + // and writes a 500; we then see status=500 in the log. If a + // panic escapes recoveryMiddleware, Go still unwinds through + // our deferred func before the goroutine dies, so the log is + // emitted with whatever status had been set. + defer func() { + duration := time.Since(start) + recErr := recover() + + // /metrics is self-referential — exclude to avoid scraping + // feedback loops. + if r.URL.Path != "/metrics" && obs.HTTP != nil { + route := r.Pattern + if route == "" { + route = "unknown" + } + obs.HTTP.Observe(route, r.Method, rw.status, duration) + } - level := slog.LevelInfo - if rw.status >= 500 { - level = slog.LevelError - } else if rw.status >= 400 { - level = slog.LevelWarn - } + level := slog.LevelInfo + if rw.status >= 500 || recErr != nil { + level = slog.LevelError + } else if rw.status >= 400 { + level = slog.LevelWarn + } - slog.Log(r.Context(), level, "http", - "method", r.Method, - "path", r.URL.Path, - "status", rw.status, - "duration_ms", duration.Milliseconds(), - "request_id", rid, - ) + attrs := []any{ + "req_id", rid, + "method", r.Method, + "path", r.URL.Path, + "route", r.Pattern, + "status", rw.status, + "duration_ms", duration.Milliseconds(), + "bytes_out", rw.bytes, + "auth", classifyAuth(r), + } + if project := ProjectFromContext(r.Context()); project != "" { + attrs = append(attrs, "project", project) + } + if recErr != nil { + attrs = append(attrs, "panic", recErr) + } + + slog.Log(r.Context(), level, "http", attrs...) + + if recErr != nil { + // Re-raise: let upstream recovery middleware (or the + // std library) handle the actual HTTP error response. + panic(recErr) + } + }() + + next.ServeHTTP(rw, r) }) } -// responseWriter wraps http.ResponseWriter to capture the status code. +// classifyAuth reports a coarse auth-method label for the access log. +// docsiq uses a single shared API key (no per-user identity), so we +// emit the channel the client used rather than a user_id. +func classifyAuth(r *http.Request) string { + if strings.HasPrefix(strings.TrimSpace(r.Header.Get("Authorization")), "Bearer ") { + return "bearer" + } + if c, err := r.Cookie(sessionCookieName); err == nil && c.Value != "" { + return "cookie" + } + return "anon" +} + +// responseWriter wraps http.ResponseWriter to capture status code and +// bytes written. Both are read by loggingMiddleware for the access log +// and by Prometheus. bytes is tracked via Write; implicit 200-only +// writes go through WriteHeader, so we default status to 200. type responseWriter struct { http.ResponseWriter - status int + status int + bytes int64 + wroteHeader bool } func (rw *responseWriter) WriteHeader(code int) { rw.status = code + rw.wroteHeader = true rw.ResponseWriter.WriteHeader(code) } + +func (rw *responseWriter) Write(p []byte) (int, error) { + if !rw.wroteHeader { + // Implicit 200 per net/http contract. + rw.wroteHeader = true + } + n, err := rw.ResponseWriter.Write(p) + rw.bytes += int64(n) + return n, err +} + +// Flush passes through to the underlying writer when it supports it — +// required for SSE and streaming responses. Standard http.ResponseWriter +// does NOT have Flush in its interface, so the type assertion is the +// correct idiom. +func (rw *responseWriter) Flush() { + if f, ok := rw.ResponseWriter.(http.Flusher); ok { + f.Flush() + } +} diff --git a/internal/api/router_no_llm_test.go b/internal/api/router_no_llm_test.go index 0c2e312..7353f06 100644 --- a/internal/api/router_no_llm_test.go +++ b/internal/api/router_no_llm_test.go @@ -154,4 +154,24 @@ func TestRouterNoLLM(t *testing.T) { t.Errorf("GET /api/projects status = %d, want 200", rec.Code) } }) + + t.Run("version_returns_200_no_auth", func(t *testing.T) { + h, _ := newNoLLMRouter(t) + req := httptest.NewRequest(http.MethodGet, "/api/version", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("GET /api/version status = %d, want 200; body=%s", + rec.Code, rec.Body.String()) + } + var body map[string]any + if err := json.NewDecoder(rec.Body).Decode(&body); err != nil { + t.Fatalf("GET /api/version body not JSON: %v", err) + } + for _, k := range []string{"version", "commit", "build_date", "go_version"} { + if _, ok := body[k]; !ok { + t.Errorf("GET /api/version missing key %q", k) + } + } + }) } diff --git a/internal/api/timeout_test.go b/internal/api/timeout_test.go index 3b2f89c..f86cf60 100644 --- a/internal/api/timeout_test.go +++ b/internal/api/timeout_test.go @@ -116,3 +116,71 @@ func TestIsUploadRoute_Classification(t *testing.T) { }) } } + +// TestIsStreamingRoute_Classification: SSE / long-poll routes must +// bypass TimeoutHandler so http.Flusher propagates to the handler. +func TestIsStreamingRoute_Classification(t *testing.T) { + t.Parallel() + cases := []struct { + method, path string + want bool + }{ + {http.MethodGet, "/api/upload/progress", true}, + {http.MethodGet, "/mcp", true}, + {http.MethodPost, "/mcp", false}, // POST /mcp is a short JSON-RPC call + {http.MethodPost, "/api/upload/progress", false}, + {http.MethodGet, "/api/stats", false}, + {http.MethodGet, "/api/upload", false}, + } + for _, c := range cases { + c := c + t.Run(c.method+" "+c.path, func(t *testing.T) { + req := httptest.NewRequest(c.method, c.path, nil) + got := isStreamingRoute(req) + if got != c.want { + t.Fatalf("isStreamingRoute(%s %s) = %v; want %v", c.method, c.path, got, c.want) + } + }) + } +} + +// TestRequestTimeoutMiddleware_StreamingRouteBypassesTimeout: an SSE +// handler that streams with http.Flusher must not be wrapped by +// TimeoutHandler (which buffers and drops Flusher). Without the +// bypass, flusher.Flush() is a no-op and the client stalls. +func TestRequestTimeoutMiddleware_StreamingRouteBypassesTimeout(t *testing.T) { + t.Parallel() + cfg := &config.Config{} + cfg.Server.RequestTimeout = 50 * time.Millisecond + cfg.Server.UploadTimeout = 1 * time.Second + + streamed := make(chan struct{}, 1) + sse := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + f, ok := w.(http.Flusher) + if !ok { + http.Error(w, "no flusher", http.StatusInternalServerError) + return + } + w.Header().Set("Content-Type", "text/event-stream") + _, _ = w.Write([]byte("data: hello\n\n")) + f.Flush() + streamed <- struct{}{} + }) + handler := requestTimeoutMiddleware(cfg)(sse) + + req := httptest.NewRequest(http.MethodGet, "/api/upload/progress", nil) + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + + select { + case <-streamed: + default: + t.Fatalf("handler did not observe Flusher — SSE would stall behind TimeoutHandler") + } + if rec.Code != http.StatusOK { + t.Fatalf("status = %d; want 200", rec.Code) + } + if !strings.Contains(rec.Body.String(), "data: hello") { + t.Fatalf("body = %q; want SSE event", rec.Body.String()) + } +} diff --git a/internal/api/version.go b/internal/api/version.go new file mode 100644 index 0000000..bb11afd --- /dev/null +++ b/internal/api/version.go @@ -0,0 +1,23 @@ +package api + +import ( + "net/http" + + "github.com/RandomCodeSpace/docsiq/internal/buildinfo" +) + +// versionHandler serves GET /api/version. Returns buildinfo.Info as +// JSON, including the direct-dependency map. Public endpoint — no +// secrets are exposed; commit hash and Go version are considered +// non-sensitive for a self-hosted MCP server. +func versionHandler() http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + w.Header().Set("Allow", "GET") + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + info := buildinfo.Resolve(true) + writeJSON(w, http.StatusOK, info) + }) +} diff --git a/internal/api/version_test.go b/internal/api/version_test.go new file mode 100644 index 0000000..d9baef7 --- /dev/null +++ b/internal/api/version_test.go @@ -0,0 +1,59 @@ +package api + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestVersionHandler_ReturnsJSON(t *testing.T) { + t.Parallel() + h := versionHandler() + req := httptest.NewRequest(http.MethodGet, "/api/version", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status=%d want 200", rec.Code) + } + ct := rec.Header().Get("Content-Type") + if !strings.HasPrefix(ct, "application/json") { + t.Errorf("Content-Type=%q want application/json*", ct) + } + + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("body not JSON: %v — raw=%q", err, rec.Body.String()) + } + + for _, key := range []string{"version", "commit", "build_date", "go_version"} { + if _, ok := body[key]; !ok { + t.Errorf("response missing %q field; got keys=%v", key, mapKeys(body)) + } + } +} + +func TestVersionHandler_RejectsNonGET(t *testing.T) { + t.Parallel() + h := versionHandler() + req := httptest.NewRequest(http.MethodPost, "/api/version", nil) + rec := httptest.NewRecorder() + h.ServeHTTP(rec, req) + + if rec.Code != http.StatusMethodNotAllowed { + t.Errorf("POST /api/version status=%d want 405", rec.Code) + } + if got := rec.Header().Get("Allow"); got != "GET" { + t.Errorf("Allow=%q want GET", got) + } +} + +func mapKeys(m map[string]any) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/internal/buildinfo/buildinfo.go b/internal/buildinfo/buildinfo.go new file mode 100644 index 0000000..fefb0a1 --- /dev/null +++ b/internal/buildinfo/buildinfo.go @@ -0,0 +1,132 @@ +// Package buildinfo resolves the running binary's version metadata. It +// reads ldflags-injected overrides (set via `-X` at build time) and +// falls back to `runtime/debug.ReadBuildInfo()` when the overrides are +// sentinel values, so `go install module@version` binaries still report +// useful metadata. +package buildinfo + +import "runtime/debug" + +// Set via -ldflags at build time (see Makefile). These act as overrides. +// They are package-level var so the linker can write to them; keep the +// names stable — the Makefile's LDFLAGS refers to them by full symbol +// path (github.com/RandomCodeSpace/docsiq/internal/buildinfo.Version etc.). +var ( + Version = "dev" + Commit = "unknown" + Date = "unknown" +) + +// Info holds resolved version metadata for the running binary. +type Info struct { + Version string `json:"version"` + Commit string `json:"commit"` + BuildDate string `json:"build_date"` + GoVersion string `json:"go_version"` + Dirty string `json:"dirty"` // "true", "false", or "unknown" + Deps map[string]string `json:"deps,omitempty"` +} + +// readBuildInfo is a package-level indirection so tests can stub it. +var readBuildInfo = debug.ReadBuildInfo + +func isSentinel(v string) bool { + switch v { + case "", "dev", "unknown": + return true + } + return false +} + +// Resolve returns the current version metadata using: +// 1. -ldflags overrides (if non-sentinel) +// 2. runtime/debug.ReadBuildInfo() (module version + VCS settings) +// 3. "unknown" for any remaining field +// +// When includeDeps is true, the returned Info also lists the main +// module's direct dependencies (Path → Version). Transitive deps are +// omitted because they bloat the response without real diagnostic value. +func Resolve(includeDeps bool) Info { + info := Info{ + Version: Version, + Commit: Commit, + BuildDate: Date, + Dirty: "unknown", + } + + bi, ok := readBuildInfo() + if !ok { + if isSentinel(info.Version) { + info.Version = "unknown" + } + if isSentinel(info.Commit) { + info.Commit = "unknown" + } + if isSentinel(info.BuildDate) { + info.BuildDate = "unknown" + } + info.GoVersion = "unknown" + return info + } + + info.GoVersion = bi.GoVersion + + if isSentinel(info.Version) { + if bi.Main.Version != "" { + info.Version = bi.Main.Version + } else { + info.Version = "unknown" + } + } + + var vcsRev, vcsTime, vcsMod string + for _, s := range bi.Settings { + switch s.Key { + case "vcs.revision": + vcsRev = s.Value + case "vcs.time": + vcsTime = s.Value + case "vcs.modified": + vcsMod = s.Value + } + } + if isSentinel(info.Commit) { + if vcsRev != "" { + info.Commit = vcsRev + } else { + info.Commit = "unknown" + } + } + if isSentinel(info.BuildDate) { + if vcsTime != "" { + info.BuildDate = vcsTime + } else { + info.BuildDate = "unknown" + } + } + if vcsMod != "" { + info.Dirty = vcsMod + } + + if includeDeps { + deps := make(map[string]string, len(bi.Deps)) + for _, d := range bi.Deps { + if d == nil { + continue + } + // Skip replaced modules — the Replace struct's Version is + // what actually ships, not d.Version. + v := d.Version + if d.Replace != nil { + v = d.Replace.Version + } + if v == "" { + v = "unknown" + } + deps[d.Path] = v + } + info.Deps = deps + } + + return info +} diff --git a/internal/buildinfo/buildinfo_test.go b/internal/buildinfo/buildinfo_test.go new file mode 100644 index 0000000..b77a635 --- /dev/null +++ b/internal/buildinfo/buildinfo_test.go @@ -0,0 +1,127 @@ +package buildinfo + +import ( + "runtime/debug" + "strings" + "testing" +) + +func TestResolve_SentinelFallsBackToBuildInfo(t *testing.T) { + origVersion, origCommit, origDate, origRead := + Version, Commit, Date, readBuildInfo + defer func() { + Version, Commit, Date, readBuildInfo = + origVersion, origCommit, origDate, origRead + }() + + Version, Commit, Date = "dev", "unknown", "unknown" + readBuildInfo = func() (*debug.BuildInfo, bool) { + return &debug.BuildInfo{ + GoVersion: "go1.25.5", + Main: debug.Module{ + Path: "github.com/RandomCodeSpace/docsiq", + Version: "v0.5.0", + }, + Settings: []debug.BuildSetting{ + {Key: "vcs.revision", Value: "abc123def"}, + {Key: "vcs.time", Value: "2026-04-23T10:00:00Z"}, + {Key: "vcs.modified", Value: "false"}, + }, + }, true + } + + got := Resolve(false) + if got.Version != "v0.5.0" { + t.Errorf("Version=%q want v0.5.0", got.Version) + } + if got.Commit != "abc123def" { + t.Errorf("Commit=%q want abc123def", got.Commit) + } + if got.BuildDate != "2026-04-23T10:00:00Z" { + t.Errorf("BuildDate=%q", got.BuildDate) + } + if got.GoVersion != "go1.25.5" { + t.Errorf("GoVersion=%q", got.GoVersion) + } + if got.Dirty != "false" { + t.Errorf("Dirty=%q", got.Dirty) + } +} + +func TestResolve_LdflagsOverridesWin(t *testing.T) { + origVersion, origCommit, origDate, origRead := + Version, Commit, Date, readBuildInfo + defer func() { + Version, Commit, Date, readBuildInfo = + origVersion, origCommit, origDate, origRead + }() + + Version, Commit, Date = "v9.9.9", "ffffff", "2026-01-01T00:00:00Z" + readBuildInfo = func() (*debug.BuildInfo, bool) { + return &debug.BuildInfo{ + GoVersion: "go1.25.5", + Main: debug.Module{Version: "v0.0.0"}, + Settings: []debug.BuildSetting{ + {Key: "vcs.revision", Value: "DO-NOT-USE"}, + }, + }, true + } + + got := Resolve(false) + if got.Version != "v9.9.9" { + t.Errorf("ldflags Version should win; got %q", got.Version) + } + if got.Commit != "ffffff" { + t.Errorf("ldflags Commit should win; got %q", got.Commit) + } +} + +func TestResolve_IncludeDepsPopulatesMap(t *testing.T) { + origRead := readBuildInfo + defer func() { readBuildInfo = origRead }() + readBuildInfo = func() (*debug.BuildInfo, bool) { + return &debug.BuildInfo{ + GoVersion: "go1.25.5", + Main: debug.Module{Version: "v0.5.0"}, + Deps: []*debug.Module{ + {Path: "github.com/spf13/cobra", Version: "v1.10.2"}, + {Path: "github.com/tmc/langchaingo", Version: "v0.1.14"}, + }, + }, true + } + + got := Resolve(true) + if got.Deps["github.com/spf13/cobra"] != "v1.10.2" { + t.Errorf("deps missing cobra; got %+v", got.Deps) + } + if got.Deps["github.com/tmc/langchaingo"] != "v0.1.14" { + t.Errorf("deps missing langchaingo") + } +} + +func TestResolve_ReadBuildInfoUnavailable(t *testing.T) { + origRead := readBuildInfo + origVersion, origCommit, origDate := + Version, Commit, Date + defer func() { + readBuildInfo = origRead + Version, Commit, Date = origVersion, origCommit, origDate + }() + + Version, Commit, Date = "dev", "unknown", "unknown" + readBuildInfo = func() (*debug.BuildInfo, bool) { return nil, false } + + got := Resolve(true) + if got.Version != "unknown" || got.Commit != "unknown" || got.BuildDate != "unknown" { + t.Errorf("all fields should be 'unknown'; got %+v", got) + } + if got.GoVersion != "unknown" { + t.Errorf("GoVersion should be 'unknown'; got %q", got.GoVersion) + } + if got.Deps != nil { + t.Errorf("Deps should be nil when ReadBuildInfo fails; got %+v", got.Deps) + } + if strings.Contains(got.Dirty, "true") { + t.Errorf("Dirty should default to 'unknown'; got %q", got.Dirty) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 81a457b..6924028 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,6 +18,7 @@ type Config struct { Indexing IndexingConfig `mapstructure:"indexing"` Community CommunityConfig `mapstructure:"community"` Server ServerConfig `mapstructure:"server"` + Log LogConfig `mapstructure:"log"` // Phase-5: per-project LLM overrides. Keyed by project slug. // When a slug is missing from the map, callers fall back to the @@ -26,6 +27,16 @@ type Config struct { LLMOverrides map[string]LLMConfig `mapstructure:"llm_overrides"` } +// LogConfig controls structured-log emission format. Lowest-priority +// source of truth — `--log-format` flag and `DOCSIQ_LOG_FORMAT` env +// var both outrank this value in cmd/root.go. +type LogConfig struct { + // Format chooses the slog handler. "text" (default) emits a + // human-readable single-line format with emoji prefixes; "json" + // strips emoji and emits machine-parseable JSON objects. + Format string `mapstructure:"format"` +} + // LLMConfigForProject returns the override for slug if present, otherwise // the root LLM config. A missing or empty slug yields the root config. // The Provider field is treated as the presence sentinel — a YAML block @@ -243,6 +254,9 @@ func Load(cfgFile string) (*Config, error) { v.SetDefault("server.request_timeout", 30*time.Second) v.SetDefault("server.upload_timeout", 10*time.Minute) + // Log format: "text" for human dev output, "json" for production. + v.SetDefault("log.format", "text") + // Config file search paths. Only ~/.docsiq and CWD are consulted. newCfgDir := filepath.Join(home, ".docsiq") if cfgFile != "" { @@ -271,6 +285,7 @@ func Load(cfgFile string) (*Config, error) { _ = v.BindEnv("llm.call_timeout", "DOCSIQ_LLM_CALL_TIMEOUT") _ = v.BindEnv("server.request_timeout", "DOCSIQ_SERVER_REQUEST_TIMEOUT") _ = v.BindEnv("server.upload_timeout", "DOCSIQ_SERVER_UPLOAD_TIMEOUT") + _ = v.BindEnv("log.format", "DOCSIQ_LOG_FORMAT") if err := v.ReadInConfig(); err != nil { if _, ok := err.(viper.ConfigFileNotFoundError); ok { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 6c8cb72..856892b 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -670,3 +670,52 @@ func captureSlog(t *testing.T) *syncBuffer { t.Cleanup(func() { slog.SetDefault(prev) }) return buf } + +func TestLoad_LogFormatDefaultText(t *testing.T) { + // NOT parallel — mutates env + HOME. + dir := t.TempDir() + isolateEnv(t, dir) + // Empty cfgFile → Load searches ~/.docsiq (empty) + cwd, falls + // through to defaults. + cfg, err := Load("") + if err != nil { + t.Fatalf("Load: %v", err) + } + if cfg.Log.Format != "text" { + t.Errorf("default Log.Format=%q want text", cfg.Log.Format) + } +} + +func TestLoad_LogFormatFromYAML(t *testing.T) { + // NOT parallel — mutates env + HOME. + dir := t.TempDir() + isolateEnv(t, dir) + f := filepath.Join(dir, "config.yaml") + if err := os.WriteFile(f, []byte("log:\n format: json\n"), 0o600); err != nil { + t.Fatal(err) + } + cfg, err := Load(f) + if err != nil { + t.Fatalf("Load: %v", err) + } + if cfg.Log.Format != "json" { + t.Errorf("Log.Format=%q want json", cfg.Log.Format) + } +} + +func TestLoad_LogFormatFromEnv(t *testing.T) { + // NOT parallel — mutates env + HOME. + dir := t.TempDir() + isolateEnv(t, dir) + if err := os.Setenv("DOCSIQ_LOG_FORMAT", "json"); err != nil { + t.Fatal(err) + } + defer os.Unsetenv("DOCSIQ_LOG_FORMAT") + cfg, err := Load("") + if err != nil { + t.Fatalf("Load: %v", err) + } + if cfg.Log.Format != "json" { + t.Errorf("env Log.Format=%q want json", cfg.Log.Format) + } +} diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go index 4581f96..0fc2b98 100644 --- a/internal/embedder/embedder.go +++ b/internal/embedder/embedder.go @@ -4,8 +4,10 @@ import ( "context" "fmt" "sync" + "time" "github.com/RandomCodeSpace/docsiq/internal/llm" + "github.com/RandomCodeSpace/docsiq/internal/obs" ) // Embedder batches text → []float32 vectors using an LLM provider. @@ -72,7 +74,11 @@ func (e *Embedder) EmbedTexts(ctx context.Context, texts []string) ([][]float32, defer wg.Done() defer func() { <-sem }() + start := time.Now() vecs, err := e.provider.EmbedBatch(ctx, work.texts) + if obs.Embed != nil { + obs.Embed.Observe(e.provider.Name(), time.Since(start)) + } if err != nil { errs[batchIdx] = fmt.Errorf("embed batch [%d:%d]: %w", work.idx, work.idx+len(work.texts), err) diff --git a/internal/llm/mock/mock.go b/internal/llm/mock/mock.go index a4b7833..73be53e 100644 --- a/internal/llm/mock/mock.go +++ b/internal/llm/mock/mock.go @@ -37,8 +37,9 @@ func New(dims int) *Provider { return &Provider{Dims: dims} } -func (p *Provider) Name() string { return "mock" } -func (p *Provider) ModelID() string { return "mock-llm" } +func (p *Provider) Name() string { return "mock" } +func (p *Provider) ModelID() string { return "mock-llm" } +func (p *Provider) BatchCeiling() int { return 64 } // Complete returns a deterministic response chosen by prompt substring. // Schema must match what internal/extractor and internal/community diff --git a/internal/llm/provider.go b/internal/llm/provider.go index 3dfc4d8..2335d12 100644 --- a/internal/llm/provider.go +++ b/internal/llm/provider.go @@ -7,6 +7,7 @@ import ( "time" "github.com/RandomCodeSpace/docsiq/internal/config" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/tmc/langchaingo/embeddings" "github.com/tmc/langchaingo/llms" "github.com/tmc/langchaingo/llms/ollama" @@ -125,7 +126,17 @@ func (p *lcProvider) Complete(ctx context.Context, prompt string, opts ...Option if o.jsonMode { callOpts = append(callOpts, llms.WithJSONMode()) } - return llms.GenerateFromSinglePrompt(ctx, p.llm, prompt, callOpts...) + resp, err := llms.GenerateFromSinglePrompt(ctx, p.llm, prompt, callOpts...) + if obs.LLM != nil { + // Approximation: 1 token ~= 4 bytes of UTF-8 for English prose. + // This is a coarse fallback until langchaingo's GenerationInfo + // usage data is threaded through the Provider interface + // (tracked as follow-up). Using "total" kind since we cannot + // split prompt vs completion here without the usage payload. + approx := (len(prompt) + len(resp)) / 4 + obs.LLM.RecordTokens(p.name, "total", approx) + } + return resp, err } func (p *lcProvider) Embed(ctx context.Context, text string) ([]float32, error) { diff --git a/internal/obs/metrics.go b/internal/obs/metrics.go new file mode 100644 index 0000000..54d3a87 --- /dev/null +++ b/internal/obs/metrics.go @@ -0,0 +1,289 @@ +// Package obs wires Prometheus metrics for docsiq. One registry per +// process (exposed via obs.Default). Metric families are grouped by +// subject (HTTP, pipeline, embed, LLM, workq, build-info) so handlers +// record through a thin typed API — callers never touch raw collectors. +package obs + +import ( + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +// Default is the process-wide registry. Tests construct their own +// prometheus.NewRegistry() to avoid Register-twice panics. +var ( + Default = prometheus.NewRegistry() + + HTTP *HTTPMetrics + Pipeline *PipelineMetrics + Embed *EmbedMetrics + LLM *LLMMetrics + Workq *WorkqMetrics + Build *BuildInfoMetric +) + +// Init wires the Default registry. Must be called exactly once at +// startup (from cmd/serve.go). Safe no-op on second call. +var ( + initOnce sync.Once +) + +func Init() { + initOnce.Do(func() { + HTTP = NewHTTPMetrics(Default) + Pipeline = NewPipelineMetrics(Default) + Embed = NewEmbedMetrics(Default) + LLM = NewLLMMetrics(Default) + Workq = NewWorkqMetrics(Default) + Build = NewBuildInfoMetric(Default) + }) +} + +// ---- HTTP --------------------------------------------------------------- + +// HTTPMetrics bundles the request counter + duration histogram. +type HTTPMetrics struct { + Requests *prometheus.CounterVec + Duration *prometheus.HistogramVec +} + +// NewHTTPMetrics constructs and registers the HTTP metric family on reg. +func NewHTTPMetrics(reg prometheus.Registerer) *HTTPMetrics { + m := &HTTPMetrics{ + Requests: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "docsiq_http_requests_total", + Help: "Total HTTP requests by route, method, and status.", + }, + []string{"route", "method", "status"}, + ), + Duration: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "docsiq_http_request_duration_seconds", + Help: "HTTP request duration in seconds, by route.", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, + []string{"route"}, + ), + } + reg.MustRegister(m.Requests, m.Duration) + return m +} + +// Observe records one request. `route` MUST be the pattern (e.g. +// "GET /api/documents/{id}"), not r.URL.Path — raw paths have unbounded +// cardinality and will explode the scrape database. +func (m *HTTPMetrics) Observe(route, method string, status int, d time.Duration) { + m.Requests.WithLabelValues(route, method, statusLabel(status)).Inc() + m.Duration.WithLabelValues(route).Observe(d.Seconds()) +} + +// statusLabel collapses a status code to a two-digit bucket (2xx, 3xx, +// 4xx, 5xx). Bounds cardinality even when a handler emits non-standard +// codes. +func statusLabel(code int) string { + switch { + case code >= 500: + return "5xx" + case code >= 400: + return "4xx" + case code >= 300: + return "3xx" + case code >= 200: + return "2xx" + default: + return "1xx" + } +} + +// ---- Pipeline ----------------------------------------------------------- + +// PipelineMetrics bundles the pipeline-stage histogram. +type PipelineMetrics struct { + StageDuration *prometheus.HistogramVec +} + +// NewPipelineMetrics constructs + registers the pipeline-stage family. +func NewPipelineMetrics(reg prometheus.Registerer) *PipelineMetrics { + m := &PipelineMetrics{ + StageDuration: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "docsiq_pipeline_stage_duration_seconds", + Help: "GraphRAG pipeline stage duration in seconds.", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 14), // 0.1s → ~27min + }, + []string{"stage"}, + ), + } + reg.MustRegister(m.StageDuration) + return m +} + +// TimeStage measures the wall-clock duration of fn and records it +// against the given stage label. The error from fn is propagated. +func (m *PipelineMetrics) TimeStage(stage string, fn func() error) error { + start := time.Now() + err := fn() + m.StageDuration.WithLabelValues(stage).Observe(time.Since(start).Seconds()) + return err +} + +// ---- Embed -------------------------------------------------------------- + +// EmbedMetrics bundles the embed-latency histogram. +type EmbedMetrics struct { + Latency *prometheus.HistogramVec +} + +// NewEmbedMetrics constructs + registers the embed-latency family. +func NewEmbedMetrics(reg prometheus.Registerer) *EmbedMetrics { + m := &EmbedMetrics{ + Latency: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "docsiq_embed_latency_seconds", + Help: "Per-batch embed call latency in seconds, by provider.", + Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + []string{"provider"}, + ), + } + reg.MustRegister(m.Latency) + return m +} + +// Observe records a single embed-batch call duration. +func (m *EmbedMetrics) Observe(provider string, d time.Duration) { + m.Latency.WithLabelValues(provider).Observe(d.Seconds()) +} + +// ---- LLM ---------------------------------------------------------------- + +// LLMMetrics bundles the token-counter family. +type LLMMetrics struct { + Tokens *prometheus.CounterVec +} + +// NewLLMMetrics constructs + registers the LLM-tokens family. +func NewLLMMetrics(reg prometheus.Registerer) *LLMMetrics { + m := &LLMMetrics{ + Tokens: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "docsiq_llm_tokens_total", + Help: "LLM tokens consumed, by provider and kind (prompt|completion|total).", + }, + []string{"provider", "kind"}, + ), + } + reg.MustRegister(m.Tokens) + return m +} + +// RecordTokens increments the counter by n. Use kind="prompt", +// "completion", or "total" (when the provider cannot split usage). +// Non-positive n is ignored so callers don't have to guard. +func (m *LLMMetrics) RecordTokens(provider, kind string, n int) { + if n <= 0 { + return + } + m.Tokens.WithLabelValues(provider, kind).Add(float64(n)) +} + +// ---- Workq -------------------------------------------------------------- + +// WorkqStats is the snapshot surface the obs layer needs from workq. +// Workq owns the concrete Pool.Stats() method; obs only reads. +type WorkqStats struct { + Depth int64 + Rejected int64 +} + +// WorkqStatsProvider is a closure over pool.Stats(). Injected from +// cmd/serve.go after the pool is constructed, so the obs package does +// not take a hard dep on internal/workq (keeps the import DAG acyclic). +type WorkqStatsProvider func() WorkqStats + +// WorkqMetrics wraps the workq gauges/counters; the actual values are +// read via a late-bound provider function so the pool can be swapped in +// after registration without re-registering collectors. +type WorkqMetrics struct { + mu sync.RWMutex + provider WorkqStatsProvider +} + +// NewWorkqMetrics registers the workq collectors. The provider is a +// no-op until BindStatsProvider is called; Prometheus scrapes before +// binding will see Depth=0, Rejected=0 (safe defaults). +func NewWorkqMetrics(reg prometheus.Registerer) *WorkqMetrics { + m := &WorkqMetrics{ + provider: func() WorkqStats { return WorkqStats{} }, + } + + depth := prometheus.NewGaugeFunc( + prometheus.GaugeOpts{ + Name: "docsiq_workq_depth", + Help: "Current depth of the workq submission queue (jobs waiting).", + }, + func() float64 { + m.mu.RLock() + defer m.mu.RUnlock() + return float64(m.provider().Depth) + }, + ) + + rejected := prometheus.NewCounterFunc( + prometheus.CounterOpts{ + Name: "docsiq_workq_rejected_total", + Help: "Total workq submissions rejected because the queue was full.", + }, + func() float64 { + m.mu.RLock() + defer m.mu.RUnlock() + return float64(m.provider().Rejected) + }, + ) + reg.MustRegister(depth, rejected) + return m +} + +// BindStatsProvider wires a live snapshot source. Call from cmd/serve.go +// after the pool is created, before starting the HTTP server. +func (m *WorkqMetrics) BindStatsProvider(p WorkqStatsProvider) { + if p == nil { + return + } + m.mu.Lock() + defer m.mu.Unlock() + m.provider = p +} + +// ---- Build info --------------------------------------------------------- + +// BuildInfoMetric wraps the docsiq_build_info gauge used for +// {version, commit} labels on dashboards. +type BuildInfoMetric struct { + Info *prometheus.GaugeVec +} + +// NewBuildInfoMetric constructs + registers the build-info gauge. +func NewBuildInfoMetric(reg prometheus.Registerer) *BuildInfoMetric { + m := &BuildInfoMetric{ + Info: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "docsiq_build_info", + Help: "Build metadata (value is always 1; labels carry version and commit).", + }, + []string{"version", "commit"}, + ), + } + reg.MustRegister(m.Info) + return m +} + +// Set publishes the current build metadata. Subsequent Set calls +// overwrite rather than accumulate labels. +func (m *BuildInfoMetric) Set(version, commit string) { + m.Info.Reset() + m.Info.WithLabelValues(version, commit).Set(1) +} diff --git a/internal/obs/metrics_test.go b/internal/obs/metrics_test.go new file mode 100644 index 0000000..ff257fa --- /dev/null +++ b/internal/obs/metrics_test.go @@ -0,0 +1,212 @@ +package obs + +import ( + "net/http/httptest" + "strings" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestHTTP_ObserveRecordsCounterAndHistogram(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + h := NewHTTPMetrics(reg) + + h.Observe("GET /api/documents", "GET", 200, 42*time.Millisecond) + h.Observe("GET /api/documents", "GET", 200, 80*time.Millisecond) + h.Observe("POST /api/search", "POST", 500, 2*time.Second) + + if got := testutil.ToFloat64(h.Requests.WithLabelValues("GET /api/documents", "GET", "2xx")); got != 2 { + t.Errorf("requests{...2xx}=%v want 2", got) + } + if got := testutil.ToFloat64(h.Requests.WithLabelValues("POST /api/search", "POST", "5xx")); got != 1 { + t.Errorf("requests{...5xx}=%v want 1", got) + } + + out, err := reg.Gather() + if err != nil { + t.Fatalf("gather: %v", err) + } + var found bool + for _, mf := range out { + if mf.GetName() == "docsiq_http_request_duration_seconds" { + found = true + for _, m := range mf.Metric { + if h := m.GetHistogram(); h != nil { + if h.GetSampleCount() == 0 { + t.Errorf("histogram sample count=0") + } + } + } + } + } + if !found { + t.Errorf("docsiq_http_request_duration_seconds not registered") + } +} + +func TestPipeline_TimeStageRecordsBothSuccessAndError(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + p := NewPipelineMetrics(reg) + + _ = p.TimeStage("load", func() error { return nil }) + _ = p.TimeStage("chunk", func() error { return nil }) + _ = p.TimeStage("chunk", func() error { return nil }) + + count := func(stage string) uint64 { + out, _ := reg.Gather() + for _, mf := range out { + if mf.GetName() != "docsiq_pipeline_stage_duration_seconds" { + continue + } + for _, m := range mf.Metric { + var s string + for _, lp := range m.GetLabel() { + if lp.GetName() == "stage" { + s = lp.GetValue() + } + } + if s == stage { + return m.GetHistogram().GetSampleCount() + } + } + } + return 0 + } + if got := count("load"); got != 1 { + t.Errorf("load stage count=%d want 1", got) + } + if got := count("chunk"); got != 2 { + t.Errorf("chunk stage count=%d want 2", got) + } +} + +func TestEmbed_ObserveByProvider(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + e := NewEmbedMetrics(reg) + + e.Observe("openai", 120*time.Millisecond) + e.Observe("openai", 200*time.Millisecond) + e.Observe("ollama", 50*time.Millisecond) + + out, _ := reg.Gather() + perProvider := map[string]uint64{} + for _, mf := range out { + if mf.GetName() != "docsiq_embed_latency_seconds" { + continue + } + for _, m := range mf.Metric { + for _, lp := range m.GetLabel() { + if lp.GetName() == "provider" { + perProvider[lp.GetValue()] = m.GetHistogram().GetSampleCount() + } + } + } + } + if perProvider["openai"] != 2 { + t.Errorf("openai count=%d want 2", perProvider["openai"]) + } + if perProvider["ollama"] != 1 { + t.Errorf("ollama count=%d want 1", perProvider["ollama"]) + } +} + +func TestLLM_RecordTokensByKind(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + l := NewLLMMetrics(reg) + + l.RecordTokens("openai", "prompt", 512) + l.RecordTokens("openai", "completion", 128) + l.RecordTokens("azure", "total", 256) + l.RecordTokens("openai", "prompt", 0) // ignored + l.RecordTokens("openai", "prompt", -5) // ignored + + if got := testutil.ToFloat64(l.Tokens.WithLabelValues("openai", "prompt")); got != 512 { + t.Errorf("openai prompt=%v want 512", got) + } + if got := testutil.ToFloat64(l.Tokens.WithLabelValues("openai", "completion")); got != 128 { + t.Errorf("openai completion=%v want 128", got) + } + if got := testutil.ToFloat64(l.Tokens.WithLabelValues("azure", "total")); got != 256 { + t.Errorf("azure total=%v want 256", got) + } +} + +func TestWorkq_DepthAndRejectedFromStatsProvider(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + w := NewWorkqMetrics(reg) + + w.BindStatsProvider(func() WorkqStats { return WorkqStats{Depth: 7, Rejected: 3} }) + + out, _ := reg.Gather() + var depth, rejected float64 + for _, mf := range out { + switch mf.GetName() { + case "docsiq_workq_depth": + depth = mf.Metric[0].GetGauge().GetValue() + case "docsiq_workq_rejected_total": + rejected = mf.Metric[0].GetCounter().GetValue() + } + } + if depth != 7 { + t.Errorf("depth=%v want 7", depth) + } + if rejected != 3 { + t.Errorf("rejected=%v want 3", rejected) + } +} + +func TestExpose_ScrapeOutputContainsAllFamilies(t *testing.T) { + t.Parallel() + reg := prometheus.NewRegistry() + h := NewHTTPMetrics(reg) + p := NewPipelineMetrics(reg) + e := NewEmbedMetrics(reg) + l := NewLLMMetrics(reg) + _ = NewWorkqMetrics(reg) + + bi := NewBuildInfoMetric(reg) + bi.Set("v9.9.9", "abcdef") + + // Fire at least one observation per label-vec family so the + // Prometheus text exposition actually prints a line. Counter/ + // HistogramVec families are omitted from the scrape until at least + // one label combination has been touched — this is by design in + // client_golang. + h.Observe("GET /test", "GET", 200, time.Millisecond) + _ = p.TimeStage("load", func() error { return nil }) + e.Observe("openai", time.Millisecond) + l.RecordTokens("openai", "total", 1) + + body := renderForTest(t, reg) + for _, want := range []string{ + "docsiq_http_requests_total", + "docsiq_http_request_duration_seconds", + "docsiq_pipeline_stage_duration_seconds", + "docsiq_embed_latency_seconds", + "docsiq_llm_tokens_total", + "docsiq_workq_depth", + "docsiq_workq_rejected_total", + "docsiq_build_info", + } { + if !strings.Contains(body, want) { + t.Errorf("scrape output missing family %q", want) + } + } +} + +func renderForTest(t *testing.T, reg *prometheus.Registry) string { + t.Helper() + rec := httptest.NewRecorder() + promhttp.HandlerFor(reg, promhttp.HandlerOpts{}). + ServeHTTP(rec, httptest.NewRequest("GET", "/metrics", nil)) + return rec.Body.String() +} diff --git a/internal/obs/slogfmt.go b/internal/obs/slogfmt.go new file mode 100644 index 0000000..b375a08 --- /dev/null +++ b/internal/obs/slogfmt.go @@ -0,0 +1,81 @@ +package obs + +import ( + "context" + "log/slog" + "strings" + "unicode/utf8" +) + +// NewProductionHandler wraps an inner slog.Handler and strips a +// leading emoji + trailing space from each record's Message. docsiq +// uses emoji prefixes (OK KO WARN etc.) as visual cues in dev text +// format; in JSON these collide with log-aggregator indexing +// (Elasticsearch tokeniser, fluentd grep rules) and obscure the actual +// message string. The handler mutates only Message — attrs pass +// through. +func NewProductionHandler(inner slog.Handler) slog.Handler { + return &prodHandler{inner: inner} +} + +type prodHandler struct{ inner slog.Handler } + +func (h *prodHandler) Enabled(ctx context.Context, lvl slog.Level) bool { + return h.inner.Enabled(ctx, lvl) +} + +func (h *prodHandler) Handle(ctx context.Context, r slog.Record) error { + r.Message = stripLeadingEmoji(r.Message) + return h.inner.Handle(ctx, r) +} + +func (h *prodHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + return &prodHandler{inner: h.inner.WithAttrs(attrs)} +} + +func (h *prodHandler) WithGroup(name string) slog.Handler { + return &prodHandler{inner: h.inner.WithGroup(name)} +} + +// stripLeadingEmoji removes the first rune from msg if it is in a +// Unicode emoji-like range, plus any immediately-following whitespace. +// Also strips a VS16 variation selector (U+FE0F) that often follows +// warning signs etc. We intentionally do NOT use a dependency like +// mattn/go-emoji; docsiq ships under air-gap rules (see build.md). +func stripLeadingEmoji(msg string) string { + if msg == "" { + return msg + } + r, size := utf8.DecodeRuneInString(msg) + if r == utf8.RuneError { + return msg + } + if !isEmojiLike(r) { + return msg + } + rest := msg[size:] + rest = strings.TrimLeft(rest, " \t") + if len(rest) > 0 { + r2, size2 := utf8.DecodeRuneInString(rest) + if r2 == 0xFE0F { + rest = strings.TrimLeft(rest[size2:], " \t") + } + } + return rest +} + +// isEmojiLike is a conservative test for the emoji-range runes that +// appear in docsiq log messages today. Covers BMP symbols +// (U+2600-U+27BF), miscellaneous pictographs (U+1F300-U+1F6FF), and +// supplemental symbols (U+1F900-U+1F9FF). +func isEmojiLike(r rune) bool { + switch { + case r >= 0x2600 && r <= 0x27BF: + return true + case r >= 0x1F300 && r <= 0x1F6FF: + return true + case r >= 0x1F900 && r <= 0x1F9FF: + return true + } + return false +} diff --git a/internal/obs/slogfmt_test.go b/internal/obs/slogfmt_test.go new file mode 100644 index 0000000..cfdf047 --- /dev/null +++ b/internal/obs/slogfmt_test.go @@ -0,0 +1,61 @@ +package obs + +import ( + "bytes" + "encoding/json" + "log/slog" + "testing" +) + +func TestStripLeadingEmoji(t *testing.T) { + t.Parallel() + cases := []struct { + in, want string + }{ + {"✅ all good", "all good"}, // white heavy check mark + {"❌ panic recovered", "panic recovered"}, // cross mark + {"⚠️ auth disabled", "auth disabled"}, // warning + VS16 + {"\U0001f6d1 shutting down...", "shutting down..."}, + {"⚙️ LLM provider initialised", "LLM provider initialised"}, + {"plain log line", "plain log line"}, + {"", ""}, + {" leading spaces", " leading spaces"}, // no emoji → untouched + } + for _, c := range cases { + got := stripLeadingEmoji(c.in) + if got != c.want { + t.Errorf("stripLeadingEmoji(%q)=%q want %q", c.in, got, c.want) + } + } +} + +func TestProductionHandler_JSONOutputNoEmoji(t *testing.T) { + t.Parallel() + var buf bytes.Buffer + h := NewProductionHandler(slog.NewJSONHandler(&buf, nil)) + logger := slog.New(h) + + logger.Info("✅ ready", "port", 8080) + logger.Error("❌ connection failed", "err", "timeout") + + lines := bytes.Split(bytes.TrimSpace(buf.Bytes()), []byte("\n")) + if len(lines) != 2 { + t.Fatalf("got %d lines want 2", len(lines)) + } + for _, line := range lines { + var rec map[string]any + if err := json.Unmarshal(line, &rec); err != nil { + t.Fatalf("not JSON: %v — raw=%q", err, line) + } + msg, _ := rec["msg"].(string) + if msg == "" { + t.Errorf("missing msg: %s", line) + } + for _, r := range msg { + if isEmojiLike(r) { + t.Errorf("msg contains emoji %q; msg=%q", r, msg) + break + } + } + } +} diff --git a/internal/pipeline/pipeline.go b/internal/pipeline/pipeline.go index 00d0e02..75243b0 100644 --- a/internal/pipeline/pipeline.go +++ b/internal/pipeline/pipeline.go @@ -22,11 +22,23 @@ import ( "github.com/RandomCodeSpace/docsiq/internal/extractor" "github.com/RandomCodeSpace/docsiq/internal/llm" "github.com/RandomCodeSpace/docsiq/internal/loader" + "github.com/RandomCodeSpace/docsiq/internal/obs" "github.com/RandomCodeSpace/docsiq/internal/store" "github.com/google/uuid" "github.com/schollz/progressbar/v3" ) +// timeStage is a nil-safe wrapper around obs.Pipeline.TimeStage. The +// indexer CLI does not initialise obs (obs.Init is only called from +// cmd/serve.go), so the CLI path must not blow up on a nil +// obs.Pipeline. +func timeStage(stage string, fn func() error) error { + if obs.Pipeline == nil { + return fn() + } + return obs.Pipeline.TimeStage(stage, fn) +} + // ProgressEvent sent over progress channel. type ProgressEvent struct { Phase string @@ -69,6 +81,12 @@ type IndexOptions struct { // IndexPath indexes a file or directory. func (p *Pipeline) IndexPath(ctx context.Context, path string, opts IndexOptions) error { + return timeStage("index_path", func() error { + return p.indexPath(ctx, path, opts) + }) +} + +func (p *Pipeline) indexPath(ctx context.Context, path string, opts IndexOptions) error { workers := opts.Workers if workers <= 0 { workers = p.cfg.Indexing.Workers @@ -123,6 +141,12 @@ func (p *Pipeline) IndexPath(ctx context.Context, path string, opts IndexOptions // IndexURL crawls a documentation website and indexes all discovered pages. func (p *Pipeline) IndexURL(ctx context.Context, rootURL string, opts IndexOptions) error { + return timeStage("index_url", func() error { + return p.indexURL(ctx, rootURL, opts) + }) +} + +func (p *Pipeline) indexURL(ctx context.Context, rootURL string, opts IndexOptions) error { workers := opts.Workers if workers <= 0 { workers = p.cfg.Indexing.Workers @@ -570,6 +594,12 @@ func (p *Pipeline) structureDocument(ctx context.Context, docID, content string) // If force is true, the graph fingerprint cache is ignored and communities // are always regenerated. func (p *Pipeline) Finalize(ctx context.Context, verbose bool, force ...bool) error { + return timeStage("finalize", func() error { + return p.finalize(ctx, verbose, force...) + }) +} + +func (p *Pipeline) finalize(ctx context.Context, verbose bool, force ...bool) error { slog.Info("🧩 Phase 3: loading entities and relationships") entities, err := p.store.AllEntities(ctx) if err != nil { diff --git a/internal/workq/workq.go b/internal/workq/workq.go index 0204ae6..ef67d90 100644 --- a/internal/workq/workq.go +++ b/internal/workq/workq.go @@ -8,6 +8,7 @@ import ( "context" "errors" "sync" + "sync/atomic" ) // ErrQueueFull is returned by Submit when the job queue is saturated. @@ -41,6 +42,12 @@ type Pool struct { mu sync.RWMutex closeOnce sync.Once closed chan struct{} + + // rejectedTotal counts Submit calls that returned ErrQueueFull + // over the pool's lifetime. ErrClosed is NOT counted — it's a + // shutdown condition, not a capacity condition. Atomic so + // readers (Stats scrapers) never block senders. + rejectedTotal atomic.Int64 } // New constructs and starts a Pool. @@ -83,10 +90,28 @@ func (p *Pool) Submit(job Job) error { case p.jobs <- job: return nil default: + p.rejectedTotal.Add(1) return ErrQueueFull } } +// Stats is a point-in-time snapshot of pool utilisation. Depth is the +// count of jobs currently queued but not yet picked up by a worker; +// Rejected is the monotonic count of Submit calls that returned +// ErrQueueFull since process start. Safe to call concurrently. +type Stats struct { + Depth int64 + Rejected int64 +} + +// Stats returns a snapshot of pool utilisation. +func (p *Pool) Stats() Stats { + return Stats{ + Depth: int64(len(p.jobs)), + Rejected: p.rejectedTotal.Load(), + } +} + // Close stops accepting new work and waits for workers to drain. If // the caller's ctx fires before drain completes, the pool context is // cancelled so in-flight jobs honouring cancellation can abort, and diff --git a/internal/workq/workq_test.go b/internal/workq/workq_test.go index 84c4415..fe4914c 100644 --- a/internal/workq/workq_test.go +++ b/internal/workq/workq_test.go @@ -105,3 +105,50 @@ func TestPool_SubmitRaceDuringClose(t *testing.T) { wg.Wait() } } + +func TestPool_StatsReportsDepthAndRejected(t *testing.T) { + t.Parallel() + // The underlying jobs channel has capacity = Workers + QueueDepth. + // With 1 worker + 2 queue slots the buffer is 3 wide. The worker + // pulls the first job off (blocks on <-block) freeing a slot, so + // three more blocking submits fit before the channel is saturated. + p := New(Config{Workers: 1, QueueDepth: 2}) + block := make(chan struct{}) + defer func() { + select { + case <-block: + default: + close(block) + } + _ = p.Close(context.Background()) + }() + + started := make(chan struct{}) + if err := p.Submit(func(ctx context.Context) { close(started); <-block }); err != nil { + t.Fatalf("submit 1: %v", err) + } + <-started + // Fill the channel buffer with blocking jobs (3 slots available + // once the worker has pulled submit 1 off). + for i := 2; i <= 4; i++ { + if err := p.Submit(func(ctx context.Context) { <-block }); err != nil { + t.Fatalf("submit %d: %v", i, err) + } + } + + // Next two submissions must be rejected; Rejected must grow by 2. + if err := p.Submit(func(ctx context.Context) {}); !errors.Is(err, ErrQueueFull) { + t.Fatalf("submit 5: want ErrQueueFull, got %v", err) + } + if err := p.Submit(func(ctx context.Context) {}); !errors.Is(err, ErrQueueFull) { + t.Fatalf("submit 6: want ErrQueueFull, got %v", err) + } + + stats := p.Stats() + if stats.Depth != 3 { + t.Errorf("Depth=%d want 3", stats.Depth) + } + if stats.Rejected != 2 { + t.Errorf("Rejected=%d want 2", stats.Rejected) + } +}