Skip to content

Commit d42c19c

Browse files
aksOpsclaude
andauthored
fix(storage): bound AutoMigrate with configurable deadline (#65)
Without a timeout, db.AutoMigrate inherits no deadline. On Postgres under load, an ALTER TABLE waits indefinitely for a relation lock — hanging the entire binary startup with no log output and requiring kill -9 to recover. Adds DB_MIGRATE_TIMEOUT_SECS (default 60s, max 1h, set 0 to opt out) and threads it through MigrateOptions.Timeout into a context applied to the AutoMigrate call only via db.WithContext. Pre/post hooks (FTS5 setup, legacy index drops) remain unbounded since they don't take long-held locks. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5e074f0 commit d42c19c

3 files changed

Lines changed: 126 additions & 1 deletion

File tree

internal/storage/factory.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package storage
22

33
import (
4+
"context"
45
"fmt"
56
"log"
67
"os"
@@ -204,6 +205,13 @@ type MigrateOptions struct {
204205
// PartitionLookaheadDays is the number of future daily partitions to
205206
// pre-create at boot. Defaults to 3 when zero.
206207
PartitionLookaheadDays int
208+
// Timeout, when > 0, bounds the AutoMigrate call. Without it,
209+
// db.AutoMigrate inherits no deadline and an ALTER TABLE waiting on a
210+
// Postgres relation lock can hang startup indefinitely. The timeout is
211+
// applied via db.WithContext to the AutoMigrate call only — pre/post
212+
// hooks (FTS5 triggers, legacy index drops) are not bounded since they
213+
// don't take long-held locks. Zero preserves legacy unbounded behaviour.
214+
Timeout time.Duration
207215
}
208216

209217
// AutoMigrateModelsWithOptions is the option-driven variant of
@@ -251,7 +259,17 @@ func AutoMigrateModelsWithOptions(db *gorm.DB, driver string, opts MigrateOption
251259
if !logsPartitioned {
252260
migrateModels = append(migrateModels, &Log{})
253261
}
254-
if err := db.AutoMigrate(migrateModels...); err != nil {
262+
// Apply a deadline to the AutoMigrate call when configured so a Postgres
263+
// relation-lock wait cannot hang startup indefinitely. WithContext returns
264+
// a session-scoped *gorm.DB; the parent db is unaffected for the post-
265+
// migration helpers below.
266+
migrator := db
267+
if opts.Timeout > 0 {
268+
ctx, cancel := context.WithTimeout(context.Background(), opts.Timeout)
269+
defer cancel()
270+
migrator = db.WithContext(ctx)
271+
}
272+
if err := migrator.AutoMigrate(migrateModels...); err != nil {
255273
return fmt.Errorf("failed to migrate database: %w", err)
256274
}
257275

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package storage
2+
3+
import (
4+
"testing"
5+
"time"
6+
)
7+
8+
func TestMigrateTimeoutFromEnv(t *testing.T) {
9+
cases := []struct {
10+
name string
11+
env string
12+
want time.Duration
13+
}{
14+
{"default when unset", "", 60 * time.Second},
15+
{"explicit 30s", "30", 30 * time.Second},
16+
{"explicit 0 = opt out", "0", 0},
17+
{"negative falls back", "-5", 60 * time.Second},
18+
{"malformed falls back", "abc", 60 * time.Second},
19+
{"caps at 1h", "7200", time.Hour},
20+
{"surrounding whitespace", " 120 ", 120 * time.Second},
21+
}
22+
for _, tc := range cases {
23+
t.Run(tc.name, func(t *testing.T) {
24+
if tc.env == "" {
25+
t.Setenv("DB_MIGRATE_TIMEOUT_SECS", "")
26+
} else {
27+
t.Setenv("DB_MIGRATE_TIMEOUT_SECS", tc.env)
28+
}
29+
if got := migrateTimeoutFromEnv(); got != tc.want {
30+
t.Errorf("env=%q -> got %v, want %v", tc.env, got, tc.want)
31+
}
32+
})
33+
}
34+
}
35+
36+
// TestAutoMigrateModels_RespectsTimeout exercises the timeout-bounded path
37+
// against an in-memory SQLite DB. SQLite migrations are too fast to actually
38+
// trip the deadline, so this test asserts: (a) Timeout > 0 doesn't break
39+
// the happy path, (b) Timeout=0 still works (legacy unbounded behaviour).
40+
//
41+
// A real Postgres lock-contention test would need a live Postgres + advisory
42+
// lock and lives in pg_integration_test.go behind an external-DB build tag.
43+
func TestAutoMigrateModels_RespectsTimeout(t *testing.T) {
44+
t.Run("WithTimeout", func(t *testing.T) {
45+
db, err := NewDatabase("sqlite", ":memory:")
46+
if err != nil {
47+
t.Fatalf("NewDatabase: %v", err)
48+
}
49+
defer func() { sqlDB, _ := db.DB(); _ = sqlDB.Close() }()
50+
opts := MigrateOptions{Timeout: 30 * time.Second}
51+
if err := AutoMigrateModelsWithOptions(db, "sqlite", opts); err != nil {
52+
t.Fatalf("AutoMigrate with timeout: %v", err)
53+
}
54+
if !db.Migrator().HasTable("traces") {
55+
t.Fatalf("expected traces table to be created")
56+
}
57+
})
58+
59+
t.Run("ZeroTimeoutLegacyPath", func(t *testing.T) {
60+
db, err := NewDatabase("sqlite", ":memory:")
61+
if err != nil {
62+
t.Fatalf("NewDatabase: %v", err)
63+
}
64+
defer func() { sqlDB, _ := db.DB(); _ = sqlDB.Close() }()
65+
opts := MigrateOptions{Timeout: 0}
66+
if err := AutoMigrateModelsWithOptions(db, "sqlite", opts); err != nil {
67+
t.Fatalf("AutoMigrate without timeout: %v", err)
68+
}
69+
if !db.Migrator().HasTable("traces") {
70+
t.Fatalf("expected traces table to be created")
71+
}
72+
})
73+
}

internal/storage/repository.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,39 @@ func partitionLookaheadFromEnv() int {
3030
return n
3131
}
3232

33+
// migrateTimeoutFromEnv reads DB_MIGRATE_TIMEOUT_SECS, defaulting to 60s
34+
// when unset, malformed, or non-positive. The migration timeout bounds
35+
// db.AutoMigrate so a Postgres relation-lock wait cannot hang startup
36+
// indefinitely. Set to 0 explicitly to opt out (legacy unbounded behaviour).
37+
//
38+
// Cap at 1 hour — anyone needing a longer migration window should run
39+
// schema changes out-of-band with versioned migrations and DB_AUTOMIGRATE=false.
40+
func migrateTimeoutFromEnv() time.Duration {
41+
const (
42+
defaultTimeout = 60 * time.Second
43+
maxTimeout = time.Hour
44+
)
45+
v := strings.TrimSpace(os.Getenv("DB_MIGRATE_TIMEOUT_SECS"))
46+
if v == "" {
47+
return defaultTimeout
48+
}
49+
n, err := strconv.Atoi(v)
50+
if err != nil {
51+
return defaultTimeout
52+
}
53+
if n == 0 {
54+
return 0 // explicit opt-out — preserves legacy behaviour
55+
}
56+
if n < 0 {
57+
return defaultTimeout
58+
}
59+
d := time.Duration(n) * time.Second
60+
if d > maxTimeout {
61+
return maxTimeout
62+
}
63+
return d
64+
}
65+
3366
// likeOpFor returns the case-insensitive LIKE operator for the given driver.
3467
// Postgres LIKE is case-sensitive; SQLite/MySQL LIKE is case-insensitive by default.
3568
// Callers should embed the returned token directly into SQL fragments.
@@ -109,6 +142,7 @@ func NewRepository(metrics *telemetry.Metrics) (*Repository, error) {
109142
opts := MigrateOptions{
110143
PostgresPartitioning: strings.ToLower(strings.TrimSpace(os.Getenv("DB_POSTGRES_PARTITIONING"))),
111144
PartitionLookaheadDays: partitionLookaheadFromEnv(),
145+
Timeout: migrateTimeoutFromEnv(),
112146
}
113147
if err := AutoMigrateModelsWithOptions(db, driver, opts); err != nil {
114148
return nil, err

0 commit comments

Comments
 (0)