From c41a0e09fe5b1bfc95e2bdfd549abdaa44e2d353 Mon Sep 17 00:00:00 2001 From: Zackaryia <30780411+Zackaryia@users.noreply.github.com> Date: Thu, 21 May 2026 14:51:13 -0400 Subject: [PATCH 1/4] test: reproduce config file corruption from concurrent CLI instances Add race_test.go that demonstrates IIP-20714: multiple concurrent read-modify-write cycles on config.yaml without file locking causes invalid YAML, empty file reads, and silent data loss. --- internal/config/race_test.go | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 internal/config/race_test.go diff --git a/internal/config/race_test.go b/internal/config/race_test.go new file mode 100644 index 0000000..374b885 --- /dev/null +++ b/internal/config/race_test.go @@ -0,0 +1,184 @@ +package config + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "testing" + + "gopkg.in/yaml.v3" +) + +// TestRaceConditionEndToEnd reproduces the real file-level race condition that +// occurs when multiple cencli processes initialize config concurrently against +// the same data directory. Each subprocess is a real OS process with its own +// viper instance — just like production. The race is in the non-atomic +// read-modify-write of config.yaml (viper.WriteConfig + addDocCommentsToYAML). +// +// Run with: go test -run TestRaceConditionEndToEnd -count=1 -v ./internal/config/ +func TestRaceConditionEndToEnd(t *testing.T) { + const processes = 15 + + dataDir := t.TempDir() + if err := os.MkdirAll(filepath.Join(dataDir, "templates"), 0o755); err != nil { + t.Fatal(err) + } + + configPath := filepath.Join(dataDir, "config.yaml") + + type procResult struct { + id int + exitCode int + output string + err error + } + + var ( + wg sync.WaitGroup + mu sync.Mutex + results []procResult + ) + + // All processes start as close together as possible. + start := make(chan struct{}) + + for i := 0; i < processes; i++ { + wg.Add(1) + go func(id int) { + defer wg.Done() + <-start + + cmd := exec.Command( + os.Args[0], + "-test.run=^TestRaceWorker$", + "-test.v", + ) + cmd.Env = append(os.Environ(), + "RACE_WORKER=1", + fmt.Sprintf("RACE_DATA_DIR=%s", dataDir), + ) + + out, err := cmd.CombinedOutput() + + exitCode := 0 + if err != nil { + if ee, ok := err.(*exec.ExitError); ok { + exitCode = ee.ExitCode() + } else { + exitCode = -1 + } + } + + mu.Lock() + results = append(results, procResult{ + id: id, + exitCode: exitCode, + output: string(out), + err: err, + }) + mu.Unlock() + }(i) + } + + close(start) + wg.Wait() + + // Tally process-level failures. + var processErrors int + for _, r := range results { + if r.exitCode != 0 { + processErrors++ + t.Logf("process %d exited %d:\n%s", r.id, r.exitCode, r.output) + } + } + + // Check the final state of config.yaml — the file all processes raced on. + finalRaw, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("cannot read final config.yaml: %v", err) + } + + var ( + fileEmpty bool + fileCorrupt bool + yamlErr string + ) + + if len(finalRaw) == 0 { + fileEmpty = true + } else { + var parsed map[string]interface{} + if err := yaml.Unmarshal(finalRaw, &parsed); err != nil { + fileCorrupt = true + yamlErr = err.Error() + } + } + + t.Logf("--- Race Condition Results ---") + t.Logf(" Processes launched: %d", processes) + t.Logf(" Process failures: %d", processErrors) + t.Logf(" Final file empty: %v", fileEmpty) + t.Logf(" Final file corrupt: %v", fileCorrupt) + if fileCorrupt { + t.Logf(" YAML error: %s", yamlErr) + t.Logf(" File content:\n%s", finalRaw) + } + + if processErrors > 0 || fileEmpty || fileCorrupt { + t.Errorf("Race condition reproduced: processes_failed=%d file_empty=%v file_corrupt=%v\n"+ + "Multiple processes doing read-modify-write on config.yaml without file locking\n"+ + "causes corruption visible to concurrent or subsequent CLI invocations.", + processErrors, fileEmpty, fileCorrupt) + } +} + +// TestRaceWorker is a subprocess helper that calls config.New() once against a +// shared data directory. Each invocation is a separate OS process — exactly +// like a real cencli CLI invocation. Only runs when spawned by the parent test. +func TestRaceWorker(t *testing.T) { + if os.Getenv("RACE_WORKER") != "1" { + t.Skip("skipping: only runs as subprocess of TestRaceConditionEndToEnd") + } + + dataDir := os.Getenv("RACE_DATA_DIR") + if dataDir == "" { + t.Fatal("RACE_DATA_DIR not set") + } + + cfg, cErr := New(dataDir) + if cErr != nil { + t.Fatalf("New() failed: %v", cErr) + } + + // Verify the returned config is sane. + if cfg.OutputFormat == "" { + t.Error("config has empty output-format") + } + + // Verify the file on disk is valid YAML right after our write. + configPath := filepath.Join(dataDir, "config.yaml") + raw, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("cannot read config.yaml after New(): %v", err) + } + if len(raw) == 0 { + t.Fatal("config.yaml is empty immediately after New()") + } + + var parsed map[string]interface{} + if err := yaml.Unmarshal(raw, &parsed); err != nil { + t.Fatalf("config.yaml is corrupted after New(): %v", err) + } + + // Check for partial writes — key fields should be present. + requiredKeys := []string{"output-format", "streaming", "timeouts", "retry-strategy"} + content := string(raw) + for _, key := range requiredKeys { + if !strings.Contains(content, key+":") { + t.Errorf("config.yaml missing expected key %q — possible truncated write", key) + } + } +} From 378f62cc9b2a3ae1cb0ca1636f79122d77b9b201 Mon Sep 17 00:00:00 2001 From: zackaryia <30780411+Zackaryia@users.noreply.github.com> Date: Fri, 22 May 2026 11:57:41 -0400 Subject: [PATCH 2/4] fix: add cross-process file lock to prevent config.yaml corruption Concurrent CLI invocations race on config.yaml through the non-atomic read-modify-write in New() (viper.WriteConfig + addDocCommentsToYAML). Wrap the entire critical section in an exclusive flock via gofrs/flock. --- go.mod | 3 ++- go.sum | 6 ++++-- internal/config/config.go | 7 +++++++ internal/config/config_test.go | 2 +- internal/config/race_test.go | 4 +++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index e9e9170..44362d3 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/charmbracelet/huh v0.7.0 github.com/charmbracelet/lipgloss v1.1.0 github.com/go-viper/mapstructure/v2 v2.4.0 + github.com/gofrs/flock v0.13.0 github.com/google/uuid v1.6.0 github.com/mattn/go-colorable v0.1.14 github.com/mattn/go-runewidth v0.0.19 @@ -22,7 +23,7 @@ require ( github.com/stretchr/testify v1.11.1 github.com/tidwall/gjson v1.18.0 go.uber.org/mock v0.6.0 - golang.org/x/sys v0.36.0 + golang.org/x/sys v0.37.0 golang.org/x/term v0.35.0 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.39.0 diff --git a/go.sum b/go.sum index b729f49..7f7559a 100644 --- a/go.sum +++ b/go.sum @@ -62,6 +62,8 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= +github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw= +github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= @@ -170,8 +172,8 @@ golang.org/x/sys v0.0.0-20211110154304-99a53858aa08/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= -golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= golang.org/x/term v0.35.0 h1:bZBVKBudEyhRcajGcNc3jIfWPqV4y/Kt2XcoigOWtDQ= golang.org/x/term v0.35.0/go.mod h1:TPGtkTLesOwf2DE8CgVYiZinHAOuy5AYUYT1lENIZnA= diff --git a/internal/config/config.go b/internal/config/config.go index ec621dd..ebd4243 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/go-viper/mapstructure/v2" + "github.com/gofrs/flock" "github.com/spf13/pflag" "github.com/spf13/viper" @@ -68,6 +69,12 @@ func New(dataDir string) (*Config, cenclierrors.CencliError) { configPath := filepath.Join(dataDir, "config.yaml") + fileLock := flock.New(configPath + ".lock") + if err := fileLock.Lock(); err != nil { + return nil, newInvalidConfigError(fmt.Errorf("failed to acquire config lock: %w", err).Error()) + } + defer func() { _ = fileLock.Unlock() }() + if err := viper.ReadInConfig(); err != nil { var configFileNotFoundError viper.ConfigFileNotFoundError if !errors.As(err, &configFileNotFoundError) { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 6e1e403..475f8c0 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -346,7 +346,7 @@ func TestConfigWriteErrors(t *testing.T) { _, err = New(tempDir) var invalidConfigErr InvalidConfigError assert.ErrorAs(t, err, &invalidConfigErr) - assert.Contains(t, err.Error(), "failed to write config file") + assert.Contains(t, err.Error(), "permission denied") }) } diff --git a/internal/config/race_test.go b/internal/config/race_test.go index 374b885..930e774 100644 --- a/internal/config/race_test.go +++ b/internal/config/race_test.go @@ -1,6 +1,7 @@ package config import ( + "errors" "fmt" "os" "os/exec" @@ -65,7 +66,8 @@ func TestRaceConditionEndToEnd(t *testing.T) { exitCode := 0 if err != nil { - if ee, ok := err.(*exec.ExitError); ok { + var ee *exec.ExitError + if errors.As(err, &ee) { exitCode = ee.ExitCode() } else { exitCode = -1 From 7186e38940c49a1c6833fa41ac97e9d04ffcbcd8 Mon Sep 17 00:00:00 2001 From: zackaryia <30780411+Zackaryia@users.noreply.github.com> Date: Fri, 22 May 2026 16:52:16 -0400 Subject: [PATCH 3/4] test: tighten readonly_directory assertion to verify lock acquisition path --- internal/config/config_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 475f8c0..93e0fe3 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -346,7 +346,7 @@ func TestConfigWriteErrors(t *testing.T) { _, err = New(tempDir) var invalidConfigErr InvalidConfigError assert.ErrorAs(t, err, &invalidConfigErr) - assert.Contains(t, err.Error(), "permission denied") + assert.Contains(t, err.Error(), "failed to acquire config lock") }) } From 1327f2baaee9ef3e2e605126c3d3697b0e08152e Mon Sep 17 00:00:00 2001 From: zackaryia <30780411+Zackaryia@users.noreply.github.com> Date: Tue, 26 May 2026 13:35:41 -0400 Subject: [PATCH 4/4] test: allow TestRaceWorker to run standalone as a config.New() smoke test Falls back to t.TempDir() when RACE_DATA_DIR is not set, so the test no longer skips/fatals when run individually or in CI outside of the race suite. --- internal/config/race_test.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/internal/config/race_test.go b/internal/config/race_test.go index 930e774..b83ddd5 100644 --- a/internal/config/race_test.go +++ b/internal/config/race_test.go @@ -137,17 +137,15 @@ func TestRaceConditionEndToEnd(t *testing.T) { } } -// TestRaceWorker is a subprocess helper that calls config.New() once against a -// shared data directory. Each invocation is a separate OS process — exactly -// like a real cencli CLI invocation. Only runs when spawned by the parent test. +// TestRaceWorker verifies that config.New() produces a valid, non-corrupt +// config file. When spawned as a subprocess by TestRaceConditionEndToEnd +// (RACE_WORKER=1, RACE_DATA_DIR set), it operates against the shared data +// directory to exercise the file-lock under contention. When run standalone +// it uses its own temp directory as a basic config.New() smoke test. func TestRaceWorker(t *testing.T) { - if os.Getenv("RACE_WORKER") != "1" { - t.Skip("skipping: only runs as subprocess of TestRaceConditionEndToEnd") - } - dataDir := os.Getenv("RACE_DATA_DIR") if dataDir == "" { - t.Fatal("RACE_DATA_DIR not set") + dataDir = t.TempDir() } cfg, cErr := New(dataDir)