Skip to content

Commit bdadb79

Browse files
committed
kai 0.12.5: MCP single-instance guard + demo teardown
MCP reconcile-on-start: scan .kai/mcp-session-*.json and decide per entry - dead PID -> delete stale file - alive, parent dead -> orphan; SIGTERM + take over - alive, parent alive -> live sibling; refuse with a clear error Prevents N MCP servers from stacking fsnotify watchers on the same workdir (each server was logging its own push/skip for every event, flooding the sync feed). Gating takeover on "parent dead" makes a kill ping-pong impossible when two Claude windows share a repo. Demo livesync scripts: new docs/teardown-livesync.sh kills the livesync tmux session and any kai mcp serve whose cwd is under /tmp/demo-*. setup-livesync.sh and layout-livesync.sh call it before rebuilding state. Fix sed -> awk (macOS sed has no line-buffer flag) and drop SKIP events from the sync feed so push/receive/merge aren't drowned out.
1 parent bca2d59 commit bdadb79

5 files changed

Lines changed: 194 additions & 39 deletions

File tree

docs/layout-livesync.sh

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ for d in a b c d; do
1919
done
2020

2121
SESSION="livesync"
22-
tmux kill-session -t "$SESSION" 2>/dev/null || true
22+
23+
# Kill any MCP servers and the tmux session from a prior demo run. Skipping
24+
# this leaves stale fsnotify watchers attached to /tmp/demo-* that flood the
25+
# sync feed with skip events.
26+
bash "$(dirname "$0")/teardown-livesync.sh"
2327

2428
# ── Build the layout ─────────────────────────────────────────────────
2529
# Final shape:
@@ -35,64 +39,75 @@ tmux kill-session -t "$SESSION" 2>/dev/null || true
3539
tmux new-session -d -s "$SESSION" -x 240 -y 60 -c /tmp/demo-a
3640
tmux rename-window -t "$SESSION:0" livesync
3741

38-
# Reserve the bottom ~18% for the sync feed.
39-
tmux split-window -v -l 18% -t "$SESSION:0" -c /tmp/demo-a
42+
# Capture pane IDs as we create them so we don't depend on numeric
43+
# indices (which shift under `pane-base-index 1` and across tmux
44+
# versions). Pane IDs look like "%17" and are stable.
45+
A=$(tmux display-message -p -t "$SESSION:0" '#{pane_id}')
46+
47+
# Reserve the bottom ~35% for the sync feed (it's the narration track
48+
# of the demo — it needs to be tall enough to actually read).
49+
FEED=$(tmux split-window -v -l 35% -t "$A" -c /tmp/demo-a -P -F '#{pane_id}')
4050

41-
# Top pane (now pane 0) becomes the 2×2. Pane 1 is the feed.
42-
tmux split-window -h -l 50% -t "$SESSION:0.0" -c /tmp/demo-b
43-
tmux split-window -v -l 50% -t "$SESSION:0.0" -c /tmp/demo-c
44-
tmux split-window -v -l 50% -t "$SESSION:0.2" -c /tmp/demo-d
51+
# Turn the top pane into a 2×2.
52+
B=$(tmux split-window -h -l 50% -t "$A" -c /tmp/demo-b -P -F '#{pane_id}')
53+
C=$(tmux split-window -v -l 50% -t "$A" -c /tmp/demo-c -P -F '#{pane_id}')
54+
D=$(tmux split-window -v -l 50% -t "$B" -c /tmp/demo-d -P -F '#{pane_id}')
4555

4656
# Wire each agent pane: clear + show which agent + cd into its dir.
47-
# Pane indices after the splits above:
48-
# 0 = Agent A (top-left)
49-
# 2 = Agent B (top-right)
50-
# 3 = Agent C (bottom-left)
51-
# 4 = Agent D (bottom-right)
52-
# 1 = Sync feed (bottom strip)
53-
tmux send-keys -t "$SESSION:0.0" 'clear; printf "\033[1;31mAGENT A — backend\033[0m (/tmp/demo-a)\n"' C-m
54-
tmux send-keys -t "$SESSION:0.2" 'clear; printf "\033[1;34mAGENT B — tests\033[0m (/tmp/demo-b)\n"' C-m
55-
tmux send-keys -t "$SESSION:0.3" 'clear; printf "\033[1;32mAGENT C — frontend\033[0m (/tmp/demo-c)\n"' C-m
56-
tmux send-keys -t "$SESSION:0.4" 'clear; printf "\033[1;33mAGENT D — docs\033[0m (/tmp/demo-d)\n"' C-m
57+
tmux send-keys -t "$A" 'clear; printf "\033[1;31mAGENT A — backend\033[0m (/tmp/demo-a)\n"' C-m
58+
tmux send-keys -t "$B" 'clear; printf "\033[1;34mAGENT B — tests\033[0m (/tmp/demo-b)\n"' C-m
59+
tmux send-keys -t "$C" 'clear; printf "\033[1;32mAGENT C — frontend\033[0m (/tmp/demo-c)\n"' C-m
60+
tmux send-keys -t "$D" 'clear; printf "\033[1;33mAGENT D — docs\033[0m (/tmp/demo-d)\n"' C-m
5761

5862
# Sync-feed pane: tail every agent's sync log, prefixed with the agent
5963
# letter. jq is used when available to pretty-print; otherwise raw JSONL.
60-
FEED_CMD=$(cat <<'EOS'
64+
#
65+
# We write the feed program to a file and exec it instead of piping the
66+
# whole multi-line if/then/fi block through `send-keys` — typing that
67+
# into an interactive shell is fragile (PS2 continuation hazards,
68+
# quoting interactions). A script file is parsed as a single unit.
69+
FEED_SCRIPT="/tmp/demo-livesync-feed.sh"
70+
cat >"$FEED_SCRIPT" <<'EOS'
71+
#!/usr/bin/env bash
6172
clear
6273
printf "\033[1mSYNC FEED\033[0m (push/recv across all four agents)\n\n"
6374
TODAY=$(date +%Y-%m-%d)
6475
if command -v jq >/dev/null 2>&1; then
6576
{
6677
for d in a b c d; do
67-
tail -F /tmp/demo-$d/.kai/sync-log/$TODAY.jsonl 2>/dev/null | \
68-
sed "s/^/$d /" &
78+
tail -F "/tmp/demo-$d/.kai/sync-log/$TODAY.jsonl" 2>/dev/null \
79+
| awk -v p="$d " '{print p $0; fflush()}' &
6980
done
7081
wait
71-
} | jq -r --unbuffered '
82+
} | jq -R -r --unbuffered '
7283
def color(e):
73-
if e == "push" then "\u001b[1;32m"
74-
elif e == "recv" then "\u001b[2m"
75-
elif e == "merge" then "\u001b[1;33m"
76-
elif e == "conflict" then "\u001b[1;31m"
77-
else "\u001b[0m" end;
78-
def tag(l): if l=="a" then "\u001b[31mA" elif l=="b" then "\u001b[34mB"
79-
elif l=="c" then "\u001b[32mC" elif l=="d" then "\u001b[33mD"
80-
else l end;
81-
(input_line_number|tostring) as $ln |
82-
. as $line |
83-
($line | split(" ") | .[0]) as $letter |
84-
($line | .[2:] | fromjson? // {event:"(parse err)", file:$line}) as $ev |
85-
"\(tag($letter))\u001b[0m \(color($ev.event))\( ($ev.timestamp // 0) / 1000 | strftime("%H:%M:%S") ) \($ev.event | ascii_upcase) \($ev.file // "")\u001b[0m"
84+
if e == "push" then ""
85+
elif e == "receive" then ""
86+
elif e == "merge" then ""
87+
elif e == "conflict" then ""
88+
else "" end;
89+
def tag(l):
90+
if l == "a" then "A"
91+
elif l == "b" then "B"
92+
elif l == "c" then "C"
93+
elif l == "d" then "D"
94+
else l end;
95+
. as $line
96+
| ($line | split(" ") | .[0]) as $letter
97+
| ($line | .[2:] | fromjson? // {event:"(parse err)", file:$line}) as $ev
98+
| select($ev.event != "skip")
99+
| "\(tag($letter)) \(color($ev.event))\(($ev.timestamp // 0) / 1000 | strftime("%H:%M:%S")) \($ev.event | ascii_upcase) \($ev.file // "")"
86100
'
87101
else
88102
for d in a b c d; do
89-
tail -F /tmp/demo-$d/.kai/sync-log/$TODAY.jsonl 2>/dev/null | sed "s/^/[$d] /" &
103+
tail -F "/tmp/demo-$d/.kai/sync-log/$TODAY.jsonl" 2>/dev/null \
104+
| awk -v p="[$d] " '{print p $0; fflush()}' &
90105
done
91106
wait
92107
fi
93108
EOS
94-
)
95-
tmux send-keys -t "$SESSION:0.1" "$FEED_CMD" C-m
109+
chmod +x "$FEED_SCRIPT"
110+
tmux send-keys -t "$FEED" "exec bash $FEED_SCRIPT" C-m
96111

97112
echo "Attaching to tmux session '$SESSION'…"
98113
echo " Ctrl-B q = show pane numbers"

docs/setup-livesync.sh

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ kai version | grep -qE '0\.(1[2-9]|[2-9][0-9])' || {
1010
echo "need kai >= 0.12.3"; exit 1
1111
}
1212

13+
# Kill any MCP servers / tmux session left from a prior demo run. Otherwise
14+
# their fsnotify watchers stay attached and flood the new sync log with
15+
# skip-events (see docs/teardown-livesync.sh).
16+
bash "$(dirname "$0")/teardown-livesync.sh"
17+
1318
rm -rf /tmp/demo-a /tmp/demo-b /tmp/demo-c /tmp/demo-d
1419

1520
# ── Agent A's dir is the seed: init, push once, then clone for b/c/d. ──
@@ -55,9 +60,22 @@ eval "$(kai remote get origin | awk '
5560
FULL_URL="${URL%/}/$TENANT/$REPO"
5661
echo "seed published at: $FULL_URL"
5762

58-
# ── Clone the same kai repo into /tmp/demo-{b,c,d} (working tree only). ──
63+
# ── Clone the same kai repo into /tmp/demo-{b,c,d}, then init a local
64+
# git repo in each so all four dirs are symmetric. We use --kai-only
65+
# because the kaicontext server doesn't serve git refs (kai push
66+
# uploads the snapshot, not a git remote). Each B/C/D gets an
67+
# independent local git history rooted at the scaffold state — fine
68+
# for the demo, since agents sync via kai, not git. ──
5969
for d in b c d; do
6070
kai clone "$FULL_URL" "/tmp/demo-$d" --kai-only
71+
(
72+
cd "/tmp/demo-$d"
73+
git init -q -b main
74+
git config user.email demo@demo
75+
git config user.name Demo
76+
git config commit.gpgsign false
77+
git add -A && git commit -q -m "scaffold"
78+
)
6179
done
6280

6381
echo

docs/teardown-livesync.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env bash
2+
# Kills MCP servers and tmux session left over from a previous livesync demo.
3+
# Scoped to /tmp/demo-* so it can't touch real MCP servers you use for work.
4+
#
5+
# Called from setup-livesync.sh and layout-livesync.sh before they rebuild
6+
# state. Safe to run standalone: `bash docs/teardown-livesync.sh`.
7+
8+
set -u
9+
10+
tmux kill-session -t livesync 2>/dev/null || true
11+
12+
# Find `kai mcp serve` processes whose cwd is inside /tmp/demo-*. We resolve
13+
# cwd via lsof because ps on macOS doesn't expose it. Anything outside the
14+
# demo dirs is left alone — a developer may have a real MCP server open.
15+
pids=$(pgrep -f "kai mcp serve" 2>/dev/null || true)
16+
for pid in $pids; do
17+
cwd=$(lsof -p "$pid" -d cwd -Fn 2>/dev/null | awk '/^n/ {print substr($0,2); exit}')
18+
case "$cwd" in
19+
/tmp/demo-a|/tmp/demo-b|/tmp/demo-c|/tmp/demo-d|/tmp/demo-a/*|/tmp/demo-b/*|/tmp/demo-c/*|/tmp/demo-d/*)
20+
kill "$pid" 2>/dev/null || true
21+
;;
22+
esac
23+
done

kai-cli/cmd/kai/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ const (
7070
)
7171

7272
// Version is the current kai CLI version
73-
var Version = "0.12.4"
73+
var Version = "0.12.5"
7474

7575
// verbose enables debug output when --verbose/-v flag or KAI_VERBOSE env var is set
7676
var verbose bool

kai-cli/internal/mcp/server.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ import (
1515
"os/exec"
1616
"path/filepath"
1717
"sort"
18+
"strconv"
1819
"strings"
1920
"sync"
2021
"sync/atomic"
22+
"syscall"
2123
"time"
2224
"unicode"
2325

@@ -211,6 +213,13 @@ func (s *Server) Serve(ctx context.Context) error {
211213

212214
s.registerTools(srv)
213215

216+
// Reconcile any stale or orphaned MCP sessions from prior runs before
217+
// we take over the workdir. If a sibling session is alive and healthy,
218+
// this returns an error and we refuse to start.
219+
if err := s.reconcileExistingSessions(); err != nil {
220+
return err
221+
}
222+
214223
// Write MCP session file so kai capture can detect active AI sessions.
215224
// The initial write uses the "mcp-client" fallback; the after-initialize
216225
// hook rewrites it with the real client identity as soon as the
@@ -500,6 +509,96 @@ func (s *Server) removeSessionFile() {
500509
os.Remove(s.sessionFilePath())
501510
}
502511

512+
// reconcileExistingSessions handles any other MCP session files present in
513+
// s.kaiDir before we start watching files. Running multiple MCP servers in
514+
// the same workdir causes every one of them to attach a separate fsnotify
515+
// watcher and log its own push/skip for every event — the sync feed fills
516+
// with noise.
517+
//
518+
// For each stale session file we find:
519+
//
520+
// - If the PID is dead → leftover from a prior crash. Delete the file.
521+
// - If the PID is alive and its parent is dead → the parent Claude exited
522+
// without taking its MCP child with it. Safe to take over: SIGTERM the
523+
// orphan and delete its session file.
524+
// - If the PID is alive and its parent is also alive → a legitimate
525+
// concurrent Claude session is running here. Refuse to start so we
526+
// don't fight over fsnotify events (and so we don't enter a ping-pong
527+
// kill loop where each MCP kills the other on restart).
528+
//
529+
// The "parent alive" check is what makes this safe: once an MCP is
530+
// orphaned we take over permanently; once a real sibling exists, we
531+
// always refuse. There's no state that flips back and forth.
532+
func (s *Server) reconcileExistingSessions() error {
533+
entries, err := os.ReadDir(s.kaiDir)
534+
if err != nil {
535+
return nil
536+
}
537+
myPid := os.Getpid()
538+
for _, e := range entries {
539+
if e.IsDir() {
540+
continue
541+
}
542+
name := e.Name()
543+
if !strings.HasPrefix(name, "mcp-session-") || !strings.HasSuffix(name, ".json") {
544+
continue
545+
}
546+
pidStr := strings.TrimSuffix(strings.TrimPrefix(name, "mcp-session-"), ".json")
547+
pid, convErr := strconv.Atoi(pidStr)
548+
if convErr != nil || pid == myPid {
549+
continue
550+
}
551+
path := filepath.Join(s.kaiDir, name)
552+
553+
if !processAlive(pid) {
554+
os.Remove(path)
555+
continue
556+
}
557+
558+
ppid, ok := parentPID(pid)
559+
if !ok || !processAlive(ppid) {
560+
// Orphan — parent Claude is gone. Take over.
561+
_ = syscall.Kill(pid, syscall.SIGTERM)
562+
os.Remove(path)
563+
continue
564+
}
565+
566+
// Live sibling. Refuse to start; let the user pick which Claude to close.
567+
return fmt.Errorf("another MCP server is already running in %s (pid %d, parent pid %d). "+
568+
"Close one of the Claude sessions for this workdir, or kill pid %d, then retry",
569+
s.workDir, pid, ppid, pid)
570+
}
571+
return nil
572+
}
573+
574+
// processAlive returns true if a signal-0 kill succeeds (process exists
575+
// and we can signal it). A "permission denied" errno also means the
576+
// process exists (just owned by someone else), which is treated as alive.
577+
func processAlive(pid int) bool {
578+
if pid <= 0 {
579+
return false
580+
}
581+
err := syscall.Kill(pid, 0)
582+
if err == nil {
583+
return true
584+
}
585+
return err == syscall.EPERM
586+
}
587+
588+
// parentPID returns the PPID of pid via `ps`. Portable across macOS and
589+
// Linux without pulling in platform-specific sysctl code.
590+
func parentPID(pid int) (int, bool) {
591+
out, err := exec.Command("ps", "-o", "ppid=", "-p", strconv.Itoa(pid)).Output()
592+
if err != nil {
593+
return 0, false
594+
}
595+
ppid, err := strconv.Atoi(strings.TrimSpace(string(out)))
596+
if err != nil {
597+
return 0, false
598+
}
599+
return ppid, true
600+
}
601+
503602
// Close cleans up resources.
504603
func (s *Server) Close() error {
505604
s.mu.Lock()

0 commit comments

Comments
 (0)