Skip to content
32 changes: 23 additions & 9 deletions backend/internal/adapters/telemetry/posthog.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,37 @@ var remotePayloadAllowlist = map[string]map[string]struct{}{
"command_path": {},
},
"ao.cli.usage_errors": {
"component": {},
"command": {},
"command_path": {},
"error_kind": {},
"fingerprint": {},
"operation": {},
},
"ao.daemon.panic": {
"method": {},
"path": {},
"panic_kind": {},
"component": {},
"fingerprint": {},
"method": {},
"operation": {},
"path": {},
"panic_kind": {},
"stack_fingerprint": {},
},
"ao.daemon.started": {
"agent": {},
"port": {},
},
"ao.http.5xx": {
"duration": {},
"error_code": {},
"error_kind": {},
"method": {},
"path": {},
"status": {},
"component": {},
"duration": {},
"error_code": {},
"error_kind": {},
"fingerprint": {},
"method": {},
"operation": {},
"path": {},
"status": {},
"status_family": {},
},
"ao.onboarding.first_project_added": {
"has_git_remote": {},
Expand All @@ -70,11 +81,14 @@ var remotePayloadAllowlist = map[string]map[string]struct{}{
"kind": {},
},
"ao.session.spawn_failed": {
"component": {},
"duration_ms": {},
"error_code": {},
"error_kind": {},
"fingerprint": {},
"harness": {},
"kind": {},
"operation": {},
},
"ao.session.spawned": {
"duration_ms": {},
Expand Down
20 changes: 15 additions & 5 deletions backend/internal/adapters/telemetry/posthog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,15 @@ func TestPostHogSinkSanitizesPayloads(t *testing.T) {
OccurredAt: time.Unix(1700000000, 0).UTC(),
Level: ports.TelemetryLevelError,
Payload: map[string]any{
"method": http.MethodGet,
"path": "/api/v1/sessions/demo",
"panic_kind": "error",
"panic": "open /Users/name/private: no such file",
"stack": "stack trace with local path",
"component": "httpd",
"operation": "http_request_panic",
"method": http.MethodGet,
"path": "/api/v1/sessions/demo",
"panic_kind": "error",
"fingerprint": "abc123",
"stack_fingerprint": "def456",
"panic": "open /Users/name/private: no such file",
"stack": "stack trace with local path",
},
})
if err := sink.Close(context.Background()); err != nil {
Expand All @@ -110,9 +114,15 @@ func TestPostHogSinkSanitizesPayloads(t *testing.T) {
if !ok {
t.Fatalf("properties type = %T, want map[string]any", req["properties"])
}
if props["component"] != "httpd" || props["operation"] != "http_request_panic" {
t.Fatalf("sanitized properties = %#v, want allowlisted metadata", props)
}
if props["method"] != http.MethodGet || props["path"] != "/api/v1/sessions/demo" || props["panic_kind"] != "error" {
t.Fatalf("sanitized properties = %#v, want allowlisted fields", props)
}
if props["fingerprint"] != "abc123" || props["stack_fingerprint"] != "def456" {
t.Fatalf("sanitized properties = %#v, want exported fingerprints", props)
}
if _, ok := props["panic"]; ok {
t.Fatalf("panic property should be dropped: %#v", props)
}
Expand Down
41 changes: 15 additions & 26 deletions backend/internal/httpd/log.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
package httpd

import (
"errors"
"log/slog"
"net/http"
"strconv"
"time"

"github.com/go-chi/chi/v5/middleware"

"github.com/aoagents/agent-orchestrator/backend/internal/httpd/apierr"
"github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope"
"github.com/aoagents/agent-orchestrator/backend/internal/ports"
"github.com/aoagents/agent-orchestrator/backend/internal/telemetrymeta"
)

// requestLogger emits one structured access-log line per request via the
Expand Down Expand Up @@ -48,21 +48,23 @@ func requestLogger(log *slog.Logger, sink ports.EventSink) func(http.Handler) ht
}
log.Info("http request", attrs...)
if sink != nil && ww.Status() >= http.StatusInternalServerError {
path := telemetrymeta.RoutePattern(r)
payload := map[string]any{
"method": r.Method,
"path": r.URL.Path,
"status": ww.Status(),
"duration": time.Since(start).Milliseconds(),
"component": "httpd",
"operation": "http_request",
"method": r.Method,
"path": path,
"status": ww.Status(),
"status_family": telemetrymeta.StatusFamily(ww.Status()),
"duration": time.Since(start).Milliseconds(),
}
if err := capturedErr(); err != nil {
payload["error_kind"] = "internal"
var apiErr *apierr.Error
if errors.As(err, &apiErr) {
payload["error_kind"] = telemetryErrorKind(apiErr.Kind)
if apiErr.Code != "" {
payload["error_code"] = apiErr.Code
}
errorKind, errorCode := telemetrymeta.ErrorKindAndCode(err)
payload["error_kind"] = errorKind
if errorCode != "" {
payload["error_code"] = errorCode
}
payload["fingerprint"] = telemetrymeta.Fingerprint("httpd", "http_request", r.Method, path, strconv.Itoa(ww.Status()), errorKind, errorCode)
}
sink.Emit(r.Context(), ports.TelemetryEvent{
Name: "ao.http.5xx",
Expand All @@ -78,16 +80,3 @@ func requestLogger(log *slog.Logger, sink ports.EventSink) func(http.Handler) ht
})
}
}

func telemetryErrorKind(kind apierr.Kind) string {
switch kind {
case apierr.KindInvalid:
return "invalid"
case apierr.KindNotFound:
return "not_found"
case apierr.KindConflict:
return "conflict"
default:
return "internal"
}
}
25 changes: 25 additions & 0 deletions backend/internal/httpd/log_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,31 @@ func TestRequestLoggerRecords5xxCause(t *testing.T) {
if len(sink.events) != 1 || sink.events[0].Name != "ao.http.5xx" {
t.Fatalf("telemetry events = %#v, want one ao.http.5xx event", sink.events)
}
payload := sink.events[0].Payload
if got := payload["component"]; got != "httpd" {
t.Fatalf("payload.component = %#v, want httpd", got)
}
if got := payload["operation"]; got != "http_request" {
t.Fatalf("payload.operation = %#v, want http_request", got)
}
if got := payload["method"]; got != http.MethodPost {
t.Fatalf("payload.method = %#v, want POST", got)
}
if got := payload["path"]; got != "/api/v1/sessions/x/kill" {
t.Fatalf("payload.path = %#v, want request path fallback", got)
}
if got := payload["status"]; got != http.StatusInternalServerError {
t.Fatalf("payload.status = %#v, want 500", got)
}
if got := payload["status_family"]; got != "5xx" {
t.Fatalf("payload.status_family = %#v, want 5xx", got)
}
if got := payload["error_kind"]; got != "internal" {
t.Fatalf("payload.error_kind = %#v, want internal", got)
}
if got := payload["fingerprint"]; got == "" {
t.Fatalf("payload.fingerprint = %#v, want non-empty", got)
}
})
}
}
Expand Down
24 changes: 10 additions & 14 deletions backend/internal/httpd/recover.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

"github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope"
"github.com/aoagents/agent-orchestrator/backend/internal/ports"
"github.com/aoagents/agent-orchestrator/backend/internal/telemetrymeta"
)

func recoverTelemetry(log *slog.Logger, sink ports.EventSink) func(http.Handler) http.Handler {
Expand All @@ -29,16 +30,22 @@ func recoverTelemetry(log *slog.Logger, sink ports.EventSink) func(http.Handler)
"stack", stack,
)
if sink != nil {
path := telemetrymeta.RoutePattern(r)
panicKind := telemetrymeta.PanicKind(rec)
sink.Emit(r.Context(), ports.TelemetryEvent{
Name: "ao.daemon.panic",
Source: "http",
OccurredAt: time.Now().UTC(),
Level: ports.TelemetryLevelError,
RequestID: middleware.GetReqID(r.Context()),
Payload: map[string]any{
"method": r.Method,
"path": r.URL.Path,
"panic_kind": telemetryPanicKind(rec),
"component": "httpd",
"operation": "http_request_panic",
"method": r.Method,
"path": path,
"panic_kind": panicKind,
"stack_fingerprint": telemetrymeta.Fingerprint("httpd", "http_request_panic", path, panicKind, stack),
"fingerprint": telemetrymeta.Fingerprint("httpd", "http_request_panic", r.Method, path, panicKind),
},
})
}
Expand All @@ -50,17 +57,6 @@ func recoverTelemetry(log *slog.Logger, sink ports.EventSink) func(http.Handler)
}
}

func telemetryPanicKind(rec any) string {
switch rec.(type) {
case error:
return "error"
case string:
return "string"
default:
return "other"
}
}

func writeRecoveredError(w http.ResponseWriter, r *http.Request) {
if strings.HasPrefix(r.URL.Path, "/api/") {
envelope.WriteAPIError(w, r, http.StatusInternalServerError, "internal_error", "INTERNAL_ERROR", "Internal server error", nil)
Expand Down
4 changes: 4 additions & 0 deletions backend/internal/httpd/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta"
"github.com/aoagents/agent-orchestrator/backend/internal/httpd/envelope"
"github.com/aoagents/agent-orchestrator/backend/internal/ports"
"github.com/aoagents/agent-orchestrator/backend/internal/telemetrymeta"
"github.com/aoagents/agent-orchestrator/backend/internal/terminal"
)

Expand Down Expand Up @@ -187,9 +188,12 @@ func mountTelemetry(r chi.Router, sink ports.EventSink) {
Level: ports.TelemetryLevelWarn,
RequestID: middleware.GetReqID(req.Context()),
Payload: map[string]any{
"component": "cli",
"operation": "command_parse",
"command": body.Command,
"command_path": body.CommandPath,
"error_kind": "usage",
"fingerprint": telemetrymeta.Fingerprint("cli", "command_parse", body.CommandPath, "usage"),
},
})
w.WriteHeader(http.StatusAccepted)
Expand Down
53 changes: 48 additions & 5 deletions backend/internal/httpd/telemetry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,25 @@ func TestCLIUsageErrorRouteEmitsTelemetry(t *testing.T) {
if len(sink.events) != 1 || sink.events[0].Name != "ao.cli.usage_errors" {
t.Fatalf("events = %#v, want one ao.cli.usage_errors event", sink.events)
}
payload := sink.events[0].Payload
if got := payload["component"]; got != "cli" {
t.Fatalf("payload.component = %#v, want cli", got)
}
if got := payload["operation"]; got != "command_parse" {
t.Fatalf("payload.operation = %#v, want command_parse", got)
}
if got := payload["command_path"]; got != "ao status" {
t.Fatalf("payload.command_path = %#v, want ao status", got)
}
if got := payload["error_kind"]; got != "usage" {
t.Fatalf("payload.error_kind = %#v, want usage", got)
}
if got := payload["fingerprint"]; got == "" {
t.Fatalf("payload.fingerprint = %#v, want non-empty", got)
}
if _, ok := payload["error"]; ok {
t.Fatalf("payload leaked raw error text: %#v", payload)
}
}

func TestRecoverTelemetryEmitsPanicEvent(t *testing.T) {
Expand All @@ -90,19 +109,43 @@ func TestRecoverTelemetryEmitsPanicEvent(t *testing.T) {
if rec.Code != http.StatusInternalServerError {
t.Fatalf("status = %d, want 500", rec.Code)
}
var sawPanic, saw5xx bool
var panicPayload, fiveXXPayload map[string]any
for _, ev := range sink.events {
switch ev.Name {
case "ao.daemon.panic":
sawPanic = true
panicPayload = ev.Payload
case "ao.http.5xx":
saw5xx = true
fiveXXPayload = ev.Payload
}
}
if !sawPanic {
if panicPayload == nil {
t.Fatalf("events = %#v, want ao.daemon.panic", sink.events)
}
if !saw5xx {
if fiveXXPayload == nil {
t.Fatalf("events = %#v, want ao.http.5xx after recovery", sink.events)
}
if got := panicPayload["component"]; got != "httpd" {
t.Fatalf("panic payload.component = %#v, want httpd", got)
}
if got := panicPayload["operation"]; got != "http_request_panic" {
t.Fatalf("panic payload.operation = %#v, want http_request_panic", got)
}
if got := panicPayload["path"]; got != "/panic" {
t.Fatalf("panic payload.path = %#v, want /panic", got)
}
if got := panicPayload["panic_kind"]; got != "string" {
t.Fatalf("panic payload.panic_kind = %#v, want string", got)
}
if got := panicPayload["fingerprint"]; got == "" {
t.Fatalf("panic payload.fingerprint = %#v, want non-empty", got)
}
if got := panicPayload["stack_fingerprint"]; got == "" {
t.Fatalf("panic payload.stack_fingerprint = %#v, want non-empty", got)
}
if got := fiveXXPayload["path"]; got != "/panic" {
t.Fatalf("5xx payload.path = %#v, want /panic", got)
}
if got := fiveXXPayload["status_family"]; got != "5xx" {
t.Fatalf("5xx payload.status_family = %#v, want 5xx", got)
}
}
Loading