Skip to content

Commit 8276228

Browse files
authored
[client] Add health check flag to status command and expose daemon status in output (#5650)
1 parent b550a2f commit 8276228

9 files changed

Lines changed: 197 additions & 76 deletions

File tree

client/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ ENV \
1717
NETBIRD_BIN="/usr/local/bin/netbird" \
1818
NB_LOG_FILE="console,/var/log/netbird/client.log" \
1919
NB_DAEMON_ADDR="unix:///var/run/netbird.sock" \
20-
NB_ENTRYPOINT_SERVICE_TIMEOUT="5" \
21-
NB_ENTRYPOINT_LOGIN_TIMEOUT="5"
20+
NB_ENTRYPOINT_SERVICE_TIMEOUT="30" \
21+
NB_ENTRYPOINT_LOGIN_TIMEOUT="30"
2222

2323
ENTRYPOINT [ "/usr/local/bin/netbird-entrypoint.sh" ]
2424

client/Dockerfile-rootless

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ ENV \
2323
NB_DAEMON_ADDR="unix:///var/lib/netbird/netbird.sock" \
2424
NB_LOG_FILE="console,/var/lib/netbird/client.log" \
2525
NB_DISABLE_DNS="true" \
26-
NB_ENTRYPOINT_SERVICE_TIMEOUT="5" \
27-
NB_ENTRYPOINT_LOGIN_TIMEOUT="1"
26+
NB_ENTRYPOINT_SERVICE_TIMEOUT="30" \
27+
NB_ENTRYPOINT_LOGIN_TIMEOUT="30"
2828

2929
ENTRYPOINT [ "/usr/local/bin/netbird-entrypoint.sh" ]
3030

client/cmd/status.go

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ var (
2828
ipsFilterMap map[string]struct{}
2929
prefixNamesFilterMap map[string]struct{}
3030
connectionTypeFilter string
31+
checkFlag string
3132
)
3233

3334
var statusCmd = &cobra.Command{
@@ -49,13 +50,18 @@ func init() {
4950
statusCmd.PersistentFlags().StringSliceVar(&prefixNamesFilter, "filter-by-names", []string{}, "filters the detailed output by a list of one or more peer FQDN or hostnames, e.g., --filter-by-names peer-a,peer-b.netbird.cloud")
5051
statusCmd.PersistentFlags().StringVar(&statusFilter, "filter-by-status", "", "filters the detailed output by connection status(idle|connecting|connected), e.g., --filter-by-status connected")
5152
statusCmd.PersistentFlags().StringVar(&connectionTypeFilter, "filter-by-connection-type", "", "filters the detailed output by connection type (P2P|Relayed), e.g., --filter-by-connection-type P2P")
53+
statusCmd.PersistentFlags().StringVar(&checkFlag, "check", "", "run a health check and exit with code 0 on success, 1 on failure (live|ready|startup)")
5254
}
5355

5456
func statusFunc(cmd *cobra.Command, args []string) error {
5557
SetFlagsFromEnvVars(rootCmd)
5658

5759
cmd.SetOut(cmd.OutOrStdout())
5860

61+
if checkFlag != "" {
62+
return runHealthCheck(cmd)
63+
}
64+
5965
err := parseFilters()
6066
if err != nil {
6167
return err
@@ -68,15 +74,17 @@ func statusFunc(cmd *cobra.Command, args []string) error {
6874

6975
ctx := internal.CtxInitState(cmd.Context())
7076

71-
resp, err := getStatus(ctx, false)
77+
resp, err := getStatus(ctx, true, false)
7278
if err != nil {
7379
return err
7480
}
7581

7682
status := resp.GetStatus()
7783

78-
if status == string(internal.StatusNeedsLogin) || status == string(internal.StatusLoginFailed) ||
79-
status == string(internal.StatusSessionExpired) {
84+
needsAuth := status == string(internal.StatusNeedsLogin) || status == string(internal.StatusLoginFailed) ||
85+
status == string(internal.StatusSessionExpired)
86+
87+
if needsAuth && !jsonFlag && !yamlFlag {
8088
cmd.Printf("Daemon status: %s\n\n"+
8189
"Run UP command to log in with SSO (interactive login):\n\n"+
8290
" netbird up \n\n"+
@@ -99,7 +107,17 @@ func statusFunc(cmd *cobra.Command, args []string) error {
99107
profName = activeProf.Name
100108
}
101109

102-
var outputInformationHolder = nbstatus.ConvertToStatusOutputOverview(resp.GetFullStatus(), anonymizeFlag, resp.GetDaemonVersion(), statusFilter, prefixNamesFilter, prefixNamesFilterMap, ipsFilterMap, connectionTypeFilter, profName)
110+
var outputInformationHolder = nbstatus.ConvertToStatusOutputOverview(resp.GetFullStatus(), nbstatus.ConvertOptions{
111+
Anonymize: anonymizeFlag,
112+
DaemonVersion: resp.GetDaemonVersion(),
113+
DaemonStatus: nbstatus.ParseDaemonStatus(status),
114+
StatusFilter: statusFilter,
115+
PrefixNamesFilter: prefixNamesFilter,
116+
PrefixNamesFilterMap: prefixNamesFilterMap,
117+
IPsFilter: ipsFilterMap,
118+
ConnectionTypeFilter: connectionTypeFilter,
119+
ProfileName: profName,
120+
})
103121
var statusOutputString string
104122
switch {
105123
case detailFlag:
@@ -121,7 +139,7 @@ func statusFunc(cmd *cobra.Command, args []string) error {
121139
return nil
122140
}
123141

124-
func getStatus(ctx context.Context, shouldRunProbes bool) (*proto.StatusResponse, error) {
142+
func getStatus(ctx context.Context, fullPeerStatus bool, shouldRunProbes bool) (*proto.StatusResponse, error) {
125143
conn, err := DialClientGRPCServer(ctx, daemonAddr)
126144
if err != nil {
127145
//nolint
@@ -131,7 +149,7 @@ func getStatus(ctx context.Context, shouldRunProbes bool) (*proto.StatusResponse
131149
}
132150
defer conn.Close()
133151

134-
resp, err := proto.NewDaemonServiceClient(conn).Status(ctx, &proto.StatusRequest{GetFullPeerStatus: true, ShouldRunProbes: shouldRunProbes})
152+
resp, err := proto.NewDaemonServiceClient(conn).Status(ctx, &proto.StatusRequest{GetFullPeerStatus: fullPeerStatus, ShouldRunProbes: shouldRunProbes})
135153
if err != nil {
136154
return nil, fmt.Errorf("status failed: %v", status.Convert(err).Message())
137155
}
@@ -185,6 +203,83 @@ func enableDetailFlagWhenFilterFlag() {
185203
}
186204
}
187205

206+
func runHealthCheck(cmd *cobra.Command) error {
207+
check := strings.ToLower(checkFlag)
208+
switch check {
209+
case "live", "ready", "startup":
210+
default:
211+
return fmt.Errorf("unknown check %q, must be one of: live, ready, startup", checkFlag)
212+
}
213+
214+
if err := util.InitLog(logLevel, util.LogConsole); err != nil {
215+
return fmt.Errorf("init log: %w", err)
216+
}
217+
218+
ctx := internal.CtxInitState(cmd.Context())
219+
220+
isStartup := check == "startup"
221+
resp, err := getStatus(ctx, isStartup, isStartup)
222+
if err != nil {
223+
return err
224+
}
225+
226+
switch check {
227+
case "live":
228+
return nil
229+
case "ready":
230+
return checkReadiness(resp)
231+
case "startup":
232+
return checkStartup(resp)
233+
default:
234+
return nil
235+
}
236+
}
237+
238+
func checkReadiness(resp *proto.StatusResponse) error {
239+
daemonStatus := internal.StatusType(resp.GetStatus())
240+
switch daemonStatus {
241+
case internal.StatusIdle, internal.StatusConnecting, internal.StatusConnected:
242+
return nil
243+
case internal.StatusNeedsLogin, internal.StatusLoginFailed, internal.StatusSessionExpired:
244+
return fmt.Errorf("readiness check: daemon status is %s", daemonStatus)
245+
default:
246+
return fmt.Errorf("readiness check: unexpected daemon status %q", daemonStatus)
247+
}
248+
}
249+
250+
func checkStartup(resp *proto.StatusResponse) error {
251+
fullStatus := resp.GetFullStatus()
252+
if fullStatus == nil {
253+
return fmt.Errorf("startup check: no full status available")
254+
}
255+
256+
if !fullStatus.GetManagementState().GetConnected() {
257+
return fmt.Errorf("startup check: management not connected")
258+
}
259+
260+
if !fullStatus.GetSignalState().GetConnected() {
261+
return fmt.Errorf("startup check: signal not connected")
262+
}
263+
264+
var relayCount, relaysConnected int
265+
for _, r := range fullStatus.GetRelays() {
266+
uri := r.GetURI()
267+
if !strings.HasPrefix(uri, "rel://") && !strings.HasPrefix(uri, "rels://") {
268+
continue
269+
}
270+
relayCount++
271+
if r.GetAvailable() {
272+
relaysConnected++
273+
}
274+
}
275+
276+
if relayCount > 0 && relaysConnected == 0 {
277+
return fmt.Errorf("startup check: no relay servers available (0/%d connected)", relayCount)
278+
}
279+
280+
return nil
281+
}
282+
188283
func parseInterfaceIP(interfaceIP string) string {
189284
ip, _, err := net.ParseCIDR(interfaceIP)
190285
if err != nil {

client/internal/debug/debug.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
nbstatus "github.com/netbirdio/netbird/client/status"
3232
mgmProto "github.com/netbirdio/netbird/shared/management/proto"
3333
"github.com/netbirdio/netbird/util"
34-
"github.com/netbirdio/netbird/version"
3534
)
3635

3736
const readmeContent = `Netbird debug bundle
@@ -418,7 +417,10 @@ func (g *BundleGenerator) addStatus() error {
418417
fullStatus := g.statusRecorder.GetFullStatus()
419418
protoFullStatus := nbstatus.ToProtoFullStatus(fullStatus)
420419
protoFullStatus.Events = g.statusRecorder.GetEventHistory()
421-
overview := nbstatus.ConvertToStatusOutputOverview(protoFullStatus, g.anonymize, version.NetbirdVersion(), "", nil, nil, nil, "", profName)
420+
overview := nbstatus.ConvertToStatusOutputOverview(protoFullStatus, nbstatus.ConvertOptions{
421+
Anonymize: g.anonymize,
422+
ProfileName: profName,
423+
})
422424
statusOutput := overview.FullDetailSummary()
423425

424426
statusReader := strings.NewReader(statusOutput)

client/netbird-entrypoint.sh

Lines changed: 33 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
#!/usr/bin/env bash
22
set -eEuo pipefail
33

4-
: ${NB_ENTRYPOINT_SERVICE_TIMEOUT:="5"}
5-
: ${NB_ENTRYPOINT_LOGIN_TIMEOUT:="5"}
4+
: ${NB_ENTRYPOINT_SERVICE_TIMEOUT:="30"}
5+
: ${NB_ENTRYPOINT_LOGIN_TIMEOUT:="30"}
66
NETBIRD_BIN="${NETBIRD_BIN:-"netbird"}"
77
export NB_LOG_FILE="${NB_LOG_FILE:-"console,/var/log/netbird/client.log"}"
88
service_pids=()
9-
log_file_path=""
109

1110
_log() {
1211
# mimic Go logger's output for easier parsing
@@ -33,60 +32,50 @@ on_exit() {
3332
fi
3433
}
3534

36-
wait_for_message() {
37-
local timeout="${1}" message="${2}"
38-
if test "${timeout}" -eq 0; then
39-
info "not waiting for log line ${message@Q} due to zero timeout."
40-
elif test -n "${log_file_path}"; then
41-
info "waiting for log line ${message@Q} for ${timeout} seconds..."
42-
grep -E -q "${message}" <(timeout "${timeout}" tail -F "${log_file_path}" 2>/dev/null)
43-
else
44-
info "log file unsupported, sleeping for ${timeout} seconds..."
45-
sleep "${timeout}"
46-
fi
47-
}
48-
49-
locate_log_file() {
50-
local log_files_string="${1}"
51-
52-
while read -r log_file; do
53-
case "${log_file}" in
54-
console | syslog) ;;
55-
*)
56-
log_file_path="${log_file}"
57-
return
58-
;;
59-
esac
60-
done < <(sed 's#,#\n#g' <<<"${log_files_string}")
61-
62-
warn "log files parsing for ${log_files_string@Q} is not supported by debug bundles"
63-
warn "please consider removing the \$NB_LOG_FILE or setting it to real file, before gathering debug bundles."
64-
}
65-
6635
wait_for_daemon_startup() {
6736
local timeout="${1}"
37+
if [[ "${timeout}" -eq 0 ]]; then
38+
info "not waiting for daemon startup due to zero timeout."
39+
return
40+
fi
6841

69-
if test -n "${log_file_path}"; then
70-
if ! wait_for_message "${timeout}" "started daemon server"; then
71-
warn "log line containing 'started daemon server' not found after ${timeout} seconds"
72-
warn "daemon failed to start, exiting..."
73-
exit 1
42+
local deadline=$((SECONDS + timeout))
43+
while [[ "${SECONDS}" -lt "${deadline}" ]]; do
44+
if "${NETBIRD_BIN}" status --check live 2>/dev/null; then
45+
return
7446
fi
75-
else
76-
warn "daemon service startup not discovered, sleeping ${timeout} instead"
77-
sleep "${timeout}"
78-
fi
47+
sleep 1
48+
done
49+
50+
warn "daemon did not become responsive after ${timeout} seconds, exiting..."
51+
exit 1
7952
}
8053

8154
login_if_needed() {
8255
local timeout="${1}"
8356

84-
if test -n "${log_file_path}" && wait_for_message "${timeout}" 'peer has been successfully registered|management connection state READY'; then
57+
if "${NETBIRD_BIN}" status --check ready 2>/dev/null; then
8558
info "already logged in, skipping 'netbird up'..."
86-
else
59+
return
60+
fi
61+
62+
if [[ "${timeout}" -eq 0 ]]; then
8763
info "logging in..."
8864
"${NETBIRD_BIN}" up
65+
return
8966
fi
67+
68+
local deadline=$((SECONDS + timeout))
69+
while [[ "${SECONDS}" -lt "${deadline}" ]]; do
70+
if "${NETBIRD_BIN}" status --check ready 2>/dev/null; then
71+
info "already logged in, skipping 'netbird up'..."
72+
return
73+
fi
74+
sleep 1
75+
done
76+
77+
info "logging in..."
78+
"${NETBIRD_BIN}" up
9079
}
9180

9281
main() {
@@ -95,7 +84,6 @@ main() {
9584
service_pids+=("$!")
9685
info "registered new service process 'netbird service run', currently running: ${service_pids[@]@Q}"
9786

98-
locate_log_file "${NB_LOG_FILE}"
9987
wait_for_daemon_startup "${NB_ENTRYPOINT_SERVICE_TIMEOUT}"
10088
login_if_needed "${NB_ENTRYPOINT_LOGIN_TIMEOUT}"
10189

0 commit comments

Comments
 (0)