Skip to content

Commit 50a8cc1

Browse files
committed
Add option to handle watchdog alerts
1 parent 12a28b2 commit 50a8cc1

4 files changed

Lines changed: 87 additions & 4 deletions

File tree

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,41 @@ $ check_prometheus alert --name "HostHighCpuLoad" --name "PrometheusTargetMissin
208208
OK - Alerts inactive | total=2 firing=0 pending=0 inactive=2
209209
```
210210
211+
#### Checking alerts via their labels
212+
213+
The `--include-label` and `--exclude-label` options can be used to filter alerts:
214+
215+
```bash
216+
$ check_prometheus alert --include-label severity=warning
217+
OK - 2 Alerts: 0 Firing - 0 Pending - 2 Inactive
218+
\_[OK] [MysqlTooManyConnections] is inactive
219+
\_[OK] [MysqlHighPreparedStatementsUtilization] is inactive
220+
```
221+
222+
```bash
223+
$ check_prometheus alert --include-label namespace=production --exclude-label severity=info
224+
OK - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive
225+
\_[OK] [ApacheDown] is inactive
226+
```
227+
228+
#### Checking watchdog alerts
229+
230+
In Prometheus a "watchdog" or "dead man's switch" is an alert that is always firing to ensure alerting pipeline is working. The `-W, --watchdog` flag can be used to flip/negate the exit state of the plugin for these kind of alerts:
231+
232+
```bash
233+
$ check_prometheus alert --name Watchdog -W --no-alerts-state 2
234+
[OK] - 1 Alerts: 1 Firing - 0 Pending - 0 Inactive
235+
\_ [OK] [Watchdog] is firing - value: 1.00 - {"alertname":"Watchdog","severity":"none"}
236+
|total=1 firing=1 pending=0 inactive=0
237+
```
238+
239+
```bash
240+
$ check_prometheus alert --name Watchdog -W --no-alerts-state 2
241+
[CRITICAL] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive
242+
\_ [CRITICAL] No alerts retrieved
243+
|total=0 firing=0 pending=0 inactive=0
244+
```
245+
211246
## License
212247
213248
Copyright (c) 2022 [NETWAYS GmbH](mailto:info@netways.de)

cmd/alert.go

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type AlertConfig struct {
2222
ExcludeLabels []string
2323
IncludeLabels []string
2424
ProblemsOnly bool
25+
FlipExitState bool
2526
StateLabelKey string
2627
NoAlertsState string
2728
}
@@ -163,7 +164,13 @@ inactive = 0`,
163164

164165
sc := result.NewPartialResult()
165166

166-
_ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey))
167+
rlStatus := rl.GetStatus(cliAlertConfig.StateLabelKey)
168+
// If the negate flag is set we negate this state
169+
if cliAlertConfig.FlipExitState {
170+
rlStatus = negateStatus(rlStatus)
171+
}
172+
173+
_ = sc.SetState(rlStatus)
167174
sc.Output = rl.GetOutput()
168175
overall.AddSubcheck(sc)
169176
}
@@ -185,7 +192,13 @@ inactive = 0`,
185192

186193
sc := result.NewPartialResult()
187194

188-
_ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey))
195+
rlStatus := rl.GetStatus(cliAlertConfig.StateLabelKey)
196+
// If the negate flag is set we negate this state
197+
if cliAlertConfig.FlipExitState {
198+
rlStatus = negateStatus(rlStatus)
199+
}
200+
201+
_ = sc.SetState(rlStatus)
189202
// Set the alert in the internal Type to generate the output
190203
rl.Alert = alert
191204
sc.Output = rl.GetOutput()
@@ -257,9 +270,12 @@ func init() {
257270
fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false,
258271
"Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed")
259272

273+
fs.BoolVarP(&cliAlertConfig.FlipExitState, "watchdog", "W", false,
274+
"Flip the exit state for firing alerts. When this flag is set firing alerts will be OK and inactive alerts will be CRITICAL. This is intended for handling watchdog alerts")
275+
260276
fs.StringVarP(&cliAlertConfig.StateLabelKey, "label-key-state", "S", "",
261277
"Use the given AlertRule label to override the exit state for firing alerts."+
262-
"\nIf this flag is set the plugin looks for warning/critical/ok in the provided label key")
278+
"\nIf this flag is set the plugin looks for the strings 'warning/critical/ok' in the provided label key")
263279
}
264280

265281
// Function to convert state to integer.
@@ -314,3 +330,19 @@ func matchesLabel(labels model.LabelSet, labelsToMatch []string) bool {
314330

315331
return false
316332
}
333+
334+
// negateStatus turns an OK state into critical and a warning/critical state into OK
335+
func negateStatus(state int) int {
336+
switch state {
337+
case check.OK:
338+
return check.Critical
339+
case check.Critical:
340+
return check.OK
341+
case check.Warning:
342+
return check.OK
343+
case check.Unknown:
344+
return check.Unknown
345+
default:
346+
return check.Unknown
347+
}
348+
}

cmd/alert_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,15 @@ exit status 2
225225
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert"},
226226
expected: "[OK] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [OK] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\n",
227227
},
228+
{
229+
name: "alert-watchdog",
230+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
231+
w.WriteHeader(http.StatusOK)
232+
w.Write(loadTestdata(alertTestDataSet2))
233+
})),
234+
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert", "-W"},
235+
expected: "[CRITICAL] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [CRITICAL] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\nexit status 2\n",
236+
},
228237
{
229238
name: "alert-recording-rule",
230239
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

testdata/alertmanager/alert.rules

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
groups:
22
- name: Test Alerts for check_plugin
33
rules:
4-
4+
- alert: Watchdog
5+
annotations:
6+
message: |
7+
This is an alert meant to ensure that the entire alerting pipeline is functional.
8+
This alert is always firing.
9+
expr: vector(1)
10+
labels:
11+
severity: none
512
- alert: PrometheusTargetMissing
613
expr: up == 0
714
for: 0m

0 commit comments

Comments
 (0)