Skip to content

Commit b043308

Browse files
authored
Merge pull request #124 from NETWAYS/watchdogs
Add option to handle watchdog alerts
2 parents 12a28b2 + e7e52c2 commit b043308

12 files changed

Lines changed: 111 additions & 15 deletions

File tree

.github/workflows/golangci-lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ jobs:
1717
- name: golangci-lint
1818
uses: golangci/golangci-lint-action@v9
1919
with:
20-
version: v2.1.6
20+
version: v2.9.0

.golangci.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ run:
33
tests: false
44
linters:
55
default: all
6+
enable:
7+
- wsl_v5
68
disable:
9+
- wsl
710
- cyclop
811
- depguard
912
- err113
@@ -25,6 +28,13 @@ linters:
2528
- varnamelen
2629
- wrapcheck
2730
- funlen
31+
settings:
32+
wsl_v5:
33+
allow-first-in-block: true
34+
allow-whole-block: true
35+
branch-max-lines: 2
36+
disable:
37+
- err
2838
exclusions:
2939
generated: lax
3040
presets:

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,41 @@ $ check_prometheus alert --name "HostHighCpuLoad" --name "PrometheusTargetMissin
208208
OK - Alerts inactive | total=2 firing=0 pending=0 inactive=2
209209
```
210210
211+
#### Checking alerts via their labels
212+
213+
The `--include-label` and `--exclude-label` options can be used to filter alerts:
214+
215+
```bash
216+
$ check_prometheus alert --include-label severity=warning
217+
OK - 2 Alerts: 0 Firing - 0 Pending - 2 Inactive
218+
\_[OK] [MysqlTooManyConnections] is inactive
219+
\_[OK] [MysqlHighPreparedStatementsUtilization] is inactive
220+
```
221+
222+
```bash
223+
$ check_prometheus alert --include-label namespace=production --exclude-label severity=info
224+
OK - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive
225+
\_[OK] [ApacheDown] is inactive
226+
```
227+
228+
#### Checking watchdog alerts
229+
230+
In Prometheus a "watchdog" or "dead man's switch" is an alert that is always firing to ensure alerting pipeline is working. The `-W, --watchdog` flag can be used to flip/negate the exit state of the plugin for these kind of alerts:
231+
232+
```bash
233+
$ check_prometheus alert --name Watchdog -W --no-alerts-state 2
234+
[OK] - 1 Alerts: 1 Firing - 0 Pending - 0 Inactive
235+
\_ [OK] [Watchdog] is firing - value: 1.00 - {"alertname":"Watchdog","severity":"none"}
236+
|total=1 firing=1 pending=0 inactive=0
237+
```
238+
239+
```bash
240+
$ check_prometheus alert --name Watchdog -W --no-alerts-state 2
241+
[CRITICAL] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive
242+
\_ [CRITICAL] No alerts retrieved
243+
|total=0 firing=0 pending=0 inactive=0
244+
```
245+
211246
## License
212247
213248
Copyright (c) 2022 [NETWAYS GmbH](mailto:info@netways.de)

cmd/alert.go

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type AlertConfig struct {
2222
ExcludeLabels []string
2323
IncludeLabels []string
2424
ProblemsOnly bool
25+
FlipExitState bool
2526
StateLabelKey string
2627
NoAlertsState string
2728
}
@@ -99,6 +100,7 @@ inactive = 0`,
99100
if cliAlertConfig.AlertName != nil {
100101
check.ExitRaw(check.Unknown, "No such alert defined", "|", pdlist.String())
101102
}
103+
102104
check.ExitRaw(noAlertsState, "No alerts defined", "|", pdlist.String())
103105
}
104106

@@ -163,7 +165,13 @@ inactive = 0`,
163165

164166
sc := result.NewPartialResult()
165167

166-
_ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey))
168+
rlStatus := rl.GetStatus(cliAlertConfig.StateLabelKey)
169+
// If the negate flag is set we negate this state
170+
if cliAlertConfig.FlipExitState {
171+
rlStatus = negateStatus(rlStatus)
172+
}
173+
174+
_ = sc.SetState(rlStatus)
167175
sc.Output = rl.GetOutput()
168176
overall.AddSubcheck(sc)
169177
}
@@ -185,7 +193,13 @@ inactive = 0`,
185193

186194
sc := result.NewPartialResult()
187195

188-
_ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey))
196+
rlStatus := rl.GetStatus(cliAlertConfig.StateLabelKey)
197+
// If the negate flag is set we negate this state
198+
if cliAlertConfig.FlipExitState {
199+
rlStatus = negateStatus(rlStatus)
200+
}
201+
202+
_ = sc.SetState(rlStatus)
189203
// Set the alert in the internal Type to generate the output
190204
rl.Alert = alert
191205
sc.Output = rl.GetOutput()
@@ -257,9 +271,12 @@ func init() {
257271
fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false,
258272
"Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed")
259273

274+
fs.BoolVarP(&cliAlertConfig.FlipExitState, "watchdog", "W", false,
275+
"Flip the exit state for firing alerts. When this flag is set firing alerts will be OK and inactive alerts will be CRITICAL. This is intended for handling watchdog alerts")
276+
260277
fs.StringVarP(&cliAlertConfig.StateLabelKey, "label-key-state", "S", "",
261278
"Use the given AlertRule label to override the exit state for firing alerts."+
262-
"\nIf this flag is set the plugin looks for warning/critical/ok in the provided label key")
279+
"\nIf this flag is set the plugin looks for the strings 'warning/critical/ok' in the provided label key")
263280
}
264281

265282
// Function to convert state to integer.
@@ -314,3 +331,19 @@ func matchesLabel(labels model.LabelSet, labelsToMatch []string) bool {
314331

315332
return false
316333
}
334+
335+
// negateStatus turns an OK state into critical and a warning/critical state into OK
336+
func negateStatus(state int) int {
337+
switch state {
338+
case check.OK:
339+
return check.Critical
340+
case check.Critical:
341+
return check.OK
342+
case check.Warning:
343+
return check.OK
344+
case check.Unknown:
345+
return check.Unknown
346+
default:
347+
return check.Unknown
348+
}
349+
}

cmd/alert_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,15 @@ exit status 2
225225
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert"},
226226
expected: "[OK] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [OK] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\n",
227227
},
228+
{
229+
name: "alert-watchdog",
230+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
231+
w.WriteHeader(http.StatusOK)
232+
w.Write(loadTestdata(alertTestDataSet2))
233+
})),
234+
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert", "-W"},
235+
expected: "[CRITICAL] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [CRITICAL] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\nexit status 2\n",
236+
},
228237
{
229238
name: "alert-recording-rule",
230239
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

cmd/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func (c *Config) NewClient() *client.Client {
9090
// Using a Bearer Token for authentication
9191
if c.Bearer != "" {
9292
var t = config.NewInlineSecret(c.Bearer)
93+
9394
rt = config.NewAuthorizationCredentialsRoundTripper("Bearer", t, rt)
9495
}
9596

cmd/health.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Ready: Checks the readiness of an endpoint, which returns OK if the Prometheus s
2929

3030
// Creating an client and connecting to the API
3131
c := cliConfig.NewClient()
32+
3233
err := c.Connect()
3334
if err != nil {
3435
check.ExitError(err)
@@ -61,6 +62,7 @@ Ready: Checks the readiness of an endpoint, which returns OK if the Prometheus s
6162
if err != nil {
6263
check.ExitError(err)
6364
}
65+
6466
partialResult := result.NewPartialResult()
6567

6668
_ = partialResult.SetState(rc)

cmd/query.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Note: Time range values e.G. 'go_memstats_alloc_bytes_total[0s]' only the latest
7979
}
8080

8181
c := cliConfig.NewClient()
82+
8283
err = c.Connect()
8384
if err != nil {
8485
check.ExitError(err)
@@ -93,6 +94,7 @@ Note: Time range values e.G. 'go_memstats_alloc_bytes_total[0s]' only the latest
9394
if strings.Contains(err.Error(), "unmarshalerDecoder: unexpected value type \"string\"") {
9495
err = errors.New("string value results are not supported")
9596
}
97+
9698
check.ExitError(err)
9799
}
98100

@@ -112,10 +114,8 @@ Note: Time range values e.G. 'go_memstats_alloc_bytes_total[0s]' only the latest
112114
case model.ValVector:
113115
// Instant vector - a set of time series containing a single sample for each time series, all sharing the same timestamp
114116
vectorVal := result.(model.Vector)
115-
116117
// Set initial capacity to reduce memory allocations
117118
for _, sample := range vectorVal {
118-
119119
numberValue := float64(sample.Value)
120120
partial := goresult.NewPartialResult()
121121

@@ -185,6 +185,7 @@ Note: Time range values e.G. 'go_memstats_alloc_bytes_total[0s]' only the latest
185185
appendum := fmt.Sprintf("HTTP Warnings: %v", strings.Join(warnings, ", "))
186186
overall.Summary = overall.GetOutput() + appendum
187187
}
188+
188189
check.ExitRaw(overall.GetStatus(), overall.GetOutput())
189190
},
190191
}

cmd/root.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ func Execute(version string) {
2424
rootCmd.Version = version
2525
rootCmd.VersionTemplate()
2626

27-
if err := rootCmd.Execute(); err != nil {
27+
err := rootCmd.Execute()
28+
29+
if err != nil {
2830
check.ExitError(err)
2931
}
3032
}

internal/alert/alert.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const (
1616
alertnameLabelKey = "alertname"
1717
)
1818

19-
// Internal representation of Prometheus Rules.
19+
// Rule is the internal representation of a Prometheus Rules.
2020
// Alert attribute will be used when iterating over multiple AlertingRules.
2121
type Rule struct {
2222
AlertingRule v1.AlertingRule
@@ -139,7 +139,6 @@ func (a *Rule) GetOutput() (output string) {
139139
// Add current value to output
140140
value, _ = strconv.ParseFloat(a.Alert.Value, 32)
141141
out.WriteString(fmt.Sprintf(" is %s - value: %.2f", a.AlertingRule.State, value))
142-
143142
// Add labels to the output
144143
l, err := json.Marshal(a.Alert.Labels)
145144

0 commit comments

Comments
 (0)