Skip to content

Commit 36d7490

Browse files
committed
Add option to include exclude alerts via their labels
1 parent 7b26e70 commit 36d7490

5 files changed

Lines changed: 144 additions & 21 deletions

File tree

README.md

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,21 @@ Examples:
153153
| total=2 firing=1 pending=0 inactive=1
154154
155155
Flags:
156-
--exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex.
157-
-h, --help help for alert
158-
-n, --name strings The name of one or more specific alerts to check.
159-
This parameter can be repeated e.G.: '--name alert1 --name alert2'
160-
If no name is given, all alerts will be evaluated
161-
-g, --group strings The name of one or more specific groups to check.
162-
This parameter can be repeated e.G.: '--group group1 --group group2'
163-
If no group is given, all groups will be scanned for alerts
164-
-T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK")
165-
-P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed
156+
--exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex.
157+
--exclude-label stringArray The label of one or more specific alerts to exclude.
158+
This parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'
159+
-g, --group strings The name of one or more specific groups to check for alerts.
160+
This parameter can be repeated e.g.: '--group group1 --group group2'
161+
If no group is given, all groups will be scanned for alerts
162+
-h, --help help for alert
163+
--include-label stringArray The label of one or more specific alerts to include.
164+
This parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'
165+
Note that repeated --include-label are combined using a union.
166+
-n, --name strings The name of one or more specific alerts to check.
167+
This parameter can be repeated e.g.: '--name alert1 --name alert2'
168+
If no name is given, all alerts will be evaluated
169+
-T, --no-alerts-state string State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK (default "OK")
170+
-P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed
166171
```
167172
168173
#### Checking all defined alerts

cmd/alert.go

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@ import (
1111
"github.com/NETWAYS/go-check"
1212
"github.com/NETWAYS/go-check/perfdata"
1313
"github.com/NETWAYS/go-check/result"
14+
"github.com/prometheus/common/model"
1415
"github.com/spf13/cobra"
1516
)
1617

1718
type AlertConfig struct {
1819
AlertName []string
1920
Group []string
2021
ExcludeAlerts []string
22+
ExcludeLabels []string
23+
IncludeLabels []string
2124
ProblemsOnly bool
2225
NoAlertsState string
2326
}
@@ -102,30 +105,43 @@ inactive = 0`,
102105
var overall result.Overall
103106

104107
for _, rl := range rules {
105-
106108
// If it's not the Alert we're looking for, Skip!
107109
if cliAlertConfig.AlertName != nil {
108110
if !slices.Contains(cliAlertConfig.AlertName, rl.AlertingRule.Name) {
109111
continue
110112
}
111113
}
112114

115+
labelsMatchedInclude := matchesLabel(rl.AlertingRule.Labels, cliAlertConfig.IncludeLabels)
116+
117+
if len(cliAlertConfig.IncludeLabels) > 0 && !labelsMatchedInclude {
118+
// If the alert labels don't match here we can skip it.
119+
continue
120+
}
121+
113122
// Skip inactive alerts if flag is set
114123
if len(rl.AlertingRule.Alerts) == 0 && cliAlertConfig.ProblemsOnly {
115124
continue
116125
}
117126

118-
alertMatched, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts)
127+
alertMatchedExclude, regexErr := matches(rl.AlertingRule.Name, cliAlertConfig.ExcludeAlerts)
119128

120129
if regexErr != nil {
121130
check.ExitRaw(check.Unknown, "Invalid regular expression provided:", regexErr.Error())
122131
}
123132

124-
if alertMatched {
133+
if alertMatchedExclude {
125134
// If the alert matches a regex from the list we can skip it.
126135
continue
127136
}
128137

138+
labelsMatchedExclude := matchesLabel(rl.AlertingRule.Labels, cliAlertConfig.ExcludeLabels)
139+
140+
if len(cliAlertConfig.ExcludeLabels) > 0 && labelsMatchedExclude {
141+
// If the alert labels matches here we can skip it.
142+
continue
143+
}
144+
129145
// Handle Inactive Alerts
130146
if len(rl.AlertingRule.Alerts) == 0 {
131147
// Counting states for perfdata
@@ -208,18 +224,28 @@ func init() {
208224

209225
fs.StringVarP(&cliAlertConfig.NoAlertsState, "no-alerts-state", "T", "OK", "State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK")
210226

211-
fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{}, "Alerts to ignore. Can be used multiple times and supports regex.")
227+
fs.StringArrayVar(&cliAlertConfig.ExcludeAlerts, "exclude-alert", []string{},
228+
"Alerts to ignore. Can be used multiple times and supports regex.")
212229

213230
fs.StringSliceVarP(&cliAlertConfig.AlertName, "name", "n", nil,
214231
"The name of one or more specific alerts to check."+
215-
"\nThis parameter can be repeated e.G.: '--name alert1 --name alert2'"+
232+
"\nThis parameter can be repeated e.g.: '--name alert1 --name alert2'"+
216233
"\nIf no name is given, all alerts will be evaluated")
217234

218235
fs.StringSliceVarP(&cliAlertConfig.Group, "group", "g", nil,
219236
"The name of one or more specific groups to check for alerts."+
220-
"\nThis parameter can be repeated e.G.: '--group group1 --group group2'"+
237+
"\nThis parameter can be repeated e.g.: '--group group1 --group group2'"+
221238
"\nIf no group is given, all groups will be scanned for alerts")
222239

240+
fs.StringArrayVar(&cliAlertConfig.IncludeLabels, "include-label", []string{},
241+
"The label of one or more specific alerts to include. "+
242+
"\nThis parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'"+
243+
"\nNote that repeated --include-label are combined using a union.")
244+
245+
fs.StringArrayVar(&cliAlertConfig.ExcludeLabels, "exclude-label", []string{},
246+
"The label of one or more specific alerts to exclude."+
247+
"\nThis parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'")
248+
223249
fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false,
224250
"Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed")
225251
}
@@ -257,3 +283,22 @@ func matches(input string, regexToExclude []string) (bool, error) {
257283

258284
return false, nil
259285
}
286+
287+
// Matches a list of labels against a list of labels
288+
func matchesLabel(labels model.LabelSet, labelsToMatch []string) bool {
289+
for _, lb := range labelsToMatch {
290+
kv := strings.SplitN(lb, "=", 2)
291+
292+
if len(kv) != 2 {
293+
continue
294+
}
295+
296+
key, value := model.LabelName(kv[0]), model.LabelValue(kv[1])
297+
298+
if val, ok := labels[key]; ok && val == value {
299+
return true
300+
}
301+
}
302+
303+
return false
304+
}

cmd/alert_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,76 @@ exit status 2
234234
args: []string{"run", "../main.go", "alert", "--name", "InactiveAlert"},
235235
expected: "[OK] - 1 Alerts: 0 Firing - 0 Pending - 1 Inactive\n\\_ [OK] [InactiveAlert] is inactive\n|total=1 firing=0 pending=0 inactive=1\n\n",
236236
},
237+
{
238+
name: "alert-include-label",
239+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
240+
w.WriteHeader(http.StatusOK)
241+
w.Write(loadTestdata(alertTestDataSet1))
242+
})),
243+
args: []string{"run", "../main.go", "alert", "--include-label", "severity=critical"},
244+
expected: `[CRITICAL] - 2 Alerts: 1 Firing - 0 Pending - 1 Inactive
245+
\_ [OK] [HostOutOfMemory] is inactive
246+
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
247+
|total=2 firing=1 pending=0 inactive=1
248+
249+
exit status 2
250+
`,
251+
},
252+
{
253+
name: "alert-exclude-label",
254+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
255+
w.WriteHeader(http.StatusOK)
256+
w.Write(loadTestdata(alertTestDataSet1))
257+
})),
258+
args: []string{"run", "../main.go", "alert", "--exclude-label", "severity=critical"},
259+
expected: `[WARNING] - 1 Alerts: 0 Firing - 1 Pending - 0 Inactive
260+
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
261+
|total=1 firing=0 pending=1 inactive=0
262+
263+
exit status 1
264+
`,
265+
},
266+
{
267+
name: "alert-include-label-multiple",
268+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
269+
w.WriteHeader(http.StatusOK)
270+
w.Write(loadTestdata(alertTestDataSet1))
271+
})),
272+
args: []string{"run", "../main.go", "alert", "--include-label", "team=database", "--include-label", "severity=critical"},
273+
expected: `[CRITICAL] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive
274+
\_ [OK] [HostOutOfMemory] is inactive
275+
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
276+
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
277+
|total=3 firing=1 pending=1 inactive=1
278+
279+
exit status 2
280+
`,
281+
},
282+
{
283+
name: "alert-include-label-multiple-similar",
284+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
285+
w.WriteHeader(http.StatusOK)
286+
w.Write(loadTestdata(alertTestDataSet1))
287+
})),
288+
args: []string{"run", "../main.go", "alert", "--include-label", "severity=warning", "--include-label", "severity=critical"},
289+
expected: `[CRITICAL] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive
290+
\_ [OK] [HostOutOfMemory] is inactive
291+
\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"}
292+
\_ [CRITICAL] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"}
293+
|total=3 firing=1 pending=1 inactive=1
294+
295+
exit status 2
296+
`,
297+
},
298+
{
299+
name: "alert-exclude-label-multiple",
300+
server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
301+
w.WriteHeader(http.StatusOK)
302+
w.Write(loadTestdata(alertTestDataSet1))
303+
})),
304+
args: []string{"run", "../main.go", "alert", "--exclude-label", "team=database", "--exclude-label", "severity=critical"},
305+
expected: "[OK] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive\n\\_ [OK] No alerts retrieved\n|total=0 firing=0 pending=0 inactive=0\n\n",
306+
},
237307
}
238308

239309
for _, test := range tests {

testdata/alertmanager/alert.rules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ groups:
1515
expr: absent(up{job="alertmanager"})
1616
for: 0m
1717
labels:
18-
severity: warning
18+
severity: low
1919
annotations:
2020
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
2121
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
@@ -33,7 +33,7 @@ groups:
3333
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
3434
for: 0m
3535
labels:
36-
severity: warning
36+
severity: extreme
3737
annotations:
3838
summary: Host high CPU load (instance {{ $labels.instance }})
3939
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

testdata/unittest/alertDataset1.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
"query": "up",
1313
"duration": 120,
1414
"labels": {
15-
"severity": "critical"
15+
"severity": "critical",
16+
"team": "network"
1617
},
1718
"annotations": {
1819
"description": "Foo",
@@ -40,7 +41,8 @@
4041
"query": "mysql",
4142
"duration": 17280000,
4243
"labels": {
43-
"severity": "warning"
44+
"severity": "warning",
45+
"team": "database"
4446
},
4547
"annotations": {
4648
"description": "MySQL",
@@ -84,7 +86,8 @@
8486
"query": "SSL",
8587
"duration": 0,
8688
"labels": {
87-
"severity": "critical"
89+
"severity": "critical",
90+
"team": "network"
8891
},
8992
"annotations": {
9093
"description": "TLS",

0 commit comments

Comments
 (0)