Skip to content

Commit e41d4f4

Browse files
author
Anandkumar Patel
committed
add prom files
1 parent 0133222 commit e41d4f4

5 files changed

Lines changed: 267 additions & 0 deletions

File tree

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: prometheus-conf
5+
data:
6+
alerts.conf: |
7+
ALERT HookDockUnresponsive
8+
IF up == 0
9+
FOR 10m
10+
LABELS {
11+
reportTo = "drake",
12+
type = "unresponsive"
13+
}
14+
ANNOTATIONS {
15+
summary = "({{ $labels.env }}) Dock unresponsive host={{ $labels.hostIp }} labels={{ $labels }}",
16+
description = "(hook) Dock unresponsive host={{ $labels.hostIp }} labels={{ $labels }}"
17+
}
18+
19+
ALERT DockUnresponsive
20+
IF up == 0
21+
FOR 1h
22+
LABELS {
23+
reportTo = "pagerduty"
24+
}
25+
ANNOTATIONS {
26+
summary = "({{ $labels.env }}) Dock unresponsive host={{ $labels.hostIp }} labels={{ $labels }}",
27+
description = "Dock unresponsive host={{ $labels.hostIp }} labels={{ $labels }"
28+
}
29+
30+
ALERT HookDockDockerDiskFull
31+
IF (node_filesystem_size{device="/dev/xvdb"} - node_filesystem_free{device="/dev/xvdb"}) / node_filesystem_size{device="/dev/xvdb"} * 100 > 70
32+
FOR 5m
33+
LABELS {
34+
reportTo = "drake",
35+
type = "disk_filled"
36+
}
37+
ANNOTATIONS {
38+
summary = "({{ $labels.env }}) Dock /docker disk 70% host={{ $labels.hostIp }} labels={{ $labels }}",
39+
description = "(hook) Dock /docker disk 70% host={{ $labels.hostIp }} labels={{ $labels }}"
40+
}
41+
42+
ALERT DockDockerDiskFull
43+
IF (node_filesystem_size{device="/dev/xvdb"} - node_filesystem_free{device="/dev/xvdb"}) / node_filesystem_size{device="/dev/xvdb"} * 100 > 90
44+
FOR 30m
45+
LABELS {
46+
reportTo = "pagerduty"
47+
}
48+
ANNOTATIONS {
49+
summary = "({{ $labels.env }}) Dock /docker disk 90% host={{ $labels.hostIp }} labels={{ $labels }}",
50+
description = "Playbook here: https://github.com/CodeNow/devops-scripts/wiki/server-out-of-disk"
51+
}
52+
53+
ALERT DockRootDiskFull
54+
IF (node_filesystem_size{device="/dev/xvda1"} - node_filesystem_free{device="/dev/xvda1"}) / node_filesystem_size{device="/dev/xvda1"} * 100 > 90
55+
FOR 5m
56+
LABELS {
57+
reportTo = "pagerduty"
58+
}
59+
ANNOTATIONS {
60+
summary = "({{ $labels.env }}) Dock root disk 90% host={{ $labels.hostIp }} labels={{ $labels }}",
61+
description = "Playbook here: https://github.com/CodeNow/devops-scripts/wiki/server-out-of-disk"
62+
}
63+
64+
ALERT HookDockOutOfRam
65+
IF (node_memory_MemFree + node_memory_Buffers + node_memory_Cached) < 150000000
66+
FOR 5m
67+
LABELS {
68+
reportTo = "drake",
69+
type = "memory_exhausted"
70+
}
71+
ANNOTATIONS {
72+
summary = "({{ $labels.env }}) Dock out of ram host={{ $labels.hostIp }} labels={{ $labels }}",
73+
description = "(hook) Dock out of ram host={{ $labels.hostIp }} labels={{ $labels }}"
74+
}
75+
76+
ALERT DockOutOfRam
77+
IF (node_memory_MemFree + node_memory_Buffers + node_memory_Cached) < 130000000
78+
FOR 30m
79+
LABELS {
80+
reportTo = "pagerduty"
81+
}
82+
ANNOTATIONS {
83+
summary = "({{ $labels.env }}) Dock out of ram host={{ $labels.hostIp }} labels={{ $labels }}",
84+
description = "unhealthy dock {{ $labels.hostIp } using dock-cli and message slack #customer channel with labels={{ $labels }}"
85+
}
86+
87+
ALERT DockHighLoad
88+
IF node_load15 > 90
89+
FOR 30m
90+
LABELS {
91+
reportTo = "pagerduty"
92+
}
93+
ANNOTATIONS {
94+
summary = "({{ $labels.env }}) Dock is experiencing high load host={{ $labels.hostIp }} labels={{ $labels }}",
95+
description = "ssh {{ $labels.hostIp }} into dock make sure it is responsive, if it is not, unhealthy. `docks unhealthy -e delta {{ $labels.hostIp }}`"
96+
}
97+
98+
prometheus.yml: |
99+
# my global config
100+
global:
101+
scrape_interval: 1m
102+
evaluation_interval: 1m
103+
104+
rule_files:
105+
- "alerts.conf"
106+
107+
scrape_configs:
108+
# monitor self to get metrics for prometheus
109+
- job_name: prometheus
110+
metrics_path: /metrics
111+
static_configs:
112+
- targets: [ 'localhost:9090' ]
113+
114+
115+
# pulls server list from ec2 and drops all servers that are not in env or a dock
116+
- job_name: container_info
117+
scrape_interval: 30m
118+
# keys to access this region and port of prom
119+
ec2_sd_configs:
120+
- region: us-west-2
121+
access_key: AKIAIFG37NSI6O2QMRRQ
122+
secret_key: 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
123+
port: 29007
124+
125+
# drop all servers not in this env and not a dock
126+
relabel_configs:
127+
- source_labels: [__meta_ec2_tag_aws_autoscaling_groupName]
128+
regex: delta-asg-dock-pool
129+
action: drop
130+
131+
- source_labels: [__meta_ec2_tag_env]
132+
regex: production-delta
133+
action: keep
134+
135+
- source_labels: [__meta_ec2_tag_role]
136+
regex: dock
137+
action: keep
138+
139+
- source_labels: [__meta_ec2_tag_org]
140+
target_label: githubOrgId
141+
142+
- source_labels: [__meta_ec2_private_ip]
143+
target_label: hostIp
144+
145+
- source_labels: [__meta_ec2_tag_env]
146+
target_label: env
147+
148+
# pulls server list from ec2 and drops all servers that are not production gamma or a dock
149+
- job_name: server_info
150+
# keys to access this region and port of prom
151+
ec2_sd_configs:
152+
- region: us-west-2
153+
access_key: AKIAIFG37NSI6O2QMRRQ
154+
secret_key: 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
155+
port: 29006
156+
157+
# drop all servers not in this env and not a dock
158+
relabel_configs:
159+
- source_labels: [__meta_ec2_tag_aws_autoscaling_groupName]
160+
regex: delta-asg-dock-pool
161+
action: drop
162+
163+
- source_labels: [__meta_ec2_tag_env]
164+
regex: production-delta
165+
action: keep
166+
167+
- source_labels: [__meta_ec2_tag_role]
168+
regex: dock
169+
action: keep
170+
171+
- source_labels: [__meta_ec2_tag_org]
172+
target_label: githubOrgId
173+
174+
- source_labels: [__meta_ec2_private_ip]
175+
target_label: hostIp
176+
177+
- source_labels: [__meta_ec2_tag_env]
178+
target_label: env
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
apiVersion: extensions/v1beta1
2+
kind: Deployment
3+
metadata:
4+
name: prometheus
5+
spec:
6+
replicas: 1
7+
template:
8+
metadata:
9+
labels:
10+
app: prometheus
11+
spec:
12+
imagePullSecrets:
13+
- name: runnable-kubernetes-pull-secret
14+
hostname: prometheus
15+
containers:
16+
- image: prom/prometheus:v1.4.1
17+
imagePullPolicy: Always
18+
name: prometheus
19+
resources:
20+
requests:
21+
cpu: "250m"
22+
memory: "500M"
23+
limits:
24+
cpu: "1550m"
25+
memory: 15G
26+
args:
27+
- -alertmanager.url
28+
- "http://prometheus-alerts:9093"
29+
- -config.file=/prometheus/prometheus.yml
30+
- -storage.local.path
31+
- "/data"
32+
- -storage.local.retention
33+
- 168h0m0s
34+
- -web.external-url
35+
- http://localhost
36+
- -web.listen-address
37+
- ":9090"
38+
39+
ports:
40+
- containerPort: 9090
41+
volumeMounts:
42+
- name: prometheus-conf
43+
mountPath: /prometheus
44+
- name: prometheus-db-claim
45+
mountPath: /data
46+
volumes:
47+
- name: prometheus-conf
48+
configMap:
49+
name: prometheus-conf
50+
- name: prometheus-db-claim
51+
persistentVolumeClaim:
52+
claimName: prometheus-db-claim
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: prometheus
5+
spec:
6+
selector:
7+
app: prometheus
8+
ports:
9+
- port: 9090
10+
protocol: TCP
11+
name: '9090'
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
kind: PersistentVolumeClaim
2+
apiVersion: v1
3+
metadata:
4+
name: prometheus-db-claim
5+
labels:
6+
type: amazonEBS
7+
spec:
8+
accessModes:
9+
- ReadWriteOnce
10+
resources:
11+
requests:
12+
storage: 100Gi
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
kind: PersistentVolume
2+
apiVersion: v1
3+
metadata:
4+
name: prometheus-db
5+
labels:
6+
type: amazonEBS
7+
spec:
8+
capacity:
9+
storage: 100Gi
10+
accessModes:
11+
- ReadWriteOnce
12+
awsElasticBlockStore:
13+
volumeID: vol-0dc9ca42481538a30
14+
fsType: ext4

0 commit comments

Comments
 (0)