Skip to content

Commit 612a13f

Browse files
committed
add gpu monitoring
1 parent 2d3722d commit 612a13f

3 files changed

Lines changed: 16 additions & 18 deletions

File tree

app/controllers/app.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,38 +33,37 @@ def post(self, resource):
3333
if not is_valid:
3434
return dict(status='fail', message=message), 400
3535

36+
QUERY = f'''{{container!="POD", image!="", namespace="{namespace}", pod=~"{app_alias}.*"}}'''
37+
3638
try:
3739
if resource == 'cpu':
3840
prom_data = prometheus.query_rang(
3941
start=start,
4042
end=end,
4143
step=step,
42-
metric='sum(rate(container_cpu_usage_seconds_total{container!="POD", image!="", namespace="' +
43-
namespace + '", pod=~"' + app_alias + '.*"}[5m]))')
44+
metric=f'''sum(rate(container_cpu_usage_seconds_total{QUERY}[5m]))''')
4445
elif resource == 'memory':
4546
prom_data = prometheus.query_rang(
4647
start=start,
4748
end=end,
4849
step=step,
49-
metric='sum(rate(container_memory_usage_bytes{container_name!="POD", image!="",pod=~"' + app_alias + '.*", namespace="' + namespace + '"}[5m]))')
50+
metric=f'''sum(rate(container_memory_usage_bytes{QUERY}[5m]))''')
5051
elif resource == 'network':
5152
prom_data = prometheus.query_rang(
5253
start=start,
5354
end=end,
5455
step=step,
55-
metric='sum(rate(container_network_receive_bytes_total{namespace="' +
56-
namespace + '", pod=~"' + app_alias + '.*"}[5m]))'
57-
)
56+
metric=f'''sum(rate(container_network_receive_bytes_total{QUERY}[5m]))''')
57+
5858
elif resource == 'gpu':
5959
prom_data = prometheus.query_rang(
6060
start=start,
6161
end=end,
6262
step=step,
63-
metric='sum(rate(container_gpu_usage_seconds_total{namespace="' +
64-
namespace+'"}[5m]))'
63+
metric=f'''sum(rate(container_gpu_usage_seconds_total{QUERY}[5m]))'''
6564
)
6665
else:
67-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
66+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
6867
except Exception as error:
6968
return dict(status='fail', message=str(error)), 500
7069

app/controllers/project.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,40 +29,37 @@ def post(self, resource):
2929
is_valid, message = is_valid_prometheus_query(step, start, end)
3030
if not is_valid:
3131
return dict(status='fail', message=message), 400
32-
32+
QUERY = f'''{{container!="POD", image!="", namespace="{namespace}"}}'''
3333
try:
3434
if resource == 'cpu':
3535
prom_data = prometheus.query_rang(
3636
start=start,
3737
end=end,
3838
step=step,
39-
metric='sum(rate(container_cpu_usage_seconds_total{container!="POD", image!="",namespace="' +
40-
namespace+'"}[5m]))'
39+
metric=f'''sum(rate(container_cpu_usage_seconds_total{QUERY}[5m]))'''
4140
)
4241
elif resource == 'memory':
4342
prom_data = prometheus.query_rang(
4443
start=start,
4544
end=end,
4645
step=step,
47-
metric='sum(rate(container_memory_usage_bytes{container_name!="POD", image!="", namespace="'+namespace+'"}[5m]))')
46+
metric=f'''sum(rate(container_memory_usage_bytes{QUERY}[5m]))''')
4847
elif resource == 'network':
4948
prom_data = prometheus.query_rang(
5049
start=start,
5150
end=end,
5251
step=step,
53-
metric='sum(rate(container_network_receive_bytes_total{namespace="' +
54-
namespace+'"}[5m]))'
52+
metric=f'''sum(rate(container_network_receive_bytes_total{QUERY}[5m]))'''
5553
)
5654
elif resource == 'gpu':
5755
prom_data = prometheus.query_rang(
5856
start=start,
5957
end=end,
6058
step=step,
61-
metric='sum(rate(container_gpu_usage_seconds_total{namespace="' +
62-
namespace+'"}[5m]))'
59+
metric=f'''sum(rate(container_gpu_usage_seconds_total{QUERY}[5m]))'''
6360
)
6461
else:
65-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
62+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
6663
except Exception as error:
6764
return dict(status='fail', message=str(error)), 500
6865

app/helpers/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def get_app_data(request):
9393
)
9494

9595
if not app_response.ok:
96+
if app_response.status_code == 404:
97+
return SimpleNamespace(status='failed', message=app_response.json()['message'], status_code=404)
9698
return SimpleNamespace(status='failed', message="Failed to fetch app for current user", status_code=400)
9799

98100
app_response = app_response.json()

0 commit comments

Comments
 (0)