Skip to content

Commit 15da119

Browse files
authored
Merge pull request #10 from crane-cloud/ft-add-ml-monitoring
feat: add gpu monitoring for ml apps
2 parents 37bc250 + 612a13f commit 15da119

5 files changed

Lines changed: 57 additions & 32 deletions

File tree

app/controllers/app.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@
1111
class AppUsageView(Resource):
1212
@jwt_required
1313
def post(self, resource):
14-
if resource not in ['cpu', 'memory', 'network']:
15-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
14+
if resource not in ['cpu', 'memory', 'network', 'gpu']:
15+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
1616

1717
app = get_app_data(request)
18-
19-
if app.status_code != 200:
20-
return dict(status='fail', message=app.message), app.status_code
18+
try:
19+
if app.status_code != 200:
20+
return dict(status='fail', message=app.message), app.status_code
21+
except:
22+
return app
2123

2224
start = app.start
2325
end = app.end
@@ -31,30 +33,37 @@ def post(self, resource):
3133
if not is_valid:
3234
return dict(status='fail', message=message), 400
3335

36+
QUERY = f'''{{container!="POD", image!="", namespace="{namespace}", pod=~"{app_alias}.*"}}'''
37+
3438
try:
3539
if resource == 'cpu':
3640
prom_data = prometheus.query_rang(
3741
start=start,
3842
end=end,
3943
step=step,
40-
metric='sum(rate(container_cpu_usage_seconds_total{container!="POD", image!="", namespace="' +
41-
namespace + '", pod=~"' + app_alias + '.*"}[5m]))')
44+
metric=f'''sum(rate(container_cpu_usage_seconds_total{QUERY}[5m]))''')
4245
elif resource == 'memory':
4346
prom_data = prometheus.query_rang(
4447
start=start,
4548
end=end,
4649
step=step,
47-
metric='sum(rate(container_memory_usage_bytes{container_name!="POD", image!="",pod=~"' + app_alias + '.*", namespace="' + namespace + '"}[5m]))')
50+
metric=f'''sum(rate(container_memory_usage_bytes{QUERY}[5m]))''')
4851
elif resource == 'network':
4952
prom_data = prometheus.query_rang(
5053
start=start,
5154
end=end,
5255
step=step,
53-
metric='sum(rate(container_network_receive_bytes_total{namespace="' +
54-
namespace + '", pod=~"' + app_alias + '.*"}[5m]))'
56+
metric=f'''sum(rate(container_network_receive_bytes_total{QUERY}[5m]))''')
57+
58+
elif resource == 'gpu':
59+
prom_data = prometheus.query_rang(
60+
start=start,
61+
end=end,
62+
step=step,
63+
metric=f'''sum(rate(container_gpu_usage_seconds_total{QUERY}[5m]))'''
5564
)
5665
else:
57-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
66+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
5867
except Exception as error:
5968
return dict(status='fail', message=str(error)), 500
6069

app/controllers/project.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
from prometheus_http_client import Prometheus
33
from flask_restful import Resource, request
4-
from app.helpers.utils import get_project_data,is_valid_prometheus_query
4+
from app.helpers.utils import get_project_data, is_valid_prometheus_query
55

66
from app.helpers.authenticate import (
77
jwt_required
@@ -11,8 +11,8 @@
1111
class ProjectUsageView(Resource):
1212
@jwt_required
1313
def post(self, resource):
14-
if resource not in ['cpu', 'memory', 'network']:
15-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
14+
if resource not in ['cpu', 'memory', 'network', 'gpu']:
15+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
1616

1717
project = get_project_data(request)
1818

@@ -29,32 +29,37 @@ def post(self, resource):
2929
is_valid, message = is_valid_prometheus_query(step, start, end)
3030
if not is_valid:
3131
return dict(status='fail', message=message), 400
32-
32+
QUERY = f'''{{container!="POD", image!="", namespace="{namespace}"}}'''
3333
try:
3434
if resource == 'cpu':
3535
prom_data = prometheus.query_rang(
3636
start=start,
3737
end=end,
3838
step=step,
39-
metric='sum(rate(container_cpu_usage_seconds_total{container!="POD", image!="",namespace="' +
40-
namespace+'"}[5m]))'
39+
metric=f'''sum(rate(container_cpu_usage_seconds_total{QUERY}[5m]))'''
4140
)
4241
elif resource == 'memory':
4342
prom_data = prometheus.query_rang(
4443
start=start,
4544
end=end,
4645
step=step,
47-
metric='sum(rate(container_memory_usage_bytes{container_name!="POD", image!="", namespace="'+namespace+'"}[5m]))')
46+
metric=f'''sum(rate(container_memory_usage_bytes{QUERY}[5m]))''')
4847
elif resource == 'network':
4948
prom_data = prometheus.query_rang(
5049
start=start,
5150
end=end,
5251
step=step,
53-
metric='sum(rate(container_network_receive_bytes_total{namespace="' +
54-
namespace+'"}[5m]))'
52+
metric=f'''sum(rate(container_network_receive_bytes_total{QUERY}[5m]))'''
53+
)
54+
elif resource == 'gpu':
55+
prom_data = prometheus.query_rang(
56+
start=start,
57+
end=end,
58+
step=step,
59+
metric=f'''sum(rate(container_gpu_usage_seconds_total{QUERY}[5m]))'''
5560
)
5661
else:
57-
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network'), 400
62+
return dict(status='fail', message='Invalid resource name, pass cpu, memory, network, gpu'), 400
5863
except Exception as error:
5964
return dict(status='fail', message=str(error)), 500
6065

app/helpers/utils.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def get_app_data(request):
9393
)
9494

9595
if not app_response.ok:
96+
if app_response.status_code == 404:
97+
return SimpleNamespace(status='failed', message=app_response.json()['message'], status_code=404)
9698
return SimpleNamespace(status='failed', message="Failed to fetch app for current user", status_code=400)
9799

98100
app_response = app_response.json()
@@ -109,7 +111,6 @@ def get_app_data(request):
109111
)
110112

111113

112-
113114
# default mx data points for prometheus
114115
MAX_DATA_POINTS = 11000
115116
STEP_UNITS_IN_SECONDS = {
@@ -120,16 +121,19 @@ def get_app_data(request):
120121
"w": 604800,
121122
}
122123

124+
123125
def parse_step_to_seconds(step: str) -> int:
124126
"""
125127
Parses a Prometheus step string like '1m', '2h', '4d' into seconds.
126128
"""
127129
match = re.match(r"^(\d+)([smhdw])$", step)
128130
if not match:
129-
raise ValueError("Invalid step format. Use formats like 30s, 5m, 2h, 1d.")
131+
raise ValueError(
132+
"Invalid step format. Use formats like 30s, 5m, 2h, 1d.")
130133
value, unit = match.groups()
131134
return int(value) * STEP_UNITS_IN_SECONDS[unit]
132135

136+
133137
def is_valid_prometheus_query(step: str, start_ts: int, end_ts: int) -> (bool, str):
134138
"""
135139
Validates if the number of points in a Prometheus query is within the allowed range.
@@ -138,15 +142,14 @@ def is_valid_prometheus_query(step: str, start_ts: int, end_ts: int) -> (bool, s
138142
step_seconds = parse_step_to_seconds(step)
139143
except ValueError as e:
140144
return False, str(e)
141-
145+
142146
if start_ts >= end_ts:
143147
return False, "Start timestamp must be less than end timestamp."
144-
148+
145149
total_duration = end_ts - start_ts
146150
num_points = total_duration // step_seconds
147151

148152
if num_points > MAX_DATA_POINTS:
149153
return False, f"Query returns {num_points} points, which exceeds the limit of {MAX_DATA_POINTS}. Increase the step or reduce the time range."
150-
151-
return True, f"Query valid: {num_points} data points."
152154

155+
return True, f"Query valid: {num_points} data points."

docker-compose.yml

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ services:
88
POSTGRES_USER: postgres
99
POSTGRES_DB: cc-monitoring
1010
ports:
11-
- "4200:5432"
11+
- "4240:5432"
1212
volumes:
1313
- db-data:/var/lib/postgresql/data
1414

@@ -18,17 +18,21 @@ services:
1818
context: .
1919
dockerfile: Dockerfile
2020
container_name: monitoring-api
21+
networks:
22+
- cranecloud
23+
- default
2124
environment:
2225
FLASK_APP_SECRET:
2326
JWT_SALT:
2427
PYTHONDONTWRITEBYTECODE: 1
2528
PYTHONUNBUFFERED: 1
2629
FLASK_ENV: development
30+
FLASK_DEBUG: 1
2731
DATABASE_USER: postgres
2832
DATABASE_URI: ${DATABASE_URI:-postgresql://postgres:postgres@database:5432/monitor}
2933
TEST_DATABASE_URI: ${TEST_DATABASE_URI:-postgresql://postgres:postgres@database:5432/monitor_test}
3034
FLASK_APP: server.py
31-
PRODUCT_BASE_URL: ${PRODUCT_BASE_URL:-http://127.0.0.1:5000}
35+
PRODUCT_BASE_URL: ${PRODUCT_BASE_URL:-http://flask-api:5000}
3236
ports:
3337
- "${FLASK_PORT:-4000}:5000"
3438
volumes:
@@ -38,6 +42,10 @@ services:
3842
links:
3943
- database
4044

41-
4245
volumes:
43-
db-data:
46+
db-data:
47+
48+
networks:
49+
cranecloud:
50+
external: true
51+
name: cranecloud_default

scripts/start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
flask db upgrade
77

88
# start server
9-
flask run --host=0.0.0.0 --port=5000
9+
flask run --host=0.0.0.0 --port=5000 --debug --reload

0 commit comments

Comments
 (0)