Skip to content

Commit efd4a4e

Browse files
authored
tools/lab_bisect: Add retry logic for server connection errors (#75)
When Batfish server fails to start or connection errors occur during testing, retry once with a clean server restart before skipping the commit. This prevents transient server issues from being treated as test failures during bisection. - Add retry_count parameter to test_commit() method - Retry server startup failures once before giving up - Detect connection errors in test output and retry once - Skip commits (return None) after retry failure instead of treating as test failures (return False) - Add _is_connection_error() helper to detect various connection patterns --- **Stack**: - #76 - #75⚠️ *Part of a stack created by [spr](https://github.com/ejoffe/spr). Do not merge manually using the UI - doing so may have unexpected results.*
1 parent 61d7762 commit efd4a4e

1 file changed

Lines changed: 38 additions & 4 deletions

File tree

tools/lab_bisect.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,20 @@ def finalize_log(self, first_bad_commit: Optional[str] = None):
9696
self.logger.info(f"Bisection complete. Full log saved to {self.log_file}")
9797
return self.log_file
9898

99+
def _is_connection_error(self, test_output: str) -> bool:
100+
"""Check if test output indicates a connection error to Batfish server."""
101+
connection_error_indicators = [
102+
"Connection refused",
103+
"ConnectionRefusedError",
104+
"Failed to establish a new connection",
105+
"NewConnectionError",
106+
"Max retries exceeded",
107+
"HTTPConnectionPool(host='localhost', port=9996)",
108+
]
109+
return any(
110+
indicator in test_output for indicator in connection_error_indicators
111+
)
112+
99113
def cleanup_containers(self) -> bool:
100114
"""Remove containers directory to clear cached files."""
101115
try:
@@ -263,9 +277,9 @@ def get_commit_from_date(self, date: str) -> Optional[str]:
263277
self.logger.error(f"Error getting commit for date {date}: {e}")
264278
return None
265279

266-
def test_commit(self, commit: str) -> Optional[bool]:
280+
def test_commit(self, commit: str, retry_count: int = 0) -> Optional[bool]:
267281
"""Test a specific commit. Returns True if good, False if bad, None if error."""
268-
self.logger.info(f"Testing commit: {commit}")
282+
self.logger.info(f"Testing commit: {commit} (attempt {retry_count + 1})")
269283
error_msg = ""
270284
test_output = ""
271285

@@ -288,12 +302,32 @@ def test_commit(self, commit: str) -> Optional[bool]:
288302
# Start server
289303
if not self.start_batfish_server():
290304
error_msg = "Failed to start Batfish server"
291-
self.log_commit_test(commit, None, test_output, error_msg)
292-
return None
305+
# Retry once for server startup failures
306+
if retry_count == 0:
307+
self.logger.warning(
308+
f"Batfish server failed to start, retrying once..."
309+
)
310+
return self.test_commit(commit, retry_count + 1)
311+
else:
312+
self.log_commit_test(commit, None, test_output, error_msg)
313+
return None
293314

294315
# Test the lab
295316
result, test_output = self.test_lab()
296317

318+
# Check if the test failed due to connection errors (indicating server issues)
319+
if not result and self._is_connection_error(test_output):
320+
# Stop server and retry once for connection errors
321+
self.stop_batfish_server()
322+
if retry_count == 0:
323+
self.logger.warning(f"Connection error detected, retrying once...")
324+
return self.test_commit(commit, retry_count + 1)
325+
else:
326+
# After retry, treat as skip (return None) rather than failure (return False)
327+
error_msg = "Connection error persisted after retry - skipping"
328+
self.log_commit_test(commit, None, test_output, error_msg)
329+
return None
330+
297331
# Stop server
298332
self.stop_batfish_server()
299333

0 commit comments

Comments
 (0)