Skip to content

Commit 57b27a7

Browse files
committed
Add retry for the machine ssh
There's a bit of a gap between the time that the machine is assigned an IP and the ssh service is up and listening, which creates a race for the ssh command (i.e. we may run the ssh command after the machine gets the IP but before the ssh service is up). So we retry a couple of times to mitigate that effect until either the ssh command succeeds, or we run out of attempts (i.e. something else is going on).
1 parent 92ab788 commit 57b27a7

1 file changed

Lines changed: 29 additions & 7 deletions

File tree

juju/machine.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,21 @@ async def _scp(self, source, destination, scp_opts):
136136
]
137137
cmd.extend(scp_opts.split() if isinstance(scp_opts, str) else scp_opts)
138138
cmd.extend([source, destination])
139-
process = await jasyncio.create_subprocess_exec(*cmd)
140-
await process.wait()
139+
# There's a bit of a gap between the time that the machine is assigned an IP and the ssh
140+
# service is up and listening, which creates a race for the ssh command. So we retry a
141+
# couple of times until either we run out of attempts, or the ssh command succeeds to
142+
# mitigate that effect.
143+
# TODO (cderici): refactor the ssh and scp subcommand processing into a single method.
144+
retry_backoff = 1
145+
retries = 10
146+
for _ in range(retries):
147+
process = await jasyncio.create_subprocess_exec(*cmd)
148+
await process.wait()
149+
if process.returncode == 0:
150+
break
151+
await jasyncio.sleep(retry_backoff)
141152
if process.returncode != 0:
142-
raise JujuError("command failed: %s" % cmd)
153+
raise JujuError(f"command failed after {retries} attempts: {cmd}")
143154

144155
async def ssh(
145156
self, command, user='ubuntu', proxy=False, ssh_opts=None, wait_for_active=False, timeout=None):
@@ -169,11 +180,22 @@ async def ssh(
169180
if ssh_opts:
170181
cmd.extend(ssh_opts.split() if isinstance(ssh_opts, str) else ssh_opts)
171182
cmd.extend([command])
172-
process = await jasyncio.create_subprocess_exec(
173-
*cmd, stdout=jasyncio.subprocess.PIPE, stderr=jasyncio.subprocess.PIPE)
174-
stdout, stderr = await process.communicate()
183+
184+
# There's a bit of a gap between the time that the machine is assigned an IP and the ssh
185+
# service is up and listening, which creates a race for the ssh command. So we retry a
186+
# couple of times until either we run out of attempts, or the ssh command succeeds to
187+
# mitigate that effect.
188+
retry_backoff = 1
189+
retries = 10
190+
for _ in range(retries):
191+
process = await jasyncio.create_subprocess_exec(
192+
*cmd, stdout=jasyncio.subprocess.PIPE, stderr=jasyncio.subprocess.PIPE)
193+
stdout, stderr = await process.communicate()
194+
if process.returncode == 0:
195+
break
196+
await jasyncio.sleep(retry_backoff)
175197
if process.returncode != 0:
176-
raise JujuError("command failed: %s with %s" % (cmd, stderr.decode()))
198+
raise JujuError(f"command failed: {cmd} after {retries} attempts, with {stderr.decode()}")
177199
# stdout is a bytes-like object, returning a string might be more useful
178200
return stdout.decode()
179201

0 commit comments

Comments
 (0)