Harden container cancel-and-wait supervisor loop

This should hopefully make teardown more reliable and includes better
logic to fail over to a hard kill path after a 3 second timeout waiting
for the instance to complete using the `docker-py` wait API. Also
generalize the supervisor teardown loop by allowing the container config
endpoint to return 2 msgs to expect:
- a startup message that can be read from the container's internal
  process logging that indicates it is fully up and ready.
- a teardown msg that can be polled for that indicates the container has
  gracefully terminated after a cancellation request which is passed to
  our container wrappers `.cancel()` method.

Make the marketstore config endpoint return the 2 messages we previously
had hard coded and use this new api.
contain_mkts
Tyler Goodlet 2022-06-23 10:23:14 -04:00
parent dcee0ddd55
commit b8b76a32a6
2 changed files with 62 additions and 32 deletions

View File

@ -19,6 +19,7 @@ Supervisor for docker with included specific-image service helpers.
''' '''
import os import os
import time
from typing import ( from typing import (
Optional, Optional,
Callable, Callable,
@ -186,30 +187,30 @@ class Container:
async def cancel( async def cancel(
self, self,
stop_msg: str,
) -> None: ) -> None:
cid = self.cntr.id cid = self.cntr.id
# first try a graceful cancel
log.cancel(
f'SIGINT cancelling container: {cid}\n'
f'waiting on stop msg: "{stop_msg}"'
)
self.try_signal('SIGINT') self.try_signal('SIGINT')
with trio.move_on_after(0.5) as cs: start = time.time()
cs.shield = True for _ in range(30):
await self.process_logs_until('initiating graceful shutdown')
await self.process_logs_until('exiting...',)
for _ in range(10):
with trio.move_on_after(0.5) as cs: with trio.move_on_after(0.5) as cs:
cs.shield = True cs.shield = True
await self.process_logs_until('exiting...',) await self.process_logs_until(stop_msg)
# if we aren't cancelled on above checkpoint then we
# assume we read the expected stop msg and terminated.
break break
if cs.cancelled_caught:
# get out the big guns, bc apparently marketstore
# doesn't actually know how to terminate gracefully
# :eyeroll:...
self.try_signal('SIGKILL')
try: try:
log.info('Waiting on container shutdown: {cid}') log.info(f'Polling for container shutdown:\n{cid}')
self.cntr.wait( self.cntr.wait(
timeout=0.1, timeout=0.1,
condition='not-running', condition='not-running',
@ -218,13 +219,30 @@ class Container:
except ( except (
ReadTimeout, ReadTimeout,
):
log.info(f'Still waiting on container:\n{cid}')
continue
except (
docker.errors.APIError,
ConnectionError, ConnectionError,
): ):
log.error(f'failed to wait on container {cid}') log.exception(f'Docker connection failure')
raise break
else: else:
raise RuntimeError('Failed to cancel container {cid}') delay = time.time() - start
log.error(
f'Failed to kill container {cid} after {delay}s\n'
'sending SIGKILL..'
)
# get out the big guns, bc apparently marketstore
# doesn't actually know how to terminate gracefully
# :eyeroll:...
self.try_signal('SIGKILL')
self.cntr.wait(
timeout=3,
condition='not-running',
)
log.cancel(f'Container stopped: {cid}') log.cancel(f'Container stopped: {cid}')
@ -245,13 +263,18 @@ async def open_ahabd(
# params, etc. passing to ``Containter.run()``? # params, etc. passing to ``Containter.run()``?
# call into endpoint for container config/init # call into endpoint for container config/init
ep_func = NamespacePath(endpoint).load_ref() ep_func = NamespacePath(endpoint).load_ref()
dcntr, cntr_config = ep_func(client) (
dcntr,
cntr_config,
start_msg,
stop_msg,
) = ep_func(client)
cntr = Container(dcntr) cntr = Container(dcntr)
with trio.move_on_after(1): with trio.move_on_after(1):
found = await cntr.process_logs_until( found = await cntr.process_logs_until(start_msg)
"launching tcp listener for all services...", # "launching tcp listener for all services...",
) # )
if not found and cntr not in client.containers.list(): if not found and cntr not in client.containers.list():
raise RuntimeError( raise RuntimeError(
@ -278,7 +301,7 @@ async def open_ahabd(
): ):
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
await cntr.cancel() await cntr.cancel(stop_msg)
raise raise

View File

@ -185,7 +185,14 @@ def start_marketstore(
init=True, init=True,
# remove=True, # remove=True,
) )
return dcntr, _config return (
dcntr,
_config,
# expected startup and stop msgs
"launching tcp listener for all services...",
"exiting...",
)
_tick_tbk_ids: tuple[str, str] = ('1Sec', 'TICK') _tick_tbk_ids: tuple[str, str] = ('1Sec', 'TICK')