From 139aea67b1a29ed5fe301391297a5741db12ede3 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Sun, 11 Dec 2022 08:32:25 -0300 Subject: [PATCH] Add gpu worker tests to debug tractor lifecycle stuff also logging Minor tweak to cuda docker image Pin tractor branch Add triton to cuda reqs --- Dockerfile.runtime-cuda | 2 +- requirements.cuda.0.txt | 1 + requirements.txt | 2 +- skynet_bot/dgpu.py | 3 +- skynet_bot/gpu.py | 4 +- test.sh | 3 +- tests/test_dgpu.py | 106 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 116 insertions(+), 5 deletions(-) diff --git a/Dockerfile.runtime-cuda b/Dockerfile.runtime-cuda index c39aefc..af7c77f 100644 --- a/Dockerfile.runtime-cuda +++ b/Dockerfile.runtime-cuda @@ -5,7 +5,7 @@ env DEBIAN_FRONTEND=noninteractive workdir /skynet -copy requirements.* . +copy requirements.* ./ run pip install -U pip ninja run pip install -r requirements.cuda.0.txt diff --git a/requirements.cuda.0.txt b/requirements.cuda.0.txt index e91ed18..3c1ce2a 100644 --- a/requirements.cuda.0.txt +++ b/requirements.cuda.0.txt @@ -1,5 +1,6 @@ pdbpp scipy +triton accelerate transformers huggingface_hub diff --git a/requirements.txt b/requirements.txt index 7831fd3..1fee2d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ aiohttp msgspec trio_asyncio -git+https://github.com/goodboy/tractor.git@master#egg=tractor +git+https://github.com/goodboy/tractor.git@piker_pin#egg=tractor diff --git a/skynet_bot/dgpu.py b/skynet_bot/dgpu.py index 016aa53..c454717 100644 --- a/skynet_bot/dgpu.py +++ b/skynet_bot/dgpu.py @@ -93,8 +93,9 @@ async def open_dgpu_node( ): logging.info(f'starting {dgpu_max_tasks} gpu workers') async with tractor.gather_contexts(( - ctx.open_context( + portal.open_context( open_gpu_worker, algo, 1.0 / dgpu_max_tasks) + for portal in portal_map.values() )) as contexts: contexts = {i: ctx for i, ctx in enumerate(contexts)} for i, ctx in contexts.items(): diff --git a/skynet_bot/gpu.py b/skynet_bot/gpu.py index b805bab..2756e0b 100644 --- a/skynet_bot/gpu.py +++ b/skynet_bot/gpu.py @@ -45,11 +45,13 @@ async def open_gpu_worker( start_algo: str, mem_fraction: float ): + log = tractor.log.get_logger(name='gpu', _root_name='skynet') + log.info(f'starting gpu worker with algo {start_algo}...') current_algo = start_algo with torch.no_grad(): pipe = pipeline_for(current_algo, mem_fraction) + log.info('pipeline loaded') await ctx.started() - async with ctx.open_stream() as bus: async for ireq in bus: if ireq.algo != current_algo: diff --git a/test.sh b/test.sh index 4cbef69..7cc11fd 100755 --- a/test.sh +++ b/test.sh @@ -1,8 +1,9 @@ docker run \ -it \ --rm \ + --gpus=all \ --mount type=bind,source="$(pwd)",target=/skynet \ skynet:runtime-cuda \ bash -c \ "cd /skynet && pip install -e . && \ - pytest tests/test_dgpu.py --log-cli-level=info" + pytest $1 --log-cli-level=info" diff --git a/tests/test_dgpu.py b/tests/test_dgpu.py index 421eb18..d4ae034 100644 --- a/tests/test_dgpu.py +++ b/tests/test_dgpu.py @@ -6,8 +6,10 @@ import logging import trio import pynng +import tractor import trio_asyncio +from skynet_bot.gpu import open_gpu_worker from skynet_bot.dgpu import open_dgpu_node from skynet_bot.types import * from skynet_bot.brain import run_skynet @@ -15,6 +17,110 @@ from skynet_bot.constants import * from skynet_bot.frontend import open_skynet_rpc, rpc_call +@tractor.context +async def open_fake_worker( + ctx: tractor.Context, + start_algo: str, + mem_fraction: float +): + log = tractor.log.get_logger(name='gpu', _root_name='skynet') + log.info(f'starting gpu worker with algo {start_algo}...') + current_algo = start_algo + log.info('pipeline loaded') + await ctx.started() + async with ctx.open_stream() as bus: + async for ireq in bus: + if ireq: + await bus.send('hello!') + else: + break + +def test_gpu_worker(): + log = tractor.log.get_logger(name='root', _root_name='skynet') + async def main(): + async with ( + tractor.open_nursery(debug_mode=True) as an, + trio.open_nursery() as n + ): + portal = await an.start_actor( + 'gpu_worker', + enable_modules=[__name__], + debug_mode=True + ) + + log.info('portal opened') + async with ( + portal.open_context( + open_fake_worker, + start_algo='midj', + mem_fraction=0.6 + ) as (ctx, _), + ctx.open_stream() as stream, + ): + log.info('opened worker sending req...') + ireq = ImageGenRequest( + prompt='a red tractor on a wheat field', + step=28, + width=512, height=512, + guidance=10, seed=None, + algo='midj', upscaler=None) + + await stream.send(ireq) + log.info('sent, await respnse') + async for msg in stream: + log.info(f'got {msg}') + break + + assert msg == 'hello!' + await stream.send(None) + log.info('done.') + + await portal.cancel_actor() + + trio.run(main) + + +def test_gpu_two_workers(): + async def main(): + outputs = [] + async with ( + tractor.open_actor_cluster( + modules=[__name__], + count=2, + names=[0, 1]) as portal_map, + tractor.trionics.gather_contexts(( + portal.open_context( + open_fake_worker, + start_algo='midj', + mem_fraction=0.333) + for portal in portal_map.values() + )) as contexts, + trio.open_nursery() as n + ): + ireq = ImageGenRequest( + prompt='a red tractor on a wheat field', + step=28, + width=512, height=512, + guidance=10, seed=None, + algo='midj', upscaler=None) + + async def get_img(i): + ctx = contexts[i] + async with ctx.open_stream() as stream: + await stream.send(ireq) + async for img in stream: + outputs[i] = img + await portal_map[i].cancel_actor() + + n.start_soon(get_img, 0) + n.start_soon(get_img, 1) + + + assert len(outputs) == 2 + + trio.run(main) + + def test_dgpu_simple(): async def main(): async with trio.open_nursery() as n: