Make gpu work cancellable using trio threading apis!, also make docker always reinstall package for easier development

2023-10-05 15:07:42 -03:00 · 2023-10-05 15:07:42 -03:00 · 01c78b5d20
parent 47d9f59dbe
commit 01c78b5d20
3 changed files with 34 additions and 4 deletions
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@ -3,4 +3,6 @@
 export VIRTUAL_ENV='/skynet/.venv'
 poetry env use $VIRTUAL_ENV/bin/python
 poetry install
 exec poetry run "$@"
--- a/skynet/dgpu/compute.py
+++ b/skynet/dgpu/compute.py
@ -3,12 +3,15 @@
 # Skynet Memory Manager
 import gc
 from hashlib import sha256
 import json
 import logging
 from hashlib import sha256
 from diffusers import DiffusionPipeline
 import trio
 import torch
 from skynet.constants import DEFAULT_INITAL_MODELS, MODELS
 from skynet.dgpu.errors import DGPUComputeError
@ -122,10 +125,17 @@ class SkynetMM:
    def compute_one(
        self,
        should_cancel_work,
        method: str,
        params: dict,
        binary: bytes | None = None
    ):
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor):
            should_raise = trio.from_thread.run(should_cancel_work)
            if should_raise:
                logging.warn(f'cancelling work at step {step}')
                raise DGPUComputeError('Inference cancelled')
        try:
            match method:
                case 'diffuse':
@ -140,6 +150,8 @@ class SkynetMM:
                        guidance_scale=guidance,
                        num_inference_steps=step,
                        generator=seed,
                        callback=callback_fn,
                        callback_steps=1,
                        **extra_params
                    ).images[0]
--- a/skynet/dgpu/daemon.py
+++ b/skynet/dgpu/daemon.py
@ -5,6 +5,7 @@ import logging
 import traceback
 from hashlib import sha256
 from functools import partial
 import trio
@ -26,6 +27,16 @@ class SkynetDGPUDaemon:
            config['auto_withdraw']
            if 'auto_withdraw' in config else False
        )
        self.non_compete = set(('testworker2', 'animus2.boid', 'animus1.boid'))
        self.current_request = None
    async def should_cancel_work(self):
        competitors = set((
            status['worker']
            for status in
            (await self.conn.get_status_by_request_id(self.current_request))
        ))
        return self.non_compete & competitors
    async def serve_forever(self):
        try:
@ -43,7 +54,7 @@ class SkynetDGPUDaemon:
                        statuses = await self.conn.get_status_by_request_id(rid)
                        if len(statuses) == 0:
-
+                            self.current_request = rid
                            # parse request
                            body = json.loads(req['body'])
@ -70,8 +81,13 @@ class SkynetDGPUDaemon:
                            else:
                                try:
-                                    img_sha, img_raw = self.mm.compute_one(
+                                    img_sha, img_raw = await trio.to_thread.run_sync(
-                                        body['method'], body['params'], binary=binary)
+                                        partial(
                                            self.mm.compute_one,
                                            self.should_cancel_work,
                                            body['method'], body['params'], binary=binary
                                        )
                                    )
                                    ipfs_hash = await self.conn.publish_on_ipfs(img_raw)