Suggest `skynet.dgpu` docs, typing, pythonisms

From the deep-ish dive drafting our first set of design/architecture diagrams in https://github.com/skygpu/cyberdyne/pull/2, this adds a buncha suggestions, typing, and styling adjustments. Namely the code tweaks include, - changing to multi-line import tuples where appropriate (since they're much handier to modify ;) - adding typing in many spots where it wasn't clear to me the types being returned/operated-with in various (internal) methods. - doc strings (in mostly random spots Xp ) where i had the need to remember the impl's purpose but didn't want to re-read the code in detail again. - ALOT of TODOs surrounding various potential style changes, re-factorings, naming and in some cases "modernization" according to the latest python3.12 feats/spec/stdlib.
2025-02-03 10:25:14 -05:00 · 2025-02-03 10:25:14 -05:00 · 7cb9f09d95
parent c0ac6298a9
commit 7cb9f09d95
5 changed files with 201 additions and 61 deletions
--- a/skynet/dgpu/init.py
+++ b/skynet/dgpu/init.py
@ -14,8 +14,8 @@ from skynet.dgpu.network import SkynetGPUConnector
 async def open_dgpu_node(config: dict) -> None:
    '''
    Open a top level "GPU mgmt daemon", keep the
-    `SkynetDGPUDaemon._snap: dict[str, list|dict]` table and *maybe*
+    `SkynetDGPUDaemon._snap: dict[str, list|dict]` table
-    serve a `hypercorn` web API.
+    and *maybe* serve a `hypercorn` web API.
    '''
    conn = SkynetGPUConnector(config)
@ -32,6 +32,8 @@ async def open_dgpu_node(config: dict) -> None:
    async with trio.open_nursery() as tn:
        tn.start_soon(daemon.snap_updater_task)
        # TODO, consider a more explicit `as hypercorn_serve`
        # to clarify?
        if api:
            tn.start_soon(serve, api, api_conf)
--- a/skynet/dgpu/compute.py
+++ b/skynet/dgpu/compute.py
@ -1,20 +1,36 @@
 #!/usr/bin/python
 # ^TODO? again, why..
 #
 # Do we expect this mod
 # to be invoked? if so why is there no
 # `if __name__ == '__main__'` guard?
 #
 # if anything this should contain a license header ;)
-# Skynet Memory Manager
+'''
 Skynet Memory Manager
 '''
 import gc
 import logging
 from hashlib import sha256
-import zipfile
+# import zipfile
-from PIL import Image
+# from PIL import Image
-from diffusers import DiffusionPipeline
+# from diffusers import DiffusionPipeline
 import trio
 import torch
-from skynet.constants import DEFAULT_INITAL_MODEL, MODELS
+# from skynet.constants import (
-from skynet.dgpu.errors import DGPUComputeError, DGPUInferenceCancelled
+#     DEFAULT_INITAL_MODEL,
 #     MODELS,
 # )
 from skynet.dgpu.errors import (
    DGPUComputeError,
    DGPUInferenceCancelled,
 )
 from skynet.utils import crop_image, convert_from_cv2_to_image, convert_from_image_to_cv2, convert_from_img_to_bytes, init_upscaler, pipeline_for
@ -66,15 +82,20 @@ def prepare_params_for_diffuse(
    )
 # TODO, yet again - drop the redundant prefix ;)
 class SkynetMM:
    '''
    (AI algo) Model manager for loading models, computing outputs,
    checking load state, and unloading when no-longer-needed/finished.
    '''
    def __init__(self, config: dict):
        self.cache_dir = None
        if 'hf_home' in config:
            self.cache_dir = config['hf_home']
-        self._model_name = ''
+        self._model_name: str = ''
-        self._model_mode = ''
+        self._model_mode: str = ''
        # self.load_model(DEFAULT_INITAL_MODEL, 'txt2img')
@ -89,7 +110,7 @@ class SkynetMM:
        return False
-    def unload_model(self):
+    def unload_model(self) -> None:
        if getattr(self, '_model', None):
            del self._model
@ -103,7 +124,7 @@ class SkynetMM:
        self,
        name: str,
        mode: str
-    ):
+    ) -> None:
        logging.info(f'loading model {name}...')
        self.unload_model()
        self._model = pipeline_for(
@ -111,7 +132,6 @@ class SkynetMM:
        self._model_mode = mode
        self._model_name = name
    def compute_one(
        self,
        request_id: int,
@ -124,6 +144,9 @@ class SkynetMM:
                should_raise = trio.from_thread.run(self._should_cancel, request_id)
                if should_raise:
                    logging.warn(f'cancelling work at step {step}')
                    # ?TODO, this is never caught, so why is it
                    # raised specially?
                    raise DGPUInferenceCancelled()
            return {}
@ -199,9 +222,10 @@ class SkynetMM:
                case _:
                    raise DGPUComputeError('Unsupported compute method')
-        except BaseException as e:
+        except BaseException as err:
-            logging.error(e)
+            logging.error(err)
-            raise DGPUComputeError(str(e))
+            # to see the src exc in tb
            raise DGPUComputeError(str(err)) from err
        finally:
            torch.cuda.empty_cache()
--- a/skynet/dgpu/daemon.py
+++ b/skynet/dgpu/daemon.py
@ -1,23 +1,25 @@
 #!/usr/bin/python
 import json
 import random
 import logging
 import time
 import traceback
 from hashlib import sha256
 from datetime import datetime
 from functools import partial
 from hashlib import sha256
 import json
 import logging
 import random
 # import traceback
 import time
 import trio
 from quart import jsonify
 from quart_trio import QuartTrio as Quart
-from skynet.constants import MODELS, VERSION
+from skynet.constants import (
-
+    MODELS,
-from skynet.dgpu.errors import *
+    VERSION,
 )
 from skynet.dgpu.errors import (
    DGPUComputeError,
 )
 from skynet.dgpu.compute import SkynetMM
 from skynet.dgpu.network import SkynetGPUConnector
@ -30,22 +32,29 @@ def convert_reward_to_int(reward_str):
    return int(int_part + decimal_part)
 # prolly don't need the `Skynet` prefix since that's kinda implied ;p
 class SkynetDGPUDaemon:
    '''
    The root "GPU daemon".
    Contains/manages underlying susystems:
    - a GPU connecto
    '''
    def __init__(
        self,
        mm: SkynetMM,
        conn: SkynetGPUConnector,
        config: dict
    ):
-        self.mm = mm
+        self.mm: SkynetMM = mm
-        self.conn = conn
+        self.conn: SkynetGPUConnector = conn
        self.auto_withdraw = (
            config['auto_withdraw']
            if 'auto_withdraw' in config else False
        )
-        self.account = config['account']
+        self.account: str = config['account']
        self.non_compete = set()
        if 'non_compete' in config:
@ -67,13 +76,20 @@ class SkynetDGPUDaemon:
            'queue': [],
            'requests': {},
            'my_results': []
            # ^and here i thot they were **my** results..
            # :sadcat:
        }
-        self._benchmark = []
+        self._benchmark: list[float] = []
-        self._last_benchmark = None
+        self._last_benchmark: list[float]|None = None
-        self._last_generation_ts = None
+        self._last_generation_ts: str|None = None
    def _get_benchmark_speed(self) -> float:
        '''
        Return the (arithmetic) average work-iterations-per-second
        fconducted by this compute worker.
        '''
        if not self._last_benchmark:
            return 0
@ -99,11 +115,26 @@ class SkynetDGPUDaemon:
    async def snap_updater_task(self):
        '''
        Busy loop update the local `._snap: dict` table from
        '''
        while True:
            self._snap = await self.conn.get_full_queue_snapshot()
            await trio.sleep(1)
-    async def generate_api(self):
+    # TODO, design suggestion, just make this a lazily accessed
    # `@class_property` if we're 3.12+
    # |_ https://docs.python.org/3/library/functools.html#functools.cached_property
    async def generate_api(self) -> Quart:
        '''
        Gen a `Quart`-compat web API spec which (for now) simply
        serves a small monitoring ep that reports,
        - iso-time-stamp of the last served model-output
        - the worker's average "compute-iterations-per-second"
        '''
        app = Quart(__name__)
        @app.route('/')
@ -117,21 +148,34 @@ class SkynetDGPUDaemon:
        return app
-    async def maybe_serve_one(self, req):
+    # TODO? this func is kinda big and maybe is better at module
    # level to reduce indentation?
    # -[ ] just pass `daemon: SkynetDGPUDaemon` vs. `self`
    async def maybe_serve_one(
        self,
        req: dict,
    ):
        rid = req['id']
        # parse request
        body = json.loads(req['body'])
        model = body['params']['model']
-        # if model not known
+        # if model not known, ignore.
-        if model != 'RealESRGAN_x4plus' and model not in MODELS:
+        if (
            model != 'RealESRGAN_x4plus'
            and
            model not in MODELS
        ):
            logging.warning(f'Unknown model {model}')
            return False
-        # if whitelist enabled and model not in it continue
+        # only handle whitelisted models
-        if (len(self.model_whitelist) > 0 and
+        if (
-            not model in self.model_whitelist):
+            len(self.model_whitelist) > 0
            and
            model not in self.model_whitelist
        ):
            return False
        # if blacklist contains model skip
@ -139,21 +183,29 @@ class SkynetDGPUDaemon:
            return False
        my_results = [res['id'] for res in self._snap['my_results']]
-        if rid not in my_results and rid in self._snap['requests']:
+        if (
            rid not in my_results
            and
            rid in self._snap['requests']
        ):
            statuses = self._snap['requests'][rid]
            if len(statuses) == 0:
                inputs = []
                for _input in req['binary_data'].split(','):
                    if _input:
                        for _ in range(3):
                            try:
                                # user `GPUConnector` to IO with
                                # storage layer to seed the compute
                                # task.
                                img = await self.conn.get_input_data(_input)
                                inputs.append(img)
                                break
-                            except:
+                            except BaseException:
-                                ...
+                                logging.exception(
                                    'Model input error !?!\n'
                                )
                hash_str = (
                    str(req['nonce'])
@ -172,7 +224,7 @@ class SkynetDGPUDaemon:
                resp = await self.conn.begin_work(rid)
                if not resp or 'code' in resp:
-                    logging.info(f'probably being worked on already... skip.')
+                    logging.info('probably being worked on already... skip.')
                else:
                    try:
@ -195,25 +247,37 @@ class SkynetDGPUDaemon:
                                )
                            case _:
-                                raise DGPUComputeError(f'Unsupported backend {self.backend}')
+                                raise DGPUComputeError(
-                        self._last_generation_ts = datetime.now().isoformat()
+                                    f'Unsupported backend {self.backend}'
-                        self._last_benchmark = self._benchmark
+                                )
-                        self._benchmark = []
+
                        self._last_generation_ts: str = datetime.now().isoformat()
                        self._last_benchmark: list[float] = self._benchmark
                        self._benchmark: list[float] = []
                        ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type)
                        await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash)
-                    except BaseException as e:
+                    except BaseException as err:
-                        traceback.print_exc()
+                        logging.exception('Failed to serve model request !?\n')
-                        await self.conn.cancel_work(rid, str(e))
+                        # traceback.print_exc()  # TODO? <- replaced by above ya?
                        await self.conn.cancel_work(rid, str(err))
                    finally:
                        return True
        # TODO, i would inverse this case logic to avoid an indent
        # level in above block ;)
        else:
            logging.info(f'request {rid} already beign worked on, skip...')
    # TODO, as per above on `.maybe_serve_one()`, it's likely a bit
    # more *trionic* to define this all as a module level task-func
    # which operates on a `daemon: SkynetDGPUDaemon`?
    #
    # -[ ] keeps tasks-as-funcs style prominent
    # -[ ] avoids so much indentation due to methods
    async def serve_forever(self):
        try:
            while True:
@ -230,6 +294,8 @@ class SkynetDGPUDaemon:
                )
                for req in queue:
                    # TODO, as mentioned above just inline this once
                    # converted to a mod level func.
                    if (await self.maybe_serve_one(req)):
                        break
--- a/skynet/dgpu/errors.py
+++ b/skynet/dgpu/errors.py
@ -1,4 +1,5 @@
 #!/usr/bin/python
 # ^TODO, why..
 class DGPUComputeError(BaseException):
--- a/skynet/dgpu/network.py
+++ b/skynet/dgpu/network.py
@ -13,40 +13,72 @@ import leap
 import anyio
 import httpx
-from PIL import Image, UnidentifiedImageError
+from PIL import (
    Image,
    # UnidentifiedImageError,  # TODO, remove?
 )
 from leap.cleos import CLEOS
 from leap.protocol import Asset
-from skynet.constants import DEFAULT_IPFS_DOMAIN, GPU_CONTRACT_ABI
+from skynet.constants import (
    DEFAULT_IPFS_DOMAIN,
    GPU_CONTRACT_ABI,
 )
-from skynet.ipfs import AsyncIPFSHTTP, get_ipfs_file
+from skynet.ipfs import (
-from skynet.dgpu.errors import DGPUComputeError
+    AsyncIPFSHTTP,
    get_ipfs_file,
 )
 # TODO, remove?
 # from skynet.dgpu.errors import DGPUComputeError
-REQUEST_UPDATE_TIME = 3
+REQUEST_UPDATE_TIME: int = 3
-async def failable(fn: partial, ret_fail=None):
+# TODO, consider using the `outcome` lib instead?
 # - it's already purpose built for exactly this, boxing (async)
 #  function invocations..
 # |_ https://outcome.readthedocs.io/en/latest/api.html#outcome.capture
 async def failable(
    fn: partial,
    ret_fail=None,
 ):
    try:
        return await fn()
    except (
        OSError,
        json.JSONDecodeError,
        anyio.BrokenResourceError,
        httpx.ReadError,
        leap.errors.TransactionPushError
-    ) as e:
+    ):
        return ret_fail
 # TODO, again the prefix XD
 # -[ ] better name then `GPUConnector` ??
 # |_ `Compute[Net]IO[Mngr]`
 class SkynetGPUConnector:
    '''
    An API for connecting to and conducting various "high level"
    network-service operations in the skynet.
    - skynet user account creds
    - hyperion API
    - IPFs client
    - CLEOS client
    '''
    def __init__(self, config: dict):
        # TODO, why these extra instance vars for an (unsynced)
        # copy of the `config` state?
        self.account = config['account']
        self.permission = config['permission']
        self.key = config['key']
        # TODO, neither of these instance vars are used anywhere in
        # methods? so why are they set on this type?
        self.node_url = config['node_url']
        self.hyperion_url = config['hyperion_url']
@ -125,7 +157,9 @@ class SkynetGPUConnector:
        logging.info(f'competitors: {competitors}')
        return set(competitors)
-
+    # TODO, considery making this a NON-method and instead
    # handing in the `snap['queue']` output beforehand?
    # -> since that call is the only usage of `self`?
    async def get_full_queue_snapshot(self):
        snap = {
            'requests': {},
@ -146,6 +180,11 @@ class SkynetGPUConnector:
        return snap
    async def begin_work(self, request_id: int):
        '''
        Publish to the bc that the worker is beginning a model-computation
        step.
        '''
        logging.info('begin_work')
        return await failable(
            partial(
@ -269,6 +308,14 @@ class SkynetGPUConnector:
        return file_cid
    async def get_input_data(self, ipfs_hash: str) -> Image:
        '''
        Retrieve an input (image) from the IPFs layer.
        Normally used to retreive seed (visual) content previously
        generated/validated by the network to be fed to some
        consuming AI model.
        '''
        link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}'
        res = await get_ipfs_file(link, timeout=1)