From 01e6bfe25252938863ac8b26ee7f479f53c9978b Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 30 Jan 2025 15:29:56 -0500 Subject: [PATCH 01/35] Doc and type `skynet.dgpu` pkg mod --- skynet/dgpu/__init__.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) mode change 100644 => 100755 skynet/dgpu/__init__.py diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py old mode 100644 new mode 100755 index 173dc32..aaf84f7 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -4,27 +4,36 @@ import trio from hypercorn.config import Config from hypercorn.trio import serve +from quart_trio import QuartTrio as Quart from skynet.dgpu.compute import SkynetMM from skynet.dgpu.daemon import SkynetDGPUDaemon from skynet.dgpu.network import SkynetGPUConnector -async def open_dgpu_node(config: dict): +async def open_dgpu_node(config: dict) -> None: + ''' + Open a top level "GPU mgmt daemon", keep the + `SkynetDGPUDaemon._snap: dict[str, list|dict]` table and *maybe* + serve a `hypercorn` web API. + + ''' conn = SkynetGPUConnector(config) mm = SkynetMM(config) daemon = SkynetDGPUDaemon(mm, conn, config) - api = None + api: Quart|None = None if 'api_bind' in config: api_conf = Config() api_conf.bind = [config['api_bind']] - api = await daemon.generate_api() + api: Quart = await daemon.generate_api() - async with trio.open_nursery() as n: - n.start_soon(daemon.snap_updater_task) + tn: trio.Nursery + async with trio.open_nursery() as tn: + tn.start_soon(daemon.snap_updater_task) if api: - n.start_soon(serve, api, api_conf) + tn.start_soon(serve, api, api_conf) + # block until cancelled await daemon.serve_forever() From e0704e3787340b618415a0902a45be78faf9250e Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 3 Feb 2025 10:25:14 -0500 Subject: [PATCH 02/35] Suggest `skynet.dgpu` docs, typing, pythonisms From the deep-ish dive drafting our first set of design/architecture diagrams in https://github.com/skygpu/cyberdyne/pull/2, this adds a buncha suggestions, typing, and styling adjustments. Namely the code tweaks include, - changing to multi-line import tuples where appropriate (since they're much handier to modify ;) - adding typing in many spots where it wasn't clear to me the types being returned/operated-with in various (internal) methods. - doc strings (in mostly random spots Xp ) where i had the need to remember the impl's purpose but didn't want to re-read the code in detail again. - ALOT of TODOs surrounding various potential style changes, re-factorings, naming and in some cases "modernization" according to the latest python3.12 feats/spec/stdlib. --- skynet/dgpu/__init__.py | 6 +- skynet/dgpu/compute.py | 52 +++++++++++---- skynet/dgpu/daemon.py | 138 +++++++++++++++++++++++++++++----------- skynet/dgpu/errors.py | 1 + skynet/dgpu/network.py | 65 ++++++++++++++++--- 5 files changed, 201 insertions(+), 61 deletions(-) mode change 100644 => 100755 skynet/dgpu/compute.py mode change 100644 => 100755 skynet/dgpu/daemon.py mode change 100644 => 100755 skynet/dgpu/errors.py diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index aaf84f7..b454a95 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -14,8 +14,8 @@ from skynet.dgpu.network import SkynetGPUConnector async def open_dgpu_node(config: dict) -> None: ''' Open a top level "GPU mgmt daemon", keep the - `SkynetDGPUDaemon._snap: dict[str, list|dict]` table and *maybe* - serve a `hypercorn` web API. + `SkynetDGPUDaemon._snap: dict[str, list|dict]` table + and *maybe* serve a `hypercorn` web API. ''' conn = SkynetGPUConnector(config) @@ -32,6 +32,8 @@ async def open_dgpu_node(config: dict) -> None: async with trio.open_nursery() as tn: tn.start_soon(daemon.snap_updater_task) + # TODO, consider a more explicit `as hypercorn_serve` + # to clarify? if api: tn.start_soon(serve, api, api_conf) diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py old mode 100644 new mode 100755 index 4523d48..535dfcc --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -1,20 +1,36 @@ #!/usr/bin/python +# ^TODO? again, why.. +# +# Do we expect this mod +# to be invoked? if so why is there no +# `if __name__ == '__main__'` guard? +# +# if anything this should contain a license header ;) -# Skynet Memory Manager +''' +Skynet Memory Manager + +''' import gc import logging from hashlib import sha256 -import zipfile -from PIL import Image -from diffusers import DiffusionPipeline +# import zipfile +# from PIL import Image +# from diffusers import DiffusionPipeline import trio import torch -from skynet.constants import DEFAULT_INITAL_MODEL, MODELS -from skynet.dgpu.errors import DGPUComputeError, DGPUInferenceCancelled +# from skynet.constants import ( +# DEFAULT_INITAL_MODEL, +# MODELS, +# ) +from skynet.dgpu.errors import ( + DGPUComputeError, + DGPUInferenceCancelled, +) from skynet.utils import crop_image, convert_from_cv2_to_image, convert_from_image_to_cv2, convert_from_img_to_bytes, init_upscaler, pipeline_for @@ -66,15 +82,20 @@ def prepare_params_for_diffuse( ) +# TODO, yet again - drop the redundant prefix ;) class SkynetMM: + ''' + (AI algo) Model manager for loading models, computing outputs, + checking load state, and unloading when no-longer-needed/finished. + ''' def __init__(self, config: dict): self.cache_dir = None if 'hf_home' in config: self.cache_dir = config['hf_home'] - self._model_name = '' - self._model_mode = '' + self._model_name: str = '' + self._model_mode: str = '' # self.load_model(DEFAULT_INITAL_MODEL, 'txt2img') @@ -89,7 +110,7 @@ class SkynetMM: return False - def unload_model(self): + def unload_model(self) -> None: if getattr(self, '_model', None): del self._model @@ -103,7 +124,7 @@ class SkynetMM: self, name: str, mode: str - ): + ) -> None: logging.info(f'loading model {name}...') self.unload_model() self._model = pipeline_for( @@ -111,7 +132,6 @@ class SkynetMM: self._model_mode = mode self._model_name = name - def compute_one( self, request_id: int, @@ -124,6 +144,9 @@ class SkynetMM: should_raise = trio.from_thread.run(self._should_cancel, request_id) if should_raise: logging.warn(f'cancelling work at step {step}') + + # ?TODO, this is never caught, so why is it + # raised specially? raise DGPUInferenceCancelled() return {} @@ -199,9 +222,10 @@ class SkynetMM: case _: raise DGPUComputeError('Unsupported compute method') - except BaseException as e: - logging.error(e) - raise DGPUComputeError(str(e)) + except BaseException as err: + logging.error(err) + # to see the src exc in tb + raise DGPUComputeError(str(err)) from err finally: torch.cuda.empty_cache() diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py old mode 100644 new mode 100755 index 220fe3c..db8da86 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -1,23 +1,25 @@ #!/usr/bin/python -import json -import random -import logging -import time -import traceback - -from hashlib import sha256 from datetime import datetime from functools import partial +from hashlib import sha256 +import json +import logging +import random +# import traceback +import time import trio - from quart import jsonify from quart_trio import QuartTrio as Quart -from skynet.constants import MODELS, VERSION - -from skynet.dgpu.errors import * +from skynet.constants import ( + MODELS, + VERSION, +) +from skynet.dgpu.errors import ( + DGPUComputeError, +) from skynet.dgpu.compute import SkynetMM from skynet.dgpu.network import SkynetGPUConnector @@ -30,22 +32,29 @@ def convert_reward_to_int(reward_str): return int(int_part + decimal_part) +# prolly don't need the `Skynet` prefix since that's kinda implied ;p class SkynetDGPUDaemon: + ''' + The root "GPU daemon". + Contains/manages underlying susystems: + - a GPU connecto + + ''' def __init__( self, mm: SkynetMM, conn: SkynetGPUConnector, config: dict ): - self.mm = mm - self.conn = conn + self.mm: SkynetMM = mm + self.conn: SkynetGPUConnector = conn self.auto_withdraw = ( config['auto_withdraw'] if 'auto_withdraw' in config else False ) - self.account = config['account'] + self.account: str = config['account'] self.non_compete = set() if 'non_compete' in config: @@ -67,13 +76,20 @@ class SkynetDGPUDaemon: 'queue': [], 'requests': {}, 'my_results': [] + # ^and here i thot they were **my** results.. + # :sadcat: } - self._benchmark = [] - self._last_benchmark = None - self._last_generation_ts = None + self._benchmark: list[float] = [] + self._last_benchmark: list[float]|None = None + self._last_generation_ts: str|None = None def _get_benchmark_speed(self) -> float: + ''' + Return the (arithmetic) average work-iterations-per-second + fconducted by this compute worker. + + ''' if not self._last_benchmark: return 0 @@ -99,11 +115,26 @@ class SkynetDGPUDaemon: async def snap_updater_task(self): + ''' + Busy loop update the local `._snap: dict` table from + + ''' while True: self._snap = await self.conn.get_full_queue_snapshot() await trio.sleep(1) - async def generate_api(self): + # TODO, design suggestion, just make this a lazily accessed + # `@class_property` if we're 3.12+ + # |_ https://docs.python.org/3/library/functools.html#functools.cached_property + async def generate_api(self) -> Quart: + ''' + Gen a `Quart`-compat web API spec which (for now) simply + serves a small monitoring ep that reports, + + - iso-time-stamp of the last served model-output + - the worker's average "compute-iterations-per-second" + + ''' app = Quart(__name__) @app.route('/') @@ -117,21 +148,34 @@ class SkynetDGPUDaemon: return app - async def maybe_serve_one(self, req): + # TODO? this func is kinda big and maybe is better at module + # level to reduce indentation? + # -[ ] just pass `daemon: SkynetDGPUDaemon` vs. `self` + async def maybe_serve_one( + self, + req: dict, + ): rid = req['id'] # parse request body = json.loads(req['body']) model = body['params']['model'] - # if model not known - if model != 'RealESRGAN_x4plus' and model not in MODELS: + # if model not known, ignore. + if ( + model != 'RealESRGAN_x4plus' + and + model not in MODELS + ): logging.warning(f'Unknown model {model}') return False - # if whitelist enabled and model not in it continue - if (len(self.model_whitelist) > 0 and - not model in self.model_whitelist): + # only handle whitelisted models + if ( + len(self.model_whitelist) > 0 + and + model not in self.model_whitelist + ): return False # if blacklist contains model skip @@ -139,21 +183,29 @@ class SkynetDGPUDaemon: return False my_results = [res['id'] for res in self._snap['my_results']] - if rid not in my_results and rid in self._snap['requests']: + if ( + rid not in my_results + and + rid in self._snap['requests'] + ): statuses = self._snap['requests'][rid] - if len(statuses) == 0: inputs = [] for _input in req['binary_data'].split(','): if _input: for _ in range(3): try: + # user `GPUConnector` to IO with + # storage layer to seed the compute + # task. img = await self.conn.get_input_data(_input) inputs.append(img) break - except: - ... + except BaseException: + logging.exception( + 'Model input error !?!\n' + ) hash_str = ( str(req['nonce']) @@ -172,7 +224,7 @@ class SkynetDGPUDaemon: resp = await self.conn.begin_work(rid) if not resp or 'code' in resp: - logging.info(f'probably being worked on already... skip.') + logging.info('probably being worked on already... skip.') else: try: @@ -195,25 +247,37 @@ class SkynetDGPUDaemon: ) case _: - raise DGPUComputeError(f'Unsupported backend {self.backend}') - self._last_generation_ts = datetime.now().isoformat() - self._last_benchmark = self._benchmark - self._benchmark = [] + raise DGPUComputeError( + f'Unsupported backend {self.backend}' + ) + + self._last_generation_ts: str = datetime.now().isoformat() + self._last_benchmark: list[float] = self._benchmark + self._benchmark: list[float] = [] ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) - except BaseException as e: - traceback.print_exc() - await self.conn.cancel_work(rid, str(e)) + except BaseException as err: + logging.exception('Failed to serve model request !?\n') + # traceback.print_exc() # TODO? <- replaced by above ya? + await self.conn.cancel_work(rid, str(err)) finally: return True + # TODO, i would inverse this case logic to avoid an indent + # level in above block ;) else: logging.info(f'request {rid} already beign worked on, skip...') + # TODO, as per above on `.maybe_serve_one()`, it's likely a bit + # more *trionic* to define this all as a module level task-func + # which operates on a `daemon: SkynetDGPUDaemon`? + # + # -[ ] keeps tasks-as-funcs style prominent + # -[ ] avoids so much indentation due to methods async def serve_forever(self): try: while True: @@ -230,6 +294,8 @@ class SkynetDGPUDaemon: ) for req in queue: + # TODO, as mentioned above just inline this once + # converted to a mod level func. if (await self.maybe_serve_one(req)): break diff --git a/skynet/dgpu/errors.py b/skynet/dgpu/errors.py old mode 100644 new mode 100755 index 91db585..3c4992c --- a/skynet/dgpu/errors.py +++ b/skynet/dgpu/errors.py @@ -1,4 +1,5 @@ #!/usr/bin/python +# ^TODO, why.. class DGPUComputeError(BaseException): diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 55fc786..6b2ed5a 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -13,23 +13,39 @@ import leap import anyio import httpx -from PIL import Image, UnidentifiedImageError +from PIL import ( + Image, + # UnidentifiedImageError, # TODO, remove? +) from leap.cleos import CLEOS from leap.protocol import Asset -from skynet.constants import DEFAULT_IPFS_DOMAIN, GPU_CONTRACT_ABI +from skynet.constants import ( + DEFAULT_IPFS_DOMAIN, + GPU_CONTRACT_ABI, +) -from skynet.ipfs import AsyncIPFSHTTP, get_ipfs_file -from skynet.dgpu.errors import DGPUComputeError +from skynet.ipfs import ( + AsyncIPFSHTTP, + get_ipfs_file, +) +# TODO, remove? +# from skynet.dgpu.errors import DGPUComputeError -REQUEST_UPDATE_TIME = 3 +REQUEST_UPDATE_TIME: int = 3 -async def failable(fn: partial, ret_fail=None): +# TODO, consider using the `outcome` lib instead? +# - it's already purpose built for exactly this, boxing (async) +# function invocations.. +# |_ https://outcome.readthedocs.io/en/latest/api.html#outcome.capture +async def failable( + fn: partial, + ret_fail=None, +): try: return await fn() - except ( OSError, json.JSONDecodeError, @@ -39,17 +55,33 @@ async def failable(fn: partial, ret_fail=None): httpx.ReadError, httpx.ReadTimeout, leap.errors.TransactionPushError - ) as e: + ): return ret_fail +# TODO, again the prefix XD +# -[ ] better name then `GPUConnector` ?? +# |_ `Compute[Net]IO[Mngr]` class SkynetGPUConnector: + ''' + An API for connecting to and conducting various "high level" + network-service operations in the skynet. + - skynet user account creds + - hyperion API + - IPFs client + - CLEOS client + + ''' def __init__(self, config: dict): + # TODO, why these extra instance vars for an (unsynced) + # copy of the `config` state? self.account = config['account'] self.permission = config['permission'] self.key = config['key'] + # TODO, neither of these instance vars are used anywhere in + # methods? so why are they set on this type? self.node_url = config['node_url'] self.hyperion_url = config['hyperion_url'] @@ -128,7 +160,9 @@ class SkynetGPUConnector: logging.info(f'competitors: {competitors}') return set(competitors) - + # TODO, considery making this a NON-method and instead + # handing in the `snap['queue']` output beforehand? + # -> since that call is the only usage of `self`? async def get_full_queue_snapshot(self): snap = { 'requests': {}, @@ -149,6 +183,11 @@ class SkynetGPUConnector: return snap async def begin_work(self, request_id: int): + ''' + Publish to the bc that the worker is beginning a model-computation + step. + + ''' logging.info('begin_work') return await failable( partial( @@ -272,6 +311,14 @@ class SkynetGPUConnector: return file_cid async def get_input_data(self, ipfs_hash: str) -> Image: + ''' + Retrieve an input (image) from the IPFs layer. + + Normally used to retreive seed (visual) content previously + generated/validated by the network to be fed to some + consuming AI model. + + ''' link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}' res = await get_ipfs_file(link, timeout=1) From bb4be24facaacfeffc44ba393c7a4d73c0d4a986 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 3 Feb 2025 10:44:01 -0500 Subject: [PATCH 03/35] Some more `import` fixes Removing unused imports in a few modules as well as converting a few more tuple imports to multi-line style. We should prolly consider a linter as part of pre-merge machinery; I noticed a few pretty low hanging pep8 violations just spelunking rando modules ;) --- skynet/config.py | 2 -- skynet/constants.py | 6 ++++-- skynet/frontend/discord/__init__.py | 11 +++++++++-- skynet/utils.py | 5 ++--- 4 files changed, 15 insertions(+), 9 deletions(-) mode change 100644 => 100755 skynet/frontend/discord/__init__.py diff --git a/skynet/config.py b/skynet/config.py index 21cc113..109c54c 100755 --- a/skynet/config.py +++ b/skynet/config.py @@ -3,8 +3,6 @@ import os import toml -from pathlib import Path - from .constants import DEFAULT_CONFIG_PATH diff --git a/skynet/constants.py b/skynet/constants.py index 6bc1867..480a5eb 100755 --- a/skynet/constants.py +++ b/skynet/constants.py @@ -1,22 +1,24 @@ #!/usr/bin/python +import msgspec +from typing import Literal VERSION = '0.1a12' DOCKER_RUNTIME_CUDA = 'skynet:runtime-cuda' -import msgspec -from typing import Literal class Size(msgspec.Struct): w: int h: int + class ModelDesc(msgspec.Struct): short: str mem: float size: Size tags: list[Literal['txt2img', 'img2img', 'inpaint']] + MODELS: dict[str, ModelDesc] = { 'runwayml/stable-diffusion-v1-5': ModelDesc( short='stable', diff --git a/skynet/frontend/discord/__init__.py b/skynet/frontend/discord/__init__.py old mode 100644 new mode 100755 index a6bc0f3..9bfcf08 --- a/skynet/frontend/discord/__init__.py +++ b/skynet/frontend/discord/__init__.py @@ -8,11 +8,18 @@ import asyncio from decimal import Decimal from hashlib import sha256 from datetime import datetime -from contextlib import ExitStack, AsyncExitStack +from contextlib import ( + ExitStack, + AsyncExitStack, +) from contextlib import asynccontextmanager as acm from leap.cleos import CLEOS -from leap.sugar import Name, asset_from_str, collect_stdout +from leap.sugar import ( + Name, + asset_from_str, + collect_stdout, +) from leap.hyperion import HyperionAPI # from telebot.types import InputMediaPhoto diff --git a/skynet/utils.py b/skynet/utils.py index 0662aca..24ac04f 100755 --- a/skynet/utils.py +++ b/skynet/utils.py @@ -9,9 +9,7 @@ import logging import importlib from typing import Optional -from pathlib import Path -import trio import torch import numpy as np @@ -112,6 +110,7 @@ def pipeline_for( return custom_pipeline.pipeline_for(model, mode, mem_fraction=mem_fraction, cache_dir=cache_dir) except ImportError: + # TODO, uhh why not warn/error log this? ... @@ -159,7 +158,7 @@ def pipeline_for( if mode == 'txt2img': pipe.vae.enable_tiling() pipe.vae.enable_slicing() - + pipe.enable_model_cpu_offload() else: From 2eb398bb8d3028b6448fde6276eab9789b84e80b Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 3 Feb 2025 10:46:09 -0500 Subject: [PATCH 04/35] Bit more multi-line styling in `.cli` Such that i could more easily read the flag specs in a vsplit `vim` buffer set ;) Also includes the same for some (internal) `tuple`-imports. --- skynet/cli.py | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/skynet/cli.py b/skynet/cli.py index 3adefdb..8da96bf 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -8,10 +8,25 @@ from functools import partial import click -from leap.protocol import Name, Asset +from leap.protocol import ( + Name, + Asset, +) -from .config import * -from .constants import * +from .config import ( + load_skynet_toml, + load_key, + set_hf_vars, + ConfigParsingError, +) +from .constants import ( + # TODO, more conventional to make these private i'm pretty + # sure according to pep8? + DEFAULT_IPFS_DOMAIN, + DEFAULT_EXPLORER_DOMAIN, + DEFAULT_CONFIG_PATH, + MODELS, +) @click.group() @@ -22,7 +37,10 @@ def skynet(*args, **kwargs): @click.command() @click.option('--model', '-m', default=list(MODELS.keys())[-1]) @click.option( - '--prompt', '-p', default='a red old tractor in a sunny wheat field') + '--prompt', + '-p', + default='a red old tractor in a sunny wheat field', +) @click.option('--output', '-o', default='output.png') @click.option('--width', '-w', default=512) @click.option('--height', '-h', default=512) @@ -30,7 +48,7 @@ def skynet(*args, **kwargs): @click.option('--steps', '-s', default=26) @click.option('--seed', '-S', default=None) def txt2img(*args, **kwargs): - from . import utils + from . import utils # TODO? why here, import cycle? config = load_skynet_toml() hf_token = load_key(config, 'skynet.dgpu.hf_token') @@ -38,10 +56,18 @@ def txt2img(*args, **kwargs): set_hf_vars(hf_token, hf_home) utils.txt2img(hf_token, **kwargs) + @click.command() -@click.option('--model', '-m', default=list(MODELS.keys())[-2]) @click.option( - '--prompt', '-p', default='a red old tractor in a sunny wheat field') + '--model', + '-m', + default=list(MODELS.keys())[-2] +) +@click.option( + '--prompt', + '-p', + default='a red old tractor in a sunny wheat field', +) @click.option('--input', '-i', default='input.png') @click.option('--output', '-o', default='output.png') @click.option('--strength', '-Z', default=1.0) @@ -118,6 +144,7 @@ def download(): set_hf_vars(hf_token, hf_home) utils.download_all_models(hf_token, hf_home) + @skynet.command() @click.option( '--reward', '-r', default='20.0000 GPU') @@ -315,6 +342,7 @@ def config( def deposit(quantity: str): import trio from leap.cleos import CLEOS + from leap.sugar import asset_from_str config = load_skynet_toml() @@ -365,7 +393,10 @@ def nodeos(): @run.command() @click.option('--loglevel', '-l', default='INFO', help='Logging level') @click.option( - '--config-path', '-c', default=DEFAULT_CONFIG_PATH) + '--config-path', + '-c', + default=DEFAULT_CONFIG_PATH, +) def dgpu( loglevel: str, config_path: str From 286d49a7f5ba533352da85b1a477f96b39812ed7 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 3 Feb 2025 10:48:37 -0500 Subject: [PATCH 05/35] Drop unused `as err`, suggest `logging.exception()` --- skynet/ipfs/docker.py | 2 ++ skynet/ipfs/pinner.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) mode change 100644 => 100755 skynet/ipfs/pinner.py diff --git a/skynet/ipfs/docker.py b/skynet/ipfs/docker.py index 7cdb3e9..69f564a 100755 --- a/skynet/ipfs/docker.py +++ b/skynet/ipfs/docker.py @@ -63,6 +63,8 @@ def open_ipfs_node( if ec != 0: logging.error(out) + # TODO, why not deliver some kinda API here for controlling the + # ipfs node? yield if teardown and container: diff --git a/skynet/ipfs/pinner.py b/skynet/ipfs/pinner.py old mode 100644 new mode 100755 index 0cc2836..a50acb0 --- a/skynet/ipfs/pinner.py +++ b/skynet/ipfs/pinner.py @@ -118,7 +118,8 @@ class SkynetPinner: for cid in cids: n.start_soon(self.task_pin, cid) - except OSError as e: + except OSError: + # TODO, use `logging.exception()` here instead ?? traceback.print_exc() except KeyboardInterrupt: From 1b528e1fa40f6ab0ad7e942ee9d0d00a79ab3441 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 3 Feb 2025 10:49:03 -0500 Subject: [PATCH 06/35] Fix non-f-string --- skynet/nodeos.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 skynet/nodeos.py diff --git a/skynet/nodeos.py b/skynet/nodeos.py old mode 100644 new mode 100755 index 10fb76d..f541a52 --- a/skynet/nodeos.py +++ b/skynet/nodeos.py @@ -88,6 +88,8 @@ def open_nodeos(cleanup: bool = True): logging.info(f'GPU KEYS: {(priv, pub)}') cleos.new_account('telos.gpu', ram=4200000, key=pub) + # lol, magic #s much XD ? + # TODO, why is there 4 workers/keys? for i in range(1, 4): priv, pub = cleos.create_key_pair() cleos.import_key(priv) @@ -116,7 +118,7 @@ def open_nodeos(cleanup: bool = True): 'telos.gpu', 'config', ['eosio.token', '4,GPU'], - f'telos.gpu@active' + 'telos.gpu@active' ) assert ec == 0 From 18cdffa70041a6553bee9f571f824f7429015f7e Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 4 Feb 2025 12:07:43 -0500 Subject: [PATCH 07/35] Doc `GPUConnector.get_full_queue_snapshot()` --- skynet/dgpu/network.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 6b2ed5a..7229e9d 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -164,6 +164,11 @@ class SkynetGPUConnector: # handing in the `snap['queue']` output beforehand? # -> since that call is the only usage of `self`? async def get_full_queue_snapshot(self): + ''' + Keep in-sync with latest (telos chain's smart-contract) table + state by polling (currently with period 1s). + + ''' snap = { 'requests': {}, 'my_results': [] From 336c0122eb737bbd3de620937baa886437049f57 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 4 Feb 2025 12:08:46 -0500 Subject: [PATCH 08/35] Tell vim about `uv.lock` toml-ness --- uv.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/uv.lock b/uv.lock index 51d6441..dd706db 100644 --- a/uv.lock +++ b/uv.lock @@ -1,3 +1,4 @@ +# vim: ft=toml version = 1 requires-python = ">=3.10, <3.13" resolution-markers = [ From f2a8f0367fbc970dde45c7bd273976b3e2a5608e Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 16:02:38 -0300 Subject: [PATCH 09/35] Switch failable to use outcome --- pyproject.toml | 1 + skynet/dgpu/network.py | 39 ++++++++++++++++++--------------------- uv.lock | 2 ++ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0eccc28..73884a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "zstandard>=0.23.0,<0.24", "click>=8.1.8,<9", "httpx>=0.28.1,<0.29", + "outcome>=1.3.0.post0", ] [project.scripts] diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 7229e9d..22ee76b 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -12,6 +12,7 @@ import trio import leap import anyio import httpx +import outcome from PIL import ( Image, @@ -36,27 +37,23 @@ from skynet.ipfs import ( REQUEST_UPDATE_TIME: int = 3 -# TODO, consider using the `outcome` lib instead? -# - it's already purpose built for exactly this, boxing (async) -# function invocations.. -# |_ https://outcome.readthedocs.io/en/latest/api.html#outcome.capture -async def failable( - fn: partial, - ret_fail=None, -): - try: - return await fn() - except ( - OSError, - json.JSONDecodeError, - anyio.BrokenResourceError, - httpx.ConnectError, - httpx.ConnectTimeout, - httpx.ReadError, - httpx.ReadTimeout, - leap.errors.TransactionPushError - ): - return ret_fail +async def failable(fn: partial, ret_fail=None): + o = await outcome.acapture(fn) + match o: + case outcome.Error(error=( + OSError() | + json.JSONDecodeError() | + anyio.BrokenResourceError() | + httpx.ConnectError() | + httpx.ConnectTimeout() | + httpx.ReadError() | + httpx.ReadTimeout() | + leap.errors.TransactionPushError() + )): + return ret_fail + + case _: + return o.unwrap() # TODO, again the prefix XD diff --git a/uv.lock b/uv.lock index dd706db..aa03f4a 100644 --- a/uv.lock +++ b/uv.lock @@ -2233,6 +2233,7 @@ dependencies = [ { name = "httpx" }, { name = "msgspec" }, { name = "numpy" }, + { name = "outcome" }, { name = "pillow" }, { name = "protobuf" }, { name = "py-leap" }, @@ -2283,6 +2284,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1,<0.29" }, { name = "msgspec", specifier = ">=0.19.0,<0.20" }, { name = "numpy", specifier = "<2.1" }, + { name = "outcome", specifier = ">=1.3.0.post0" }, { name = "pillow", specifier = ">=10.0.1,<11" }, { name = "protobuf", specifier = ">=5.29.3,<6" }, { name = "py-leap", git = "https://github.com/guilledk/py-leap.git?rev=v0.1a32" }, From 5f5314cd352a7cf834809cfe6a8ee7d854cc18e4 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 16:59:51 -0300 Subject: [PATCH 10/35] Remove all shebangs --- skynet/__init__.py | 2 -- skynet/cli.py | 2 -- skynet/config.py | 2 -- skynet/constants.py | 1 - skynet/db/__init__.py | 2 -- skynet/db/functions.py | 2 -- skynet/dgpu/__init__.py | 2 -- skynet/dgpu/compute.py | 9 --------- skynet/dgpu/daemon.py | 2 -- skynet/dgpu/errors.py | 3 --- skynet/dgpu/network.py | 2 -- skynet/dgpu/pipes/flux.py | 2 -- skynet/dgpu/pipes/flux_inpaint.py | 2 -- skynet/frontend/__init__.py | 2 -- skynet/frontend/discord/__init__.py | 2 -- skynet/frontend/discord/handlers.py | 2 -- skynet/frontend/discord/utils.py | 2 -- skynet/frontend/telegram/__init__.py | 2 -- skynet/frontend/telegram/handlers.py | 2 -- skynet/frontend/telegram/utils.py | 2 -- skynet/ipfs/__init__.py | 2 -- skynet/ipfs/docker.py | 2 -- skynet/ipfs/pinner.py | 2 -- skynet/nodeos.py | 2 -- skynet/utils.py | 2 -- tests/conftest.py | 2 -- tests/test_deploy.py | 2 -- tests/test_ipfs_client.py | 3 --- 28 files changed, 64 deletions(-) diff --git a/skynet/__init__.py b/skynet/__init__.py index 8d5063a..e69de29 100644 --- a/skynet/__init__.py +++ b/skynet/__init__.py @@ -1,2 +0,0 @@ -#!/usr/bin/python - diff --git a/skynet/cli.py b/skynet/cli.py index 8da96bf..9de9b98 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import json import logging import random diff --git a/skynet/config.py b/skynet/config.py index 109c54c..9470877 100755 --- a/skynet/config.py +++ b/skynet/config.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import os import toml diff --git a/skynet/constants.py b/skynet/constants.py index 480a5eb..7431d28 100755 --- a/skynet/constants.py +++ b/skynet/constants.py @@ -1,4 +1,3 @@ -#!/usr/bin/python import msgspec from typing import Literal diff --git a/skynet/db/__init__.py b/skynet/db/__init__.py index ae1dd6b..6ffe5c7 100644 --- a/skynet/db/__init__.py +++ b/skynet/db/__init__.py @@ -1,3 +1 @@ -#!/usr/bin/python - from .functions import open_new_database, open_database_connection diff --git a/skynet/db/functions.py b/skynet/db/functions.py index 91a493e..c4a0331 100644 --- a/skynet/db/functions.py +++ b/skynet/db/functions.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import time import random import string diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index b454a95..e906418 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import trio from hypercorn.config import Config diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 535dfcc..6b2f092 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -1,12 +1,3 @@ -#!/usr/bin/python -# ^TODO? again, why.. -# -# Do we expect this mod -# to be invoked? if so why is there no -# `if __name__ == '__main__'` guard? -# -# if anything this should contain a license header ;) - ''' Skynet Memory Manager diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index db8da86..c95e1ca 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - from datetime import datetime from functools import partial from hashlib import sha256 diff --git a/skynet/dgpu/errors.py b/skynet/dgpu/errors.py index 3c4992c..fa92e8d 100755 --- a/skynet/dgpu/errors.py +++ b/skynet/dgpu/errors.py @@ -1,6 +1,3 @@ -#!/usr/bin/python -# ^TODO, why.. - class DGPUComputeError(BaseException): ... diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 22ee76b..1a8ebd6 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import io import json import time diff --git a/skynet/dgpu/pipes/flux.py b/skynet/dgpu/pipes/flux.py index 57642c0..4b02a49 100644 --- a/skynet/dgpu/pipes/flux.py +++ b/skynet/dgpu/pipes/flux.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import torch from diffusers import ( diff --git a/skynet/dgpu/pipes/flux_inpaint.py b/skynet/dgpu/pipes/flux_inpaint.py index 2f88d58..d13a2b5 100644 --- a/skynet/dgpu/pipes/flux_inpaint.py +++ b/skynet/dgpu/pipes/flux_inpaint.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import torch from diffusers import ( diff --git a/skynet/frontend/__init__.py b/skynet/frontend/__init__.py index bb5e9bc..0404f2f 100644 --- a/skynet/frontend/__init__.py +++ b/skynet/frontend/__init__.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import random from ..constants import * diff --git a/skynet/frontend/discord/__init__.py b/skynet/frontend/discord/__init__.py index 9bfcf08..bae9a8d 100755 --- a/skynet/frontend/discord/__init__.py +++ b/skynet/frontend/discord/__init__.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - from json import JSONDecodeError import random import logging diff --git a/skynet/frontend/discord/handlers.py b/skynet/frontend/discord/handlers.py index 95bfc52..a2b677e 100644 --- a/skynet/frontend/discord/handlers.py +++ b/skynet/frontend/discord/handlers.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import io import json import logging diff --git a/skynet/frontend/discord/utils.py b/skynet/frontend/discord/utils.py index 8f6d484..1fdcc32 100644 --- a/skynet/frontend/discord/utils.py +++ b/skynet/frontend/discord/utils.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import json import logging import traceback diff --git a/skynet/frontend/telegram/__init__.py b/skynet/frontend/telegram/__init__.py index 2dd54a3..540a240 100644 --- a/skynet/frontend/telegram/__init__.py +++ b/skynet/frontend/telegram/__init__.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import io import random import logging diff --git a/skynet/frontend/telegram/handlers.py b/skynet/frontend/telegram/handlers.py index 6a1eb20..e9eaebb 100644 --- a/skynet/frontend/telegram/handlers.py +++ b/skynet/frontend/telegram/handlers.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import io import json import logging diff --git a/skynet/frontend/telegram/utils.py b/skynet/frontend/telegram/utils.py index 9d6f927..13271fb 100644 --- a/skynet/frontend/telegram/utils.py +++ b/skynet/frontend/telegram/utils.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import json import logging import traceback diff --git a/skynet/ipfs/__init__.py b/skynet/ipfs/__init__.py index 5a6384f..125c225 100644 --- a/skynet/ipfs/__init__.py +++ b/skynet/ipfs/__init__.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import logging from pathlib import Path diff --git a/skynet/ipfs/docker.py b/skynet/ipfs/docker.py index 69f564a..c2da5fb 100755 --- a/skynet/ipfs/docker.py +++ b/skynet/ipfs/docker.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import sys import logging diff --git a/skynet/ipfs/pinner.py b/skynet/ipfs/pinner.py index a50acb0..dc9fe2d 100755 --- a/skynet/ipfs/pinner.py +++ b/skynet/ipfs/pinner.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import logging import traceback diff --git a/skynet/nodeos.py b/skynet/nodeos.py index f541a52..d5d05b4 100755 --- a/skynet/nodeos.py +++ b/skynet/nodeos.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import json import time import logging diff --git a/skynet/utils.py b/skynet/utils.py index 24ac04f..f29bea2 100755 --- a/skynet/utils.py +++ b/skynet/utils.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import io import os import sys diff --git a/tests/conftest.py b/tests/conftest.py index 0ea4821..34309dd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - import pytest from skynet.config import * diff --git a/tests/test_deploy.py b/tests/test_deploy.py index 62ef635..bfd93d9 100644 --- a/tests/test_deploy.py +++ b/tests/test_deploy.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import time import json diff --git a/tests/test_ipfs_client.py b/tests/test_ipfs_client.py index 5ad4409..a400cdf 100644 --- a/tests/test_ipfs_client.py +++ b/tests/test_ipfs_client.py @@ -1,6 +1,3 @@ -#!/usr/bin/python - - from pathlib import Path From f0604f54fb3e47f13b0b3876c9b4185d767d0640 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 17:00:44 -0300 Subject: [PATCH 11/35] Fix some import related TODOs --- skynet/dgpu/compute.py | 7 ------- skynet/dgpu/daemon.py | 7 +++---- skynet/dgpu/network.py | 10 +--------- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 6b2f092..f069e60 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -7,17 +7,10 @@ import gc import logging from hashlib import sha256 -# import zipfile -# from PIL import Image -# from diffusers import DiffusionPipeline import trio import torch -# from skynet.constants import ( -# DEFAULT_INITAL_MODEL, -# MODELS, -# ) from skynet.dgpu.errors import ( DGPUComputeError, DGPUInferenceCancelled, diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index c95e1ca..0f49ff7 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -1,11 +1,10 @@ -from datetime import datetime -from functools import partial -from hashlib import sha256 import json import logging import random -# import traceback import time +from datetime import datetime +from functools import partial +from hashlib import sha256 import trio from quart import jsonify diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 1a8ebd6..a6753aa 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -2,7 +2,6 @@ import io import json import time import logging - from pathlib import Path from functools import partial @@ -11,12 +10,7 @@ import leap import anyio import httpx import outcome - -from PIL import ( - Image, - # UnidentifiedImageError, # TODO, remove? -) - +from PIL import Image from leap.cleos import CLEOS from leap.protocol import Asset from skynet.constants import ( @@ -28,8 +22,6 @@ from skynet.ipfs import ( AsyncIPFSHTTP, get_ipfs_file, ) -# TODO, remove? -# from skynet.dgpu.errors import DGPUComputeError REQUEST_UPDATE_TIME: int = 3 From 1b437b761b9d9ba8e69694d15b3ffc50e90dc2c5 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 18:31:12 -0300 Subject: [PATCH 12/35] Change dgpu submodules classes name per fomos suggestion --- skynet/dgpu/__init__.py | 14 +++++++------- skynet/dgpu/compute.py | 3 +-- skynet/dgpu/daemon.py | 19 +++++++++---------- skynet/dgpu/network.py | 5 +---- tests/conftest.py | 12 ++++++------ tests/test_reqs.py | 2 +- 6 files changed, 25 insertions(+), 30 deletions(-) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index e906418..eb93697 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -4,21 +4,21 @@ from hypercorn.config import Config from hypercorn.trio import serve from quart_trio import QuartTrio as Quart -from skynet.dgpu.compute import SkynetMM -from skynet.dgpu.daemon import SkynetDGPUDaemon -from skynet.dgpu.network import SkynetGPUConnector +from skynet.dgpu.compute import ModelMngr +from skynet.dgpu.daemon import WorkerDaemon +from skynet.dgpu.network import NetConnector async def open_dgpu_node(config: dict) -> None: ''' Open a top level "GPU mgmt daemon", keep the - `SkynetDGPUDaemon._snap: dict[str, list|dict]` table + `WorkerDaemon._snap: dict[str, list|dict]` table and *maybe* serve a `hypercorn` web API. ''' - conn = SkynetGPUConnector(config) - mm = SkynetMM(config) - daemon = SkynetDGPUDaemon(mm, conn, config) + conn = NetConnector(config) + mm = ModelMngr(config) + daemon = WorkerDaemon(mm, conn, config) api: Quart|None = None if 'api_bind' in config: diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index f069e60..085b2a3 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -66,8 +66,7 @@ def prepare_params_for_diffuse( ) -# TODO, yet again - drop the redundant prefix ;) -class SkynetMM: +class ModelMngr: ''' (AI algo) Model manager for loading models, computing outputs, checking load state, and unloading when no-longer-needed/finished. diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 0f49ff7..a709d0b 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -17,8 +17,8 @@ from skynet.constants import ( from skynet.dgpu.errors import ( DGPUComputeError, ) -from skynet.dgpu.compute import SkynetMM -from skynet.dgpu.network import SkynetGPUConnector +from skynet.dgpu.compute import ModelMngr +from skynet.dgpu.network import NetConnector def convert_reward_to_int(reward_str): @@ -29,8 +29,7 @@ def convert_reward_to_int(reward_str): return int(int_part + decimal_part) -# prolly don't need the `Skynet` prefix since that's kinda implied ;p -class SkynetDGPUDaemon: +class WorkerDaemon: ''' The root "GPU daemon". @@ -40,12 +39,12 @@ class SkynetDGPUDaemon: ''' def __init__( self, - mm: SkynetMM, - conn: SkynetGPUConnector, + mm: ModelMngr, + conn: NetConnector, config: dict ): - self.mm: SkynetMM = mm - self.conn: SkynetGPUConnector = conn + self.mm: ModelMngr = mm + self.conn: NetConnector = conn self.auto_withdraw = ( config['auto_withdraw'] if 'auto_withdraw' in config else False @@ -147,7 +146,7 @@ class SkynetDGPUDaemon: # TODO? this func is kinda big and maybe is better at module # level to reduce indentation? - # -[ ] just pass `daemon: SkynetDGPUDaemon` vs. `self` + # -[ ] just pass `daemon: WorkerDaemon` vs. `self` async def maybe_serve_one( self, req: dict, @@ -271,7 +270,7 @@ class SkynetDGPUDaemon: # TODO, as per above on `.maybe_serve_one()`, it's likely a bit # more *trionic* to define this all as a module level task-func - # which operates on a `daemon: SkynetDGPUDaemon`? + # which operates on a `daemon: WorkerDaemon`? # # -[ ] keeps tasks-as-funcs style prominent # -[ ] avoids so much indentation due to methods diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index a6753aa..e80bea3 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -46,10 +46,7 @@ async def failable(fn: partial, ret_fail=None): return o.unwrap() -# TODO, again the prefix XD -# -[ ] better name then `GPUConnector` ?? -# |_ `Compute[Net]IO[Mngr]` -class SkynetGPUConnector: +class NetConnector: ''' An API for connecting to and conducting various "high level" network-service operations in the skynet. diff --git a/tests/conftest.py b/tests/conftest.py index 34309dd..f990935 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,17 +24,17 @@ def cleos(): @pytest.fixture(scope='session') def dgpu(): - from skynet.dgpu.network import SkynetGPUConnector - from skynet.dgpu.compute import SkynetMM - from skynet.dgpu.daemon import SkynetDGPUDaemon + from skynet.dgpu.network import NetConnector + from skynet.dgpu.compute import ModelMngr + from skynet.dgpu.daemon import WorkerDaemon config = load_skynet_toml(file_path='skynet.toml') hf_token = load_key(config, 'skynet.dgpu.hf_token') hf_home = load_key(config, 'skynet.dgpu.hf_home') set_hf_vars(hf_token, hf_home) config = config['skynet']['dgpu'] - conn = SkynetGPUConnector(config) - mm = SkynetMM(config) - daemon = SkynetDGPUDaemon(mm, conn, config) + conn = NetConnector(config) + mm = ModelMngr(config) + daemon = WorkerDaemon(mm, conn, config) yield conn, mm, daemon diff --git a/tests/test_reqs.py b/tests/test_reqs.py index d55c940..48ca886 100644 --- a/tests/test_reqs.py +++ b/tests/test_reqs.py @@ -1,6 +1,6 @@ import json -from skynet.dgpu.compute import SkynetMM +from skynet.dgpu.compute import ModelMngr from skynet.constants import * from skynet.config import * From 62f891c0174c01a9e622e3e1c84444994f9e401b Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 18:43:42 -0300 Subject: [PATCH 13/35] Rename my_results --- skynet/dgpu/daemon.py | 8 +++----- skynet/dgpu/network.py | 8 ++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index a709d0b..d874e37 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -71,9 +71,7 @@ class WorkerDaemon: self._snap = { 'queue': [], 'requests': {}, - 'my_results': [] - # ^and here i thot they were **my** results.. - # :sadcat: + 'results': [] } self._benchmark: list[float] = [] @@ -178,9 +176,9 @@ class WorkerDaemon: if model in self.model_blacklist: return False - my_results = [res['id'] for res in self._snap['my_results']] + results = [res['id'] for res in self._snap['results']] if ( - rid not in my_results + rid not in results and rid in self._snap['requests'] ): diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index e80bea3..7915b3d 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -155,7 +155,7 @@ class NetConnector: ''' snap = { 'requests': {}, - 'my_results': [] + 'results': [] } snap['queue'] = await self.get_work_requests_last_hour() @@ -164,7 +164,7 @@ class NetConnector: d[key] = await fn(*args, **kwargs) async with trio.open_nursery() as n: - n.start_soon(_run_and_save, snap, 'my_results', self.find_my_results) + n.start_soon(_run_and_save, snap, 'results', self.find_results) for req in snap['queue']: n.start_soon( _run_and_save, snap['requests'], req['id'], self.get_status_by_request_id, req['id']) @@ -232,8 +232,8 @@ class NetConnector: ) ) - async def find_my_results(self): - logging.info('find_my_results') + async def find_results(self): + logging.info('find_results') return await failable( partial( self.cleos.aget_table, From eaad7d9112d8a1c0c6c22933d87d6f648b9269c5 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 18:45:18 -0300 Subject: [PATCH 14/35] Remove todo about hardcoded numbers in test setup --- skynet/nodeos.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skynet/nodeos.py b/skynet/nodeos.py index d5d05b4..6b9f756 100755 --- a/skynet/nodeos.py +++ b/skynet/nodeos.py @@ -86,8 +86,6 @@ def open_nodeos(cleanup: bool = True): logging.info(f'GPU KEYS: {(priv, pub)}') cleos.new_account('telos.gpu', ram=4200000, key=pub) - # lol, magic #s much XD ? - # TODO, why is there 4 workers/keys? for i in range(1, 4): priv, pub = cleos.create_key_pair() cleos.import_key(priv) From e09652eaaed8136300a8db1c9bf4ea9d7eaf5a3a Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 18:48:29 -0300 Subject: [PATCH 15/35] Drop old ipfs node docker managment --- skynet/ipfs/docker.py | 69 ------------------------------------------- tests/conftest.py | 4 +-- 2 files changed, 1 insertion(+), 72 deletions(-) delete mode 100755 skynet/ipfs/docker.py diff --git a/skynet/ipfs/docker.py b/skynet/ipfs/docker.py deleted file mode 100755 index c2da5fb..0000000 --- a/skynet/ipfs/docker.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys -import logging - -from pathlib import Path -from contextlib import contextmanager as cm - -import docker - -from docker.types import Mount - - -@cm -def open_ipfs_node( - name: str = 'skynet-ipfs', - teardown: bool = False, - peers: list[str] = [] -): - dclient = docker.from_env() - - container = None - try: - container = dclient.containers.get(name) - - except docker.errors.NotFound: - data_dir = Path().resolve() / 'ipfs-docker-data' - data_dir.mkdir(parents=True, exist_ok=True) - - data_target = '/data/ipfs' - - container = dclient.containers.run( - 'ipfs/go-ipfs:latest', - name='skynet-ipfs', - ports={ - '8080/tcp': 8080, - '4001/tcp': 4001, - '5001/tcp': ('127.0.0.1', 5001) - }, - mounts=[ - Mount(data_target, str(data_dir), 'bind') - ], - detach=True, - remove=True - ) - - uid, gid = 1000, 1000 - - if sys.platform != 'win32': - ec, out = container.exec_run(['chown', f'{uid}:{gid}', '-R', data_target]) - logging.info(out) - assert ec == 0 - - for log in container.logs(stream=True): - log = log.decode().rstrip() - logging.info(log) - if 'Daemon is ready' in log: - break - - for peer in peers: - ec, out = container.exec_run( - ['ipfs', 'swarm', 'connect', peer]) - if ec != 0: - logging.error(out) - - # TODO, why not deliver some kinda API here for controlling the - # ipfs node? - yield - - if teardown and container: - container.stop() diff --git a/tests/conftest.py b/tests/conftest.py index f990935..56c0780 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,14 +2,12 @@ import pytest from skynet.config import * from skynet.ipfs import AsyncIPFSHTTP -from skynet.ipfs.docker import open_ipfs_node from skynet.nodeos import open_nodeos @pytest.fixture(scope='session') def ipfs_client(): - with open_ipfs_node(teardown=True): - yield AsyncIPFSHTTP('http://127.0.0.1:5001') + yield AsyncIPFSHTTP('http://127.0.0.1:5001') @pytest.fixture(scope='session') def postgres_db(): From cc7015eb0350546fb4ada92f3faa87466d07744a Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 18:58:55 -0300 Subject: [PATCH 16/35] Rework if statement to reduce indentation, add comment about logic --- skynet/dgpu/daemon.py | 169 +++++++++++++++++++++--------------------- 1 file changed, 83 insertions(+), 86 deletions(-) diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index d874e37..1e7cdb5 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -177,94 +177,91 @@ class WorkerDaemon: return False results = [res['id'] for res in self._snap['results']] - if ( - rid not in results - and - rid in self._snap['requests'] - ): - statuses = self._snap['requests'][rid] - if len(statuses) == 0: - inputs = [] - for _input in req['binary_data'].split(','): - if _input: - for _ in range(3): - try: - # user `GPUConnector` to IO with - # storage layer to seed the compute - # task. - img = await self.conn.get_input_data(_input) - inputs.append(img) - break - except BaseException: - logging.exception( - 'Model input error !?!\n' - ) - - hash_str = ( - str(req['nonce']) - + - req['body'] - + - req['binary_data'] - ) - logging.info(f'hashing: {hash_str}') - request_hash = sha256(hash_str.encode('utf-8')).hexdigest() - - # TODO: validate request - - # perform work - logging.info(f'working on {body}') - - resp = await self.conn.begin_work(rid) - if not resp or 'code' in resp: - logging.info('probably being worked on already... skip.') - - else: - try: - output_type = 'png' - if 'output_type' in body['params']: - output_type = body['params']['output_type'] - - output = None - output_hash = None - match self.backend: - case 'sync-on-thread': - self.mm._should_cancel = self.should_cancel_work - output_hash, output = await trio.to_thread.run_sync( - partial( - self.mm.compute_one, - rid, - body['method'], body['params'], - inputs=inputs - ) - ) - - case _: - raise DGPUComputeError( - f'Unsupported backend {self.backend}' - ) - - self._last_generation_ts: str = datetime.now().isoformat() - self._last_benchmark: list[float] = self._benchmark - self._benchmark: list[float] = [] - - ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) - - await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) - - except BaseException as err: - logging.exception('Failed to serve model request !?\n') - # traceback.print_exc() # TODO? <- replaced by above ya? - await self.conn.cancel_work(rid, str(err)) - - finally: - return True - - # TODO, i would inverse this case logic to avoid an indent - # level in above block ;) - else: + # if worker is already on that request or + # if worker has a stale status for that request + if rid in results or rid not in self._snap['requests']: logging.info(f'request {rid} already beign worked on, skip...') + return + + statuses = self._snap['requests'][rid] + if len(statuses) == 0: + inputs = [] + for _input in req['binary_data'].split(','): + if _input: + for _ in range(3): + try: + # user `GPUConnector` to IO with + # storage layer to seed the compute + # task. + img = await self.conn.get_input_data(_input) + inputs.append(img) + break + + except BaseException: + logging.exception( + 'Model input error !?!\n' + ) + + hash_str = ( + str(req['nonce']) + + + req['body'] + + + req['binary_data'] + ) + logging.info(f'hashing: {hash_str}') + request_hash = sha256(hash_str.encode('utf-8')).hexdigest() + + # TODO: validate request + + # perform work + logging.info(f'working on {body}') + + resp = await self.conn.begin_work(rid) + if not resp or 'code' in resp: + logging.info('probably being worked on already... skip.') + + else: + try: + output_type = 'png' + if 'output_type' in body['params']: + output_type = body['params']['output_type'] + + output = None + output_hash = None + match self.backend: + case 'sync-on-thread': + self.mm._should_cancel = self.should_cancel_work + output_hash, output = await trio.to_thread.run_sync( + partial( + self.mm.compute_one, + rid, + body['method'], body['params'], + inputs=inputs + ) + ) + + case _: + raise DGPUComputeError( + f'Unsupported backend {self.backend}' + ) + + self._last_generation_ts: str = datetime.now().isoformat() + self._last_benchmark: list[float] = self._benchmark + self._benchmark: list[float] = [] + + ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) + + await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) + + except BaseException as err: + logging.exception('Failed to serve model request !?\n') + # traceback.print_exc() # TODO? <- replaced by above ya? + await self.conn.cancel_work(rid, str(err)) + + finally: + return True # TODO, as per above on `.maybe_serve_one()`, it's likely a bit # more *trionic* to define this all as a module level task-func From a5dbe5ab123355967d11e943ee1e4c90f419124a Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 20:07:45 -0300 Subject: [PATCH 17/35] Further improvements in indentation and logic in daemons maybe_serve_one, also might of fixed a bug related to using id instead of request_id in the search existing results phase, and add way more logging --- skynet/dgpu/__init__.py | 7 ++ skynet/dgpu/compute.py | 14 ++-- skynet/dgpu/daemon.py | 171 ++++++++++++++++++++++------------------ skynet/dgpu/network.py | 53 ++++++------- 4 files changed, 130 insertions(+), 115 deletions(-) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index eb93697..4371f83 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -1,3 +1,5 @@ +import logging + import trio from hypercorn.config import Config @@ -16,6 +18,10 @@ async def open_dgpu_node(config: dict) -> None: and *maybe* serve a `hypercorn` web API. ''' + + # suppress logs from httpx (logs url + status after every query) + logging.getLogger("httpx").setLevel(logging.WARNING) + conn = NetConnector(config) mm = ModelMngr(config) daemon = WorkerDaemon(mm, conn, config) @@ -33,6 +39,7 @@ async def open_dgpu_node(config: dict) -> None: # TODO, consider a more explicit `as hypercorn_serve` # to clarify? if api: + logging.info(f'serving api @ {config["api_bind"]}') tn.start_soon(serve, api, api_conf) # block until cancelled diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 085b2a3..8b10bd7 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -83,8 +83,8 @@ class ModelMngr: # self.load_model(DEFAULT_INITAL_MODEL, 'txt2img') def log_debug_info(self): - logging.info('memory summary:') - logging.info('\n' + torch.cuda.memory_summary()) + logging.debug('memory summary:') + logging.debug('\n' + torch.cuda.memory_summary()) def is_model_loaded(self, name: str, mode: str): if (name == self._model_name and @@ -114,6 +114,8 @@ class ModelMngr: name, mode, cache_dir=self.cache_dir) self._model_mode = mode self._model_name = name + logging.info('{name} loaded!') + self.log_debug_info() def compute_one( self, @@ -126,11 +128,7 @@ class ModelMngr: if self._should_cancel: should_raise = trio.from_thread.run(self._should_cancel, request_id) if should_raise: - logging.warn(f'cancelling work at step {step}') - - # ?TODO, this is never caught, so why is it - # raised specially? - raise DGPUInferenceCancelled() + logging.warn(f'CANCELLING work at step {step}') return {} @@ -206,8 +204,6 @@ class ModelMngr: raise DGPUComputeError('Unsupported compute method') except BaseException as err: - logging.error(err) - # to see the src exc in tb raise DGPUComputeError(str(err)) from err finally: diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 1e7cdb5..137259b 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -105,7 +105,11 @@ class WorkerDaemon: for status in self._snap['requests'][request_id] if status['worker'] != self.account ]) - return bool(self.non_compete & competitors) + logging.info('should cancel work?') + logging.info(f'competitors: {competitors}') + should_cancel = bool(self.non_compete & competitors) + logging.info(f'cancel: {should_cancel}') + return should_cancel async def snap_updater_task(self): @@ -150,6 +154,7 @@ class WorkerDaemon: req: dict, ): rid = req['id'] + logging.info(f'maybe serve request #{rid}') # parse request body = json.loads(req['body']) @@ -161,7 +166,7 @@ class WorkerDaemon: and model not in MODELS ): - logging.warning(f'Unknown model {model}') + logging.warning(f'unknown model {model}!, skip...') return False # only handle whitelisted models @@ -170,98 +175,110 @@ class WorkerDaemon: and model not in self.model_whitelist ): + logging.warning('model not whitelisted!, skip...') return False # if blacklist contains model skip if model in self.model_blacklist: + logging.warning('model not blacklisted!, skip...') return False - results = [res['id'] for res in self._snap['results']] + results = [res['request_id'] for res in self._snap['results']] - # if worker is already on that request or - # if worker has a stale status for that request - if rid in results or rid not in self._snap['requests']: - logging.info(f'request {rid} already beign worked on, skip...') - return + # if worker already produced a result for this request + if rid in results: + logging.info(f'worker already submitted a result for request #{rid}, skip...') + return False statuses = self._snap['requests'][rid] - if len(statuses) == 0: - inputs = [] - for _input in req['binary_data'].split(','): - if _input: - for _ in range(3): - try: - # user `GPUConnector` to IO with - # storage layer to seed the compute - # task. - img = await self.conn.get_input_data(_input) - inputs.append(img) - break - except BaseException: - logging.exception( - 'Model input error !?!\n' + # skip if workers in non_compete already on it + competitors = set((status['worker'] for status in statuses)) + if bool(self.non_compete & competitors): + logging.info('worker in configured non_compete list already working on request, skip...') + return False + + # resolve the ipfs hashes into the actual data behind them + inputs = [] + raw_inputs = req['binary_data'].split(',') + if raw_inputs: + logging.info(f'fetching IPFS inputs: {raw_inputs}') + + retry = 3 + for _input in req['binary_data'].split(','): + if _input: + for r in range(retry): + try: + # user `GPUConnector` to IO with + # storage layer to seed the compute + # task. + img = await self.conn.get_input_data(_input) + inputs.append(img) + logging.info(f'retrieved {_input}!') + break + + except BaseException: + logging.exception( + f'IPFS fetch input error !?! retries left {retry - r - 1}\n' + ) + + # compute unique request hash used on submit + hash_str = ( + str(req['nonce']) + + + req['body'] + + + req['binary_data'] + ) + logging.debug(f'hashing: {hash_str}') + request_hash = sha256(hash_str.encode('utf-8')).hexdigest() + logging.info(f'calculated request hash: {request_hash}') + + # TODO: validate request + + resp = await self.conn.begin_work(rid) + if not resp or 'code' in resp: + logging.info('begin_work error, probably being worked on already... skip.') + + else: + try: + output_type = 'png' + if 'output_type' in body['params']: + output_type = body['params']['output_type'] + + output = None + output_hash = None + match self.backend: + case 'sync-on-thread': + self.mm._should_cancel = self.should_cancel_work + output_hash, output = await trio.to_thread.run_sync( + partial( + self.mm.compute_one, + rid, + body['method'], body['params'], + inputs=inputs ) + ) - hash_str = ( - str(req['nonce']) - + - req['body'] - + - req['binary_data'] - ) - logging.info(f'hashing: {hash_str}') - request_hash = sha256(hash_str.encode('utf-8')).hexdigest() + case _: + raise DGPUComputeError( + f'Unsupported backend {self.backend}' + ) - # TODO: validate request + self._last_generation_ts: str = datetime.now().isoformat() + self._last_benchmark: list[float] = self._benchmark + self._benchmark: list[float] = [] - # perform work - logging.info(f'working on {body}') + ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) - resp = await self.conn.begin_work(rid) - if not resp or 'code' in resp: - logging.info('probably being worked on already... skip.') + await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) - else: - try: - output_type = 'png' - if 'output_type' in body['params']: - output_type = body['params']['output_type'] + except BaseException as err: + logging.exception('Failed to serve model request !?\n') + await self.conn.cancel_work(rid, str(err)) - output = None - output_hash = None - match self.backend: - case 'sync-on-thread': - self.mm._should_cancel = self.should_cancel_work - output_hash, output = await trio.to_thread.run_sync( - partial( - self.mm.compute_one, - rid, - body['method'], body['params'], - inputs=inputs - ) - ) - - case _: - raise DGPUComputeError( - f'Unsupported backend {self.backend}' - ) - - self._last_generation_ts: str = datetime.now().isoformat() - self._last_benchmark: list[float] = self._benchmark - self._benchmark: list[float] = [] - - ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) - - await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) - - except BaseException as err: - logging.exception('Failed to serve model request !?\n') - # traceback.print_exc() # TODO? <- replaced by above ya? - await self.conn.cancel_work(rid, str(err)) - - finally: - return True + finally: + return True # TODO, as per above on `.maybe_serve_one()`, it's likely a bit # more *trionic* to define this all as a module level task-func diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 7915b3d..45c19fe 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -72,9 +72,6 @@ class NetConnector: self.cleos = CLEOS(endpoint=self.node_url) self.cleos.load_abi('gpu.scd', GPU_CONTRACT_ABI) - self.ipfs_gateway_url = None - if 'ipfs_gateway_url' in config: - self.ipfs_gateway_url = config['ipfs_gateway_url'] self.ipfs_url = config['ipfs_url'] self.ipfs_client = AsyncIPFSHTTP(self.ipfs_url) @@ -89,7 +86,7 @@ class NetConnector: async def get_work_requests_last_hour(self): logging.info('get_work_requests_last_hour') - return await failable( + rows = await failable( partial( self.cleos.aget_table, 'gpu.scd', 'gpu.scd', 'queue', @@ -98,13 +95,19 @@ class NetConnector: lower_bound=int(time.time()) - 3600 ), ret_fail=[]) + logging.info(f'found {len(rows)} requests on queue') + return rows + async def get_status_by_request_id(self, request_id: int): logging.info('get_status_by_request_id') - return await failable( + rows = await failable( partial( self.cleos.aget_table, 'gpu.scd', request_id, 'status'), ret_fail=[]) + logging.info(f'found status for workers: {[r["worker"] for r in rows]}') + return rows + async def get_global_config(self): logging.info('get_global_config') rows = await failable( @@ -113,8 +116,11 @@ class NetConnector: 'gpu.scd', 'gpu.scd', 'config')) if rows: - return rows[0] + cfg = rows[0] + logging.info(f'config found: {cfg}') + return cfg else: + logging.error('global config not found, is the contract initialized?') return None async def get_worker_balance(self): @@ -130,20 +136,13 @@ class NetConnector: )) if rows: - return rows[0]['balance'] + b = rows[0]['balance'] + logging.info(f'balance: {b}') + return b else: + logging.info('no balance info found') return None - async def get_competitors_for_req(self, request_id: int) -> set: - competitors = [ - status['worker'] - for status in - (await self.get_status_by_request_id(request_id)) - if status['worker'] != self.account - ] - logging.info(f'competitors: {competitors}') - return set(competitors) - # TODO, considery making this a NON-method and instead # handing in the `snap['queue']` output beforehand? # -> since that call is the only usage of `self`? @@ -177,7 +176,7 @@ class NetConnector: step. ''' - logging.info('begin_work') + logging.info(f'begin_work on #{request_id}') return await failable( partial( self.cleos.a_push_action, @@ -194,7 +193,7 @@ class NetConnector: ) async def cancel_work(self, request_id: int, reason: str): - logging.info('cancel_work') + logging.info(f'cancel_work on #{request_id}') return await failable( partial( self.cleos.a_push_action, @@ -234,7 +233,7 @@ class NetConnector: async def find_results(self): logging.info('find_results') - return await failable( + rows = await failable( partial( self.cleos.aget_table, 'gpu.scd', 'gpu.scd', 'results', @@ -244,6 +243,7 @@ class NetConnector: upper_bound=self.account ) ) + return rows async def submit_work( self, @@ -252,7 +252,7 @@ class NetConnector: result_hash: str, ipfs_hash: str ): - logging.info('submit_work') + logging.info('submit_work #{request_id}') return await failable( partial( self.cleos.a_push_action, @@ -285,17 +285,12 @@ class NetConnector: case _: raise ValueError(f'Unsupported output type: {typ}') - if self.ipfs_gateway_url: - # check peer connections, reconnect to skynet gateway if not - gateway_id = Path(self.ipfs_gateway_url).name - peers = await self.ipfs_client.peers() - if gateway_id not in [p['Peer'] for p in peers]: - await self.ipfs_client.connect(self.ipfs_gateway_url) - file_info = await self.ipfs_client.add(Path(target_file)) file_cid = file_info['Hash'] + logging.info(f'added file to ipfs, CID: {file_cid}') await self.ipfs_client.pin(file_cid) + logging.info(f'pinned {file_cid}') return file_cid @@ -311,11 +306,11 @@ class NetConnector: link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}' res = await get_ipfs_file(link, timeout=1) - logging.info(f'got response from {link}') if not res or res.status_code != 200: logging.warning(f'couldn\'t get ipfs binary data at {link}!') # attempt to decode as image input_data = Image.open(io.BytesIO(res.raw)) + logging.info('decoded as image successfully') return input_data From 4c9be4e63e69ce2e5a7a4756ef4488feba412aa5 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 3 Feb 2025 21:08:54 -0300 Subject: [PATCH 18/35] Fix cancellation system and provide a reason for the raise --- skynet/dgpu/compute.py | 8 ++++++-- skynet/dgpu/daemon.py | 13 ++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 8b10bd7..56403a1 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -114,7 +114,7 @@ class ModelMngr: name, mode, cache_dir=self.cache_dir) self._model_mode = mode self._model_name = name - logging.info('{name} loaded!') + logging.info(f'{name} loaded!') self.log_debug_info() def compute_one( @@ -125,10 +125,14 @@ class ModelMngr: inputs: list[bytes] = [] ): def maybe_cancel_work(step, *args, **kwargs): + '''This is a callback function that gets invoked every inference step, + we need to raise an exception here if we need to cancel work + ''' if self._should_cancel: should_raise = trio.from_thread.run(self._should_cancel, request_id) if should_raise: - logging.warn(f'CANCELLING work at step {step}') + logging.warning(f'CANCELLING work at step {step}') + raise DGPUInferenceCancelled('network cancel') return {} diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 137259b..bfcab79 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -100,12 +100,16 @@ class WorkerDaemon: async def should_cancel_work(self, request_id: int): self._benchmark.append(time.time()) + logging.info('should cancel work?') + if request_id not in self._snap['requests']: + logging.info(f'request #{request_id} no longer in queue, likely its been filled by another worker, cancelling work...') + return True + competitors = set([ status['worker'] for status in self._snap['requests'][request_id] if status['worker'] != self.account ]) - logging.info('should cancel work?') logging.info(f'competitors: {competitors}') should_cancel = bool(self.non_compete & competitors) logging.info(f'cancel: {should_cancel}') @@ -274,8 +278,11 @@ class WorkerDaemon: await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) except BaseException as err: - logging.exception('Failed to serve model request !?\n') - await self.conn.cancel_work(rid, str(err)) + if 'network cancel' not in str(err): + logging.exception('Failed to serve model request !?\n') + + if rid in self._snap['requests']: + await self.conn.cancel_work(rid, 'reason not provided') finally: return True From e66f8d74fde71821522815f7baa0fdf626e8f966 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Wed, 5 Feb 2025 00:40:37 -0300 Subject: [PATCH 19/35] Fix asks -> httpx bug on get_ipfs_link response handling --- skynet/dgpu/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 45c19fe..2b67362 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -310,7 +310,7 @@ class NetConnector: logging.warning(f'couldn\'t get ipfs binary data at {link}!') # attempt to decode as image - input_data = Image.open(io.BytesIO(res.raw)) + input_data = Image.open(io.BytesIO(res.read())) logging.info('decoded as image successfully') return input_data From 8b45fb597995da6b6ee7f77914f6b22a5c196cd2 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Wed, 5 Feb 2025 15:35:40 -0300 Subject: [PATCH 20/35] Begin adding TUI --- pyproject.toml | 1 + skynet/dgpu/__init__.py | 43 ++++++- skynet/dgpu/compute.py | 39 +++++-- skynet/dgpu/daemon.py | 23 +++- skynet/dgpu/network.py | 10 +- skynet/dgpu/tui.py | 248 ++++++++++++++++++++++++++++++++++++++++ skynet/utils.py | 25 +++- uv.lock | 24 ++++ 8 files changed, 391 insertions(+), 22 deletions(-) create mode 100644 skynet/dgpu/tui.py diff --git a/pyproject.toml b/pyproject.toml index 73884a7..94a9740 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ cuda = [ "basicsr>=1.4.2,<2", "realesrgan>=0.3.0,<0.4", "sentencepiece>=0.2.0", + "urwid>=2.6.16", ] [tool.uv] diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 4371f83..59af61c 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -1,4 +1,5 @@ import logging +import warnings import trio @@ -6,11 +7,31 @@ from hypercorn.config import Config from hypercorn.trio import serve from quart_trio import QuartTrio as Quart +from skynet.dgpu.tui import WorkerMonitor from skynet.dgpu.compute import ModelMngr from skynet.dgpu.daemon import WorkerDaemon from skynet.dgpu.network import NetConnector +def setup_logging_for_tui(level): + warnings.filterwarnings("ignore") + + logger = logging.getLogger() + logger.setLevel(level) + + fh = logging.FileHandler('dgpu.log') + fh.setLevel(level) + + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + fh.setFormatter(formatter) + + logger.addHandler(fh) + + for handler in logger.handlers: + if isinstance(handler, logging.StreamHandler): + logger.removeHandler(handler) + + async def open_dgpu_node(config: dict) -> None: ''' Open a top level "GPU mgmt daemon", keep the @@ -18,13 +39,17 @@ async def open_dgpu_node(config: dict) -> None: and *maybe* serve a `hypercorn` web API. ''' - # suppress logs from httpx (logs url + status after every query) logging.getLogger("httpx").setLevel(logging.WARNING) - conn = NetConnector(config) - mm = ModelMngr(config) - daemon = WorkerDaemon(mm, conn, config) + tui = None + if config['tui']: + setup_logging_for_tui(logging.INFO) + tui = WorkerMonitor() + + conn = NetConnector(config, tui=tui) + mm = ModelMngr(config, tui=tui) + daemon = WorkerDaemon(mm, conn, config, tui=tui) api: Quart|None = None if 'api_bind' in config: @@ -35,6 +60,8 @@ async def open_dgpu_node(config: dict) -> None: tn: trio.Nursery async with trio.open_nursery() as tn: tn.start_soon(daemon.snap_updater_task) + if tui: + tn.start_soon(tui.run) # TODO, consider a more explicit `as hypercorn_serve` # to clarify? @@ -42,5 +69,9 @@ async def open_dgpu_node(config: dict) -> None: logging.info(f'serving api @ {config["api_bind"]}') tn.start_soon(serve, api, api_conf) - # block until cancelled - await daemon.serve_forever() + try: + # block until cancelled + await daemon.serve_forever() + + except *urwid.ExitMainLoop in ex_group: + ... diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 56403a1..d0e8689 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -11,6 +11,7 @@ from hashlib import sha256 import trio import torch +from skynet.dgpu.tui import WorkerMonitor from skynet.dgpu.errors import ( DGPUComputeError, DGPUInferenceCancelled, @@ -72,7 +73,8 @@ class ModelMngr: checking load state, and unloading when no-longer-needed/finished. ''' - def __init__(self, config: dict): + def __init__(self, config: dict, tui: WorkerMonitor | None = None): + self._tui = tui self.cache_dir = None if 'hf_home' in config: self.cache_dir = config['hf_home'] @@ -80,8 +82,6 @@ class ModelMngr: self._model_name: str = '' self._model_mode: str = '' - # self.load_model(DEFAULT_INITAL_MODEL, 'txt2img') - def log_debug_info(self): logging.debug('memory summary:') logging.debug('\n' + torch.cuda.memory_summary()) @@ -110,6 +110,7 @@ class ModelMngr: ) -> None: logging.info(f'loading model {name}...') self.unload_model() + self._model = pipeline_for( name, mode, cache_dir=self.cache_dir) self._model_mode = mode @@ -124,19 +125,30 @@ class ModelMngr: params: dict, inputs: list[bytes] = [] ): - def maybe_cancel_work(step, *args, **kwargs): + total_steps = params['step'] + def inference_step_wakeup(*args, **kwargs): '''This is a callback function that gets invoked every inference step, we need to raise an exception here if we need to cancel work ''' - if self._should_cancel: - should_raise = trio.from_thread.run(self._should_cancel, request_id) - if should_raise: - logging.warning(f'CANCELLING work at step {step}') - raise DGPUInferenceCancelled('network cancel') + step = args[0] + # compat with callback_on_step_end + if not isinstance(step, int): + step = args[1] + + if self._tui: + self._tui.set_progress(step, done=total_steps) + + should_raise = trio.from_thread.run(self._should_cancel, request_id) + if should_raise: + logging.warning(f'CANCELLING work at step {step}') + raise DGPUInferenceCancelled('network cancel') return {} - maybe_cancel_work(0) + if self._tui: + self._tui.set_status(f'Request #{request_id}') + + inference_step_wakeup(0) output_type = 'png' if 'output_type' in params: @@ -157,10 +169,10 @@ class ModelMngr: prompt, guidance, step, seed, upscaler, extra_params = arguments if 'flux' in name.lower(): - extra_params['callback_on_step_end'] = maybe_cancel_work + extra_params['callback_on_step_end'] = inference_step_wakeup else: - extra_params['callback'] = maybe_cancel_work + extra_params['callback'] = inference_step_wakeup extra_params['callback_steps'] = 1 output = self._model( @@ -213,4 +225,7 @@ class ModelMngr: finally: torch.cuda.empty_cache() + if self._tui: + self._tui.set_status('') + return output_hash, output diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index bfcab79..98d3eda 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -17,6 +17,7 @@ from skynet.constants import ( from skynet.dgpu.errors import ( DGPUComputeError, ) +from skynet.dgpu.tui import WorkerMonitor from skynet.dgpu.compute import ModelMngr from skynet.dgpu.network import NetConnector @@ -41,10 +42,12 @@ class WorkerDaemon: self, mm: ModelMngr, conn: NetConnector, - config: dict + config: dict, + tui: WorkerMonitor | None = None ): self.mm: ModelMngr = mm self.conn: NetConnector = conn + self._tui = tui self.auto_withdraw = ( config['auto_withdraw'] if 'auto_withdraw' in config else False @@ -150,6 +153,12 @@ class WorkerDaemon: return app + async def _update_balance(self): + if self._tui: + # update balance + balance = await self.conn.get_worker_balance() + self._tui.set_header_text(new_balance=f'balance: {balance}') + # TODO? this func is kinda big and maybe is better at module # level to reduce indentation? # -[ ] just pass `daemon: WorkerDaemon` vs. `self` @@ -238,6 +247,8 @@ class WorkerDaemon: request_hash = sha256(hash_str.encode('utf-8')).hexdigest() logging.info(f'calculated request hash: {request_hash}') + total_step = body['params']['step'] + # TODO: validate request resp = await self.conn.begin_work(rid) @@ -246,6 +257,9 @@ class WorkerDaemon: else: try: + if self._tui: + self._tui.set_progress(0, done=total_step) + output_type = 'png' if 'output_type' in body['params']: output_type = body['params']['output_type'] @@ -269,6 +283,9 @@ class WorkerDaemon: f'Unsupported backend {self.backend}' ) + if self._tui: + self._tui.set_progress(total_step) + self._last_generation_ts: str = datetime.now().isoformat() self._last_benchmark: list[float] = self._benchmark self._benchmark: list[float] = [] @@ -277,6 +294,9 @@ class WorkerDaemon: await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) + await self._update_balance() + + except BaseException as err: if 'network cancel' not in str(err): logging.exception('Failed to serve model request !?\n') @@ -294,6 +314,7 @@ class WorkerDaemon: # -[ ] keeps tasks-as-funcs style prominent # -[ ] avoids so much indentation due to methods async def serve_forever(self): + await self._update_balance() try: while True: if self.auto_withdraw: diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 2b67362..e80a5c3 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -13,6 +13,7 @@ import outcome from PIL import Image from leap.cleos import CLEOS from leap.protocol import Asset +from skynet.dgpu.tui import WorkerMonitor from skynet.constants import ( DEFAULT_IPFS_DOMAIN, GPU_CONTRACT_ABI, @@ -57,7 +58,7 @@ class NetConnector: - CLEOS client ''' - def __init__(self, config: dict): + def __init__(self, config: dict, tui: WorkerMonitor | None = None): # TODO, why these extra instance vars for an (unsynced) # copy of the `config` state? self.account = config['account'] @@ -81,6 +82,10 @@ class NetConnector: self.ipfs_domain = config['ipfs_domain'] self._wip_requests = {} + self._tui = tui + if self._tui: + self._tui.set_header_text(new_worker_name=self.account) + # blockchain helpers @@ -168,6 +173,9 @@ class NetConnector: n.start_soon( _run_and_save, snap['requests'], req['id'], self.get_status_by_request_id, req['id']) + if self._tui: + self._tui.network_update(snap) + return snap async def begin_work(self, request_id: int): diff --git a/skynet/dgpu/tui.py b/skynet/dgpu/tui.py new file mode 100644 index 0000000..6530b58 --- /dev/null +++ b/skynet/dgpu/tui.py @@ -0,0 +1,248 @@ +import urwid +import trio +import json + + +class WorkerMonitor: + def __init__(self): + self.requests = [] + self.header_info = {} + + self.palette = [ + ('headerbar', 'white', 'dark blue'), + ('request_row', 'white', 'dark gray'), + ('worker_row', 'light gray', 'black'), + ('progress_normal', 'black', 'light gray'), + ('progress_complete', 'black', 'dark green'), + ('body', 'white', 'black'), + ] + + # --- Top bar (header) --- + worker_name = self.header_info.get('left', "unknown") + balance = self.header_info.get('right', "balance: unknown") + + self.worker_name_widget = urwid.Text(worker_name) + self.balance_widget = urwid.Text(balance, align='right') + + header = urwid.Columns([self.worker_name_widget, self.balance_widget]) + header_attr = urwid.AttrMap(header, 'headerbar') + + # --- Body (List of requests) --- + self.body_listbox = self._create_listbox_body(self.requests) + + # --- Bottom bar (progress) --- + self.status_text = urwid.Text("Request: none", align='left') + self.progress_bar = urwid.ProgressBar( + 'progress_normal', + 'progress_complete', + current=0, + done=100 + ) + + footer_cols = urwid.Columns([ + ('fixed', 20, self.status_text), + self.progress_bar, + ]) + + # Build the main frame + frame = urwid.Frame( + self.body_listbox, + header=header_attr, + footer=footer_cols + ) + + # Set up the main loop with Trio + self.event_loop = urwid.TrioEventLoop() + self.main_loop = urwid.MainLoop( + frame, + palette=self.palette, + event_loop=self.event_loop, + unhandled_input=self._exit_on_q + ) + + def _create_listbox_body(self, requests): + """ + Build a ListBox (vertical list) of requests & workers using SimpleFocusListWalker. + """ + widgets = self._build_request_widgets(requests) + walker = urwid.SimpleFocusListWalker(widgets) + return urwid.ListBox(walker) + + def _build_request_widgets(self, requests): + """ + Build a list of Urwid widgets (one row per request + per worker). + """ + row_widgets = [] + + for req in requests: + # Build a columns widget for the request row + columns = urwid.Columns([ + ('fixed', 5, urwid.Text(f"#{req['id']}")), # e.g. "#12" + ('weight', 3, urwid.Text(req['model'])), + ('weight', 3, urwid.Text(req['prompt'])), + ('fixed', 13, urwid.Text(req['user'])), + ('fixed', 13, urwid.Text(req['reward'])), + ], dividechars=1) + + # Wrap the columns with an attribute map for coloring + request_row = urwid.AttrMap(columns, 'request_row') + row_widgets.append(request_row) + + # Then add each worker in its own line below + for w in req["workers"]: + worker_line = urwid.Text(f" {w}") + worker_row = urwid.AttrMap(worker_line, 'worker_row') + row_widgets.append(worker_row) + + # Optional blank line after each request + row_widgets.append(urwid.Text("")) + + return row_widgets + + def _exit_on_q(self, key): + """Exit the TUI on 'q' or 'Q'.""" + if key in ('q', 'Q'): + raise urwid.ExitMainLoop() + + async def run(self): + """ + Run the TUI in an async context (Trio). + This method blocks until the user quits (pressing q/Q). + """ + with self.main_loop.start(): + await self.event_loop.run_async() + + raise urwid.ExitMainLoop() + + # ------------------------------------------------------------------------- + # Public Methods to Update Various Parts of the UI + # ------------------------------------------------------------------------- + def set_status(self, status: str): + self.status_text.set_text(status) + + def set_progress(self, current, done=None): + """ + Update the bottom progress bar. + - `current`: new current progress value (int). + - `done`: max progress value (int). If None, we don’t change it. + """ + if done is not None: + self.progress_bar.done = done + + self.progress_bar.current = current + + pct = 0 + if self.progress_bar.done != 0: + pct = int((self.progress_bar.current / self.progress_bar.done) * 100) + + def update_requests(self, new_requests): + """ + Replace the data in the existing ListBox with new request widgets. + """ + new_widgets = self._build_request_widgets(new_requests) + self.body_listbox.body[:] = new_widgets # replace content of the list walker + + def set_header_text(self, new_worker_name=None, new_balance=None): + """ + Update the text in the header bar for worker name and/or balance. + """ + if new_worker_name is not None: + self.worker_name_widget.set_text(new_worker_name) + if new_balance is not None: + self.balance_widget.set_text(new_balance) + + def network_update(self, snapshot: dict): + queue = [ + { + **r, + **(json.loads(r['body'])['params']), + 'workers': [s['worker'] for s in snapshot['requests'][r['id']]] + } + for r in snapshot['queue'] + ] + self.update_requests(queue) + + +# # ----------------------------------------------------------------------------- +# # Example usage +# # ----------------------------------------------------------------------------- +# +# async def main(): +# # Example data +# example_requests = [ +# { +# "id": 12, +# "model": "black-forest-labs/FLUX.1-schnell", +# "prompt": "Generate an answer about quantum entanglement.", +# "user": "alice123", +# "reward": "20.0000 GPU", +# "workers": ["workerA", "workerB"], +# }, +# { +# "id": 5, +# "model": "some-other-model/v2.0", +# "prompt": "A story about dragons.", +# "user": "bobthebuilder", +# "reward": "15.0000 GPU", +# "workers": ["workerX"], +# }, +# { +# "id": 99, +# "model": "cool-model/turbo", +# "prompt": "Classify sentiment in these tweets.", +# "user": "charlie", +# "reward": "25.5000 GPU", +# "workers": ["workerOne", "workerTwo", "workerThree"], +# }, +# ] +# +# ui = WorkerMonitor() +# +# async def progress_task(): +# # Fill from 0% to 100% +# for pct in range(101): +# ui.set_progress( +# current=pct, +# status_str=f"Request #1234 ({pct}%)" +# ) +# await trio.sleep(0.05) +# # Reset to 0 +# ui.set_progress( +# current=0, +# status_str="Starting again..." +# ) +# +# async def update_data_task(): +# await trio.sleep(3) # Wait a bit, then update requests +# new_data = [{ +# "id": 101, +# "model": "new-model/v1.0", +# "prompt": "Say hi to the world.", +# "user": "eve", +# "reward": "50.0000 GPU", +# "workers": ["workerFresh", "workerPower"], +# }] +# ui.update_requests(new_data) +# ui.set_header_text(new_worker_name="NewNodeName", +# new_balance="balance: 12345.6789 GPU") +# +# try: +# async with trio.open_nursery() as nursery: +# # Run the TUI +# nursery.start_soon(ui.run_teadown_on_exit, nursery) +# +# ui.update_requests(example_requests) +# ui.set_header_text( +# new_worker_name="worker1.scd", +# new_balance="balance: 12345.6789 GPU" +# ) +# # Start background tasks +# nursery.start_soon(progress_task) +# nursery.start_soon(update_data_task) +# +# except *KeyboardInterrupt as ex_group: +# ... +# +# +# if __name__ == "__main__": +# trio.run(main) diff --git a/skynet/utils.py b/skynet/utils.py index f29bea2..ce029bd 100755 --- a/skynet/utils.py +++ b/skynet/utils.py @@ -7,8 +7,10 @@ import logging import importlib from typing import Optional +from contextlib import contextmanager import torch +import diffusers import numpy as np from PIL import Image @@ -74,12 +76,27 @@ def convert_from_bytes_and_crop(raw: bytes, max_w: int, max_h: int) -> Image: return crop_image(convert_from_bytes_to_img(raw), max_w, max_h) +class DummyPB: + def update(self): + ... + +@torch.compiler.disable +@contextmanager +def dummy_progress_bar(*args, **kwargs): + yield DummyPB() + + +def monkey_patch_pipeline_disable_progress_bar(pipe): + pipe.progress_bar = dummy_progress_bar + + def pipeline_for( model: str, mode: str, mem_fraction: float = 1.0, cache_dir: str | None = None ) -> DiffusionPipeline: + diffusers.utils.logging.disable_progress_bar() logging.info(f'pipeline_for {model} {mode}') assert torch.cuda.is_available() @@ -105,7 +122,9 @@ def pipeline_for( normalized_shortname = shortname.replace('-', '_') custom_pipeline = importlib.import_module(f'skynet.dgpu.pipes.{normalized_shortname}') assert custom_pipeline.__model['name'] == model - return custom_pipeline.pipeline_for(model, mode, mem_fraction=mem_fraction, cache_dir=cache_dir) + pipe = custom_pipeline.pipeline_for(model, mode, mem_fraction=mem_fraction, cache_dir=cache_dir) + monkey_patch_pipeline_disable_progress_bar(pipe) + return pipe except ImportError: # TODO, uhh why not warn/error log this? @@ -121,7 +140,6 @@ def pipeline_for( logging.warn(f'model requires {req_mem} but card has {mem_gb}, model will run slower..') params = { - 'safety_checker': None, 'torch_dtype': torch.float16, 'cache_dir': cache_dir, 'variant': 'fp16', @@ -130,6 +148,7 @@ def pipeline_for( match shortname: case 'stable': params['revision'] = 'fp16' + params['safety_checker'] = None torch.cuda.set_per_process_memory_fraction(mem_fraction) @@ -167,6 +186,8 @@ def pipeline_for( pipe = pipe.to('cuda') + monkey_patch_pipeline_disable_progress_bar(pipe) + return pipe diff --git a/uv.lock b/uv.lock index aa03f4a..4dbb56d 100644 --- a/uv.lock +++ b/uv.lock @@ -2262,6 +2262,7 @@ cuda = [ { name = "torchvision" }, { name = "transformers" }, { name = "triton" }, + { name = "urwid" }, { name = "xformers" }, ] dev = [ @@ -2313,6 +2314,7 @@ cuda = [ { name = "torchvision", specifier = "==0.20.1+cu121", index = "https://download.pytorch.org/whl/cu121" }, { name = "transformers", specifier = "==4.48.0" }, { name = "triton", specifier = "==3.1.0", index = "https://download.pytorch.org/whl/cu121" }, + { name = "urwid", specifier = ">=2.6.16" }, { name = "xformers", specifier = ">=0.0.29,<0.0.30" }, ] dev = [ @@ -2627,6 +2629,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, ] +[[package]] +name = "urwid" +version = "2.6.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/21/ad23c9e961b2d36d57c63686a6f86768dd945d406323fb58c84f09478530/urwid-2.6.16.tar.gz", hash = "sha256:93ad239939e44c385e64aa00027878b9e5c486d59e855ec8ab5b1e1adcdb32a2", size = 848179 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/cb/271a4f5a1bf4208dbdc96d85b9eae744cf4e5e11ac73eda76dc98c8fd2d7/urwid-2.6.16-py3-none-any.whl", hash = "sha256:de14896c6df9eb759ed1fd93e0384a5279e51e0dde8f621e4083f7a8368c0797", size = 297196 }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, +] + [[package]] name = "websocket-client" version = "1.8.0" From 12b32a7188bf15b147c039d57c6781b5463d2e09 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Wed, 5 Feb 2025 19:24:21 -0300 Subject: [PATCH 21/35] Refactor ModelMngr to be a context manager + function combo --- skynet/dgpu/__init__.py | 4 +- skynet/dgpu/compute.py | 241 +++++++++++++++++++--------------------- skynet/dgpu/daemon.py | 18 +-- 3 files changed, 124 insertions(+), 139 deletions(-) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 59af61c..f5fcc9e 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -8,7 +8,6 @@ from hypercorn.trio import serve from quart_trio import QuartTrio as Quart from skynet.dgpu.tui import WorkerMonitor -from skynet.dgpu.compute import ModelMngr from skynet.dgpu.daemon import WorkerDaemon from skynet.dgpu.network import NetConnector @@ -48,8 +47,7 @@ async def open_dgpu_node(config: dict) -> None: tui = WorkerMonitor() conn = NetConnector(config, tui=tui) - mm = ModelMngr(config, tui=tui) - daemon = WorkerDaemon(mm, conn, config, tui=tui) + daemon = WorkerDaemon(conn, config, tui=tui) api: Quart|None = None if 'api_bind' in config: diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index d0e8689..ec50054 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -7,6 +7,7 @@ import gc import logging from hashlib import sha256 +from contextlib import contextmanager as cm import trio import torch @@ -66,166 +67,150 @@ def prepare_params_for_diffuse( _params ) +_model_name: str = '' +_model_mode: str = '' +_model = None -class ModelMngr: - ''' - (AI algo) Model manager for loading models, computing outputs, - checking load state, and unloading when no-longer-needed/finished. +@cm +def maybe_load_model(name: str, mode: str): + if mode == 'diffuse': + mode = 'txt2img' - ''' - def __init__(self, config: dict, tui: WorkerMonitor | None = None): - self._tui = tui - self.cache_dir = None - if 'hf_home' in config: - self.cache_dir = config['hf_home'] - - self._model_name: str = '' - self._model_mode: str = '' - - def log_debug_info(self): - logging.debug('memory summary:') - logging.debug('\n' + torch.cuda.memory_summary()) - - def is_model_loaded(self, name: str, mode: str): - if (name == self._model_name and - mode == self._model_mode): - return True - - return False - - def unload_model(self) -> None: - if getattr(self, '_model', None): - del self._model + global _model_name, _model_mode, _model + if _model_name != name or _model_mode != mode: + # unload model + _model = None gc.collect() torch.cuda.empty_cache() - self._model_name = '' - self._model_mode = '' + _model_name = _model_mode = '' - def load_model( - self, - name: str, - mode: str - ) -> None: - logging.info(f'loading model {name}...') - self.unload_model() + # load model + if mode == 'upscale': + _model = init_upscaler() - self._model = pipeline_for( - name, mode, cache_dir=self.cache_dir) - self._model_mode = mode - self._model_name = name - logging.info(f'{name} loaded!') - self.log_debug_info() + else: + _model = pipeline_for( + name, mode, cache_dir='hf_home') - def compute_one( - self, - request_id: int, - method: str, - params: dict, - inputs: list[bytes] = [] - ): - total_steps = params['step'] - def inference_step_wakeup(*args, **kwargs): - '''This is a callback function that gets invoked every inference step, - we need to raise an exception here if we need to cancel work - ''' - step = args[0] - # compat with callback_on_step_end - if not isinstance(step, int): - step = args[1] + _model_name = name + _model_mode = mode - if self._tui: - self._tui.set_progress(step, done=total_steps) + logging.debug('memory summary:') + logging.debug('\n' + torch.cuda.memory_summary()) - should_raise = trio.from_thread.run(self._should_cancel, request_id) - if should_raise: - logging.warning(f'CANCELLING work at step {step}') - raise DGPUInferenceCancelled('network cancel') + yield - return {} - if self._tui: - self._tui.set_status(f'Request #{request_id}') +def compute_one( + request_id: int, + method: str, + params: dict, + inputs: list[bytes] = [], + should_cancel = None, + tui: WorkerMonitor | None = None +): + if method == 'diffuse': + method = 'txt2img' - inference_step_wakeup(0) + global _model, _model_name, _model_mode - output_type = 'png' - if 'output_type' in params: - output_type = params['output_type'] + # validate correct model is loaded + assert params['model'] == _model_name + assert method == _model_mode - output = None - output_hash = None - try: - name = params['model'] + total_steps = params['step'] + def inference_step_wakeup(*args, **kwargs): + '''This is a callback function that gets invoked every inference step, + we need to raise an exception here if we need to cancel work + ''' + step = args[0] + # compat with callback_on_step_end + if not isinstance(step, int): + step = args[1] - match method: - case 'diffuse' | 'txt2img' | 'img2img' | 'inpaint': - if not self.is_model_loaded(name, method): - self.load_model(name, method) + if tui: + tui.set_progress(step, done=total_steps) - arguments = prepare_params_for_diffuse( - params, method, inputs) - prompt, guidance, step, seed, upscaler, extra_params = arguments + if should_cancel: + should_raise = trio.from_thread.run(should_cancel, request_id) - if 'flux' in name.lower(): - extra_params['callback_on_step_end'] = inference_step_wakeup + if should_raise: + logging.warning(f'CANCELLING work at step {step}') + raise DGPUInferenceCancelled('network cancel') - else: - extra_params['callback'] = inference_step_wakeup - extra_params['callback_steps'] = 1 + return {} - output = self._model( - prompt, - guidance_scale=guidance, - num_inference_steps=step, - generator=seed, - **extra_params - ).images[0] + if tui: + tui.set_status(f'Request #{request_id}') - output_binary = b'' - match output_type: - case 'png': - if upscaler == 'x4': - input_img = output.convert('RGB') - up_img, _ = init_upscaler().enhance( - convert_from_image_to_cv2(input_img), outscale=4) + inference_step_wakeup(0) - output = convert_from_cv2_to_image(up_img) + output_type = 'png' + if 'output_type' in params: + output_type = params['output_type'] - output_binary = convert_from_img_to_bytes(output) + output = None + output_hash = None + try: + name = params['model'] - case _: - raise DGPUComputeError(f'Unsupported output type: {output_type}') + match method: + case 'txt2img' | 'img2img' | 'inpaint': + arguments = prepare_params_for_diffuse( + params, method, inputs) + prompt, guidance, step, seed, upscaler, extra_params = arguments - output_hash = sha256(output_binary).hexdigest() + if 'flux' in name.lower(): + extra_params['callback_on_step_end'] = inference_step_wakeup - case 'upscale': - if self._model_mode != 'upscale': - self.unload_model() - self._model = init_upscaler() - self._model_mode = 'upscale' - self._model_name = 'realesrgan' + else: + extra_params['callback'] = inference_step_wakeup + extra_params['callback_steps'] = 1 - input_img = inputs[0].convert('RGB') - up_img, _ = self._model.enhance( - convert_from_image_to_cv2(input_img), outscale=4) + output = _model( + prompt, + guidance_scale=guidance, + num_inference_steps=step, + generator=seed, + **extra_params + ).images[0] - output = convert_from_cv2_to_image(up_img) + output_binary = b'' + match output_type: + case 'png': + if upscaler == 'x4': + input_img = output.convert('RGB') + up_img, _ = init_upscaler().enhance( + convert_from_image_to_cv2(input_img), outscale=4) - output_binary = convert_from_img_to_bytes(output) - output_hash = sha256(output_binary).hexdigest() + output = convert_from_cv2_to_image(up_img) - case _: - raise DGPUComputeError('Unsupported compute method') + output_binary = convert_from_img_to_bytes(output) - except BaseException as err: - raise DGPUComputeError(str(err)) from err + case _: + raise DGPUComputeError(f'Unsupported output type: {output_type}') - finally: - torch.cuda.empty_cache() + output_hash = sha256(output_binary).hexdigest() - if self._tui: - self._tui.set_status('') + case 'upscale': + input_img = inputs[0].convert('RGB') + up_img, _ = _model.enhance( + convert_from_image_to_cv2(input_img), outscale=4) - return output_hash, output + output = convert_from_cv2_to_image(up_img) + + output_binary = convert_from_img_to_bytes(output) + output_hash = sha256(output_binary).hexdigest() + + case _: + raise DGPUComputeError('Unsupported compute method') + + except BaseException as err: + raise DGPUComputeError(str(err)) from err + + if tui: + tui.set_status('') + + return output_hash, output diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 98d3eda..4c0bdce 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -18,7 +18,7 @@ from skynet.dgpu.errors import ( DGPUComputeError, ) from skynet.dgpu.tui import WorkerMonitor -from skynet.dgpu.compute import ModelMngr +from skynet.dgpu.compute import maybe_load_model, compute_one from skynet.dgpu.network import NetConnector @@ -40,12 +40,10 @@ class WorkerDaemon: ''' def __init__( self, - mm: ModelMngr, conn: NetConnector, config: dict, tui: WorkerMonitor | None = None ): - self.mm: ModelMngr = mm self.conn: NetConnector = conn self._tui = tui self.auto_withdraw = ( @@ -248,14 +246,17 @@ class WorkerDaemon: logging.info(f'calculated request hash: {request_hash}') total_step = body['params']['step'] + model = body['params']['model'] + mode = body['method'] # TODO: validate request resp = await self.conn.begin_work(rid) if not resp or 'code' in resp: logging.info('begin_work error, probably being worked on already... skip.') + return False - else: + with maybe_load_model(model, mode): try: if self._tui: self._tui.set_progress(0, done=total_step) @@ -268,13 +269,14 @@ class WorkerDaemon: output_hash = None match self.backend: case 'sync-on-thread': - self.mm._should_cancel = self.should_cancel_work output_hash, output = await trio.to_thread.run_sync( partial( - self.mm.compute_one, + compute_one, rid, - body['method'], body['params'], - inputs=inputs + mode, body['params'], + inputs=inputs, + should_cancel=self.should_cancel_work, + tui=self._tui ) ) From 5a3a43b3c6880286c7a9ec46d30fb8baac6104ee Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Wed, 5 Feb 2025 19:48:57 -0300 Subject: [PATCH 22/35] Refactoring tui to be functional style --- skynet/dgpu/__init__.py | 32 ++-------- skynet/dgpu/compute.py | 14 ++--- skynet/dgpu/daemon.py | 19 +++--- skynet/dgpu/network.py | 13 ++-- skynet/dgpu/tui.py | 128 ++++++++++++++-------------------------- 5 files changed, 68 insertions(+), 138 deletions(-) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index f5fcc9e..96cc303 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -1,36 +1,17 @@ import logging -import warnings import trio +import urwid from hypercorn.config import Config from hypercorn.trio import serve from quart_trio import QuartTrio as Quart -from skynet.dgpu.tui import WorkerMonitor +from skynet.dgpu.tui import init_tui from skynet.dgpu.daemon import WorkerDaemon from skynet.dgpu.network import NetConnector -def setup_logging_for_tui(level): - warnings.filterwarnings("ignore") - - logger = logging.getLogger() - logger.setLevel(level) - - fh = logging.FileHandler('dgpu.log') - fh.setLevel(level) - - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - fh.setFormatter(formatter) - - logger.addHandler(fh) - - for handler in logger.handlers: - if isinstance(handler, logging.StreamHandler): - logger.removeHandler(handler) - - async def open_dgpu_node(config: dict) -> None: ''' Open a top level "GPU mgmt daemon", keep the @@ -43,11 +24,10 @@ async def open_dgpu_node(config: dict) -> None: tui = None if config['tui']: - setup_logging_for_tui(logging.INFO) - tui = WorkerMonitor() + tui = init_tui() - conn = NetConnector(config, tui=tui) - daemon = WorkerDaemon(conn, config, tui=tui) + conn = NetConnector(config) + daemon = WorkerDaemon(conn, config) api: Quart|None = None if 'api_bind' in config: @@ -71,5 +51,5 @@ async def open_dgpu_node(config: dict) -> None: # block until cancelled await daemon.serve_forever() - except *urwid.ExitMainLoop in ex_group: + except *urwid.ExitMainLoop: ... diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index ec50054..a027dc8 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -12,7 +12,7 @@ from contextlib import contextmanager as cm import trio import torch -from skynet.dgpu.tui import WorkerMonitor +from skynet.dgpu.tui import maybe_update_tui from skynet.dgpu.errors import ( DGPUComputeError, DGPUInferenceCancelled, @@ -108,8 +108,7 @@ def compute_one( method: str, params: dict, inputs: list[bytes] = [], - should_cancel = None, - tui: WorkerMonitor | None = None + should_cancel = None ): if method == 'diffuse': method = 'txt2img' @@ -130,8 +129,7 @@ def compute_one( if not isinstance(step, int): step = args[1] - if tui: - tui.set_progress(step, done=total_steps) + maybe_update_tui(lambda tui: tui.set_progress(step, done=total_steps)) if should_cancel: should_raise = trio.from_thread.run(should_cancel, request_id) @@ -142,8 +140,7 @@ def compute_one( return {} - if tui: - tui.set_status(f'Request #{request_id}') + maybe_update_tui(lambda tui: tui.set_status(f'Request #{request_id}')) inference_step_wakeup(0) @@ -210,7 +207,6 @@ def compute_one( except BaseException as err: raise DGPUComputeError(str(err)) from err - if tui: - tui.set_status('') + maybe_update_tui(lambda tui: tui.set_status('')) return output_hash, output diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 4c0bdce..88c0eed 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -17,7 +17,7 @@ from skynet.constants import ( from skynet.dgpu.errors import ( DGPUComputeError, ) -from skynet.dgpu.tui import WorkerMonitor +from skynet.dgpu.tui import maybe_update_tui, maybe_update_tui_async from skynet.dgpu.compute import maybe_load_model, compute_one from skynet.dgpu.network import NetConnector @@ -41,11 +41,9 @@ class WorkerDaemon: def __init__( self, conn: NetConnector, - config: dict, - tui: WorkerMonitor | None = None + config: dict ): self.conn: NetConnector = conn - self._tui = tui self.auto_withdraw = ( config['auto_withdraw'] if 'auto_withdraw' in config else False @@ -152,10 +150,12 @@ class WorkerDaemon: return app async def _update_balance(self): - if self._tui: + async def _fn(tui): # update balance balance = await self.conn.get_worker_balance() - self._tui.set_header_text(new_balance=f'balance: {balance}') + tui.set_header_text(new_balance=f'balance: {balance}') + + await maybe_update_tui_async(_fn) # TODO? this func is kinda big and maybe is better at module # level to reduce indentation? @@ -258,8 +258,7 @@ class WorkerDaemon: with maybe_load_model(model, mode): try: - if self._tui: - self._tui.set_progress(0, done=total_step) + maybe_update_tui(lambda tui: tui.set_progress(0, done=total_step)) output_type = 'png' if 'output_type' in body['params']: @@ -276,7 +275,6 @@ class WorkerDaemon: mode, body['params'], inputs=inputs, should_cancel=self.should_cancel_work, - tui=self._tui ) ) @@ -285,8 +283,7 @@ class WorkerDaemon: f'Unsupported backend {self.backend}' ) - if self._tui: - self._tui.set_progress(total_step) + maybe_update_tui(lambda tui: tui.set_progress(total_step)) self._last_generation_ts: str = datetime.now().isoformat() self._last_benchmark: list[float] = self._benchmark diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index e80a5c3..a20084e 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -13,7 +13,7 @@ import outcome from PIL import Image from leap.cleos import CLEOS from leap.protocol import Asset -from skynet.dgpu.tui import WorkerMonitor +from skynet.dgpu.tui import maybe_update_tui from skynet.constants import ( DEFAULT_IPFS_DOMAIN, GPU_CONTRACT_ABI, @@ -58,7 +58,7 @@ class NetConnector: - CLEOS client ''' - def __init__(self, config: dict, tui: WorkerMonitor | None = None): + def __init__(self, config: dict): # TODO, why these extra instance vars for an (unsynced) # copy of the `config` state? self.account = config['account'] @@ -82,9 +82,8 @@ class NetConnector: self.ipfs_domain = config['ipfs_domain'] self._wip_requests = {} - self._tui = tui - if self._tui: - self._tui.set_header_text(new_worker_name=self.account) + + maybe_update_tui(lambda tui: tui.set_header_text(new_worker_name=self.account)) # blockchain helpers @@ -173,8 +172,8 @@ class NetConnector: n.start_soon( _run_and_save, snap['requests'], req['id'], self.get_status_by_request_id, req['id']) - if self._tui: - self._tui.network_update(snap) + + maybe_update_tui(lambda tui: tui.network_update(snap)) return snap diff --git a/skynet/dgpu/tui.py b/skynet/dgpu/tui.py index 6530b58..7614d1c 100644 --- a/skynet/dgpu/tui.py +++ b/skynet/dgpu/tui.py @@ -1,6 +1,9 @@ -import urwid -import trio import json +import logging +import warnings + +import trio +import urwid class WorkerMonitor: @@ -163,86 +166,41 @@ class WorkerMonitor: self.update_requests(queue) -# # ----------------------------------------------------------------------------- -# # Example usage -# # ----------------------------------------------------------------------------- -# -# async def main(): -# # Example data -# example_requests = [ -# { -# "id": 12, -# "model": "black-forest-labs/FLUX.1-schnell", -# "prompt": "Generate an answer about quantum entanglement.", -# "user": "alice123", -# "reward": "20.0000 GPU", -# "workers": ["workerA", "workerB"], -# }, -# { -# "id": 5, -# "model": "some-other-model/v2.0", -# "prompt": "A story about dragons.", -# "user": "bobthebuilder", -# "reward": "15.0000 GPU", -# "workers": ["workerX"], -# }, -# { -# "id": 99, -# "model": "cool-model/turbo", -# "prompt": "Classify sentiment in these tweets.", -# "user": "charlie", -# "reward": "25.5000 GPU", -# "workers": ["workerOne", "workerTwo", "workerThree"], -# }, -# ] -# -# ui = WorkerMonitor() -# -# async def progress_task(): -# # Fill from 0% to 100% -# for pct in range(101): -# ui.set_progress( -# current=pct, -# status_str=f"Request #1234 ({pct}%)" -# ) -# await trio.sleep(0.05) -# # Reset to 0 -# ui.set_progress( -# current=0, -# status_str="Starting again..." -# ) -# -# async def update_data_task(): -# await trio.sleep(3) # Wait a bit, then update requests -# new_data = [{ -# "id": 101, -# "model": "new-model/v1.0", -# "prompt": "Say hi to the world.", -# "user": "eve", -# "reward": "50.0000 GPU", -# "workers": ["workerFresh", "workerPower"], -# }] -# ui.update_requests(new_data) -# ui.set_header_text(new_worker_name="NewNodeName", -# new_balance="balance: 12345.6789 GPU") -# -# try: -# async with trio.open_nursery() as nursery: -# # Run the TUI -# nursery.start_soon(ui.run_teadown_on_exit, nursery) -# -# ui.update_requests(example_requests) -# ui.set_header_text( -# new_worker_name="worker1.scd", -# new_balance="balance: 12345.6789 GPU" -# ) -# # Start background tasks -# nursery.start_soon(progress_task) -# nursery.start_soon(update_data_task) -# -# except *KeyboardInterrupt as ex_group: -# ... -# -# -# if __name__ == "__main__": -# trio.run(main) +def setup_logging_for_tui(level): + warnings.filterwarnings("ignore") + + logger = logging.getLogger() + logger.setLevel(level) + + fh = logging.FileHandler('dgpu.log') + fh.setLevel(level) + + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + fh.setFormatter(formatter) + + logger.addHandler(fh) + + for handler in logger.handlers: + if isinstance(handler, logging.StreamHandler): + logger.removeHandler(handler) + + +_tui = None +def init_tui(): + global _tui + assert not _tui + setup_logging_for_tui(logging.INFO) + _tui = WorkerMonitor() + return _tui + + +def maybe_update_tui(fn): + global _tui + if _tui: + fn(_tui) + + +async def maybe_update_tui_async(fn): + global _tui + if _tui: + await fn(_tui) From ea3b35904ce40775435f50b1648aff9c10e4b7ed Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 15:07:48 -0300 Subject: [PATCH 23/35] Create msgspec struct for config --- skynet/cli.py | 270 +++++----------------------------------- skynet/config.py | 65 ++++++++-- skynet/dgpu/__init__.py | 13 +- skynet/dgpu/daemon.py | 51 +++----- skynet/dgpu/network.py | 61 ++++----- 5 files changed, 129 insertions(+), 331 deletions(-) diff --git a/skynet/cli.py b/skynet/cli.py index 9de9b98..0ead5d3 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -13,7 +13,6 @@ from leap.protocol import ( from .config import ( load_skynet_toml, - load_key, set_hf_vars, ConfigParsingError, ) @@ -49,9 +48,7 @@ def txt2img(*args, **kwargs): from . import utils # TODO? why here, import cycle? config = load_skynet_toml() - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) + set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.txt2img(hf_token, **kwargs) @@ -75,9 +72,7 @@ def txt2img(*args, **kwargs): def img2img(model, prompt, input, output, strength, guidance, steps, seed): from . import utils config = load_skynet_toml() - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) + set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.img2img( hf_token, model=model, @@ -105,9 +100,7 @@ def img2img(model, prompt, input, output, strength, guidance, steps, seed): def inpaint(model, prompt, input, mask, output, strength, guidance, steps, seed): from . import utils config = load_skynet_toml() - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) + set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.inpaint( hf_token, model=model, @@ -137,113 +130,15 @@ def upscale(input, output, model): def download(): from . import utils config = load_skynet_toml() - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) - utils.download_all_models(hf_token, hf_home) + set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) + utils.download_all_models(config.dgpu.hf_token, config.dgpu.hf_home) -@skynet.command() -@click.option( - '--reward', '-r', default='20.0000 GPU') -@click.option('--jobs', '-j', default=1) -@click.option('--model', '-m', default='stabilityai/stable-diffusion-xl-base-1.0') -@click.option( - '--prompt', '-p', default='a red old tractor in a sunny wheat field') -@click.option('--output', '-o', default='output.png') -@click.option('--width', '-w', default=1024) -@click.option('--height', '-h', default=1024) -@click.option('--guidance', '-g', default=10) -@click.option('--step', '-s', default=26) -@click.option('--seed', '-S', default=None) -@click.option('--upscaler', '-U', default='x4') -@click.option('--binary_data', '-b', default='') -@click.option('--strength', '-Z', default=None) -def enqueue( - reward: str, - jobs: int, - **kwargs -): - import trio - from leap.cleos import CLEOS - - config = load_skynet_toml() - - key = load_key(config, 'skynet.user.key') - account = load_key(config, 'skynet.user.account') - permission = load_key(config, 'skynet.user.permission') - node_url = load_key(config, 'skynet.user.node_url') - - cleos = CLEOS(None, None, url=node_url, remote=node_url) - - binary = kwargs['binary_data'] - if not kwargs['strength']: - if binary: - raise ValueError('strength -Z param required if binary data passed') - - del kwargs['strength'] - - else: - kwargs['strength'] = float(kwargs['strength']) - - async def enqueue_n_jobs(): - for i in range(jobs): - if not kwargs['seed']: - kwargs['seed'] = random.randint(0, 10e9) - - req = json.dumps({ - 'method': 'diffuse', - 'params': kwargs - }) - - res = await cleos.a_push_action( - 'gpu.scd', - 'enqueue', - { - 'user': Name(account), - 'request_body': req, - 'binary_data': binary, - 'reward': Asset.from_str(reward), - 'min_verification': 1 - }, - account, key, permission, - ) - print(res) - - trio.run(enqueue_n_jobs) - - -@skynet.command() -@click.option('--loglevel', '-l', default='INFO', help='Logging level') -def clean( - loglevel: str, -): - import trio - from leap.cleos import CLEOS - - config = load_skynet_toml() - key = load_key(config, 'skynet.user.key') - account = load_key(config, 'skynet.user.account') - permission = load_key(config, 'skynet.user.permission') - node_url = load_key(config, 'skynet.user.node_url') - - logging.basicConfig(level=loglevel) - cleos = CLEOS(None, None, url=node_url, remote=node_url) - trio.run( - partial( - cleos.a_push_action, - 'gpu.scd', - 'clean', - {}, - account, key, permission=permission - ) - ) - @skynet.command() def queue(): import requests config = load_skynet_toml() - node_url = load_key(config, 'skynet.user.node_url') + node_url = config.user.node_url resp = requests.post( f'{node_url}/v1/chain/get_table_rows', json={ @@ -260,7 +155,7 @@ def queue(): def status(request_id: int): import requests config = load_skynet_toml() - node_url = load_key(config, 'skynet.user.node_url') + node_url = config.user.node_url resp = requests.post( f'{node_url}/v1/chain/get_table_rows', json={ @@ -272,101 +167,6 @@ def status(request_id: int): ) print(json.dumps(resp.json(), indent=4)) -@skynet.command() -@click.argument('request-id') -def dequeue(request_id: int): - import trio - from leap.cleos import CLEOS - - config = load_skynet_toml() - key = load_key(config, 'skynet.user.key') - account = load_key(config, 'skynet.user.account') - permission = load_key(config, 'skynet.user.permission') - node_url = load_key(config, 'skynet.user.node_url') - - cleos = CLEOS(None, None, url=node_url, remote=node_url) - res = trio.run( - partial( - cleos.a_push_action, - 'gpu.scd', - 'dequeue', - { - 'user': Name(account), - 'request_id': int(request_id), - }, - account, key, permission=permission - ) - ) - print(res) - - -@skynet.command() -@click.option( - '--token-contract', '-c', default='eosio.token') -@click.option( - '--token-symbol', '-S', default='4,GPU') -def config( - token_contract: str, - token_symbol: str -): - import trio - from leap.cleos import CLEOS - - config = load_skynet_toml() - - key = load_key(config, 'skynet.user.key') - account = load_key(config, 'skynet.user.account') - permission = load_key(config, 'skynet.user.permission') - node_url = load_key(config, 'skynet.user.node_url') - - cleos = CLEOS(None, None, url=node_url, remote=node_url) - res = trio.run( - partial( - cleos.a_push_action, - 'gpu.scd', - 'config', - { - 'token_contract': token_contract, - 'token_symbol': token_symbol, - }, - account, key, permission=permission - ) - ) - print(res) - - -@skynet.command() -@click.argument('quantity') -def deposit(quantity: str): - import trio - from leap.cleos import CLEOS - from leap.sugar import asset_from_str - - config = load_skynet_toml() - - key = load_key(config, 'skynet.user.key') - account = load_key(config, 'skynet.user.account') - permission = load_key(config, 'skynet.user.permission') - node_url = load_key(config, 'skynet.user.node_url') - cleos = CLEOS(None, None, url=node_url, remote=node_url) - - res = trio.run( - partial( - cleos.a_push_action, - 'gpu.scd', - 'transfer', - { - 'sender': Name(account), - 'recipient': Name('gpu.scd'), - 'amount': asset_from_str(quantity), - 'memo': f'{account} transferred {quantity} to gpu.scd' - }, - account, key, permission=permission - ) - ) - print(res) - - @skynet.group() def run(*args, **kwargs): pass @@ -380,13 +180,6 @@ def db(): container, passwd, host = db_params logging.info(('skynet', passwd, host)) -@run.command() -def nodeos(): - from .nodeos import open_nodeos - - logging.basicConfig(filename='skynet-nodeos.log', level=logging.INFO) - with open_nodeos(cleanup=False): - ... @run.command() @click.option('--loglevel', '-l', default='INFO', help='Logging level') @@ -405,14 +198,9 @@ def dgpu( logging.basicConfig(level=loglevel) config = load_skynet_toml(file_path=config_path) - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) + set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) - assert 'skynet' in config - assert 'dgpu' in config['skynet'] - - trio.run(open_dgpu_node, config['skynet']['dgpu']) + trio.run(open_dgpu_node, config.dgpu) @run.command() @@ -435,24 +223,24 @@ def telegram( logging.basicConfig(level=loglevel) config = load_skynet_toml() - tg_token = load_key(config, 'skynet.telegram.tg_token') + tg_token = config.telegram.tg_token - key = load_key(config, 'skynet.telegram.key') - account = load_key(config, 'skynet.telegram.account') - permission = load_key(config, 'skynet.telegram.permission') - node_url = load_key(config, 'skynet.telegram.node_url') - hyperion_url = load_key(config, 'skynet.telegram.hyperion_url') + key = config.telegram.key + account = config.telegram.account + permission = config.telegram.permission + node_url = config.telegram.node_url + hyperion_url = config.telegram.hyperion_url - ipfs_url = load_key(config, 'skynet.telegram.ipfs_url') + ipfs_url = config.telegram.ipfs_url try: - explorer_domain = load_key(config, 'skynet.telegram.explorer_domain') + explorer_domain = config.telegram.explorer_domain except ConfigParsingError: explorer_domain = DEFAULT_EXPLORER_DOMAIN try: - ipfs_domain = load_key(config, 'skynet.telegram.ipfs_domain') + ipfs_domain = config.telegram.ipfs_domain except ConfigParsingError: ipfs_domain = DEFAULT_IPFS_DOMAIN @@ -498,24 +286,24 @@ def discord( logging.basicConfig(level=loglevel) config = load_skynet_toml() - dc_token = load_key(config, 'skynet.discord.dc_token') + dc_token = config.discord.dc_token - key = load_key(config, 'skynet.discord.key') - account = load_key(config, 'skynet.discord.account') - permission = load_key(config, 'skynet.discord.permission') - node_url = load_key(config, 'skynet.discord.node_url') - hyperion_url = load_key(config, 'skynet.discord.hyperion_url') + key = config.discord.key + account = config.discord.account + permission = config.discord.permission + node_url = config.discord.node_url + hyperion_url = config.discord.hyperion_url - ipfs_url = load_key(config, 'skynet.discord.ipfs_url') + ipfs_url = config.discord.ipfs_url try: - explorer_domain = load_key(config, 'skynet.discord.explorer_domain') + explorer_domain = config.discord.explorer_domain except ConfigParsingError: explorer_domain = DEFAULT_EXPLORER_DOMAIN try: - ipfs_domain = load_key(config, 'skynet.discord.ipfs_domain') + ipfs_domain = config.discord.ipfs_domain except ConfigParsingError: ipfs_domain = DEFAULT_IPFS_DOMAIN @@ -549,8 +337,8 @@ def pinner(loglevel): from .ipfs.pinner import SkynetPinner config = load_skynet_toml() - hyperion_url = load_key(config, 'skynet.pinner.hyperion_url') - ipfs_url = load_key(config, 'skynet.pinner.ipfs_url') + hyperion_url = config.pinner.hyperion_url + ipfs_url = config.pinner.ipfs_url logging.basicConfig(level=loglevel) ipfs_node = AsyncIPFSHTTP(ipfs_url) diff --git a/skynet/config.py b/skynet/config.py index 9470877..ca7b745 100755 --- a/skynet/config.py +++ b/skynet/config.py @@ -1,27 +1,70 @@ import os import toml -from .constants import DEFAULT_CONFIG_PATH +import msgspec + +from skynet.constants import DEFAULT_CONFIG_PATH, DEFAULT_IPFS_DOMAIN class ConfigParsingError(BaseException): ... -def load_skynet_toml(file_path=DEFAULT_CONFIG_PATH) -> dict: - config = toml.load(file_path) - return config +class DgpuConfig(msgspec.Struct): + account: str + permission: str + key: str + node_url: str + hyperion_url: str + ipfs_url: str + hf_token: str + ipfs_domain: str = DEFAULT_IPFS_DOMAIN + hf_home: str = 'hf_home' + non_compete: set[str] = set() + model_whitelist: set[str] = set() + model_blacklist: set[str] = set() + backend: str = 'sync-on-thread' + api_bind: str = False + tui: bool = False +class TelegramConfig(msgspec.Struct): + account: str + permission: str + key: str + node_url: str + hyperion_url: str + ipfs_url: str + token: str -def load_key(config: dict, key: str) -> str: - for skey in key.split('.'): - if skey not in config: - conf_keys = [k for k in config] - raise ConfigParsingError(f'key \"{skey}\" not in {conf_keys}') +class DiscordConfig(msgspec.Struct): + account: str + permission: str + key: str + node_url: str + hyperion_url: str + ipfs_url: str + token: str - config = config[skey] +class PinnerConfig(msgspec.Struct): + hyperion_url: str + ipfs_url: str - return config +class UserConfig(msgspec.Struct): + account: str + permission: str + key: str + node_url: str + +class Config(msgspec.Struct): + dgpu: DgpuConfig | None = None + telegram: TelegramConfig | None = None + discord: DiscordConfig | None = None + pinner: PinnerConfig | None = None + user: UserConfig | None = None + +def load_skynet_toml(file_path=DEFAULT_CONFIG_PATH) -> Config: + with open(file_path, 'r') as file: + return msgspec.toml.decode(file.read(), type=Config) def set_hf_vars(hf_token: str, hf_home: str): diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 96cc303..6f7c6f7 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -3,16 +3,17 @@ import logging import trio import urwid -from hypercorn.config import Config +from hypercorn.config import Config as HCConfig from hypercorn.trio import serve from quart_trio import QuartTrio as Quart +from skynet.config import Config from skynet.dgpu.tui import init_tui from skynet.dgpu.daemon import WorkerDaemon from skynet.dgpu.network import NetConnector -async def open_dgpu_node(config: dict) -> None: +async def open_dgpu_node(config: Config) -> None: ''' Open a top level "GPU mgmt daemon", keep the `WorkerDaemon._snap: dict[str, list|dict]` table @@ -23,16 +24,16 @@ async def open_dgpu_node(config: dict) -> None: logging.getLogger("httpx").setLevel(logging.WARNING) tui = None - if config['tui']: + if config.tui: tui = init_tui() conn = NetConnector(config) daemon = WorkerDaemon(conn, config) api: Quart|None = None - if 'api_bind' in config: - api_conf = Config() - api_conf.bind = [config['api_bind']] + if config.api_bind: + api_conf = HCConfig() + api_conf.bind = [config.api_bind] api: Quart = await daemon.generate_api() tn: trio.Nursery diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 88c0eed..31c3d79 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -10,6 +10,7 @@ import trio from quart import jsonify from quart_trio import QuartTrio as Quart +from skynet.config import DgpuConfig as Config from skynet.constants import ( MODELS, VERSION, @@ -41,31 +42,10 @@ class WorkerDaemon: def __init__( self, conn: NetConnector, - config: dict + config: Config ): + self.config = config self.conn: NetConnector = conn - self.auto_withdraw = ( - config['auto_withdraw'] - if 'auto_withdraw' in config else False - ) - - self.account: str = config['account'] - - self.non_compete = set() - if 'non_compete' in config: - self.non_compete = set(config['non_compete']) - - self.model_whitelist = set() - if 'model_whitelist' in config: - self.model_whitelist = set(config['model_whitelist']) - - self.model_blacklist = set() - if 'model_blacklist' in config: - self.model_blacklist = set(config['model_blacklist']) - - self.backend = 'sync-on-thread' - if 'backend' in config: - self.backend = config['backend'] self._snap = { 'queue': [], @@ -107,10 +87,10 @@ class WorkerDaemon: competitors = set([ status['worker'] for status in self._snap['requests'][request_id] - if status['worker'] != self.account + if status['worker'] != self.config.account ]) logging.info(f'competitors: {competitors}') - should_cancel = bool(self.non_compete & competitors) + should_cancel = bool(self.config.non_compete & competitors) logging.info(f'cancel: {should_cancel}') return should_cancel @@ -141,7 +121,7 @@ class WorkerDaemon: @app.route('/') async def health(): return jsonify( - account=self.account, + account=self.config.account, version=VERSION, last_generation_ts=self._last_generation_ts, last_generation_speed=self._get_benchmark_speed() @@ -182,15 +162,19 @@ class WorkerDaemon: # only handle whitelisted models if ( - len(self.model_whitelist) > 0 + len(self.config.model_whitelist) > 0 and - model not in self.model_whitelist + model not in self.config.model_whitelist ): logging.warning('model not whitelisted!, skip...') return False # if blacklist contains model skip - if model in self.model_blacklist: + if ( + len(self.config.model_blacklist) > 0 + and + model in self.config.model_blacklist + ): logging.warning('model not blacklisted!, skip...') return False @@ -205,7 +189,7 @@ class WorkerDaemon: # skip if workers in non_compete already on it competitors = set((status['worker'] for status in statuses)) - if bool(self.non_compete & competitors): + if bool(self.config.non_compete & competitors): logging.info('worker in configured non_compete list already working on request, skip...') return False @@ -266,7 +250,7 @@ class WorkerDaemon: output = None output_hash = None - match self.backend: + match self.config.backend: case 'sync-on-thread': output_hash, output = await trio.to_thread.run_sync( partial( @@ -280,7 +264,7 @@ class WorkerDaemon: case _: raise DGPUComputeError( - f'Unsupported backend {self.backend}' + f'Unsupported backend {self.config.backend}' ) maybe_update_tui(lambda tui: tui.set_progress(total_step)) @@ -316,9 +300,6 @@ class WorkerDaemon: await self._update_balance() try: while True: - if self.auto_withdraw: - await self.conn.maybe_withdraw_all() - queue = self._snap['queue'] random.shuffle(queue) diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index a20084e..9076ee2 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -14,6 +14,7 @@ from PIL import Image from leap.cleos import CLEOS from leap.protocol import Asset from skynet.dgpu.tui import maybe_update_tui +from skynet.config import DgpuConfig as Config from skynet.constants import ( DEFAULT_IPFS_DOMAIN, GPU_CONTRACT_ABI, @@ -58,32 +59,16 @@ class NetConnector: - CLEOS client ''' - def __init__(self, config: dict): - # TODO, why these extra instance vars for an (unsynced) - # copy of the `config` state? - self.account = config['account'] - self.permission = config['permission'] - self.key = config['key'] - - # TODO, neither of these instance vars are used anywhere in - # methods? so why are they set on this type? - self.node_url = config['node_url'] - self.hyperion_url = config['hyperion_url'] - - self.cleos = CLEOS(endpoint=self.node_url) + def __init__(self, config: Config): + self.config = config + self.cleos = CLEOS(endpoint=config.node_url) self.cleos.load_abi('gpu.scd', GPU_CONTRACT_ABI) - self.ipfs_url = config['ipfs_url'] - - self.ipfs_client = AsyncIPFSHTTP(self.ipfs_url) - - self.ipfs_domain = DEFAULT_IPFS_DOMAIN - if 'ipfs_domain' in config: - self.ipfs_domain = config['ipfs_domain'] + self.ipfs_client = AsyncIPFSHTTP(config.ipfs_url) self._wip_requests = {} - maybe_update_tui(lambda tui: tui.set_header_text(new_worker_name=self.account)) + maybe_update_tui(lambda tui: tui.set_header_text(new_worker_name=self.config.account)) # blockchain helpers @@ -135,8 +120,8 @@ class NetConnector: 'gpu.scd', 'gpu.scd', 'users', index_position=1, key_type='name', - lower_bound=self.account, - upper_bound=self.account + lower_bound=self.config.account, + upper_bound=self.config.account )) if rows: @@ -190,12 +175,12 @@ class NetConnector: 'gpu.scd', 'workbegin', list({ - 'worker': self.account, + 'worker': self.config.account, 'request_id': request_id, 'max_workers': 2 }.values()), - self.account, self.key, - permission=self.permission + self.config.account, self.config.key, + permission=self.config.permission ) ) @@ -207,12 +192,12 @@ class NetConnector: 'gpu.scd', 'workcancel', list({ - 'worker': self.account, + 'worker': self.config.account, 'request_id': request_id, 'reason': reason }.values()), - self.account, self.key, - permission=self.permission + self.config.account, self.config.key, + permission=self.config.permission ) ) @@ -230,11 +215,11 @@ class NetConnector: 'gpu.scd', 'withdraw', list({ - 'user': self.account, + 'user': self.config.account, 'quantity': Asset.from_str(balance) }.values()), - self.account, self.key, - permission=self.permission + self.config.account, self.config.key, + permission=self.config.permission ) ) @@ -246,8 +231,8 @@ class NetConnector: 'gpu.scd', 'gpu.scd', 'results', index_position=4, key_type='name', - lower_bound=self.account, - upper_bound=self.account + lower_bound=self.config.account, + upper_bound=self.config.account ) ) return rows @@ -266,14 +251,14 @@ class NetConnector: 'gpu.scd', 'submit', list({ - 'worker': self.account, + 'worker': self.config.account, 'request_id': request_id, 'request_hash': request_hash, 'result_hash': result_hash, 'ipfs_hash': ipfs_hash }.values()), - self.account, self.key, - permission=self.permission + self.config.account, self.config.key, + permission=self.config.permission ) ) @@ -310,7 +295,7 @@ class NetConnector: consuming AI model. ''' - link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}' + link = f'https://{self.config.ipfs_domain}/ipfs/{ipfs_hash}' res = await get_ipfs_file(link, timeout=1) if not res or res.status_code != 200: From 149d9f9f33c3f9df57ab8e678f9d0d444f5f20ba Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 16:41:50 -0300 Subject: [PATCH 24/35] Factor out WorkerDaemon, split into functions, made poller into an async gen and moved it to NetConnector as well as should_cancel --- skynet/cli.py | 4 +- skynet/config.py | 1 + skynet/dgpu/__init__.py | 45 +---- skynet/dgpu/compute.py | 1 + skynet/dgpu/daemon.py | 418 ++++++++++++++-------------------------- skynet/dgpu/network.py | 40 +++- 6 files changed, 199 insertions(+), 310 deletions(-) diff --git a/skynet/cli.py b/skynet/cli.py index 0ead5d3..f835d98 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -193,14 +193,14 @@ def dgpu( config_path: str ): import trio - from .dgpu import open_dgpu_node + from .dgpu import _dgpu_main logging.basicConfig(level=loglevel) config = load_skynet_toml(file_path=config_path) set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) - trio.run(open_dgpu_node, config.dgpu) + trio.run(_dgpu_main, config.dgpu) @run.command() diff --git a/skynet/config.py b/skynet/config.py index ca7b745..7625dab 100755 --- a/skynet/config.py +++ b/skynet/config.py @@ -26,6 +26,7 @@ class DgpuConfig(msgspec.Struct): backend: str = 'sync-on-thread' api_bind: str = False tui: bool = False + poll_time: float = 0.5 class TelegramConfig(msgspec.Struct): account: str diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 6f7c6f7..1c1f40c 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -3,23 +3,13 @@ import logging import trio import urwid -from hypercorn.config import Config as HCConfig -from hypercorn.trio import serve -from quart_trio import QuartTrio as Quart - from skynet.config import Config from skynet.dgpu.tui import init_tui -from skynet.dgpu.daemon import WorkerDaemon +from skynet.dgpu.daemon import serve_forever from skynet.dgpu.network import NetConnector -async def open_dgpu_node(config: Config) -> None: - ''' - Open a top level "GPU mgmt daemon", keep the - `WorkerDaemon._snap: dict[str, list|dict]` table - and *maybe* serve a `hypercorn` web API. - - ''' +async def _dgpu_main(config: Config) -> None: # suppress logs from httpx (logs url + status after every query) logging.getLogger("httpx").setLevel(logging.WARNING) @@ -28,29 +18,14 @@ async def open_dgpu_node(config: Config) -> None: tui = init_tui() conn = NetConnector(config) - daemon = WorkerDaemon(conn, config) - api: Quart|None = None - if config.api_bind: - api_conf = HCConfig() - api_conf.bind = [config.api_bind] - api: Quart = await daemon.generate_api() + try: + n: trio.Nursery + async with trio.open_nursery() as n: + if tui: + n.start_soon(tui.run) - tn: trio.Nursery - async with trio.open_nursery() as tn: - tn.start_soon(daemon.snap_updater_task) - if tui: - tn.start_soon(tui.run) + await serve_forever(config, conn) - # TODO, consider a more explicit `as hypercorn_serve` - # to clarify? - if api: - logging.info(f'serving api @ {config["api_bind"]}') - tn.start_soon(serve, api, api_conf) - - try: - # block until cancelled - await daemon.serve_forever() - - except *urwid.ExitMainLoop: - ... + except *urwid.ExitMainLoop: + ... diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index a027dc8..5e90791 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -20,6 +20,7 @@ from skynet.dgpu.errors import ( from skynet.utils import crop_image, convert_from_cv2_to_image, convert_from_image_to_cv2, convert_from_img_to_bytes, init_upscaler, pipeline_for + def prepare_params_for_diffuse( params: dict, mode: str, diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 31c3d79..f3f8ef3 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -7,8 +7,6 @@ from functools import partial from hashlib import sha256 import trio -from quart import jsonify -from quart_trio import QuartTrio as Quart from skynet.config import DgpuConfig as Config from skynet.constants import ( @@ -31,291 +29,175 @@ def convert_reward_to_int(reward_str): return int(int_part + decimal_part) -class WorkerDaemon: - ''' - The root "GPU daemon". +async def maybe_update_tui_balance(conn: NetConnector): + async def _fn(tui): + # update balance + balance = await conn.get_worker_balance() + tui.set_header_text(new_balance=f'balance: {balance}') - Contains/manages underlying susystems: - - a GPU connecto + await maybe_update_tui_async(_fn) - ''' - def __init__( - self, - conn: NetConnector, - config: Config + +async def maybe_serve_one( + config: Config, + conn: NetConnector, + req: dict, +): + rid = req['id'] + logging.info(f'maybe serve request #{rid}') + + # parse request + body = json.loads(req['body']) + model = body['params']['model'] + + # if model not known, ignore. + if ( + model != 'RealESRGAN_x4plus' + and + model not in MODELS ): - self.config = config - self.conn: NetConnector = conn + logging.warning(f'unknown model {model}!, skip...') + return - self._snap = { - 'queue': [], - 'requests': {}, - 'results': [] - } + # only handle whitelisted models + if ( + len(config.model_whitelist) > 0 + and + model not in config.model_whitelist + ): + logging.warning('model not whitelisted!, skip...') + return - self._benchmark: list[float] = [] - self._last_benchmark: list[float]|None = None - self._last_generation_ts: str|None = None + # if blacklist contains model skip + if ( + len(config.model_blacklist) > 0 + and + model in config.model_blacklist + ): + logging.warning('model not blacklisted!, skip...') + return - def _get_benchmark_speed(self) -> float: - ''' - Return the (arithmetic) average work-iterations-per-second - fconducted by this compute worker. + results = [res['request_id'] for res in conn._tables['results']] - ''' - if not self._last_benchmark: - return 0 + # if worker already produced a result for this request + if rid in results: + logging.info(f'worker already submitted a result for request #{rid}, skip...') + return - start = self._last_benchmark[0] - end = self._last_benchmark[-1] + statuses = conn._tables['requests'][rid] - elapsed = end - start - its = len(self._last_benchmark) - speed = its / elapsed + # skip if workers in non_compete already on it + competitors = set((status['worker'] for status in statuses)) + if bool(config.non_compete & competitors): + logging.info('worker in configured non_compete list already working on request, skip...') + return - logging.info(f'{elapsed} s total its: {its}, at {speed} it/s ') + # resolve the ipfs hashes into the actual data behind them + inputs = [] + raw_inputs = req['binary_data'].split(',') + if raw_inputs: + logging.info(f'fetching IPFS inputs: {raw_inputs}') - return speed + retry = 3 + for _input in req['binary_data'].split(','): + if _input: + for r in range(retry): + try: + # user `GPUConnector` to IO with + # storage layer to seed the compute + # task. + img = await conn.get_input_data(_input) + inputs.append(img) + logging.info(f'retrieved {_input}!') + break - async def should_cancel_work(self, request_id: int): - self._benchmark.append(time.time()) - logging.info('should cancel work?') - if request_id not in self._snap['requests']: - logging.info(f'request #{request_id} no longer in queue, likely its been filled by another worker, cancelling work...') - return True + except BaseException: + logging.exception( + f'IPFS fetch input error !?! retries left {retry - r - 1}\n' + ) - competitors = set([ - status['worker'] - for status in self._snap['requests'][request_id] - if status['worker'] != self.config.account - ]) - logging.info(f'competitors: {competitors}') - should_cancel = bool(self.config.non_compete & competitors) - logging.info(f'cancel: {should_cancel}') - return should_cancel + # compute unique request hash used on submit + hash_str = ( + str(req['nonce']) + + + req['body'] + + + req['binary_data'] + ) + logging.debug(f'hashing: {hash_str}') + request_hash = sha256(hash_str.encode('utf-8')).hexdigest() + logging.info(f'calculated request hash: {request_hash}') + + total_step = body['params']['step'] + model = body['params']['model'] + mode = body['method'] + + # TODO: validate request + + resp = await conn.begin_work(rid) + if not resp or 'code' in resp: + logging.info('begin_work error, probably being worked on already... skip.') + return + + with maybe_load_model(model, mode): + try: + maybe_update_tui(lambda tui: tui.set_progress(0, done=total_step)) + + output_type = 'png' + if 'output_type' in body['params']: + output_type = body['params']['output_type'] + + output = None + output_hash = None + match config.backend: + case 'sync-on-thread': + output_hash, output = await trio.to_thread.run_sync( + partial( + compute_one, + rid, + mode, body['params'], + inputs=inputs, + should_cancel=conn.should_cancel_work, + ) + ) + + case _: + raise DGPUComputeError( + f'Unsupported backend {config.backend}' + ) + + maybe_update_tui(lambda tui: tui.set_progress(total_step)) + + ipfs_hash = await conn.publish_on_ipfs(output, typ=output_type) + + await conn.submit_work(rid, request_hash, output_hash, ipfs_hash) + + await maybe_update_tui_balance(conn) - async def snap_updater_task(self): - ''' - Busy loop update the local `._snap: dict` table from + except BaseException as err: + if 'network cancel' not in str(err): + logging.exception('Failed to serve model request !?\n') - ''' - while True: - self._snap = await self.conn.get_full_queue_snapshot() - await trio.sleep(1) + if rid in conn._tables['requests']: + await conn.cancel_work(rid, 'reason not provided') - # TODO, design suggestion, just make this a lazily accessed - # `@class_property` if we're 3.12+ - # |_ https://docs.python.org/3/library/functools.html#functools.cached_property - async def generate_api(self) -> Quart: - ''' - Gen a `Quart`-compat web API spec which (for now) simply - serves a small monitoring ep that reports, - - iso-time-stamp of the last served model-output - - the worker's average "compute-iterations-per-second" +async def serve_forever(config: Config, conn: NetConnector): + await maybe_update_tui_balance(conn) + try: + async for tables in conn.iter_poll_update(config.poll_time): + queue = tables['queue'] - ''' - app = Quart(__name__) - - @app.route('/') - async def health(): - return jsonify( - account=self.config.account, - version=VERSION, - last_generation_ts=self._last_generation_ts, - last_generation_speed=self._get_benchmark_speed() + random.shuffle(queue) + queue = sorted( + queue, + key=lambda req: convert_reward_to_int(req['reward']), + reverse=True ) - return app + if len(queue) > 0: + await maybe_serve_one(config, conn, queue[0]) - async def _update_balance(self): - async def _fn(tui): - # update balance - balance = await self.conn.get_worker_balance() - tui.set_header_text(new_balance=f'balance: {balance}') - - await maybe_update_tui_async(_fn) - - # TODO? this func is kinda big and maybe is better at module - # level to reduce indentation? - # -[ ] just pass `daemon: WorkerDaemon` vs. `self` - async def maybe_serve_one( - self, - req: dict, - ): - rid = req['id'] - logging.info(f'maybe serve request #{rid}') - - # parse request - body = json.loads(req['body']) - model = body['params']['model'] - - # if model not known, ignore. - if ( - model != 'RealESRGAN_x4plus' - and - model not in MODELS - ): - logging.warning(f'unknown model {model}!, skip...') - return False - - # only handle whitelisted models - if ( - len(self.config.model_whitelist) > 0 - and - model not in self.config.model_whitelist - ): - logging.warning('model not whitelisted!, skip...') - return False - - # if blacklist contains model skip - if ( - len(self.config.model_blacklist) > 0 - and - model in self.config.model_blacklist - ): - logging.warning('model not blacklisted!, skip...') - return False - - results = [res['request_id'] for res in self._snap['results']] - - # if worker already produced a result for this request - if rid in results: - logging.info(f'worker already submitted a result for request #{rid}, skip...') - return False - - statuses = self._snap['requests'][rid] - - # skip if workers in non_compete already on it - competitors = set((status['worker'] for status in statuses)) - if bool(self.config.non_compete & competitors): - logging.info('worker in configured non_compete list already working on request, skip...') - return False - - # resolve the ipfs hashes into the actual data behind them - inputs = [] - raw_inputs = req['binary_data'].split(',') - if raw_inputs: - logging.info(f'fetching IPFS inputs: {raw_inputs}') - - retry = 3 - for _input in req['binary_data'].split(','): - if _input: - for r in range(retry): - try: - # user `GPUConnector` to IO with - # storage layer to seed the compute - # task. - img = await self.conn.get_input_data(_input) - inputs.append(img) - logging.info(f'retrieved {_input}!') - break - - except BaseException: - logging.exception( - f'IPFS fetch input error !?! retries left {retry - r - 1}\n' - ) - - # compute unique request hash used on submit - hash_str = ( - str(req['nonce']) - + - req['body'] - + - req['binary_data'] - ) - logging.debug(f'hashing: {hash_str}') - request_hash = sha256(hash_str.encode('utf-8')).hexdigest() - logging.info(f'calculated request hash: {request_hash}') - - total_step = body['params']['step'] - model = body['params']['model'] - mode = body['method'] - - # TODO: validate request - - resp = await self.conn.begin_work(rid) - if not resp or 'code' in resp: - logging.info('begin_work error, probably being worked on already... skip.') - return False - - with maybe_load_model(model, mode): - try: - maybe_update_tui(lambda tui: tui.set_progress(0, done=total_step)) - - output_type = 'png' - if 'output_type' in body['params']: - output_type = body['params']['output_type'] - - output = None - output_hash = None - match self.config.backend: - case 'sync-on-thread': - output_hash, output = await trio.to_thread.run_sync( - partial( - compute_one, - rid, - mode, body['params'], - inputs=inputs, - should_cancel=self.should_cancel_work, - ) - ) - - case _: - raise DGPUComputeError( - f'Unsupported backend {self.config.backend}' - ) - - maybe_update_tui(lambda tui: tui.set_progress(total_step)) - - self._last_generation_ts: str = datetime.now().isoformat() - self._last_benchmark: list[float] = self._benchmark - self._benchmark: list[float] = [] - - ipfs_hash = await self.conn.publish_on_ipfs(output, typ=output_type) - - await self.conn.submit_work(rid, request_hash, output_hash, ipfs_hash) - - await self._update_balance() - - - except BaseException as err: - if 'network cancel' not in str(err): - logging.exception('Failed to serve model request !?\n') - - if rid in self._snap['requests']: - await self.conn.cancel_work(rid, 'reason not provided') - - finally: - return True - - # TODO, as per above on `.maybe_serve_one()`, it's likely a bit - # more *trionic* to define this all as a module level task-func - # which operates on a `daemon: WorkerDaemon`? - # - # -[ ] keeps tasks-as-funcs style prominent - # -[ ] avoids so much indentation due to methods - async def serve_forever(self): - await self._update_balance() - try: - while True: - queue = self._snap['queue'] - - random.shuffle(queue) - queue = sorted( - queue, - key=lambda req: convert_reward_to_int(req['reward']), - reverse=True - ) - - for req in queue: - # TODO, as mentioned above just inline this once - # converted to a mod level func. - if (await self.maybe_serve_one(req)): - break - - await trio.sleep(1) - - except KeyboardInterrupt: - ... + except KeyboardInterrupt: + ... diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 9076ee2..8b45197 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -3,6 +3,7 @@ import json import time import logging from pathlib import Path +from typing import AsyncGenerator from functools import partial import trio @@ -66,7 +67,11 @@ class NetConnector: self.ipfs_client = AsyncIPFSHTTP(config.ipfs_url) - self._wip_requests = {} + self._tables = { + 'queue': [], + 'requests': {}, + 'results': [] + } maybe_update_tui(lambda tui: tui.set_header_text(new_worker_name=self.config.account)) @@ -132,9 +137,6 @@ class NetConnector: logging.info('no balance info found') return None - # TODO, considery making this a NON-method and instead - # handing in the `snap['queue']` output beforehand? - # -> since that call is the only usage of `self`? async def get_full_queue_snapshot(self): ''' Keep in-sync with latest (telos chain's smart-contract) table @@ -162,6 +164,34 @@ class NetConnector: return snap + async def iter_poll_update(self, poll_time: float) -> AsyncGenerator[dict, None]: + ''' + Long running task, olls gpu contract tables yields latest table rows + + ''' + while True: + start_time = time.time() + self._tables = await self.get_full_queue_snapshot() + elapsed = time.time() - start_time + yield self._tables + await trio.sleep(max(poll_time - elapsed, 0.1)) + + async def should_cancel_work(self, request_id: int) -> bool: + logging.info('should cancel work?') + if request_id not in self._tables['requests']: + logging.info(f'request #{request_id} no longer in queue, likely its been filled by another worker, cancelling work...') + return True + + competitors = set([ + status['worker'] + for status in self._tables['requests'][request_id] + if status['worker'] != self.config.account + ]) + logging.info(f'competitors: {competitors}') + should_cancel = bool(self.config.non_compete & competitors) + logging.info(f'cancel: {should_cancel}') + return should_cancel + async def begin_work(self, request_id: int): ''' Publish to the bc that the worker is beginning a model-computation @@ -244,7 +274,7 @@ class NetConnector: result_hash: str, ipfs_hash: str ): - logging.info('submit_work #{request_id}') + logging.info(f'submit_work #{request_id}') return await failable( partial( self.cleos.a_push_action, From f39859943de1a6630579d17ee43a5ef74a20671d Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 20:24:31 -0300 Subject: [PATCH 25/35] Update py-leap and use its new apis on testing, add a comment on fields of DgpuConfig struct, minor name changing and mechanics of open_worker, make logging configurable --- pyproject.toml | 4 +- skynet/config.py | 50 +++++++------- skynet/dgpu/__init__.py | 15 +++-- skynet/dgpu/daemon.py | 2 +- skynet/dgpu/tui.py | 14 ++-- skynet/nodeos.py | 143 ---------------------------------------- tests/conftest.py | 30 +++++++-- tests/test_chain.py | 3 + tests/test_deploy.py | 104 ----------------------------- uv.lock | 18 ++++- 10 files changed, 90 insertions(+), 293 deletions(-) delete mode 100755 skynet/nodeos.py create mode 100644 tests/test_chain.py delete mode 100644 tests/test_deploy.py diff --git a/pyproject.toml b/pyproject.toml index 94a9740..0f074db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ frontend = [ dev = [ "pdbpp>=0.10.3,<0.11", "pytest>=7.4.2,<8", + "pytest-dockerctl", "pytest-trio>=0.8.0,<0.9", ] cuda = [ @@ -80,7 +81,8 @@ explicit = true torch = { index = "torch" } triton = { index = "torch" } torchvision = { index = "torch" } -py-leap = { git = "https://github.com/guilledk/py-leap.git", rev = "v0.1a32" } +py-leap = { git = "https://github.com/guilledk/py-leap.git", rev = "v0.1a34" } +pytest-dockerctl = { git = "https://github.com/pikers/pytest-dockerctl.git", branch = "g_update" } [build-system] requires = ["hatchling"] diff --git a/skynet/config.py b/skynet/config.py index 7625dab..f319714 100755 --- a/skynet/config.py +++ b/skynet/config.py @@ -11,24 +11,25 @@ class ConfigParsingError(BaseException): class DgpuConfig(msgspec.Struct): - account: str - permission: str - key: str - node_url: str - hyperion_url: str - ipfs_url: str - hf_token: str - ipfs_domain: str = DEFAULT_IPFS_DOMAIN - hf_home: str = 'hf_home' - non_compete: set[str] = set() - model_whitelist: set[str] = set() - model_blacklist: set[str] = set() - backend: str = 'sync-on-thread' - api_bind: str = False - tui: bool = False - poll_time: float = 0.5 + account: str # worker account name + permission: str # account permission name associated with key + key: str # private key + node_url: str # antelope http api endpoint + ipfs_url: str # IPFS node http rpc endpoint + hf_token: str # hugging face token + ipfs_domain: str = DEFAULT_IPFS_DOMAIN # IPFS Gateway domain + hf_home: str = 'hf_home' # hugging face data cache location + non_compete: set[str] = set() # set of worker names to not compete in requests + model_whitelist: set[str] = set() # only run these models + model_blacklist: set[str] = set() # don't run this models + backend: str = 'sync-on-thread' # select inference backend + tui: bool = False # enable TUI monitor + poll_time: float = 0.5 # wait time for polling updates from contract + log_level: str = 'info' + log_file: str = 'dgpu.log' # log file path (only used when tui = true) -class TelegramConfig(msgspec.Struct): + +class FrontendConfig(msgspec.Struct): account: str permission: str key: str @@ -37,32 +38,27 @@ class TelegramConfig(msgspec.Struct): ipfs_url: str token: str -class DiscordConfig(msgspec.Struct): - account: str - permission: str - key: str - node_url: str - hyperion_url: str - ipfs_url: str - token: str class PinnerConfig(msgspec.Struct): hyperion_url: str ipfs_url: str + class UserConfig(msgspec.Struct): account: str permission: str key: str node_url: str + class Config(msgspec.Struct): dgpu: DgpuConfig | None = None - telegram: TelegramConfig | None = None - discord: DiscordConfig | None = None + telegram: FrontendConfig | None = None + discord: FrontendConfig | None = None pinner: PinnerConfig | None = None user: UserConfig | None = None + def load_skynet_toml(file_path=DEFAULT_CONFIG_PATH) -> Config: with open(file_path, 'r') as file: return msgspec.toml.decode(file.read(), type=Config) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 1c1f40c..24e2f7a 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -1,21 +1,23 @@ import logging +from contextlib import asynccontextmanager as acm import trio import urwid from skynet.config import Config from skynet.dgpu.tui import init_tui -from skynet.dgpu.daemon import serve_forever +from skynet.dgpu.daemon import dgpu_serve_forever from skynet.dgpu.network import NetConnector -async def _dgpu_main(config: Config) -> None: +@acm +async def open_worker(config: Config): # suppress logs from httpx (logs url + status after every query) logging.getLogger("httpx").setLevel(logging.WARNING) tui = None if config.tui: - tui = init_tui() + tui = init_tui(config) conn = NetConnector(config) @@ -25,7 +27,12 @@ async def _dgpu_main(config: Config) -> None: if tui: n.start_soon(tui.run) - await serve_forever(config, conn) + yield conn except *urwid.ExitMainLoop: ... + + +async def _dgpu_main(config: Config): + async with open_worker(config) as conn: + await dgpu_serve_forever(config, conn) diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index f3f8ef3..b94af9b 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -183,7 +183,7 @@ async def maybe_serve_one( await conn.cancel_work(rid, 'reason not provided') -async def serve_forever(config: Config, conn: NetConnector): +async def dgpu_serve_forever(config: Config, conn: NetConnector): await maybe_update_tui_balance(conn) try: async for tables in conn.iter_poll_update(config.poll_time): diff --git a/skynet/dgpu/tui.py b/skynet/dgpu/tui.py index 7614d1c..ed3c1a7 100644 --- a/skynet/dgpu/tui.py +++ b/skynet/dgpu/tui.py @@ -5,6 +5,8 @@ import warnings import trio import urwid +from skynet.config import DgpuConfig as Config + class WorkerMonitor: def __init__(self): @@ -166,13 +168,15 @@ class WorkerMonitor: self.update_requests(queue) -def setup_logging_for_tui(level): +def setup_logging_for_tui(config: Config): warnings.filterwarnings("ignore") + level = getattr(logging, config.log_level.upper(), logging.WARNING) + logger = logging.getLogger() logger.setLevel(level) - fh = logging.FileHandler('dgpu.log') + fh = logging.FileHandler(config.log_file) fh.setLevel(level) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") @@ -185,11 +189,11 @@ def setup_logging_for_tui(level): logger.removeHandler(handler) -_tui = None -def init_tui(): +_tui: WorkerMonitor | None = None +def init_tui(config: Config): global _tui assert not _tui - setup_logging_for_tui(logging.INFO) + setup_logging_for_tui(config) _tui = WorkerMonitor() return _tui diff --git a/skynet/nodeos.py b/skynet/nodeos.py deleted file mode 100755 index 6b9f756..0000000 --- a/skynet/nodeos.py +++ /dev/null @@ -1,143 +0,0 @@ -import json -import time -import logging - -from contextlib import contextmanager as cm - -import docker - -from leap.cleos import CLEOS -from leap.sugar import get_container, Symbol - - -@cm -def open_nodeos(cleanup: bool = True): - dclient = docker.from_env() - vtestnet = get_container( - dclient, - 'guilledk/skynet:leap-4.0.1', - name='skynet-nodeos', - force_unique=True, - detach=True, - network='host') - - try: - cleos = CLEOS( - dclient, vtestnet, - url='http://127.0.0.1:42000', - remote='http://127.0.0.1:42000' - ) - - cleos.start_keosd() - - priv, pub = cleos.create_key_pair() - logging.info(f'SUDO KEYS: {(priv, pub)}') - - cleos.setup_wallet(priv) - - genesis = json.dumps({ - "initial_timestamp": '2017-08-29T02:14:00.000', - "initial_key": pub, - "initial_configuration": { - "max_block_net_usage": 1048576, - "target_block_net_usage_pct": 1000, - "max_transaction_net_usage": 1048575, - "base_per_transaction_net_usage": 12, - "net_usage_leeway": 500, - "context_free_discount_net_usage_num": 20, - "context_free_discount_net_usage_den": 100, - "max_block_cpu_usage": 200000, - "target_block_cpu_usage_pct": 1000, - "max_transaction_cpu_usage": 150000, - "min_transaction_cpu_usage": 100, - "max_transaction_lifetime": 3600, - "deferred_trx_expiration_window": 600, - "max_transaction_delay": 3888000, - "max_inline_action_size": 4096, - "max_inline_action_depth": 4, - "max_authority_depth": 6 - } - }, indent=4) - - ec, out = cleos.run( - ['bash', '-c', f'echo \'{genesis}\' > /root/skynet.json']) - assert ec == 0 - - place_holder = 'EOS5fLreY5Zq5owBhmNJTgQaLqQ4ufzXSTpStQakEyfxNFuUEgNs1=KEY:5JnvSc6pewpHHuUHwvbJopsew6AKwiGnexwDRc2Pj2tbdw6iML9' - sig_provider = f'{pub}=KEY:{priv}' - nodeos_config_ini = '/root/nodeos/config.ini' - ec, out = cleos.run( - ['bash', '-c', f'sed -i -e \'s/{place_holder}/{sig_provider}/g\' {nodeos_config_ini}']) - assert ec == 0 - - cleos.start_nodeos_from_config( - nodeos_config_ini, - data_dir='/root/nodeos/data', - genesis='/root/skynet.json', - state_plugin=True) - - time.sleep(0.5) - cleos.wait_blocks(1) - cleos.boot_sequence(token_sym=Symbol('GPU', 4)) - - priv, pub = cleos.create_key_pair() - cleos.import_key(priv) - cleos.private_keys['telos.gpu'] = priv - logging.info(f'GPU KEYS: {(priv, pub)}') - cleos.new_account('telos.gpu', ram=4200000, key=pub) - - for i in range(1, 4): - priv, pub = cleos.create_key_pair() - cleos.import_key(priv) - cleos.private_keys[f'testworker{i}'] = priv - logging.info(f'testworker{i} KEYS: {(priv, pub)}') - cleos.create_account_staked( - 'eosio', f'testworker{i}', key=pub) - - priv, pub = cleos.create_key_pair() - cleos.import_key(priv) - logging.info(f'TELEGRAM KEYS: {(priv, pub)}') - cleos.create_account_staked( - 'eosio', 'telegram', ram=500000, key=pub) - - cleos.transfer_token( - 'eosio', 'telegram', '1000000.0000 GPU', 'Initial testing funds') - - cleos.deploy_contract_from_host( - 'telos.gpu', - 'tests/contracts/telos.gpu', - verify_hash=False, - create_account=False - ) - - ec, out = cleos.push_action( - 'telos.gpu', - 'config', - ['eosio.token', '4,GPU'], - 'telos.gpu@active' - ) - assert ec == 0 - - ec, out = cleos.transfer_token( - 'telegram', 'telos.gpu', '1000000.0000 GPU', 'Initial testing funds') - assert ec == 0 - - user_row = cleos.get_table( - 'telos.gpu', - 'telos.gpu', - 'users', - index_position=1, - key_type='name', - lower_bound='telegram', - upper_bound='telegram' - ) - assert len(user_row) == 1 - - yield cleos - - finally: - # ec, out = cleos.list_all_keys() - # logging.info(out) - if cleanup: - vtestnet.stop() - vtestnet.remove() diff --git a/tests/conftest.py b/tests/conftest.py index 56c0780..fe8bcf3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,23 +2,43 @@ import pytest from skynet.config import * from skynet.ipfs import AsyncIPFSHTTP -from skynet.nodeos import open_nodeos @pytest.fixture(scope='session') def ipfs_client(): yield AsyncIPFSHTTP('http://127.0.0.1:5001') + @pytest.fixture(scope='session') def postgres_db(): from skynet.db import open_new_database with open_new_database() as db_params: yield db_params -@pytest.fixture(scope='session') -def cleos(): - with open_nodeos() as cli: - yield cli + +@pytest.fixture(scope='module') +def skynet_cleos(cleos_bs): + cleos = cleos_bs + + priv, pub = cleos.create_key_pair() + cleos.import_key('telos.gpu', priv) + cleos.new_account('telos.gpu', ram=4200000, key=pub) + + cleos.deploy_contract_from_path( + 'telos.gpu', + 'tests/contracts/telos.gpu', + create_account=False + ) + + cleos.push_action( + 'telos.gpu', + 'config', + ['eosio.token', '4,GPU'], + 'telos.gpu' + ) + + yield cleos + @pytest.fixture(scope='session') def dgpu(): diff --git a/tests/test_chain.py b/tests/test_chain.py new file mode 100644 index 0000000..243115a --- /dev/null +++ b/tests/test_chain.py @@ -0,0 +1,3 @@ +def test_dev(skynet_cleos): + cleos = skynet_cleos + ... diff --git a/tests/test_deploy.py b/tests/test_deploy.py deleted file mode 100644 index bfd93d9..0000000 --- a/tests/test_deploy.py +++ /dev/null @@ -1,104 +0,0 @@ -import time -import json - -from hashlib import sha256 -from functools import partial - -import trio -import requests -from skynet.constants import DEFAULT_IPFS_REMOTE - -from skynet.dgpu import open_dgpu_node - -from leap.sugar import collect_stdout - - -def test_enqueue_work(cleos): - user = 'telegram' - req = json.dumps({ - 'method': 'diffuse', - 'params': { - 'algo': 'midj', - 'prompt': 'skynet terminator dystopic', - 'width': 512, - 'height': 512, - 'guidance': 10, - 'step': 28, - 'seed': 420, - 'upscaler': 'x4' - } - }) - binary = '' - - ec, out = cleos.push_action( - 'telos.gpu', 'enqueue', [user, req, binary, '20.0000 GPU', 1], f'{user}@active' - ) - - assert ec == 0 - - queue = cleos.get_table('telos.gpu', 'telos.gpu', 'queue') - - assert len(queue) == 1 - - req_on_chain = queue[0] - - assert req_on_chain['user'] == user - assert req_on_chain['body'] == req - assert req_on_chain['binary_data'] == binary - - trio.run( - partial( - open_dgpu_node, - f'testworker1', - 'active', - cleos, - DEFAULT_IPFS_REMOTE, - cleos.private_keys['testworker1'], - initial_algos=['midj'] - ) - ) - - queue = cleos.get_table('telos.gpu', 'telos.gpu', 'queue') - - assert len(queue) == 0 - - -def test_enqueue_dequeue(cleos): - user = 'telegram' - req = json.dumps({ - 'method': 'diffuse', - 'params': { - 'algo': 'midj', - 'prompt': 'skynet terminator dystopic', - 'width': 512, - 'height': 512, - 'guidance': 10, - 'step': 28, - 'seed': 420, - 'upscaler': 'x4' - } - }) - binary = '' - - ec, out = cleos.push_action( - 'telos.gpu', 'enqueue', [user, req, binary, '20.0000 GPU', 1], f'{user}@active' - ) - - assert ec == 0 - - request_id, _ = collect_stdout(out).split(':') - request_id = int(request_id) - - queue = cleos.get_table('telos.gpu', 'telos.gpu', 'queue') - - assert len(queue) == 1 - - ec, out = cleos.push_action( - 'telos.gpu', 'dequeue', [user, request_id], f'{user}@active' - ) - - assert ec == 0 - - queue = cleos.get_table('telos.gpu', 'telos.gpu', 'queue') - - assert len(queue) == 0 diff --git a/uv.lock b/uv.lock index 4dbb56d..23dd35c 100644 --- a/uv.lock +++ b/uv.lock @@ -1730,8 +1730,8 @@ wheels = [ [[package]] name = "py-leap" -version = "0.1a32" -source = { git = "https://github.com/guilledk/py-leap.git?rev=v0.1a32#c8137dec3d0a7d1e883cafe5212d58a8e9db9b84" } +version = "0.1a34" +source = { git = "https://github.com/guilledk/py-leap.git?rev=v0.1a34#6055ca7063c1eb32644e855c6726c29a1d7ac7e9" } dependencies = [ { name = "base58" }, { name = "cryptos" }, @@ -1832,6 +1832,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/51/ff/f6e8b8f39e08547faece4bd80f89d5a8de68a38b2d179cc1c4490ffa3286/pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8", size = 325287 }, ] +[[package]] +name = "pytest-dockerctl" +version = "0.2a0" +source = { git = "https://github.com/pikers/pytest-dockerctl.git?branch=g_update#d58e9317b55954f05f139730a62d55e1acb5f5d1" } +dependencies = [ + { name = "docker" }, + { name = "pytest" }, + { name = "pytest-trio" }, +] + [[package]] name = "pytest-trio" version = "0.8.0" @@ -2268,6 +2278,7 @@ cuda = [ dev = [ { name = "pdbpp" }, { name = "pytest" }, + { name = "pytest-dockerctl" }, { name = "pytest-trio" }, ] frontend = [ @@ -2288,7 +2299,7 @@ requires-dist = [ { name = "outcome", specifier = ">=1.3.0.post0" }, { name = "pillow", specifier = ">=10.0.1,<11" }, { name = "protobuf", specifier = ">=5.29.3,<6" }, - { name = "py-leap", git = "https://github.com/guilledk/py-leap.git?rev=v0.1a32" }, + { name = "py-leap", git = "https://github.com/guilledk/py-leap.git?rev=v0.1a34" }, { name = "pytz", specifier = "~=2023.3.post1" }, { name = "toml", specifier = ">=0.10.2,<0.11" }, { name = "trio", specifier = ">=0.22.2,<0.23" }, @@ -2320,6 +2331,7 @@ cuda = [ dev = [ { name = "pdbpp", specifier = ">=0.10.3,<0.11" }, { name = "pytest", specifier = ">=7.4.2,<8" }, + { name = "pytest-dockerctl", git = "https://github.com/pikers/pytest-dockerctl.git?branch=g_update" }, { name = "pytest-trio", specifier = ">=0.8.0,<0.9" }, ] frontend = [ From d0e084b867abe5a5e1fc1dc06ad8bc3ea5fdccdd Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 20:55:01 -0300 Subject: [PATCH 26/35] Update example config --- skynet.toml.example | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/skynet.toml.example b/skynet.toml.example index a075a30..b50b8b2 100644 --- a/skynet.toml.example +++ b/skynet.toml.example @@ -1,21 +1,20 @@ # config sections are optional, depending on which services # you wish to run -[skynet.dgpu] +[dgpu] account = 'testworkerX' permission = 'active' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' node_url = 'https://testnet.skygpu.net' -hyperion_url = 'https://testnet.skygpu.net' -ipfs_gateway_url = '/ip4/169.197.140.154/tcp/4001/p2p/12D3KooWKWogLFNEcNNMKnzU7Snrnuj84RZdMBg3sLiQSQc51oEv' ipfs_url = 'http://127.0.0.1:5001' hf_home = 'hf_home' hf_token = 'hf_XxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXx' -auto_withdraw = true non_compete = [] -api_bind = '127.0.0.1:42690' +tui = true +log_file = 'dgpu.log' +log_level = 'info' -[skynet.telegram] +[telegram] account = 'telegram' permission = 'active' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' @@ -24,7 +23,7 @@ hyperion_url = 'https://testnet.skygpu.net' ipfs_url = 'http://127.0.0.1:5001' token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' -[skynet.discord] +[discord] account = 'discord' permission = 'active' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' @@ -33,11 +32,11 @@ hyperion_url = 'https://testnet.skygpu.net' ipfs_url = 'http://127.0.0.1:5001' token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' -[skynet.pinner] +[pinner] hyperion_url = 'https://testnet.skygpu.net' ipfs_url = 'http://127.0.0.1:5001' -[skynet.user] +[user] account = 'testuser' permission = 'active' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' From cebbd1059eae2aa31799d851831081b14f9bf3e2 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:01:00 -0300 Subject: [PATCH 27/35] Fix Upscaler hardcoding, add mockerpipeline and streamline dgpu tests --- pyproject.toml | 10 +- skynet/cli.py | 6 +- skynet/constants.py | 51 ++++--- skynet/dgpu/compute.py | 23 ++- skynet/dgpu/daemon.py | 9 +- skynet/dgpu/pipes/tester.py | 42 ++++++ skynet/utils.py | 2 +- tests/conftest.py | 24 ++- tests/test_reqs.py | 119 ++------------- uv.lock | 283 +----------------------------------- 10 files changed, 120 insertions(+), 449 deletions(-) create mode 100644 skynet/dgpu/pipes/tester.py diff --git a/pyproject.toml b/pyproject.toml index 0f074db..05bc088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,10 +16,10 @@ dependencies = [ "msgspec>=0.19.0,<0.20", "numpy<2.1", "protobuf>=5.29.3,<6", - "zstandard>=0.23.0,<0.24", "click>=8.1.8,<9", "httpx>=0.28.1,<0.29", "outcome>=1.3.0.post0", + "urwid>=2.6.16", ] [project.scripts] @@ -47,12 +47,9 @@ cuda = [ "torch==2.5.1+cu121", "scipy==1.15.1", "numba==0.60.0", - "quart>=0.19.3,<0.20", - "triton==3.1.0", - "xformers>=0.0.29,<0.0.30", - "hypercorn>=0.14.4,<0.15", + # "triton==3.1.0", + # "xformers>=0.0.29,<0.0.30", "diffusers==0.32.1", - "quart-trio>=0.11.0,<0.12", "torchvision==0.20.1+cu121", "accelerate==0.34.0", "transformers==4.48.0", @@ -62,7 +59,6 @@ cuda = [ "basicsr>=1.4.2,<2", "realesrgan>=0.3.0,<0.4", "sentencepiece>=0.2.0", - "urwid>=2.6.16", ] [tool.uv] diff --git a/skynet/cli.py b/skynet/cli.py index f835d98..f78d31d 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -197,10 +197,10 @@ def dgpu( logging.basicConfig(level=loglevel) - config = load_skynet_toml(file_path=config_path) - set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) + config = load_skynet_toml(file_path=config_path).dgpu + set_hf_vars(config.hf_token, config.hf_home) - trio.run(_dgpu_main, config.dgpu) + trio.run(_dgpu_main, config) @run.command() diff --git a/skynet/constants.py b/skynet/constants.py index 7431d28..f0af6ee 100755 --- a/skynet/constants.py +++ b/skynet/constants.py @@ -1,4 +1,6 @@ import msgspec + +from enum import Enum from typing import Literal VERSION = '0.1a12' @@ -6,107 +8,108 @@ VERSION = '0.1a12' DOCKER_RUNTIME_CUDA = 'skynet:runtime-cuda' -class Size(msgspec.Struct): - w: int - h: int - - class ModelDesc(msgspec.Struct): - short: str - mem: float - size: Size - tags: list[Literal['txt2img', 'img2img', 'inpaint']] + short: str # short unique name + mem: float # recomended mem + attrs: dict # additional mode specific attrs + tags: list[Literal['txt2img', 'img2img', 'inpaint', 'upscale']] MODELS: dict[str, ModelDesc] = { + 'RealESRGAN_x4plus': ModelDesc( + short='realesrgan', + mem=4, + attrs={}, + tags=['upscale'] + ), 'runwayml/stable-diffusion-v1-5': ModelDesc( short='stable', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'stabilityai/stable-diffusion-2-1-base': ModelDesc( short='stable2', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'snowkidy/stable-diffusion-xl-base-0.9': ModelDesc( short='stablexl0.9', mem=8.3, - size=Size(w=1024, h=1024), + attrs={'size': {'w': 1024, 'h': 1024}}, tags=['txt2img'] ), 'Linaqruf/anything-v3.0': ModelDesc( short='hdanime', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'hakurei/waifu-diffusion': ModelDesc( short='waifu', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'nitrosocke/Ghibli-Diffusion': ModelDesc( short='ghibli', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'dallinmackay/Van-Gogh-diffusion': ModelDesc( short='van-gogh', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'lambdalabs/sd-pokemon-diffusers': ModelDesc( short='pokemon', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'Envvi/Inkpunk-Diffusion': ModelDesc( short='ink', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'nousr/robo-diffusion': ModelDesc( short='robot', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img'] ), 'black-forest-labs/FLUX.1-schnell': ModelDesc( short='flux', mem=24, - size=Size(w=1024, h=1024), + attrs={'size': {'w': 1024, 'h': 1024}}, tags=['txt2img'] ), 'black-forest-labs/FLUX.1-Fill-dev': ModelDesc( short='flux-inpaint', mem=24, - size=Size(w=1024, h=1024), + attrs={'size': {'w': 1024, 'h': 1024}}, tags=['inpaint'] ), 'diffusers/stable-diffusion-xl-1.0-inpainting-0.1': ModelDesc( short='stablexl-inpaint', mem=8.3, - size=Size(w=1024, h=1024), + attrs={'size': {'w': 1024, 'h': 1024}}, tags=['inpaint'] ), 'prompthero/openjourney': ModelDesc( short='midj', mem=6, - size=Size(w=512, h=512), + attrs={'size': {'w': 512, 'h': 512}}, tags=['txt2img', 'img2img'] ), 'stabilityai/stable-diffusion-xl-base-1.0': ModelDesc( short='stablexl', mem=8.3, - size=Size(w=1024, h=1024), + attrs={'size': {'w': 1024, 'h': 1024}}, tags=['txt2img'] ), } diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 5e90791..23daa63 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -12,6 +12,7 @@ from contextlib import contextmanager as cm import trio import torch +from skynet.config import load_skynet_toml from skynet.dgpu.tui import maybe_update_tui from skynet.dgpu.errors import ( DGPUComputeError, @@ -78,6 +79,7 @@ def maybe_load_model(name: str, mode: str): mode = 'txt2img' global _model_name, _model_mode, _model + config = load_skynet_toml().dgpu if _model_name != name or _model_mode != mode: # unload model @@ -93,7 +95,7 @@ def maybe_load_model(name: str, mode: str): else: _model = pipeline_for( - name, mode, cache_dir='hf_home') + name, mode, cache_dir=config.hf_home) _model_name = name _model_mode = mode @@ -101,25 +103,17 @@ def maybe_load_model(name: str, mode: str): logging.debug('memory summary:') logging.debug('\n' + torch.cuda.memory_summary()) - yield + yield _model def compute_one( + model, request_id: int, method: str, params: dict, inputs: list[bytes] = [], should_cancel = None ): - if method == 'diffuse': - method = 'txt2img' - - global _model, _model_name, _model_mode - - # validate correct model is loaded - assert params['model'] == _model_name - assert method == _model_mode - total_steps = params['step'] def inference_step_wakeup(*args, **kwargs): '''This is a callback function that gets invoked every inference step, @@ -132,6 +126,7 @@ def compute_one( maybe_update_tui(lambda tui: tui.set_progress(step, done=total_steps)) + should_raise = False if should_cancel: should_raise = trio.from_thread.run(should_cancel, request_id) @@ -155,7 +150,7 @@ def compute_one( name = params['model'] match method: - case 'txt2img' | 'img2img' | 'inpaint': + case 'diffuse' | 'txt2img' | 'img2img' | 'inpaint': arguments = prepare_params_for_diffuse( params, method, inputs) prompt, guidance, step, seed, upscaler, extra_params = arguments @@ -167,7 +162,7 @@ def compute_one( extra_params['callback'] = inference_step_wakeup extra_params['callback_steps'] = 1 - output = _model( + output = model( prompt, guidance_scale=guidance, num_inference_steps=step, @@ -194,7 +189,7 @@ def compute_one( case 'upscale': input_img = inputs[0].convert('RGB') - up_img, _ = _model.enhance( + up_img, _ = model.enhance( convert_from_image_to_cv2(input_img), outscale=4) output = convert_from_cv2_to_image(up_img) diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index b94af9b..eb002b5 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -51,11 +51,7 @@ async def maybe_serve_one( model = body['params']['model'] # if model not known, ignore. - if ( - model != 'RealESRGAN_x4plus' - and - model not in MODELS - ): + if model not in MODELS: logging.warning(f'unknown model {model}!, skip...') return @@ -139,7 +135,7 @@ async def maybe_serve_one( logging.info('begin_work error, probably being worked on already... skip.') return - with maybe_load_model(model, mode): + with maybe_load_model(model, mode) as model: try: maybe_update_tui(lambda tui: tui.set_progress(0, done=total_step)) @@ -154,6 +150,7 @@ async def maybe_serve_one( output_hash, output = await trio.to_thread.run_sync( partial( compute_one, + model, rid, mode, body['params'], inputs=inputs, diff --git a/skynet/dgpu/pipes/tester.py b/skynet/dgpu/pipes/tester.py new file mode 100644 index 0000000..7978f22 --- /dev/null +++ b/skynet/dgpu/pipes/tester.py @@ -0,0 +1,42 @@ +import time + +from PIL import Image + +import msgspec + + +__model = { + 'name': 'skygpu/txt2img-mocker' +} + +class MockPipelineResult(msgspec.Struct): + images: list[Image] + +class MockPipeline: + + def __call__( + self, + prompt: str, + *args, + num_inference_steps: int = 3, + callback=None, + mock_step_time: float = 0.1, + **kwargs + ): + for i in range(num_inference_steps): + time.sleep(mock_step_time) + if callback: + callback(i+1) + + img = Image.new('RGB', (1, 1), color='green') + + return MockPipelineResult(images=[img]) + + +def pipeline_for( + model: str, + mode: str, + mem_fraction: float = 1.0, + cache_dir: str | None = None +): + return MockPipeline() diff --git a/skynet/utils.py b/skynet/utils.py index ce029bd..0ebf6fb 100755 --- a/skynet/utils.py +++ b/skynet/utils.py @@ -99,7 +99,7 @@ def pipeline_for( diffusers.utils.logging.disable_progress_bar() logging.info(f'pipeline_for {model} {mode}') - assert torch.cuda.is_available() + # assert torch.cuda.is_available() torch.cuda.empty_cache() torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True diff --git a/tests/conftest.py b/tests/conftest.py index fe8bcf3..4c6a406 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,19 +40,15 @@ def skynet_cleos(cleos_bs): yield cleos -@pytest.fixture(scope='session') -def dgpu(): - from skynet.dgpu.network import NetConnector - from skynet.dgpu.compute import ModelMngr - from skynet.dgpu.daemon import WorkerDaemon +@pytest.fixture +def inject_mockers(): + from skynet.constants import MODELS, ModelDesc - config = load_skynet_toml(file_path='skynet.toml') - hf_token = load_key(config, 'skynet.dgpu.hf_token') - hf_home = load_key(config, 'skynet.dgpu.hf_home') - set_hf_vars(hf_token, hf_home) - config = config['skynet']['dgpu'] - conn = NetConnector(config) - mm = ModelMngr(config) - daemon = WorkerDaemon(mm, conn, config) + MODELS['skygpu/txt2img-mocker'] = ModelDesc( + short='tester', + mem=0.01, + attrs={}, + tags=['txt2img'] + ) - yield conn, mm, daemon + yield diff --git a/tests/test_reqs.py b/tests/test_reqs.py index 48ca886..a8d8564 100644 --- a/tests/test_reqs.py +++ b/tests/test_reqs.py @@ -1,112 +1,21 @@ import json -from skynet.dgpu.compute import ModelMngr from skynet.constants import * -from skynet.config import * +from skynet.dgpu.compute import maybe_load_model, compute_one -async def test_diffuse(dgpu): - conn, mm, daemon = dgpu - await conn.cancel_work(0, 'testing') - - daemon._snap['requests'][0] = {} - req = { - 'id': 0, - 'nonce': 0, - 'body': json.dumps({ - "method": "diffuse", - "params": { - "prompt": "Kronos God Realistic 4k", - "model": list(MODELS.keys())[-1], - "step": 21, - "width": 1024, - "height": 1024, - "seed": 168402949, - "guidance": "7.5" - } - }), - 'binary_data': '', +async def test_diffuse(inject_mockers): + model = 'skygpu/txt2img-mocker' + mode = 'diffuse' + params = { + "prompt": "Kronos God Realistic 4k", + "model": model, + "step": 21, + "width": 1024, + "height": 1024, + "seed": 168402949, + "guidance": "7.5" } - await daemon.maybe_serve_one(req) - - -async def test_txt2img(dgpu): - conn, mm, daemon = dgpu - await conn.cancel_work(0, 'testing') - - daemon._snap['requests'][0] = {} - req = { - 'id': 0, - 'nonce': 0, - 'body': json.dumps({ - "method": "txt2img", - "params": { - "prompt": "Kronos God Realistic 4k", - "model": list(MODELS.keys())[-1], - "step": 21, - "width": 1024, - "height": 1024, - "seed": 168402949, - "guidance": "7.5" - } - }), - 'binary_data': '', - } - - await daemon.maybe_serve_one(req) - - -async def test_img2img(dgpu): - conn, mm, daemon = dgpu - await conn.cancel_work(0, 'testing') - - daemon._snap['requests'][0] = {} - req = { - 'id': 0, - 'nonce': 0, - 'body': json.dumps({ - "method": "img2img", - "params": { - "prompt": "a hindu cat god feline god on a house roof", - "model": list(MODELS.keys())[-2], - "step": 21, - "width": 1024, - "height": 1024, - "seed": 168402949, - "guidance": "7.5", - "strength": "0.5" - } - }), - 'binary_data': 'QmZcGdXXVQfpco1G3tr2CGFBtv8xVsCwcwuq9gnJBWDymi', - } - - await daemon.maybe_serve_one(req) - -async def test_inpaint(dgpu): - conn, mm, daemon = dgpu - await conn.cancel_work(0, 'testing') - - daemon._snap['requests'][0] = {} - req = { - 'id': 0, - 'nonce': 0, - 'body': json.dumps({ - "method": "inpaint", - "params": { - "prompt": "a black panther on a sunny roof", - "model": list(MODELS.keys())[-3], - "step": 21, - "width": 1024, - "height": 1024, - "seed": 168402949, - "guidance": "7.5", - "strength": "0.5" - } - }), - 'binary_data': - 'QmZcGdXXVQfpco1G3tr2CGFBtv8xVsCwcwuq9gnJBWDymi,' + - 'Qmccx1aXNmq5mZDS3YviUhgGHXWhQeHvca3AgA7MDjj2hR' - } - - await daemon.maybe_serve_one(req) + with maybe_load_model(model, mode) as model: + compute_one(model, 0, mode, params) diff --git a/uv.lock b/uv.lock index 23dd35c..29475b7 100644 --- a/uv.lock +++ b/uv.lock @@ -49,15 +49,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/00/b08f23b7d7e1e14ce01419a467b583edbb93c6cdb8654e54a9cc579cd61f/addict-2.4.0-py3-none-any.whl", hash = "sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc", size = 3832 }, ] -[[package]] -name = "aiofiles" -version = "24.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0b/03/a88171e277e8caa88a4c77808c20ebb04ba74cc4681bf1e9416c862de237/aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c", size = 30247 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/45/30bb92d442636f570cb5651bc661f52b610e2eec3f891a5dc3a4c3667db0/aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5", size = 15896 }, -] - [[package]] name = "aiohappyeyeballs" version = "2.4.4" @@ -265,15 +256,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/90/a2bbb9b5f997b9c9aa9c15ee4adf553ee71053bb942f89fd48d920a1aa9d/bitsandbytes-0.45.0-py3-none-win_amd64.whl", hash = "sha256:ebbf96e0ecb466716a65ecdeaef3fa1983575447b9ab66b74e5211892507c6ff", size = 68520043 }, ] -[[package]] -name = "blinker" -version = "1.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458 }, -] - [[package]] name = "certifi" version = "2024.12.14" @@ -288,43 +270,14 @@ name = "cffi" version = "1.17.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pycparser" }, + { name = "pycparser", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621 } wheels = [ - { url = "https://files.pythonhosted.org/packages/90/07/f44ca684db4e4f08a3fdc6eeb9a0d15dc6883efc7b8c90357fdbf74e186c/cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14", size = 182191 }, - { url = "https://files.pythonhosted.org/packages/08/fd/cc2fedbd887223f9f5d170c96e57cbf655df9831a6546c1727ae13fa977a/cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67", size = 178592 }, - { url = "https://files.pythonhosted.org/packages/de/cc/4635c320081c78d6ffc2cab0a76025b691a91204f4aa317d568ff9280a2d/cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382", size = 426024 }, - { url = "https://files.pythonhosted.org/packages/b6/7b/3b2b250f3aab91abe5f8a51ada1b717935fdaec53f790ad4100fe2ec64d1/cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702", size = 448188 }, - { url = "https://files.pythonhosted.org/packages/d3/48/1b9283ebbf0ec065148d8de05d647a986c5f22586b18120020452fff8f5d/cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3", size = 455571 }, - { url = "https://files.pythonhosted.org/packages/40/87/3b8452525437b40f39ca7ff70276679772ee7e8b394934ff60e63b7b090c/cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6", size = 436687 }, - { url = "https://files.pythonhosted.org/packages/8d/fb/4da72871d177d63649ac449aec2e8a29efe0274035880c7af59101ca2232/cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17", size = 446211 }, - { url = "https://files.pythonhosted.org/packages/ab/a0/62f00bcb411332106c02b663b26f3545a9ef136f80d5df746c05878f8c4b/cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8", size = 461325 }, - { url = "https://files.pythonhosted.org/packages/36/83/76127035ed2e7e27b0787604d99da630ac3123bfb02d8e80c633f218a11d/cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e", size = 438784 }, - { url = "https://files.pythonhosted.org/packages/21/81/a6cd025db2f08ac88b901b745c163d884641909641f9b826e8cb87645942/cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be", size = 461564 }, { url = "https://files.pythonhosted.org/packages/f8/fe/4d41c2f200c4a457933dbd98d3cf4e911870877bd94d9656cc0fcb390681/cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c", size = 171804 }, { url = "https://files.pythonhosted.org/packages/d1/b6/0b0f5ab93b0df4acc49cae758c81fe4e5ef26c3ae2e10cc69249dfd8b3ab/cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15", size = 181299 }, - { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264 }, - { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651 }, - { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259 }, - { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200 }, - { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235 }, - { url = "https://files.pythonhosted.org/packages/62/12/ce8710b5b8affbcdd5c6e367217c242524ad17a02fe5beec3ee339f69f85/cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6", size = 459721 }, - { url = "https://files.pythonhosted.org/packages/ff/6b/d45873c5e0242196f042d555526f92aa9e0c32355a1be1ff8c27f077fd37/cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d", size = 467242 }, - { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999 }, - { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242 }, - { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604 }, { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727 }, { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400 }, - { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178 }, - { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840 }, - { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803 }, - { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850 }, - { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729 }, - { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256 }, - { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424 }, - { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568 }, - { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736 }, { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448 }, { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976 }, ] @@ -576,22 +529,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/f6/1d/ac8914360460fafa1990890259b7fa5ef7ba4cd59014e782e4ab3ab144d8/filterpy-1.4.5.zip", hash = "sha256:4f2a4d39e4ea601b9ab42b2db08b5918a9538c168cff1c6895ae26646f3d73b1", size = 177985 } -[[package]] -name = "flask" -version = "3.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "blinker" }, - { name = "click" }, - { name = "itsdangerous" }, - { name = "jinja2" }, - { name = "werkzeug" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/89/50/dff6380f1c7f84135484e176e0cac8690af72fa90e932ad2a0a60e28c69b/flask-3.1.0.tar.gz", hash = "sha256:5f873c5184c897c8d9d1b05df1e3d01b14910ce69607a117bd3277098a5836ac", size = 680824 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/af/47/93213ee66ef8fae3b93b3e29206f6b251e65c97bd91d8e1c5596ef15af0a/flask-3.1.0-py3-none-any.whl", hash = "sha256:d667207822eb83f1c4b50949b1623c8fc8d51f2341d65f72e1a1815397551136", size = 102979 }, -] - [[package]] name = "fonttools" version = "4.55.4" @@ -799,28 +736,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, ] -[[package]] -name = "h2" -version = "4.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "hpack" }, - { name = "hyperframe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2a/32/fec683ddd10629ea4ea46d206752a95a2d8a48c22521edd70b142488efe1/h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb", size = 2145593 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/e5/db6d438da759efbb488c4f3fbdab7764492ff3c3f953132efa6b9f0e9e53/h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d", size = 57488 }, -] - -[[package]] -name = "hpack" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3e/9b/fda93fb4d957db19b0f6b370e79d586b3e8528b20252c729c476a2c02954/hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095", size = 49117 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/34/e8b383f35b77c402d28563d2b8f83159319b509bc5f760b15d60b0abf165/hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", size = 32611 }, -] - [[package]] name = "httpcore" version = "1.0.7" @@ -867,37 +782,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/3f/50f6b25fafdcfb1c089187a328c95081abf882309afd86f4053951507cd1/huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec", size = 450658 }, ] -[[package]] -name = "hypercorn" -version = "0.14.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "h11" }, - { name = "h2" }, - { name = "priority" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "wsproto" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/cc/b0/b546abe2eb39876d2965a0237f5a0133049db36cfc2e62b379b40b698af1/hypercorn-0.14.4.tar.gz", hash = "sha256:3fa504efc46a271640023c9b88c3184fd64993f47a282e8ae1a13ccb285c2f67", size = 41850 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/aa/0632f628205f09ec7daa5875e4142db519bfa7a161d48a6417b0f9ab5e08/hypercorn-0.14.4-py3-none-any.whl", hash = "sha256:f956200dbf8677684e6e976219ffa6691d6cf795281184b41dbb0b135ab37b8d", size = 58264 }, -] - -[package.optional-dependencies] -trio = [ - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "trio" }, -] - -[[package]] -name = "hyperframe" -version = "6.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/2a/4747bff0a17f7281abe73e955d60d80aae537a5d203f417fa1c2e7578ebb/hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914", size = 25008 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/de/85a784bcc4a3779d1753a7ec2dee5de90e18c7bcf402e71b51fcf150b129/hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15", size = 12389 }, -] - [[package]] name = "idna" version = "3.10" @@ -956,15 +840,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2b/57/18b5a914f6d7994dd349252873169e946dc824328e9a37fd15ed836deedc/invisible_watermark-0.2.0-py3-none-any.whl", hash = "sha256:644311beed9cfe4a9a5a4a46c740f47800cef184fe2e1297f3f4542e2d992f8b", size = 1633253 }, ] -[[package]] -name = "itsdangerous" -version = "2.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234 }, -] - [[package]] name = "janus" version = "2.0.0" @@ -1589,15 +1464,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, ] -[[package]] -name = "priority" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f5/3c/eb7c35f4dcede96fca1842dac5f4f5d15511aa4b52f3a961219e68ae9204/priority-2.0.0.tar.gz", hash = "sha256:c965d54f1b8d0d0b19479db3924c7c36cf672dbf2aec92d43fbdaf4492ba18c0", size = 24792 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/5f/82c8074f7e84978129347c2c6ec8b6c59f3584ff1a20bc3c940a3e061790/priority-2.0.0-py3-none-any.whl", hash = "sha256:6f8eefce5f3ad59baf2c080a664037bb4725cd0a790d53d59ab4059288faf6aa", size = 8946 }, -] - [[package]] name = "propcache" version = "0.2.1" @@ -1963,41 +1829,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8", size = 156338 }, ] -[[package]] -name = "quart" -version = "0.19.9" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiofiles" }, - { name = "blinker" }, - { name = "click" }, - { name = "flask" }, - { name = "hypercorn" }, - { name = "itsdangerous" }, - { name = "jinja2" }, - { name = "markupsafe" }, - { name = "werkzeug" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/07/05/0d7a280a89b05a6b6c2224b4a1b991b7a4c52eed5e14b92c3c63ce29c235/quart-0.19.9.tar.gz", hash = "sha256:30a61a0d7bae1ee13e6e99dc14c929b3c945e372b9445d92d21db053e91e95a5", size = 65653 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/01/e5d6127f6304c7e596e5c8eec5accdec0f698ec65fe342d8474ad5223717/quart-0.19.9-py3-none-any.whl", hash = "sha256:8acb8b299c72b66ee9e506ae141498bbbfcc250b5298fbdb712e97f3d7e4082f", size = 78294 }, -] - -[[package]] -name = "quart-trio" -version = "0.11.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "exceptiongroup" }, - { name = "hypercorn", extra = ["trio"] }, - { name = "quart" }, - { name = "trio" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b6/21/b644db3cd4c0055af99c3e6f4fb066921fe0e21cf55afcef85208c8efa93/quart_trio-0.11.1.tar.gz", hash = "sha256:149c9c65c2faafdf455a4461b600e1983b71e593b6f8c8b91b592bbda36cea98", size = 13190 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/61/9320f40bc363095bdb0c5e0c94b99cd582decfa61c2094f60d068fafe077/quart_trio-0.11.1-py3-none-any.whl", hash = "sha256:d4da1ab7699e44357f7788e1b5a30158680e999cf6b8e9ee762ce22164218bc0", size = 16062 }, -] - [[package]] name = "realesrgan" version = "0.3.0" @@ -2250,7 +2081,7 @@ dependencies = [ { name = "pytz" }, { name = "toml" }, { name = "trio" }, - { name = "zstandard" }, + { name = "urwid" }, ] [package.dev-dependencies] @@ -2260,20 +2091,14 @@ cuda = [ { name = "bitsandbytes" }, { name = "diffusers" }, { name = "huggingface-hub" }, - { name = "hypercorn" }, { name = "invisible-watermark" }, { name = "numba" }, - { name = "quart" }, - { name = "quart-trio" }, { name = "realesrgan" }, { name = "scipy" }, { name = "sentencepiece" }, { name = "torch" }, { name = "torchvision" }, { name = "transformers" }, - { name = "triton" }, - { name = "urwid" }, - { name = "xformers" }, ] dev = [ { name = "pdbpp" }, @@ -2303,7 +2128,7 @@ requires-dist = [ { name = "pytz", specifier = "~=2023.3.post1" }, { name = "toml", specifier = ">=0.10.2,<0.11" }, { name = "trio", specifier = ">=0.22.2,<0.23" }, - { name = "zstandard", specifier = ">=0.23.0,<0.24" }, + { name = "urwid", specifier = ">=2.6.16" }, ] [package.metadata.requires-dev] @@ -2313,20 +2138,14 @@ cuda = [ { name = "bitsandbytes", specifier = ">=0.45.0,<0.46" }, { name = "diffusers", specifier = "==0.32.1" }, { name = "huggingface-hub", specifier = ">=0.27.1,<0.28" }, - { name = "hypercorn", specifier = ">=0.14.4,<0.15" }, { name = "invisible-watermark", specifier = ">=0.2.0,<0.3" }, { name = "numba", specifier = "==0.60.0" }, - { name = "quart", specifier = ">=0.19.3,<0.20" }, - { name = "quart-trio", specifier = ">=0.11.0,<0.12" }, { name = "realesrgan", specifier = ">=0.3.0,<0.4" }, { name = "scipy", specifier = "==1.15.1" }, { name = "sentencepiece", specifier = ">=0.2.0" }, { name = "torch", specifier = "==2.5.1+cu121", index = "https://download.pytorch.org/whl/cu121" }, { name = "torchvision", specifier = "==0.20.1+cu121", index = "https://download.pytorch.org/whl/cu121" }, { name = "transformers", specifier = "==4.48.0" }, - { name = "triton", specifier = "==3.1.0", index = "https://download.pytorch.org/whl/cu121" }, - { name = "urwid", specifier = ">=2.6.16" }, - { name = "xformers", specifier = ">=0.0.29,<0.0.30" }, ] dev = [ { name = "pdbpp", specifier = ">=0.10.3,<0.11" }, @@ -2613,14 +2432,14 @@ wheels = [ [[package]] name = "triton" version = "3.1.0" -source = { registry = "https://download.pytorch.org/whl/cu121" } +source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, + { name = "filelock", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8" }, - { url = "https://download.pytorch.org/whl/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c" }, - { url = "https://download.pytorch.org/whl/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc" }, + { url = "https://files.pythonhosted.org/packages/98/29/69aa56dc0b2eb2602b553881e34243475ea2afd9699be042316842788ff5/triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8", size = 209460013 }, + { url = "https://files.pythonhosted.org/packages/86/17/d9a5cf4fcf46291856d1e90762e36cbabd2a56c7265da0d1d9508c8e3943/triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c", size = 209506424 }, + { url = "https://files.pythonhosted.org/packages/78/eb/65f5ba83c2a123f6498a3097746607e5b2f16add29e36765305e4ac7fdd8/triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc", size = 209551444 }, ] [[package]] @@ -2696,33 +2515,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/ca/723e3f8185738d7947f14ee7dc663b59415c6dee43bd71575f8c7f5cd6be/wmctrl-0.5-py2.py3-none-any.whl", hash = "sha256:ae695c1863a314c899e7cf113f07c0da02a394b968c4772e1936219d9234ddd7", size = 4268 }, ] -[[package]] -name = "wsproto" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "h11" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c9/4a/44d3c295350d776427904d73c189e10aeae66d7f555bb2feee16d1e4ba5a/wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065", size = 53425 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/58/e860788190eba3bcce367f74d29c4675466ce8dddfba85f7827588416f01/wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736", size = 24226 }, -] - -[[package]] -name = "xformers" -version = "0.0.29.post1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, - { name = "torch" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/52/e2/50139a8d6fd89397db5594f8587483c5b812b138e02f238de86a2f7795c9/xformers-0.0.29.post1.tar.gz", hash = "sha256:d78c256e4c24ecc00f6f374d5b96afd0b56b3fb197f02d9efff4357fd5a399b4", size = 8457510 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/28/1d/03356b31386a61162bccddf7fba6c792b4fe1159ad2af5f4b7879ce947ad/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:e213ff8123e20602bd486739ffee4013338b02f9d2e0e4635a2912750854fdbe", size = 15294284 }, - { url = "https://files.pythonhosted.org/packages/c1/a2/e2aca6b07688ee559834f703d587f158a3f5c314bce08c28672f448bf0f2/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a0e9e333856f6cd0eb4293cd107014c98ad74464a9706416f0232520699e1e71", size = 15293313 }, - { url = "https://files.pythonhosted.org/packages/b9/ff/e892c8adae1f3ee73816bbbe599642116a5d42aa9c4dd4db24f80fdd2263/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cc47ac48ddf975aa37f0b544e20c2764f7bebb9d11cc320109ac205fec09f3f2", size = 15294196 }, -] - [[package]] name = "yapf" version = "0.43.0" @@ -2806,62 +2598,3 @@ sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e wheels = [ { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, ] - -[[package]] -name = "zstandard" -version = "0.23.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cffi", marker = "platform_python_implementation == 'PyPy'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/f6/2ac0287b442160a89d726b17a9184a4c615bb5237db763791a7fd16d9df1/zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09", size = 681701 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2a/55/bd0487e86679db1823fc9ee0d8c9c78ae2413d34c0b461193b5f4c31d22f/zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9", size = 788701 }, - { url = "https://files.pythonhosted.org/packages/e1/8a/ccb516b684f3ad987dfee27570d635822e3038645b1a950c5e8022df1145/zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880", size = 633678 }, - { url = "https://files.pythonhosted.org/packages/12/89/75e633d0611c028e0d9af6df199423bf43f54bea5007e6718ab7132e234c/zstandard-0.23.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc", size = 4941098 }, - { url = "https://files.pythonhosted.org/packages/4a/7a/bd7f6a21802de358b63f1ee636ab823711c25ce043a3e9f043b4fcb5ba32/zstandard-0.23.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573", size = 5308798 }, - { url = "https://files.pythonhosted.org/packages/79/3b/775f851a4a65013e88ca559c8ae42ac1352db6fcd96b028d0df4d7d1d7b4/zstandard-0.23.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391", size = 5341840 }, - { url = "https://files.pythonhosted.org/packages/09/4f/0cc49570141dd72d4d95dd6fcf09328d1b702c47a6ec12fbed3b8aed18a5/zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e", size = 5440337 }, - { url = "https://files.pythonhosted.org/packages/e7/7c/aaa7cd27148bae2dc095191529c0570d16058c54c4597a7d118de4b21676/zstandard-0.23.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd", size = 4861182 }, - { url = "https://files.pythonhosted.org/packages/ac/eb/4b58b5c071d177f7dc027129d20bd2a44161faca6592a67f8fcb0b88b3ae/zstandard-0.23.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4", size = 4932936 }, - { url = "https://files.pythonhosted.org/packages/44/f9/21a5fb9bb7c9a274b05ad700a82ad22ce82f7ef0f485980a1e98ed6e8c5f/zstandard-0.23.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea", size = 5464705 }, - { url = "https://files.pythonhosted.org/packages/49/74/b7b3e61db3f88632776b78b1db597af3f44c91ce17d533e14a25ce6a2816/zstandard-0.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2", size = 4857882 }, - { url = "https://files.pythonhosted.org/packages/4a/7f/d8eb1cb123d8e4c541d4465167080bec88481ab54cd0b31eb4013ba04b95/zstandard-0.23.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9", size = 4697672 }, - { url = "https://files.pythonhosted.org/packages/5e/05/f7dccdf3d121309b60342da454d3e706453a31073e2c4dac8e1581861e44/zstandard-0.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a", size = 5206043 }, - { url = "https://files.pythonhosted.org/packages/86/9d/3677a02e172dccd8dd3a941307621c0cbd7691d77cb435ac3c75ab6a3105/zstandard-0.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0", size = 5667390 }, - { url = "https://files.pythonhosted.org/packages/41/7e/0012a02458e74a7ba122cd9cafe491facc602c9a17f590367da369929498/zstandard-0.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c", size = 5198901 }, - { url = "https://files.pythonhosted.org/packages/65/3a/8f715b97bd7bcfc7342d8adcd99a026cb2fb550e44866a3b6c348e1b0f02/zstandard-0.23.0-cp310-cp310-win32.whl", hash = "sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813", size = 430596 }, - { url = "https://files.pythonhosted.org/packages/19/b7/b2b9eca5e5a01111e4fe8a8ffb56bdcdf56b12448a24effe6cfe4a252034/zstandard-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4", size = 495498 }, - { url = "https://files.pythonhosted.org/packages/9e/40/f67e7d2c25a0e2dc1744dd781110b0b60306657f8696cafb7ad7579469bd/zstandard-0.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e", size = 788699 }, - { url = "https://files.pythonhosted.org/packages/e8/46/66d5b55f4d737dd6ab75851b224abf0afe5774976fe511a54d2eb9063a41/zstandard-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23", size = 633681 }, - { url = "https://files.pythonhosted.org/packages/63/b6/677e65c095d8e12b66b8f862b069bcf1f1d781b9c9c6f12eb55000d57583/zstandard-0.23.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a", size = 4944328 }, - { url = "https://files.pythonhosted.org/packages/59/cc/e76acb4c42afa05a9d20827116d1f9287e9c32b7ad58cc3af0721ce2b481/zstandard-0.23.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db", size = 5311955 }, - { url = "https://files.pythonhosted.org/packages/78/e4/644b8075f18fc7f632130c32e8f36f6dc1b93065bf2dd87f03223b187f26/zstandard-0.23.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2", size = 5344944 }, - { url = "https://files.pythonhosted.org/packages/76/3f/dbafccf19cfeca25bbabf6f2dd81796b7218f768ec400f043edc767015a6/zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca", size = 5442927 }, - { url = "https://files.pythonhosted.org/packages/0c/c3/d24a01a19b6733b9f218e94d1a87c477d523237e07f94899e1c10f6fd06c/zstandard-0.23.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c", size = 4864910 }, - { url = "https://files.pythonhosted.org/packages/1c/a9/cf8f78ead4597264f7618d0875be01f9bc23c9d1d11afb6d225b867cb423/zstandard-0.23.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e", size = 4935544 }, - { url = "https://files.pythonhosted.org/packages/2c/96/8af1e3731b67965fb995a940c04a2c20997a7b3b14826b9d1301cf160879/zstandard-0.23.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5", size = 5467094 }, - { url = "https://files.pythonhosted.org/packages/ff/57/43ea9df642c636cb79f88a13ab07d92d88d3bfe3e550b55a25a07a26d878/zstandard-0.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48", size = 4860440 }, - { url = "https://files.pythonhosted.org/packages/46/37/edb78f33c7f44f806525f27baa300341918fd4c4af9472fbc2c3094be2e8/zstandard-0.23.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c", size = 4700091 }, - { url = "https://files.pythonhosted.org/packages/c1/f1/454ac3962671a754f3cb49242472df5c2cced4eb959ae203a377b45b1a3c/zstandard-0.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003", size = 5208682 }, - { url = "https://files.pythonhosted.org/packages/85/b2/1734b0fff1634390b1b887202d557d2dd542de84a4c155c258cf75da4773/zstandard-0.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78", size = 5669707 }, - { url = "https://files.pythonhosted.org/packages/52/5a/87d6971f0997c4b9b09c495bf92189fb63de86a83cadc4977dc19735f652/zstandard-0.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473", size = 5201792 }, - { url = "https://files.pythonhosted.org/packages/79/02/6f6a42cc84459d399bd1a4e1adfc78d4dfe45e56d05b072008d10040e13b/zstandard-0.23.0-cp311-cp311-win32.whl", hash = "sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160", size = 430586 }, - { url = "https://files.pythonhosted.org/packages/be/a2/4272175d47c623ff78196f3c10e9dc7045c1b9caf3735bf041e65271eca4/zstandard-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0", size = 495420 }, - { url = "https://files.pythonhosted.org/packages/7b/83/f23338c963bd9de687d47bf32efe9fd30164e722ba27fb59df33e6b1719b/zstandard-0.23.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094", size = 788713 }, - { url = "https://files.pythonhosted.org/packages/5b/b3/1a028f6750fd9227ee0b937a278a434ab7f7fdc3066c3173f64366fe2466/zstandard-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8", size = 633459 }, - { url = "https://files.pythonhosted.org/packages/26/af/36d89aae0c1f95a0a98e50711bc5d92c144939efc1f81a2fcd3e78d7f4c1/zstandard-0.23.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1", size = 4945707 }, - { url = "https://files.pythonhosted.org/packages/cd/2e/2051f5c772f4dfc0aae3741d5fc72c3dcfe3aaeb461cc231668a4db1ce14/zstandard-0.23.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072", size = 5306545 }, - { url = "https://files.pythonhosted.org/packages/0a/9e/a11c97b087f89cab030fa71206963090d2fecd8eb83e67bb8f3ffb84c024/zstandard-0.23.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20", size = 5337533 }, - { url = "https://files.pythonhosted.org/packages/fc/79/edeb217c57fe1bf16d890aa91a1c2c96b28c07b46afed54a5dcf310c3f6f/zstandard-0.23.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373", size = 5436510 }, - { url = "https://files.pythonhosted.org/packages/81/4f/c21383d97cb7a422ddf1ae824b53ce4b51063d0eeb2afa757eb40804a8ef/zstandard-0.23.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db", size = 4859973 }, - { url = "https://files.pythonhosted.org/packages/ab/15/08d22e87753304405ccac8be2493a495f529edd81d39a0870621462276ef/zstandard-0.23.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772", size = 4936968 }, - { url = "https://files.pythonhosted.org/packages/eb/fa/f3670a597949fe7dcf38119a39f7da49a8a84a6f0b1a2e46b2f71a0ab83f/zstandard-0.23.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105", size = 5467179 }, - { url = "https://files.pythonhosted.org/packages/4e/a9/dad2ab22020211e380adc477a1dbf9f109b1f8d94c614944843e20dc2a99/zstandard-0.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba", size = 4848577 }, - { url = "https://files.pythonhosted.org/packages/08/03/dd28b4484b0770f1e23478413e01bee476ae8227bbc81561f9c329e12564/zstandard-0.23.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd", size = 4693899 }, - { url = "https://files.pythonhosted.org/packages/2b/64/3da7497eb635d025841e958bcd66a86117ae320c3b14b0ae86e9e8627518/zstandard-0.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a", size = 5199964 }, - { url = "https://files.pythonhosted.org/packages/43/a4/d82decbab158a0e8a6ebb7fc98bc4d903266bce85b6e9aaedea1d288338c/zstandard-0.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90", size = 5655398 }, - { url = "https://files.pythonhosted.org/packages/f2/61/ac78a1263bc83a5cf29e7458b77a568eda5a8f81980691bbc6eb6a0d45cc/zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35", size = 5191313 }, - { url = "https://files.pythonhosted.org/packages/e7/54/967c478314e16af5baf849b6ee9d6ea724ae5b100eb506011f045d3d4e16/zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d", size = 430877 }, - { url = "https://files.pythonhosted.org/packages/75/37/872d74bd7739639c4553bf94c84af7d54d8211b626b352bc57f0fd8d1e3f/zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b", size = 495595 }, -] From a652fdd781b0ce5f6202f5b90b61b05b66be087a Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:14:21 -0300 Subject: [PATCH 28/35] Move skynet.utils to skynet.dgpu submodule --- skynet/cli.py | 10 +++++----- skynet/dgpu/compute.py | 2 +- skynet/{ => dgpu}/utils.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) rename skynet/{ => dgpu}/utils.py (99%) diff --git a/skynet/cli.py b/skynet/cli.py index f78d31d..4a5850e 100755 --- a/skynet/cli.py +++ b/skynet/cli.py @@ -45,7 +45,7 @@ def skynet(*args, **kwargs): @click.option('--steps', '-s', default=26) @click.option('--seed', '-S', default=None) def txt2img(*args, **kwargs): - from . import utils # TODO? why here, import cycle? + from skynet.dgpu import utils config = load_skynet_toml() set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) @@ -70,7 +70,7 @@ def txt2img(*args, **kwargs): @click.option('--steps', '-s', default=26) @click.option('--seed', '-S', default=None) def img2img(model, prompt, input, output, strength, guidance, steps, seed): - from . import utils + from skynet.dgpu import utils config = load_skynet_toml() set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.img2img( @@ -98,7 +98,7 @@ def img2img(model, prompt, input, output, strength, guidance, steps, seed): @click.option('--steps', '-s', default=26) @click.option('--seed', '-S', default=None) def inpaint(model, prompt, input, mask, output, strength, guidance, steps, seed): - from . import utils + from skynet.dgpu import utils config = load_skynet_toml() set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.inpaint( @@ -119,7 +119,7 @@ def inpaint(model, prompt, input, mask, output, strength, guidance, steps, seed) @click.option('--output', '-o', default='output.png') @click.option('--model', '-m', default='weights/RealESRGAN_x4plus.pth') def upscale(input, output, model): - from . import utils + from skynet.dgpu import utils utils.upscale( img_path=input, output=output, @@ -128,7 +128,7 @@ def upscale(input, output, model): @skynet.command() def download(): - from . import utils + from skynet.dgpu import utils config = load_skynet_toml() set_hf_vars(config.dgpu.hf_token, config.dgpu.hf_home) utils.download_all_models(config.dgpu.hf_token, config.dgpu.hf_home) diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 23daa63..67bc7ea 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -19,7 +19,7 @@ from skynet.dgpu.errors import ( DGPUInferenceCancelled, ) -from skynet.utils import crop_image, convert_from_cv2_to_image, convert_from_image_to_cv2, convert_from_img_to_bytes, init_upscaler, pipeline_for +from skynet.dgpu.utils import crop_image, convert_from_cv2_to_image, convert_from_image_to_cv2, convert_from_img_to_bytes, init_upscaler, pipeline_for def prepare_params_for_diffuse( diff --git a/skynet/utils.py b/skynet/dgpu/utils.py similarity index 99% rename from skynet/utils.py rename to skynet/dgpu/utils.py index 0ebf6fb..31d0797 100755 --- a/skynet/utils.py +++ b/skynet/dgpu/utils.py @@ -23,7 +23,7 @@ from diffusers import ( ) from huggingface_hub import login -from .constants import MODELS +from skynet.constants import MODELS # Hack to fix a changed import in torchvision 0.17+, which otherwise breaks # basicsr; see https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13985 From 80633627da4c1c525afab497c4d51dc432ac3fbd Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:15:01 -0300 Subject: [PATCH 29/35] Remove un-used util --- skynet/dgpu/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/skynet/dgpu/utils.py b/skynet/dgpu/utils.py index 31d0797..86179a2 100755 --- a/skynet/dgpu/utils.py +++ b/skynet/dgpu/utils.py @@ -41,10 +41,6 @@ from realesrgan import RealESRGANer -def time_ms(): - return int(time.time() * 1000) - - def convert_from_cv2_to_image(img: np.ndarray) -> Image: # return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) return Image.fromarray(img) From b365614194f6e39d8227c3ff2d2f0062651a9cd0 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:24:12 -0300 Subject: [PATCH 30/35] Add real compute test and parametrize the mocker test --- tests/test_reqs.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tests/test_reqs.py b/tests/test_reqs.py index a8d8564..03a1cc6 100644 --- a/tests/test_reqs.py +++ b/tests/test_reqs.py @@ -1,12 +1,30 @@ -import json +import pytest -from skynet.constants import * from skynet.dgpu.compute import maybe_load_model, compute_one -async def test_diffuse(inject_mockers): +@pytest.mark.parametrize("mode", [ + ('diffuse'), ('txt2img') +]) +async def test_pipeline_mocker(inject_mockers, mode): model = 'skygpu/txt2img-mocker' - mode = 'diffuse' + params = { + "prompt": "Kronos God Realistic 4k", + "model": model, + "step": 21, + "width": 1024, + "height": 1024, + "seed": 168402949, + "guidance": "7.5" + } + + with maybe_load_model(model, mode) as model: + compute_one(model, 0, mode, params) + + +async def test_pipeline(): + model = 'stabilityai/stable-diffusion-xl-base-1.0' + mode = 'txt2img' params = { "prompt": "Kronos God Realistic 4k", "model": model, From 5edd7f4db44df79946e57d8fdfe9df545cf72040 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:40:19 -0300 Subject: [PATCH 31/35] Log information about non custom pipelines --- skynet/dgpu/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skynet/dgpu/utils.py b/skynet/dgpu/utils.py index 86179a2..2b3090e 100755 --- a/skynet/dgpu/utils.py +++ b/skynet/dgpu/utils.py @@ -123,8 +123,7 @@ def pipeline_for( return pipe except ImportError: - # TODO, uhh why not warn/error log this? - ... + logging.info(f'didn\'t find a custom pipeline file for {shortname}') req_mem = model_info.mem From 5d67b3cd60d42f4cc1f5e332c8315936ce2fcd36 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 22:43:40 -0300 Subject: [PATCH 32/35] Address last of fomo\'s comments, related to pinner logging, cleanup ipfs module --- skynet/ipfs/__init__.py | 2 +- skynet/ipfs/pinner.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/skynet/ipfs/__init__.py b/skynet/ipfs/__init__.py index 125c225..af77da9 100644 --- a/skynet/ipfs/__init__.py +++ b/skynet/ipfs/__init__.py @@ -24,7 +24,7 @@ class AsyncIPFSHTTP: raise IPFSClientException(resp.text) return resp.json() -#!/usr/bin/python + async def add(self, file_path: Path, **kwargs): files = { 'file': (file_path.name, file_path.open('rb')) diff --git a/skynet/ipfs/pinner.py b/skynet/ipfs/pinner.py index dc9fe2d..dd53219 100755 --- a/skynet/ipfs/pinner.py +++ b/skynet/ipfs/pinner.py @@ -1,6 +1,4 @@ import logging -import traceback - from datetime import datetime, timedelta import trio @@ -117,8 +115,7 @@ class SkynetPinner: n.start_soon(self.task_pin, cid) except OSError: - # TODO, use `logging.exception()` here instead ?? - traceback.print_exc() + logging.exception('OSError while trying to pin?') except KeyboardInterrupt: break From 30eaa6c19427793847d90c23141f2ad4f230662c Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 23:12:28 -0300 Subject: [PATCH 33/35] Switch to non interator poller, NetConnector has wait_data_update() now --- skynet/dgpu/__init__.py | 2 ++ skynet/dgpu/daemon.py | 6 ++++-- skynet/dgpu/network.py | 15 ++++++++++----- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/skynet/dgpu/__init__.py b/skynet/dgpu/__init__.py index 24e2f7a..2d834f8 100755 --- a/skynet/dgpu/__init__.py +++ b/skynet/dgpu/__init__.py @@ -27,6 +27,8 @@ async def open_worker(config: Config): if tui: n.start_soon(tui.run) + n.start_soon(conn.iter_poll_update, config.poll_time) + yield conn except *urwid.ExitMainLoop: diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index eb002b5..fabe5a9 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -182,9 +182,11 @@ async def maybe_serve_one( async def dgpu_serve_forever(config: Config, conn: NetConnector): await maybe_update_tui_balance(conn) + try: - async for tables in conn.iter_poll_update(config.poll_time): - queue = tables['queue'] + while True: + await conn.wait_data_update() + queue = conn._tables['queue'] random.shuffle(queue) queue = sorted( diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index 8b45197..a54c032 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -72,6 +72,7 @@ class NetConnector: 'requests': {}, 'results': [] } + self._data_event = trio.Event() maybe_update_tui(lambda tui: tui.set_header_text(new_worker_name=self.config.account)) @@ -139,8 +140,7 @@ class NetConnector: async def get_full_queue_snapshot(self): ''' - Keep in-sync with latest (telos chain's smart-contract) table - state by polling (currently with period 1s). + Get a "snapshot" of current contract table state ''' snap = { @@ -164,17 +164,22 @@ class NetConnector: return snap - async def iter_poll_update(self, poll_time: float) -> AsyncGenerator[dict, None]: + async def wait_data_update(self): + await self._data_event.wait() + + async def iter_poll_update(self, poll_time: float): ''' - Long running task, olls gpu contract tables yields latest table rows + Long running task, polls gpu contract tables latest table rows, + awakes any self._data_event waiters ''' while True: start_time = time.time() self._tables = await self.get_full_queue_snapshot() elapsed = time.time() - start_time - yield self._tables + self._data_event.set() await trio.sleep(max(poll_time - elapsed, 0.1)) + self._data_event = trio.Event() async def should_cancel_work(self, request_id: int) -> bool: logging.info('should cancel work?') From 8828fa13fc706586e38be586011a5cd869d1a775 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Fri, 7 Feb 2025 23:17:33 -0300 Subject: [PATCH 34/35] Add table_index system in poller in order for daemon to be aware of stale data --- skynet/dgpu/daemon.py | 7 +++++++ skynet/dgpu/network.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index fabe5a9..2fc2159 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -183,9 +183,16 @@ async def maybe_serve_one( async def dgpu_serve_forever(config: Config, conn: NetConnector): await maybe_update_tui_balance(conn) + last_poll_idx = -1 try: while True: await conn.wait_data_update() + if conn.poll_index == last_poll_idx: + await trio.sleep(config.poll_time) + continue + + last_poll_idx = conn.poll_index + queue = conn._tables['queue'] random.shuffle(queue) diff --git a/skynet/dgpu/network.py b/skynet/dgpu/network.py index a54c032..e6585d1 100755 --- a/skynet/dgpu/network.py +++ b/skynet/dgpu/network.py @@ -67,6 +67,8 @@ class NetConnector: self.ipfs_client = AsyncIPFSHTTP(config.ipfs_url) + # poll_index is used to detect stale data + self.poll_index = 0 self._tables = { 'queue': [], 'requests': {}, @@ -180,6 +182,7 @@ class NetConnector: self._data_event.set() await trio.sleep(max(poll_time - elapsed, 0.1)) self._data_event = trio.Event() + self.poll_index += 1 async def should_cancel_work(self, request_id: int) -> bool: logging.info('should cancel work?') From a4e40ba6624cc7a27c761a54e02938e1d23f4ca9 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Sat, 8 Feb 2025 00:18:38 -0300 Subject: [PATCH 35/35] Added auto-download through hf for the upscaler --- skynet/dgpu/compute.py | 2 +- skynet/dgpu/daemon.py | 5 +++-- skynet/dgpu/tui.py | 3 ++- skynet/dgpu/utils.py | 18 ++++++++++++------ 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/skynet/dgpu/compute.py b/skynet/dgpu/compute.py index 67bc7ea..3a0645a 100755 --- a/skynet/dgpu/compute.py +++ b/skynet/dgpu/compute.py @@ -114,7 +114,7 @@ def compute_one( inputs: list[bytes] = [], should_cancel = None ): - total_steps = params['step'] + total_steps = params['step'] if 'step' in params else 1 def inference_step_wakeup(*args, **kwargs): '''This is a callback function that gets invoked every inference step, we need to raise an exception here if we need to cancel work diff --git a/skynet/dgpu/daemon.py b/skynet/dgpu/daemon.py index 2fc2159..b23c652 100755 --- a/skynet/dgpu/daemon.py +++ b/skynet/dgpu/daemon.py @@ -124,7 +124,8 @@ async def maybe_serve_one( request_hash = sha256(hash_str.encode('utf-8')).hexdigest() logging.info(f'calculated request hash: {request_hash}') - total_step = body['params']['step'] + params = body['params'] + total_step = params['step'] if 'step' in params else 1 model = body['params']['model'] mode = body['method'] @@ -152,7 +153,7 @@ async def maybe_serve_one( compute_one, model, rid, - mode, body['params'], + mode, params, inputs=inputs, should_cancel=conn.should_cancel_work, ) diff --git a/skynet/dgpu/tui.py b/skynet/dgpu/tui.py index ed3c1a7..5403025 100644 --- a/skynet/dgpu/tui.py +++ b/skynet/dgpu/tui.py @@ -81,10 +81,11 @@ class WorkerMonitor: for req in requests: # Build a columns widget for the request row + prompt = req['prompt'] if 'prompt' in req else 'UPSCALE' columns = urwid.Columns([ ('fixed', 5, urwid.Text(f"#{req['id']}")), # e.g. "#12" ('weight', 3, urwid.Text(req['model'])), - ('weight', 3, urwid.Text(req['prompt'])), + ('weight', 3, urwid.Text(prompt)), ('fixed', 13, urwid.Text(req['user'])), ('fixed', 13, urwid.Text(req['reward'])), ], dividechars=1) diff --git a/skynet/dgpu/utils.py b/skynet/dgpu/utils.py index 2b3090e..a355be9 100755 --- a/skynet/dgpu/utils.py +++ b/skynet/dgpu/utils.py @@ -21,8 +21,9 @@ from diffusers import ( AutoPipelineForInpainting, EulerAncestralDiscreteScheduler, ) -from huggingface_hub import login +from huggingface_hub import login, hf_hub_download +from skynet.config import load_skynet_toml from skynet.constants import MODELS # Hack to fix a changed import in torchvision 0.17+, which otherwise breaks @@ -40,7 +41,6 @@ from basicsr.archs.rrdbnet_arch import RRDBNet from realesrgan import RealESRGANer - def convert_from_cv2_to_image(img: np.ndarray) -> Image: # return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) return Image.fromarray(img) @@ -285,7 +285,14 @@ def inpaint( image.save(output) -def init_upscaler(model_path: str = 'hf_home/RealESRGAN_x4plus.pth'): +def init_upscaler(): + config = load_skynet_toml().dgpu + model_path = hf_hub_download( + 'leonelhs/realesrgan', + 'RealESRGAN_x4plus.pth', + token=config.hf_token, + cache_dir=config.hf_home + ) return RealESRGANer( scale=4, model_path=model_path, @@ -303,12 +310,11 @@ def init_upscaler(model_path: str = 'hf_home/RealESRGAN_x4plus.pth'): def upscale( img_path: str = 'input.png', - output: str = 'output.png', - model_path: str = 'hf_home/RealESRGAN_x4plus.pth' + output: str = 'output.png' ): input_img = Image.open(img_path).convert('RGB') - upscaler = init_upscaler(model_path=model_path) + upscaler = init_upscaler() up_img, _ = upscaler.enhance( convert_from_image_to_cv2(input_img), outscale=4)