mirror of https://github.com/skygpu/skynet.git
Further improvements in indentation and logic in daemons maybe_serve_one, also might of fixed a bug related to using id instead of request_id in the search existing results phase, and add way more logging
parent
b62cdd6802
commit
399299c62b
|
@ -1,3 +1,5 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
|
|
||||||
from hypercorn.config import Config
|
from hypercorn.config import Config
|
||||||
|
@ -16,6 +18,10 @@ async def open_dgpu_node(config: dict) -> None:
|
||||||
and *maybe* serve a `hypercorn` web API.
|
and *maybe* serve a `hypercorn` web API.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# suppress logs from httpx (logs url + status after every query)
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
conn = NetConnector(config)
|
conn = NetConnector(config)
|
||||||
mm = ModelMngr(config)
|
mm = ModelMngr(config)
|
||||||
daemon = WorkerDaemon(mm, conn, config)
|
daemon = WorkerDaemon(mm, conn, config)
|
||||||
|
@ -33,6 +39,7 @@ async def open_dgpu_node(config: dict) -> None:
|
||||||
# TODO, consider a more explicit `as hypercorn_serve`
|
# TODO, consider a more explicit `as hypercorn_serve`
|
||||||
# to clarify?
|
# to clarify?
|
||||||
if api:
|
if api:
|
||||||
|
logging.info(f'serving api @ {config["api_bind"]}')
|
||||||
tn.start_soon(serve, api, api_conf)
|
tn.start_soon(serve, api, api_conf)
|
||||||
|
|
||||||
# block until cancelled
|
# block until cancelled
|
||||||
|
|
|
@ -83,8 +83,8 @@ class ModelMngr:
|
||||||
# self.load_model(DEFAULT_INITAL_MODEL, 'txt2img')
|
# self.load_model(DEFAULT_INITAL_MODEL, 'txt2img')
|
||||||
|
|
||||||
def log_debug_info(self):
|
def log_debug_info(self):
|
||||||
logging.info('memory summary:')
|
logging.debug('memory summary:')
|
||||||
logging.info('\n' + torch.cuda.memory_summary())
|
logging.debug('\n' + torch.cuda.memory_summary())
|
||||||
|
|
||||||
def is_model_loaded(self, name: str, mode: str):
|
def is_model_loaded(self, name: str, mode: str):
|
||||||
if (name == self._model_name and
|
if (name == self._model_name and
|
||||||
|
@ -114,6 +114,8 @@ class ModelMngr:
|
||||||
name, mode, cache_dir=self.cache_dir)
|
name, mode, cache_dir=self.cache_dir)
|
||||||
self._model_mode = mode
|
self._model_mode = mode
|
||||||
self._model_name = name
|
self._model_name = name
|
||||||
|
logging.info('{name} loaded!')
|
||||||
|
self.log_debug_info()
|
||||||
|
|
||||||
def compute_one(
|
def compute_one(
|
||||||
self,
|
self,
|
||||||
|
@ -126,11 +128,7 @@ class ModelMngr:
|
||||||
if self._should_cancel:
|
if self._should_cancel:
|
||||||
should_raise = trio.from_thread.run(self._should_cancel, request_id)
|
should_raise = trio.from_thread.run(self._should_cancel, request_id)
|
||||||
if should_raise:
|
if should_raise:
|
||||||
logging.warn(f'cancelling work at step {step}')
|
logging.warn(f'CANCELLING work at step {step}')
|
||||||
|
|
||||||
# ?TODO, this is never caught, so why is it
|
|
||||||
# raised specially?
|
|
||||||
raise DGPUInferenceCancelled()
|
|
||||||
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -206,8 +204,6 @@ class ModelMngr:
|
||||||
raise DGPUComputeError('Unsupported compute method')
|
raise DGPUComputeError('Unsupported compute method')
|
||||||
|
|
||||||
except BaseException as err:
|
except BaseException as err:
|
||||||
logging.error(err)
|
|
||||||
# to see the src exc in tb
|
|
||||||
raise DGPUComputeError(str(err)) from err
|
raise DGPUComputeError(str(err)) from err
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -105,7 +105,11 @@ class WorkerDaemon:
|
||||||
for status in self._snap['requests'][request_id]
|
for status in self._snap['requests'][request_id]
|
||||||
if status['worker'] != self.account
|
if status['worker'] != self.account
|
||||||
])
|
])
|
||||||
return bool(self.non_compete & competitors)
|
logging.info('should cancel work?')
|
||||||
|
logging.info(f'competitors: {competitors}')
|
||||||
|
should_cancel = bool(self.non_compete & competitors)
|
||||||
|
logging.info(f'cancel: {should_cancel}')
|
||||||
|
return should_cancel
|
||||||
|
|
||||||
|
|
||||||
async def snap_updater_task(self):
|
async def snap_updater_task(self):
|
||||||
|
@ -150,6 +154,7 @@ class WorkerDaemon:
|
||||||
req: dict,
|
req: dict,
|
||||||
):
|
):
|
||||||
rid = req['id']
|
rid = req['id']
|
||||||
|
logging.info(f'maybe serve request #{rid}')
|
||||||
|
|
||||||
# parse request
|
# parse request
|
||||||
body = json.loads(req['body'])
|
body = json.loads(req['body'])
|
||||||
|
@ -161,7 +166,7 @@ class WorkerDaemon:
|
||||||
and
|
and
|
||||||
model not in MODELS
|
model not in MODELS
|
||||||
):
|
):
|
||||||
logging.warning(f'Unknown model {model}')
|
logging.warning(f'unknown model {model}!, skip...')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# only handle whitelisted models
|
# only handle whitelisted models
|
||||||
|
@ -170,39 +175,54 @@ class WorkerDaemon:
|
||||||
and
|
and
|
||||||
model not in self.model_whitelist
|
model not in self.model_whitelist
|
||||||
):
|
):
|
||||||
|
logging.warning('model not whitelisted!, skip...')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# if blacklist contains model skip
|
# if blacklist contains model skip
|
||||||
if model in self.model_blacklist:
|
if model in self.model_blacklist:
|
||||||
|
logging.warning('model not blacklisted!, skip...')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
results = [res['id'] for res in self._snap['results']]
|
results = [res['request_id'] for res in self._snap['results']]
|
||||||
|
|
||||||
# if worker is already on that request or
|
# if worker already produced a result for this request
|
||||||
# if worker has a stale status for that request
|
if rid in results:
|
||||||
if rid in results or rid not in self._snap['requests']:
|
logging.info(f'worker already submitted a result for request #{rid}, skip...')
|
||||||
logging.info(f'request {rid} already beign worked on, skip...')
|
return False
|
||||||
return
|
|
||||||
|
|
||||||
statuses = self._snap['requests'][rid]
|
statuses = self._snap['requests'][rid]
|
||||||
if len(statuses) == 0:
|
|
||||||
|
# skip if workers in non_compete already on it
|
||||||
|
competitors = set((status['worker'] for status in statuses))
|
||||||
|
if bool(self.non_compete & competitors):
|
||||||
|
logging.info('worker in configured non_compete list already working on request, skip...')
|
||||||
|
return False
|
||||||
|
|
||||||
|
# resolve the ipfs hashes into the actual data behind them
|
||||||
inputs = []
|
inputs = []
|
||||||
|
raw_inputs = req['binary_data'].split(',')
|
||||||
|
if raw_inputs:
|
||||||
|
logging.info(f'fetching IPFS inputs: {raw_inputs}')
|
||||||
|
|
||||||
|
retry = 3
|
||||||
for _input in req['binary_data'].split(','):
|
for _input in req['binary_data'].split(','):
|
||||||
if _input:
|
if _input:
|
||||||
for _ in range(3):
|
for r in range(retry):
|
||||||
try:
|
try:
|
||||||
# user `GPUConnector` to IO with
|
# user `GPUConnector` to IO with
|
||||||
# storage layer to seed the compute
|
# storage layer to seed the compute
|
||||||
# task.
|
# task.
|
||||||
img = await self.conn.get_input_data(_input)
|
img = await self.conn.get_input_data(_input)
|
||||||
inputs.append(img)
|
inputs.append(img)
|
||||||
|
logging.info(f'retrieved {_input}!')
|
||||||
break
|
break
|
||||||
|
|
||||||
except BaseException:
|
except BaseException:
|
||||||
logging.exception(
|
logging.exception(
|
||||||
'Model input error !?!\n'
|
f'IPFS fetch input error !?! retries left {retry - r - 1}\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# compute unique request hash used on submit
|
||||||
hash_str = (
|
hash_str = (
|
||||||
str(req['nonce'])
|
str(req['nonce'])
|
||||||
+
|
+
|
||||||
|
@ -210,17 +230,15 @@ class WorkerDaemon:
|
||||||
+
|
+
|
||||||
req['binary_data']
|
req['binary_data']
|
||||||
)
|
)
|
||||||
logging.info(f'hashing: {hash_str}')
|
logging.debug(f'hashing: {hash_str}')
|
||||||
request_hash = sha256(hash_str.encode('utf-8')).hexdigest()
|
request_hash = sha256(hash_str.encode('utf-8')).hexdigest()
|
||||||
|
logging.info(f'calculated request hash: {request_hash}')
|
||||||
|
|
||||||
# TODO: validate request
|
# TODO: validate request
|
||||||
|
|
||||||
# perform work
|
|
||||||
logging.info(f'working on {body}')
|
|
||||||
|
|
||||||
resp = await self.conn.begin_work(rid)
|
resp = await self.conn.begin_work(rid)
|
||||||
if not resp or 'code' in resp:
|
if not resp or 'code' in resp:
|
||||||
logging.info('probably being worked on already... skip.')
|
logging.info('begin_work error, probably being worked on already... skip.')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
@ -257,7 +275,6 @@ class WorkerDaemon:
|
||||||
|
|
||||||
except BaseException as err:
|
except BaseException as err:
|
||||||
logging.exception('Failed to serve model request !?\n')
|
logging.exception('Failed to serve model request !?\n')
|
||||||
# traceback.print_exc() # TODO? <- replaced by above ya?
|
|
||||||
await self.conn.cancel_work(rid, str(err))
|
await self.conn.cancel_work(rid, str(err))
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -72,9 +72,6 @@ class NetConnector:
|
||||||
self.cleos = CLEOS(endpoint=self.node_url)
|
self.cleos = CLEOS(endpoint=self.node_url)
|
||||||
self.cleos.load_abi('gpu.scd', GPU_CONTRACT_ABI)
|
self.cleos.load_abi('gpu.scd', GPU_CONTRACT_ABI)
|
||||||
|
|
||||||
self.ipfs_gateway_url = None
|
|
||||||
if 'ipfs_gateway_url' in config:
|
|
||||||
self.ipfs_gateway_url = config['ipfs_gateway_url']
|
|
||||||
self.ipfs_url = config['ipfs_url']
|
self.ipfs_url = config['ipfs_url']
|
||||||
|
|
||||||
self.ipfs_client = AsyncIPFSHTTP(self.ipfs_url)
|
self.ipfs_client = AsyncIPFSHTTP(self.ipfs_url)
|
||||||
|
@ -89,7 +86,7 @@ class NetConnector:
|
||||||
|
|
||||||
async def get_work_requests_last_hour(self):
|
async def get_work_requests_last_hour(self):
|
||||||
logging.info('get_work_requests_last_hour')
|
logging.info('get_work_requests_last_hour')
|
||||||
return await failable(
|
rows = await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.aget_table,
|
self.cleos.aget_table,
|
||||||
'gpu.scd', 'gpu.scd', 'queue',
|
'gpu.scd', 'gpu.scd', 'queue',
|
||||||
|
@ -98,13 +95,19 @@ class NetConnector:
|
||||||
lower_bound=int(time.time()) - 3600
|
lower_bound=int(time.time()) - 3600
|
||||||
), ret_fail=[])
|
), ret_fail=[])
|
||||||
|
|
||||||
|
logging.info(f'found {len(rows)} requests on queue')
|
||||||
|
return rows
|
||||||
|
|
||||||
async def get_status_by_request_id(self, request_id: int):
|
async def get_status_by_request_id(self, request_id: int):
|
||||||
logging.info('get_status_by_request_id')
|
logging.info('get_status_by_request_id')
|
||||||
return await failable(
|
rows = await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.aget_table,
|
self.cleos.aget_table,
|
||||||
'gpu.scd', request_id, 'status'), ret_fail=[])
|
'gpu.scd', request_id, 'status'), ret_fail=[])
|
||||||
|
|
||||||
|
logging.info(f'found status for workers: {[r["worker"] for r in rows]}')
|
||||||
|
return rows
|
||||||
|
|
||||||
async def get_global_config(self):
|
async def get_global_config(self):
|
||||||
logging.info('get_global_config')
|
logging.info('get_global_config')
|
||||||
rows = await failable(
|
rows = await failable(
|
||||||
|
@ -113,8 +116,11 @@ class NetConnector:
|
||||||
'gpu.scd', 'gpu.scd', 'config'))
|
'gpu.scd', 'gpu.scd', 'config'))
|
||||||
|
|
||||||
if rows:
|
if rows:
|
||||||
return rows[0]
|
cfg = rows[0]
|
||||||
|
logging.info(f'config found: {cfg}')
|
||||||
|
return cfg
|
||||||
else:
|
else:
|
||||||
|
logging.error('global config not found, is the contract initialized?')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def get_worker_balance(self):
|
async def get_worker_balance(self):
|
||||||
|
@ -130,20 +136,13 @@ class NetConnector:
|
||||||
))
|
))
|
||||||
|
|
||||||
if rows:
|
if rows:
|
||||||
return rows[0]['balance']
|
b = rows[0]['balance']
|
||||||
|
logging.info(f'balance: {b}')
|
||||||
|
return b
|
||||||
else:
|
else:
|
||||||
|
logging.info('no balance info found')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def get_competitors_for_req(self, request_id: int) -> set:
|
|
||||||
competitors = [
|
|
||||||
status['worker']
|
|
||||||
for status in
|
|
||||||
(await self.get_status_by_request_id(request_id))
|
|
||||||
if status['worker'] != self.account
|
|
||||||
]
|
|
||||||
logging.info(f'competitors: {competitors}')
|
|
||||||
return set(competitors)
|
|
||||||
|
|
||||||
# TODO, considery making this a NON-method and instead
|
# TODO, considery making this a NON-method and instead
|
||||||
# handing in the `snap['queue']` output beforehand?
|
# handing in the `snap['queue']` output beforehand?
|
||||||
# -> since that call is the only usage of `self`?
|
# -> since that call is the only usage of `self`?
|
||||||
|
@ -172,7 +171,7 @@ class NetConnector:
|
||||||
step.
|
step.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
logging.info('begin_work')
|
logging.info(f'begin_work on #{request_id}')
|
||||||
return await failable(
|
return await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.a_push_action,
|
self.cleos.a_push_action,
|
||||||
|
@ -189,7 +188,7 @@ class NetConnector:
|
||||||
)
|
)
|
||||||
|
|
||||||
async def cancel_work(self, request_id: int, reason: str):
|
async def cancel_work(self, request_id: int, reason: str):
|
||||||
logging.info('cancel_work')
|
logging.info(f'cancel_work on #{request_id}')
|
||||||
return await failable(
|
return await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.a_push_action,
|
self.cleos.a_push_action,
|
||||||
|
@ -229,7 +228,7 @@ class NetConnector:
|
||||||
|
|
||||||
async def find_results(self):
|
async def find_results(self):
|
||||||
logging.info('find_results')
|
logging.info('find_results')
|
||||||
return await failable(
|
rows = await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.aget_table,
|
self.cleos.aget_table,
|
||||||
'gpu.scd', 'gpu.scd', 'results',
|
'gpu.scd', 'gpu.scd', 'results',
|
||||||
|
@ -239,6 +238,7 @@ class NetConnector:
|
||||||
upper_bound=self.account
|
upper_bound=self.account
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
async def submit_work(
|
async def submit_work(
|
||||||
self,
|
self,
|
||||||
|
@ -247,7 +247,7 @@ class NetConnector:
|
||||||
result_hash: str,
|
result_hash: str,
|
||||||
ipfs_hash: str
|
ipfs_hash: str
|
||||||
):
|
):
|
||||||
logging.info('submit_work')
|
logging.info('submit_work #{request_id}')
|
||||||
return await failable(
|
return await failable(
|
||||||
partial(
|
partial(
|
||||||
self.cleos.a_push_action,
|
self.cleos.a_push_action,
|
||||||
|
@ -280,17 +280,12 @@ class NetConnector:
|
||||||
case _:
|
case _:
|
||||||
raise ValueError(f'Unsupported output type: {typ}')
|
raise ValueError(f'Unsupported output type: {typ}')
|
||||||
|
|
||||||
if self.ipfs_gateway_url:
|
|
||||||
# check peer connections, reconnect to skynet gateway if not
|
|
||||||
gateway_id = Path(self.ipfs_gateway_url).name
|
|
||||||
peers = await self.ipfs_client.peers()
|
|
||||||
if gateway_id not in [p['Peer'] for p in peers]:
|
|
||||||
await self.ipfs_client.connect(self.ipfs_gateway_url)
|
|
||||||
|
|
||||||
file_info = await self.ipfs_client.add(Path(target_file))
|
file_info = await self.ipfs_client.add(Path(target_file))
|
||||||
file_cid = file_info['Hash']
|
file_cid = file_info['Hash']
|
||||||
|
logging.info(f'added file to ipfs, CID: {file_cid}')
|
||||||
|
|
||||||
await self.ipfs_client.pin(file_cid)
|
await self.ipfs_client.pin(file_cid)
|
||||||
|
logging.info(f'pinned {file_cid}')
|
||||||
|
|
||||||
return file_cid
|
return file_cid
|
||||||
|
|
||||||
|
@ -306,11 +301,11 @@ class NetConnector:
|
||||||
link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}'
|
link = f'https://{self.ipfs_domain}/ipfs/{ipfs_hash}'
|
||||||
|
|
||||||
res = await get_ipfs_file(link, timeout=1)
|
res = await get_ipfs_file(link, timeout=1)
|
||||||
logging.info(f'got response from {link}')
|
|
||||||
if not res or res.status_code != 200:
|
if not res or res.status_code != 200:
|
||||||
logging.warning(f'couldn\'t get ipfs binary data at {link}!')
|
logging.warning(f'couldn\'t get ipfs binary data at {link}!')
|
||||||
|
|
||||||
# attempt to decode as image
|
# attempt to decode as image
|
||||||
input_data = Image.open(io.BytesIO(res.raw))
|
input_data = Image.open(io.BytesIO(res.raw))
|
||||||
|
logging.info('decoded as image successfully')
|
||||||
|
|
||||||
return input_data
|
return input_data
|
||||||
|
|
Loading…
Reference in New Issue