Rework gpu worker logic to work better in parallel with other workers

add-txt2txt-models
Guillermo Rodriguez 2023-05-29 13:43:03 -03:00
parent 2b18fa376b
commit 25c86b5eaf
No known key found for this signature in database
GPG Key ID: EC3AB66D5D83B392
1 changed files with 34 additions and 37 deletions

View File

@ -218,13 +218,12 @@ async def open_dgpu_node(
def begin_work(request_id: int): def begin_work(request_id: int):
logging.info('begin_work') logging.info('begin_work')
ec, out = cleos.push_action( return cleos.push_action(
'telos.gpu', 'telos.gpu',
'workbegin', 'workbegin',
[account, request_id], [account, request_id],
f'{account}@{permission}' f'{account}@{permission}'
) )
assert ec == 0
def cancel_work(request_id: int, reason: str): def cancel_work(request_id: int, reason: str):
logging.info('cancel_work') logging.info('cancel_work')
@ -234,7 +233,6 @@ async def open_dgpu_node(
[account, request_id, reason], [account, request_id, reason],
f'{account}@{permission}' f'{account}@{permission}'
) )
assert ec == 0
def maybe_withdraw_all(): def maybe_withdraw_all():
logging.info('maybe_withdraw_all') logging.info('maybe_withdraw_all')
@ -251,7 +249,6 @@ async def open_dgpu_node(
f'{account}@{permission}' f'{account}@{permission}'
) )
logging.info(collect_stdout(out)) logging.info(collect_stdout(out))
assert ec == 0
async def find_my_results(): async def find_my_results():
logging.info('find_my_results') logging.info('find_my_results')
@ -289,8 +286,8 @@ async def open_dgpu_node(
f'{account}@{permission}' f'{account}@{permission}'
) )
if ec != 0:
print(collect_stdout(out)) print(collect_stdout(out))
assert ec == 0
async def get_input_data(ipfs_hash: str) -> bytes: async def get_input_data(ipfs_hash: str) -> bytes:
if ipfs_hash == '': if ipfs_hash == '':
@ -317,9 +314,7 @@ async def open_dgpu_node(
rid = req['id'] rid = req['id']
my_results = [res['id'] for res in (await find_my_results())] my_results = [res['id'] for res in (await find_my_results())]
if rid in my_results: if rid not in my_results:
continue
statuses = await get_status_by_request_id(rid) statuses = await get_status_by_request_id(rid)
if len(statuses) < config['verification_amount']: if len(statuses) < config['verification_amount']:
@ -344,8 +339,11 @@ async def open_dgpu_node(
# perform work # perform work
logging.info(f'working on {body}') logging.info(f'working on {body}')
begin_work(rid) ec, _ = begin_work(rid)
if ec != 0:
logging.info(f'probably beign worked on already... skip.')
else:
try: try:
img_sha, raw_img = gpu_compute_one( img_sha, raw_img = gpu_compute_one(
body['method'], body['params'], binext=binary) body['method'], body['params'], binext=binary)
@ -353,15 +351,14 @@ async def open_dgpu_node(
ipfs_hash = publish_on_ipfs(img_sha, raw_img) ipfs_hash = publish_on_ipfs(img_sha, raw_img)
submit_work(rid, request_hash, img_sha, ipfs_hash) submit_work(rid, request_hash, img_sha, ipfs_hash)
break break
except BaseException as e: except BaseException as e:
cancel_work(rid, str(e)) cancel_work(rid, str(e))
break
else: else:
logging.info(f'request {rid} already beign worked on, skip...') logging.info(f'request {rid} already beign worked on, skip...')
continue
await trio.sleep(1) await trio.sleep(1)