Rework gpu worker logic to work better in parallel with other workers

2023-05-29 13:43:03 -03:00 · 2023-05-29 13:43:03 -03:00 · 25c86b5eaf
parent 2b18fa376b
commit 25c86b5eaf
1 changed files with 34 additions and 37 deletions
--- a/skynet/dgpu.py
+++ b/skynet/dgpu.py
@ -218,13 +218,12 @@ async def open_dgpu_node(
    def begin_work(request_id: int):
        logging.info('begin_work')
-        ec, out = cleos.push_action(
+        return cleos.push_action(
            'telos.gpu',
            'workbegin',
            [account, request_id],
            f'{account}@{permission}'
        )
        assert ec == 0
    def cancel_work(request_id: int, reason: str):
        logging.info('cancel_work')
@ -234,7 +233,6 @@ async def open_dgpu_node(
            [account, request_id, reason],
            f'{account}@{permission}'
        )
        assert ec == 0
    def maybe_withdraw_all():
        logging.info('maybe_withdraw_all')
@ -251,7 +249,6 @@ async def open_dgpu_node(
                f'{account}@{permission}'
            )
            logging.info(collect_stdout(out))
            assert ec == 0
    async def find_my_results():
        logging.info('find_my_results')
@ -289,8 +286,8 @@ async def open_dgpu_node(
            f'{account}@{permission}'
        )
        if ec != 0:
            print(collect_stdout(out))
        assert ec == 0
    async def get_input_data(ipfs_hash: str) -> bytes:
        if ipfs_hash == '':
@ -317,9 +314,7 @@ async def open_dgpu_node(
                    rid = req['id']
                    my_results = [res['id'] for res in (await find_my_results())]
-                    if rid in my_results:
+                    if rid not in my_results:
                        continue
                        statuses = await get_status_by_request_id(rid)
                        if len(statuses) < config['verification_amount']:
@ -344,8 +339,11 @@ async def open_dgpu_node(
                            # perform work
                            logging.info(f'working on {body}')
-                        begin_work(rid)
+                            ec, _ = begin_work(rid)
                            if ec != 0:
                                logging.info(f'probably beign worked on already... skip.')
                            else:
                                try:
                                    img_sha, raw_img = gpu_compute_one(
                                        body['method'], body['params'], binext=binary)
@ -353,15 +351,14 @@ async def open_dgpu_node(
                                    ipfs_hash = publish_on_ipfs(img_sha, raw_img)
                                    submit_work(rid, request_hash, img_sha, ipfs_hash)
                                    break
                                except BaseException as e:
                                    cancel_work(rid, str(e))
                                    break
                    else:
                        logging.info(f'request {rid} already beign worked on, skip...')
                        continue
                await trio.sleep(1)