mirror of https://github.com/skygpu/skynet.git
				
				
				
			Rework gpu worker logic to work better in parallel with other workers
							parent
							
								
									2b18fa376b
								
							
						
					
					
						commit
						25c86b5eaf
					
				| 
						 | 
					@ -218,13 +218,12 @@ async def open_dgpu_node(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def begin_work(request_id: int):
 | 
					    def begin_work(request_id: int):
 | 
				
			||||||
        logging.info('begin_work')
 | 
					        logging.info('begin_work')
 | 
				
			||||||
        ec, out = cleos.push_action(
 | 
					        return cleos.push_action(
 | 
				
			||||||
            'telos.gpu',
 | 
					            'telos.gpu',
 | 
				
			||||||
            'workbegin',
 | 
					            'workbegin',
 | 
				
			||||||
            [account, request_id],
 | 
					            [account, request_id],
 | 
				
			||||||
            f'{account}@{permission}'
 | 
					            f'{account}@{permission}'
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        assert ec == 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def cancel_work(request_id: int, reason: str):
 | 
					    def cancel_work(request_id: int, reason: str):
 | 
				
			||||||
        logging.info('cancel_work')
 | 
					        logging.info('cancel_work')
 | 
				
			||||||
| 
						 | 
					@ -234,7 +233,6 @@ async def open_dgpu_node(
 | 
				
			||||||
            [account, request_id, reason],
 | 
					            [account, request_id, reason],
 | 
				
			||||||
            f'{account}@{permission}'
 | 
					            f'{account}@{permission}'
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        assert ec == 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def maybe_withdraw_all():
 | 
					    def maybe_withdraw_all():
 | 
				
			||||||
        logging.info('maybe_withdraw_all')
 | 
					        logging.info('maybe_withdraw_all')
 | 
				
			||||||
| 
						 | 
					@ -251,7 +249,6 @@ async def open_dgpu_node(
 | 
				
			||||||
                f'{account}@{permission}'
 | 
					                f'{account}@{permission}'
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            logging.info(collect_stdout(out))
 | 
					            logging.info(collect_stdout(out))
 | 
				
			||||||
            assert ec == 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def find_my_results():
 | 
					    async def find_my_results():
 | 
				
			||||||
        logging.info('find_my_results')
 | 
					        logging.info('find_my_results')
 | 
				
			||||||
| 
						 | 
					@ -289,8 +286,8 @@ async def open_dgpu_node(
 | 
				
			||||||
            f'{account}@{permission}'
 | 
					            f'{account}@{permission}'
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if ec != 0:
 | 
				
			||||||
            print(collect_stdout(out))
 | 
					            print(collect_stdout(out))
 | 
				
			||||||
        assert ec == 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def get_input_data(ipfs_hash: str) -> bytes:
 | 
					    async def get_input_data(ipfs_hash: str) -> bytes:
 | 
				
			||||||
        if ipfs_hash == '':
 | 
					        if ipfs_hash == '':
 | 
				
			||||||
| 
						 | 
					@ -317,9 +314,7 @@ async def open_dgpu_node(
 | 
				
			||||||
                    rid = req['id']
 | 
					                    rid = req['id']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    my_results = [res['id'] for res in (await find_my_results())]
 | 
					                    my_results = [res['id'] for res in (await find_my_results())]
 | 
				
			||||||
                    if rid in my_results:
 | 
					                    if rid not in my_results:
 | 
				
			||||||
                        continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        statuses = await get_status_by_request_id(rid)
 | 
					                        statuses = await get_status_by_request_id(rid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                        if len(statuses) < config['verification_amount']:
 | 
					                        if len(statuses) < config['verification_amount']:
 | 
				
			||||||
| 
						 | 
					@ -344,8 +339,11 @@ async def open_dgpu_node(
 | 
				
			||||||
                            # perform work
 | 
					                            # perform work
 | 
				
			||||||
                            logging.info(f'working on {body}')
 | 
					                            logging.info(f'working on {body}')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                        begin_work(rid)
 | 
					                            ec, _ = begin_work(rid)
 | 
				
			||||||
 | 
					                            if ec != 0:
 | 
				
			||||||
 | 
					                                logging.info(f'probably beign worked on already... skip.')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                            else:
 | 
				
			||||||
                                try:
 | 
					                                try:
 | 
				
			||||||
                                    img_sha, raw_img = gpu_compute_one(
 | 
					                                    img_sha, raw_img = gpu_compute_one(
 | 
				
			||||||
                                        body['method'], body['params'], binext=binary)
 | 
					                                        body['method'], body['params'], binext=binary)
 | 
				
			||||||
| 
						 | 
					@ -353,15 +351,14 @@ async def open_dgpu_node(
 | 
				
			||||||
                                    ipfs_hash = publish_on_ipfs(img_sha, raw_img)
 | 
					                                    ipfs_hash = publish_on_ipfs(img_sha, raw_img)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                                    submit_work(rid, request_hash, img_sha, ipfs_hash)
 | 
					                                    submit_work(rid, request_hash, img_sha, ipfs_hash)
 | 
				
			||||||
 | 
					 | 
				
			||||||
                                    break
 | 
					                                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                                except BaseException as e:
 | 
					                                except BaseException as e:
 | 
				
			||||||
                                    cancel_work(rid, str(e))
 | 
					                                    cancel_work(rid, str(e))
 | 
				
			||||||
 | 
					                                    break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
                        logging.info(f'request {rid} already beign worked on, skip...')
 | 
					                        logging.info(f'request {rid} already beign worked on, skip...')
 | 
				
			||||||
                        continue
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
                await trio.sleep(1)
 | 
					                await trio.sleep(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue