Add new telos.gpu2 data model to worker and tweak config

protocol_v2
Guillermo Rodriguez 2023-10-14 09:57:19 -03:00
parent f4592ae254
commit d18d59a0ab
No known key found for this signature in database
GPG Key ID: EC3AB66D5D83B392
5 changed files with 182 additions and 39 deletions

View File

@ -1,47 +1,41 @@
# config sections are optional, depending on which services # general skynet config
# you wish to run [skynet]
contract = 'telos.gpu2'
node_url = 'https://testnet.skygpu.net'
hyperion_url = 'https://testnet.skygpu.net'
ipfs_url = 'http://127.0.0.1:5001'
# optional set only if local ipfs node != gateway node
ipfs_gateway_url = '/ip4/169.197.140.154/tcp/4001/p2p/12D3KooWKWogLFNEcNNMKnzU7Snrnuj84RZdMBg3sLiQSQc51oEv'
# worker config (optional)
[skynet.dgpu] [skynet.dgpu]
account = 'testworkerX' account = 'testworkerX'
permission = 'active' permission = 'active'
key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
node_url = 'https://testnet.skygpu.net'
hyperion_url = 'https://testnet.skygpu.net'
ipfs_gateway_url = '/ip4/169.197.140.154/tcp/4001/p2p/12D3KooWKWogLFNEcNNMKnzU7Snrnuj84RZdMBg3sLiQSQc51oEv'
ipfs_url = 'http://127.0.0.1:5001'
hf_home = 'hf_home' hf_home = 'hf_home'
hf_token = 'hf_XxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXx' hf_token = 'hf_XxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXxXx'
auto_withdraw = true auto_withdraw = true
non_compete = [] non_compete = []
api_bind = '127.0.0.1:42690' api_bind = '127.0.0.1:42690'
# telegram bot config (optional)
[skynet.telegram] [skynet.telegram]
account = 'telegram' account = 'telegram'
permission = 'active' permission = 'active'
key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
node_url = 'https://testnet.skygpu.net'
hyperion_url = 'https://testnet.skygpu.net'
ipfs_gateway_url = '/ip4/169.197.140.154/tcp/4001/p2p/12D3KooWKWogLFNEcNNMKnzU7Snrnuj84RZdMBg3sLiQSQc51oEv'
ipfs_url = 'http://127.0.0.1:5001'
token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
# discord bot config (optional)
[skynet.discord] [skynet.discord]
account = 'discord' account = 'discord'
permission = 'active' permission = 'active'
key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
node_url = 'https://testnet.skygpu.net'
hyperion_url = 'https://testnet.skygpu.net'
ipfs_gateway_url = '/ip4/169.197.140.154/tcp/4001/p2p/12D3KooWKWogLFNEcNNMKnzU7Snrnuj84RZdMBg3sLiQSQc51oEv'
ipfs_url = 'http://127.0.0.1:5001'
token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' token = 'XXXXXXXXXX:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
[skynet.pinner] # cli utils account (optional)
hyperion_url = 'https://testnet.skygpu.net'
ipfs_url = 'http://127.0.0.1:5001'
[skynet.user] [skynet.user]
account = 'testuser' account = 'testuser'
permission = 'active' permission = 'active'
key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' key = '5Xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
node_url = 'https://testnet.skygpu.net'

View File

@ -7,7 +7,7 @@ import logging
from hashlib import sha256 from hashlib import sha256
from typing import Any from typing import Any
import zipfile
from PIL import Image from PIL import Image
from diffusers import DiffusionPipeline from diffusers import DiffusionPipeline
@ -70,6 +70,12 @@ class SkynetMM:
if 'hf_home' in config: if 'hf_home' in config:
self.cache_dir = config['hf_home'] self.cache_dir = config['hf_home']
self.num_gpus = torch.cuda.device_count()
self.gpus = [
torch.cuda.get_device_properties(i)
for i in range(self.num_gpus)
]
self._models = {} self._models = {}
for model in self.initial_models: for model in self.initial_models:
self.load_model(model, False, force=True) self.load_model(model, False, force=True)

View File

@ -1,7 +1,6 @@
#!/usr/bin/python #!/usr/bin/python
import json import json
import random
import logging import logging
import time import time
import traceback import traceback
@ -110,14 +109,14 @@ class SkynetDGPUDaemon:
def find_best_requests(self) -> list[dict]: def find_best_requests(self) -> list[dict]:
queue = self.conn.get_queue() queue = self.conn.get_queue()
for _ in range(3): # for _ in range(3):
random.shuffle(queue) # random.shuffle(queue)
queue = sorted( # queue = sorted(
queue, # queue,
key=lambda req: convert_reward_to_int(req['reward']), # key=lambda req: convert_reward_to_int(req['reward']),
reverse=True # reverse=True
) # )
requests = [] requests = []
for req in queue: for req in queue:
@ -161,7 +160,57 @@ class SkynetDGPUDaemon:
return requests return requests
async def sync_worker_on_chain_data(self, is_online: bool) -> bool:
# check worker is registered
me = self.conn.get_on_chain_worker_info(self.account)
if not me:
ec, out = await self.conn.register_worker()
if ec != 0:
raise DGPUDaemonError(f'Couldn\'t register worker! {out}')
me = self.conn.get_on_chain_worker_info(self.account)
# find if reported on chain gpus match local
found_difference = False
for i in range(self.mm.num_gpus):
chain_gpu = me['cards'][i]
gpu = self.mm.gpus[i]
gpu_v = f'{gpu.major}.{gpu.minor}'
found_difference = gpu.name != chain_gpu['card_name']
found_difference = gpu_v != chain_gpu['version']
found_difference = gpu.total_memory != chain_gpu['total_memory']
found_difference = gpu.multi_processor_count != chain_gpu['mp_count']
if found_difference:
break
# difference found, flush and re-report
if found_difference:
await self.conn.flush_cards()
for i, gpu in enumerate(self.mm.gpus):
ec, _ = await self.conn.add_card(
gpu.name, f'{gpu.major}.{gpu.minor}',
gpu.total_memory, gpu.multi_processor_count,
'',
is_online
)
if ec != 0:
raise DGPUDaemonError(f'error while reporting card {i}')
return found_difference
async def all_gpu_set_online_flag(self, is_online: bool):
for i, chain_gpu in enumerate(me['cards']):
if chain_gpu['is_online'] != is_online:
await self.conn.toggle_card(i)
async def serve_forever(self): async def serve_forever(self):
diff = await self.sync_worker_on_chain_data(True)
if not diff:
await self.all_gpu_set_online_flag(True)
try: try:
while True: while True:
if self.auto_withdraw: if self.auto_withdraw:
@ -234,3 +283,6 @@ class SkynetDGPUDaemon:
except KeyboardInterrupt: except KeyboardInterrupt:
... ...
await self.sync_worker_on_chain_data(False)
await self.all_gpu_set_online_flag(False)

View File

@ -1,5 +1,7 @@
#!/usr/bin/python #!/usr/bin/python
class DGPUDaemonError(BaseException):
...
class DGPUComputeError(BaseException): class DGPUComputeError(BaseException):
... ...

View File

@ -68,6 +68,10 @@ class SkynetGPUConnector:
if 'ipfs_domain' in config: if 'ipfs_domain' in config:
self.ipfs_domain = config['ipfs_domain'] self.ipfs_domain = config['ipfs_domain']
self.worker_url = ''
if 'worker_url' in config:
self.worker_url = config['worker_url']
self._update_delta = 1 self._update_delta = 1
self._cache: dict[str, tuple[float, Any]] = {} self._cache: dict[str, tuple[float, Any]] = {}
@ -90,12 +94,16 @@ class SkynetGPUConnector:
return default return default
async def data_updater_task(self): async def data_updater_task(self):
tasks = (
(self._get_work_requests_last_hour, 'queue'),
(self._find_my_results, 'my_results'),
(self._get_workers, 'workers')
)
while True: while True:
async with trio.open_nursery() as n: async with trio.open_nursery() as n:
n.start_soon( for task in tasks:
self._cache_set, self._get_work_requests_last_hour, 'queue') n.start_soon(self._cache_set, *task)
n.start_soon(
self._cache_set, self._find_my_results, 'my_results')
await trio.sleep(self._update_delta) await trio.sleep(self._update_delta)
@ -105,6 +113,9 @@ class SkynetGPUConnector:
def get_my_results(self): def get_my_results(self):
return self._cache_get('my_results', default=[]) return self._cache_get('my_results', default=[])
def get_workers(self):
return self._cache_get('workers', default=[])
def get_status_for_request(self, request_id: int) -> list[dict] | None: def get_status_for_request(self, request_id: int) -> list[dict] | None:
request: dict | None = next(( request: dict | None = next((
req req
@ -135,8 +146,8 @@ class SkynetGPUConnector:
self.cleos.aget_table, self.cleos.aget_table,
self.contract, self.contract, 'queue', self.contract, self.contract, 'queue',
index_position=2, index_position=2,
key_type='i64', order='asc',
lower_bound=int(time.time()) - 3600 limit=1000
), ret_fail=[]) ), ret_fail=[])
async def _find_my_results(self): async def _find_my_results(self):
@ -152,6 +163,15 @@ class SkynetGPUConnector:
) )
) )
async def _get_workers(self) -> list[dict]:
logging.info('get_workers')
return await failable(
partial(
self.cleos.aget_table,
self.contract, self.contract, 'workers'
)
)
async def get_global_config(self): async def get_global_config(self):
logging.info('get_global_config') logging.info('get_global_config')
rows = await failable( rows = await failable(
@ -169,7 +189,7 @@ class SkynetGPUConnector:
rows = await failable( rows = await failable(
partial( partial(
self.cleos.aget_table, self.cleos.aget_table,
'telos.gpu', 'telos.gpu', 'users', self.contract, self.contract, 'users',
index_position=1, index_position=1,
key_type='name', key_type='name',
lower_bound=self.account, lower_bound=self.account,
@ -181,12 +201,81 @@ class SkynetGPUConnector:
else: else:
return None return None
def get_on_chain_worker_info(self, worker: str):
return next((
w for w in self.get_workers()
if w['account'] == w
), None)
async def register_worker(self):
logging.info(f'registering worker')
return await failable(
partial(
self.cleos.a_push_action,
self.contract,
'regworker',
{
'account': self.account,
'url': self.worker_url
}
)
)
async def add_card(
self,
card_name: str,
version: str,
total_memory: int,
mp_count: int,
extra: str,
is_online: bool
):
logging.info(f'adding card: {card_name} {version}')
return await failable(
partial(
self.cleos.a_push_action,
self.contract,
'addcard',
{
'worker': self.account,
'card_name': card_name,
'version': version,
'total_memory': total_memory,
'mp_count': mp_count,
'extra': extra,
'is_online': is_online
}
)
)
async def toggle_card(self, index: int):
logging.info(f'toggle card {index}')
return await failable(
partial(
self.cleos.a_push_action,
self.contract,
'togglecard',
{'worker': self.account, 'index': index}
)
)
async def flush_cards(self):
logging.info('flushing cards...')
return await failable(
partial(
self.cleos.a_push_action,
self.contract,
'flushcards',
{'worker': self.account}
)
)
async def begin_work(self, request_id: int): async def begin_work(self, request_id: int):
logging.info('begin_work') logging.info('begin_work')
return await failable( return await failable(
partial( partial(
self.cleos.a_push_action, self.cleos.a_push_action,
'telos.gpu', self.contract,
'workbegin', 'workbegin',
{ {
'worker': self.account, 'worker': self.account,
@ -203,7 +292,7 @@ class SkynetGPUConnector:
return await failable( return await failable(
partial( partial(
self.cleos.a_push_action, self.cleos.a_push_action,
'telos.gpu', self.contract,
'workcancel', 'workcancel',
{ {
'worker': self.account, 'worker': self.account,
@ -226,7 +315,7 @@ class SkynetGPUConnector:
await failable( await failable(
partial( partial(
self.cleos.a_push_action, self.cleos.a_push_action,
'telos.gpu', self.contract,
'withdraw', 'withdraw',
{ {
'user': self.account, 'user': self.account,
@ -248,7 +337,7 @@ class SkynetGPUConnector:
return await failable( return await failable(
partial( partial(
self.cleos.a_push_action, self.cleos.a_push_action,
'telos.gpu', self.contract,
'submit', 'submit',
{ {
'worker': self.account, 'worker': self.account,