From 8a5e32d45227284c0f33470c8026501a6eb400f4 Mon Sep 17 00:00:00 2001 From: Guillermo Rodriguez Date: Mon, 14 Aug 2023 03:15:45 +0000 Subject: [PATCH] Enable certain memory optmization options on cases where trying to load a large model on a low end card --- Dockerfile-cuda | 36 ++++++ docker/Dockerfile.runtime | 16 --- docker/Dockerfile.runtime+cuda | 29 ----- requirements.cuda.1.txt | 1 - requirements.cuda.2.txt | 2 - ...ements.cuda.0.txt => requirements.cuda.txt | 9 +- skynet/constants.py | 29 ++--- skynet/ipfs/docker.py | 5 +- skynet/utils.py | 29 +++-- wrapdocker | 113 ++++++++++++++++++ 10 files changed, 196 insertions(+), 73 deletions(-) create mode 100644 Dockerfile-cuda delete mode 100644 docker/Dockerfile.runtime delete mode 100644 docker/Dockerfile.runtime+cuda delete mode 100644 requirements.cuda.1.txt delete mode 100644 requirements.cuda.2.txt rename requirements.cuda.0.txt => requirements.cuda.txt (71%) create mode 100755 wrapdocker diff --git a/Dockerfile-cuda b/Dockerfile-cuda new file mode 100644 index 0000000..393f8c2 --- /dev/null +++ b/Dockerfile-cuda @@ -0,0 +1,36 @@ +from pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel +env DEBIAN_FRONTEND=noninteractive + +run apt-get update -qq && apt-get install -qqy \ + apt-transport-https \ + ca-certificates \ + curl \ + git \ + lxc \ + vim \ + ffmpeg \ + libsm6 \ + libxext6 \ + iptables + +run curl -sSL https://get.docker.com/ | sh + +add ./wrapdocker /usr/local/bin/wrapdocker +run chmod +x /usr/local/bin/wrapdocker + +volume /var/lib/docker + +env HF_HOME hf_home + +workdir /root/target + +add ./requirements.cuda.txt requirements.cuda.txt +add ./requirements.txt requirements.txt +add ./setup.py setup.py +add ./skynet skynet + +run pip install -r requirements.cuda.txt +run pip install -r requirements.txt +run pip install -e . + +cmd ["wrapdocker"] diff --git a/docker/Dockerfile.runtime b/docker/Dockerfile.runtime deleted file mode 100644 index 316fdcb..0000000 --- a/docker/Dockerfile.runtime +++ /dev/null @@ -1,16 +0,0 @@ -from python:3.10.0 - -env DEBIAN_FRONTEND=noninteractive - -workdir /skynet - -copy requirements.txt requirements.txt -copy pytest.ini ./ -copy setup.py ./ -copy skynet ./skynet - -run pip install \ - -e . \ - -r requirements.txt - -copy tests ./ diff --git a/docker/Dockerfile.runtime+cuda b/docker/Dockerfile.runtime+cuda deleted file mode 100644 index 6d52960..0000000 --- a/docker/Dockerfile.runtime+cuda +++ /dev/null @@ -1,29 +0,0 @@ -from nvidia/cuda:11.7.0-devel-ubuntu20.04 -from python:3.11 - -env DEBIAN_FRONTEND=noninteractive - -run apt-get update && \ - apt-get install -y ffmpeg libsm6 libxext6 - -workdir /skynet - -copy requirements.cuda* ./ - -run pip install -U pip ninja -run pip install -v -r requirements.cuda.0.txt -run pip install -v -r requirements.cuda.1.txt -run pip install -v -r requirements.cuda.2.txt - -copy requirements.txt requirements.txt -copy pytest.ini pytest.ini -copy setup.py setup.py -copy skynet skynet - -run pip install -e . -r requirements.txt - -env PYTORCH_CUDA_ALLOC_CONF max_split_size_mb:128 -env NVIDIA_VISIBLE_DEVICES=all -env HF_HOME /hf_home - -copy tests tests diff --git a/requirements.cuda.1.txt b/requirements.cuda.1.txt deleted file mode 100644 index b9f2703..0000000 --- a/requirements.cuda.1.txt +++ /dev/null @@ -1 +0,0 @@ -git+https://github.com/facebookresearch/xformers.git@main#egg=xformers diff --git a/requirements.cuda.2.txt b/requirements.cuda.2.txt deleted file mode 100644 index 4d3fee4..0000000 --- a/requirements.cuda.2.txt +++ /dev/null @@ -1,2 +0,0 @@ -basicsr -realesrgan diff --git a/requirements.cuda.0.txt b/requirements.cuda.txt similarity index 71% rename from requirements.cuda.0.txt rename to requirements.cuda.txt index f796537..8b7341c 100644 --- a/requirements.cuda.0.txt +++ b/requirements.cuda.txt @@ -1,9 +1,14 @@ +torch scipy triton +xformers accelerate transformers huggingface_hub -diffusers[torch]>=0.18.0 +diffusers[torch] invisible-watermark -torch==1.13.0+cu117 + +basicsr +realesrgan + --extra-index-url https://download.pytorch.org/whl/cu117 diff --git a/skynet/constants.py b/skynet/constants.py index 743270f..f7bb5dc 100755 --- a/skynet/constants.py +++ b/skynet/constants.py @@ -5,18 +5,18 @@ VERSION = '0.1a11' DOCKER_RUNTIME_CUDA = 'skynet:runtime-cuda' MODELS = { - 'prompthero/openjourney': { 'short': 'midj'}, - 'runwayml/stable-diffusion-v1-5': { 'short': 'stable'}, - 'stabilityai/stable-diffusion-2-1-base': { 'short': 'stable2'}, - 'snowkidy/stable-diffusion-xl-base-0.9': { 'short': 'stablexl0.9'}, - 'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl'}, - 'Linaqruf/anything-v3.0': { 'short': 'hdanime'}, - 'hakurei/waifu-diffusion': { 'short': 'waifu'}, - 'nitrosocke/Ghibli-Diffusion': { 'short': 'ghibli'}, - 'dallinmackay/Van-Gogh-diffusion': { 'short': 'van-gogh'}, - 'lambdalabs/sd-pokemon-diffusers': { 'short': 'pokemon'}, - 'Envvi/Inkpunk-Diffusion': { 'short': 'ink'}, - 'nousr/robo-diffusion': { 'short': 'robot'} + 'prompthero/openjourney': { 'short': 'midj', 'mem': 8 }, + 'runwayml/stable-diffusion-v1-5': { 'short': 'stable', 'mem': 8 }, + 'stabilityai/stable-diffusion-2-1-base': { 'short': 'stable2', 'mem': 8 }, + 'snowkidy/stable-diffusion-xl-base-0.9': { 'short': 'stablexl0.9', 'mem': 24 }, + 'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl', 'mem': 24 }, + 'Linaqruf/anything-v3.0': { 'short': 'hdanime', 'mem': 8 }, + 'hakurei/waifu-diffusion': { 'short': 'waifu', 'mem': 8 }, + 'nitrosocke/Ghibli-Diffusion': { 'short': 'ghibli', 'mem': 8 }, + 'dallinmackay/Van-Gogh-diffusion': { 'short': 'van-gogh', 'mem': 8 }, + 'lambdalabs/sd-pokemon-diffusers': { 'short': 'pokemon', 'mem': 8 }, + 'Envvi/Inkpunk-Diffusion': { 'short': 'ink', 'mem': 8 }, + 'nousr/robo-diffusion': { 'short': 'robot', 'mem': 8 } } SHORT_NAMES = [ @@ -153,10 +153,11 @@ DEFAULT_UPSCALER = None DEFAULT_CONFIG_PATH = 'skynet.ini' DEFAULT_INITAL_MODELS = [ - 'prompthero/openjourney', - 'runwayml/stable-diffusion-v1-5' + 'stabilityai/stable-diffusion-xl-base-1.0' ] +DEFAULT_SINGLE_CARD_MAP = 'cuda:0' + DATE_FORMAT = '%B the %dth %Y, %H:%M:%S' CONFIG_ATTRS = [ diff --git a/skynet/ipfs/docker.py b/skynet/ipfs/docker.py index 851bb12..1f315ba 100755 --- a/skynet/ipfs/docker.py +++ b/skynet/ipfs/docker.py @@ -83,9 +83,10 @@ def open_ipfs_node(name='skynet-ipfs'): remove=True ) + uid = 1000 + gid = 1000 + if sys.platform != 'win32': - uid = os.getuid() - gid = os.getgid() ec, out = container.exec_run(['chown', f'{uid}:{gid}', '-R', export_target]) logging.info(out) assert ec == 0 diff --git a/skynet/utils.py b/skynet/utils.py index 2ec7a6c..1cacd5b 100755 --- a/skynet/utils.py +++ b/skynet/utils.py @@ -4,6 +4,7 @@ import io import os import time import random +import logging from typing import Optional from pathlib import Path @@ -24,7 +25,7 @@ from diffusers import ( from realesrgan import RealESRGANer from huggingface_hub import login -from .constants import MODELS +from .constants import MODELS, DEFAULT_SINGLE_CARD_MAP def time_ms(): @@ -74,16 +75,24 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio torch.backends.cudnn.benchmark = False torch.use_deterministic_algorithms(True) + model_info = MODELS[model] + + req_mem = model_info['mem'] + mem_gb = torch.cuda.mem_get_info()[1] / (10**9) + over_mem = mem_gb < req_mem + if over_mem: + logging.warn(f'model requires {req_mem} but card has {mem_gb}, model will run slower..') + + shortname = model_info['short'] params = { 'torch_dtype': torch.float16, 'safety_checker': None } - if model == 'runwayml/stable-diffusion-v1-5': + if shortname == 'stable': params['revision'] = 'fp16' - if (model == 'stabilityai/stable-diffusion-xl-base-1.0' or - model == 'snowkidy/stable-diffusion-xl-base-0.9'): + if 'xl' in shortname: if image: pipe_class = StableDiffusionXLImg2ImgPipeline else: @@ -100,10 +109,16 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio pipe.scheduler = EulerAncestralDiscreteScheduler.from_config( pipe.scheduler.config) - if not image: - pipe.enable_vae_slicing() + if over_mem: + if not image: + pipe.enable_vae_slicing() + pipe.enable_vae_tiling() - return pipe.to('cuda') + pipe.enable_model_cpu_offload() + + pipe.enable_xformers_memory_efficient_attention() + + return pipe def txt2img( diff --git a/wrapdocker b/wrapdocker new file mode 100755 index 0000000..ac927a3 --- /dev/null +++ b/wrapdocker @@ -0,0 +1,113 @@ +#!/bin/bash + +# Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver +dmsetup mknodes + +# First, make sure that cgroups are mounted correctly. +CGROUP=/sys/fs/cgroup +: {LOG:=stdio} + +[ -d $CGROUP ] || + mkdir $CGROUP + +mountpoint -q $CGROUP || + mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || { + echo "Could not make a tmpfs mount. Did you use --privileged?" + exit 1 + } + +if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security +then + mount -t securityfs none /sys/kernel/security || { + echo "Could not mount /sys/kernel/security." + echo "AppArmor detection and --privileged mode might break." + } +fi + +# Mount the cgroup hierarchies exactly as they are in the parent system. +for SUBSYS in $(cut -d: -f2 /proc/1/cgroup) +do + [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS + mountpoint -q $CGROUP/$SUBSYS || + mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS + + # The two following sections address a bug which manifests itself + # by a cryptic "lxc-start: no ns_cgroup option specified" when + # trying to start containers withina container. + # The bug seems to appear when the cgroup hierarchies are not + # mounted on the exact same directories in the host, and in the + # container. + + # Named, control-less cgroups are mounted with "-o name=foo" + # (and appear as such under /proc//cgroup) but are usually + # mounted on a directory named "foo" (without the "name=" prefix). + # Systemd and OpenRC (and possibly others) both create such a + # cgroup. To avoid the aforementioned bug, we symlink "foo" to + # "name=foo". This shouldn't have any adverse effect. + echo $SUBSYS | grep -q ^name= && { + NAME=$(echo $SUBSYS | sed s/^name=//) + ln -s $SUBSYS $CGROUP/$NAME + } + + # Likewise, on at least one system, it has been reported that + # systemd would mount the CPU and CPU accounting controllers + # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu" + # but on a directory called "cpu,cpuacct" (note the inversion + # in the order of the groups). This tries to work around it. + [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct +done + +# Note: as I write those lines, the LXC userland tools cannot setup +# a "sub-container" properly if the "devices" cgroup is not in its +# own hierarchy. Let's detect this and issue a warning. +grep -q :devices: /proc/1/cgroup || + echo "WARNING: the 'devices' cgroup should be in its own hierarchy." +grep -qw devices /proc/1/cgroup || + echo "WARNING: it looks like the 'devices' cgroup is not mounted." + +# Now, close extraneous file descriptors. +pushd /proc/self/fd >/dev/null +for FD in * +do + case "$FD" in + # Keep stdin/stdout/stderr + [012]) + ;; + # Nuke everything else + *) + eval exec "$FD>&-" + ;; + esac +done +popd >/dev/null + + +# If a pidfile is still around (for example after a container restart), +# delete it so that docker can start. +rm -rf /var/run/docker.pid + +# If we were given a PORT environment variable, start as a simple daemon; +# otherwise, spawn a shell as well +if [ "$PORT" ] +then + exec dockerd -H 0.0.0.0:$PORT -H unix:///var/run/docker.sock \ + $DOCKER_DAEMON_ARGS +else + if [ "$LOG" == "file" ] + then + dockerd $DOCKER_DAEMON_ARGS &>/var/log/docker.log & + else + dockerd $DOCKER_DAEMON_ARGS & + fi + (( timeout = 60 + SECONDS )) + until docker info >/dev/null 2>&1 + do + if (( SECONDS >= timeout )); then + echo 'Timed out trying to connect to internal docker host.' >&2 + break + fi + sleep 1 + done + [[ $1 ]] && exec "$@" + exec bash --login +fi