Enable certain memory optmization options on cases where trying to load a large model on a low end card

2023-08-14 03:15:45 +00:00 · 2023-08-14 03:15:45 +00:00 · 8a5e32d452
parent ffcf9dc905
commit 8a5e32d452
10 changed files with 196 additions and 73 deletions
--- a/36
+++ b/36
@ -0,0 +1,36 @@
 from pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
 env DEBIAN_FRONTEND=noninteractive
 run apt-get update -qq && apt-get install -qqy \
    apt-transport-https \
    ca-certificates \
    curl \
    git \
    lxc \
    vim \
    ffmpeg \
    libsm6 \
    libxext6 \
    iptables
 run curl -sSL https://get.docker.com/ | sh
 add ./wrapdocker /usr/local/bin/wrapdocker
 run chmod +x /usr/local/bin/wrapdocker
 volume /var/lib/docker
 env HF_HOME hf_home
 workdir /root/target
 add ./requirements.cuda.txt  requirements.cuda.txt
 add ./requirements.txt requirements.txt
 add ./setup.py setup.py
 add ./skynet skynet
 run pip install -r requirements.cuda.txt
 run pip install -r requirements.txt
 run pip install -e .
 cmd ["wrapdocker"]
--- a/docker/Dockerfile.runtime
+++ b/docker/Dockerfile.runtime
@ -1,16 +0,0 @@
 from python:3.10.0
 env DEBIAN_FRONTEND=noninteractive
 workdir /skynet
 copy requirements.txt requirements.txt
 copy pytest.ini ./
 copy setup.py ./
 copy skynet ./skynet
 run pip install \
    -e . \
    -r requirements.txt
 copy tests ./
--- a/docker/Dockerfile.runtime+cuda
+++ b/docker/Dockerfile.runtime+cuda
@ -1,29 +0,0 @@
 from nvidia/cuda:11.7.0-devel-ubuntu20.04
 from python:3.11
 env DEBIAN_FRONTEND=noninteractive
 run apt-get update && \
    apt-get install -y ffmpeg libsm6 libxext6
 workdir /skynet
 copy requirements.cuda* ./
 run pip install -U pip ninja
 run pip install -v -r requirements.cuda.0.txt
 run pip install -v -r requirements.cuda.1.txt
 run pip install -v -r requirements.cuda.2.txt
 copy requirements.txt requirements.txt
 copy pytest.ini pytest.ini
 copy setup.py setup.py
 copy skynet skynet
 run pip install -e . -r requirements.txt
 env PYTORCH_CUDA_ALLOC_CONF max_split_size_mb:128
 env NVIDIA_VISIBLE_DEVICES=all
 env HF_HOME /hf_home
 copy tests tests
--- a/requirements.cuda.1.txt
+++ b/requirements.cuda.1.txt
@ -1 +0,0 @@
 git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
--- a/requirements.cuda.2.txt
+++ b/requirements.cuda.2.txt
@ -1,2 +0,0 @@
 basicsr
 realesrgan
--- a/requirements.cuda.0.txt
+++ b/requirements.cuda.0.txt
@ -1,9 +1,14 @@
 torch
 scipy
 triton
 xformers
 accelerate
 transformers
 huggingface_hub
-diffusers[torch]>=0.18.0
+diffusers[torch]
 invisible-watermark
-torch==1.13.0+cu117
+
 basicsr
 realesrgan
 --extra-index-url https://download.pytorch.org/whl/cu117
--- a/skynet/constants.py
+++ b/skynet/constants.py
@ -5,18 +5,18 @@ VERSION = '0.1a11'
 DOCKER_RUNTIME_CUDA = 'skynet:runtime-cuda'
 MODELS = {
-    'prompthero/openjourney':                   { 'short': 'midj'},
+    'prompthero/openjourney':                   { 'short': 'midj', 'mem': 8 },
-    'runwayml/stable-diffusion-v1-5':           { 'short': 'stable'},
+    'runwayml/stable-diffusion-v1-5':           { 'short': 'stable', 'mem': 8 },
-    'stabilityai/stable-diffusion-2-1-base':    { 'short': 'stable2'},
+    'stabilityai/stable-diffusion-2-1-base':    { 'short': 'stable2', 'mem': 8 },
-    'snowkidy/stable-diffusion-xl-base-0.9':    { 'short': 'stablexl0.9'},
+    'snowkidy/stable-diffusion-xl-base-0.9':    { 'short': 'stablexl0.9', 'mem': 24 },
-    'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl'},
+    'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl', 'mem': 24 },
-    'Linaqruf/anything-v3.0':                   { 'short': 'hdanime'},
+    'Linaqruf/anything-v3.0':                   { 'short': 'hdanime', 'mem': 8 },
-    'hakurei/waifu-diffusion':                  { 'short': 'waifu'},
+    'hakurei/waifu-diffusion':                  { 'short': 'waifu', 'mem': 8 },
-    'nitrosocke/Ghibli-Diffusion':              { 'short': 'ghibli'},
+    'nitrosocke/Ghibli-Diffusion':              { 'short': 'ghibli', 'mem': 8 },
-    'dallinmackay/Van-Gogh-diffusion':          { 'short': 'van-gogh'},
+    'dallinmackay/Van-Gogh-diffusion':          { 'short': 'van-gogh', 'mem': 8 },
-    'lambdalabs/sd-pokemon-diffusers':          { 'short': 'pokemon'},
+    'lambdalabs/sd-pokemon-diffusers':          { 'short': 'pokemon', 'mem': 8 },
-    'Envvi/Inkpunk-Diffusion':                  { 'short': 'ink'},
+    'Envvi/Inkpunk-Diffusion':                  { 'short': 'ink', 'mem': 8 },
-    'nousr/robo-diffusion':                     { 'short': 'robot'}
+    'nousr/robo-diffusion':                     { 'short': 'robot', 'mem': 8 }
 }
 SHORT_NAMES = [
@ -153,10 +153,11 @@ DEFAULT_UPSCALER = None
 DEFAULT_CONFIG_PATH = 'skynet.ini'
 DEFAULT_INITAL_MODELS = [
-    'prompthero/openjourney',
+    'stabilityai/stable-diffusion-xl-base-1.0'
    'runwayml/stable-diffusion-v1-5'
 ]
 DEFAULT_SINGLE_CARD_MAP = 'cuda:0'
 DATE_FORMAT = '%B the %dth %Y, %H:%M:%S'
 CONFIG_ATTRS = [
--- a/skynet/ipfs/docker.py
+++ b/skynet/ipfs/docker.py
@ -83,9 +83,10 @@ def open_ipfs_node(name='skynet-ipfs'):
            remove=True
        )
        uid = 1000
        gid = 1000
        if sys.platform != 'win32':
            uid = os.getuid()
            gid = os.getgid()
            ec, out = container.exec_run(['chown', f'{uid}:{gid}', '-R', export_target])
            logging.info(out)
            assert ec == 0
--- a/skynet/utils.py
+++ b/skynet/utils.py
@ -4,6 +4,7 @@ import io
 import os
 import time
 import random
 import logging
 from typing import Optional
 from pathlib import Path
@ -24,7 +25,7 @@ from diffusers import (
 from realesrgan import RealESRGANer
 from huggingface_hub import login
-from .constants import MODELS
+from .constants import MODELS, DEFAULT_SINGLE_CARD_MAP
 def time_ms():
@ -74,16 +75,24 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    model_info = MODELS[model]
    req_mem = model_info['mem']
    mem_gb = torch.cuda.mem_get_info()[1] / (10**9)
    over_mem = mem_gb < req_mem
    if over_mem:
        logging.warn(f'model requires {req_mem} but card has {mem_gb}, model will run slower..')
    shortname = model_info['short']
    params = {
        'torch_dtype': torch.float16,
        'safety_checker': None
    }
-    if model == 'runwayml/stable-diffusion-v1-5':
+    if shortname == 'stable':
        params['revision'] = 'fp16'
-    if (model == 'stabilityai/stable-diffusion-xl-base-1.0' or
+    if 'xl' in shortname:
        model == 'snowkidy/stable-diffusion-xl-base-0.9'):
        if image:
            pipe_class = StableDiffusionXLImg2ImgPipeline
        else:
@ -100,10 +109,16 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio
    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
        pipe.scheduler.config)
    if over_mem:
        if not image:
            pipe.enable_vae_slicing()
            pipe.enable_vae_tiling()
-    return pipe.to('cuda')
+        pipe.enable_model_cpu_offload()
    pipe.enable_xformers_memory_efficient_attention()
    return pipe
 def txt2img(
--- a/113
+++ b/113
@ -0,0 +1,113 @@
 #!/bin/bash
 # Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver
 dmsetup mknodes
 # First, make sure that cgroups are mounted correctly.
 CGROUP=/sys/fs/cgroup
 : {LOG:=stdio}
 [ -d $CGROUP ] ||
 	mkdir $CGROUP
 mountpoint -q $CGROUP ||
 	mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || {
 		echo "Could not make a tmpfs mount. Did you use --privileged?"
 		exit 1
 	}
 if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security
 then
    mount -t securityfs none /sys/kernel/security || {
        echo "Could not mount /sys/kernel/security."
        echo "AppArmor detection and --privileged mode might break."
    }
 fi
 # Mount the cgroup hierarchies exactly as they are in the parent system.
 for SUBSYS in $(cut -d: -f2 /proc/1/cgroup)
 do
        [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS
        mountpoint -q $CGROUP/$SUBSYS ||
                mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS
        # The two following sections address a bug which manifests itself
        # by a cryptic "lxc-start: no ns_cgroup option specified" when
        # trying to start containers withina container.
        # The bug seems to appear when the cgroup hierarchies are not
        # mounted on the exact same directories in the host, and in the
        # container.
        # Named, control-less cgroups are mounted with "-o name=foo"
        # (and appear as such under /proc/<pid>/cgroup) but are usually
        # mounted on a directory named "foo" (without the "name=" prefix).
        # Systemd and OpenRC (and possibly others) both create such a
        # cgroup. To avoid the aforementioned bug, we symlink "foo" to
        # "name=foo". This shouldn't have any adverse effect.
        echo $SUBSYS | grep -q ^name= && {
                NAME=$(echo $SUBSYS | sed s/^name=//)
                ln -s $SUBSYS $CGROUP/$NAME
        }
        # Likewise, on at least one system, it has been reported that
        # systemd would mount the CPU and CPU accounting controllers
        # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu"
        # but on a directory called "cpu,cpuacct" (note the inversion
        # in the order of the groups). This tries to work around it.
        [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct
 done
 # Note: as I write those lines, the LXC userland tools cannot setup
 # a "sub-container" properly if the "devices" cgroup is not in its
 # own hierarchy. Let's detect this and issue a warning.
 grep -q :devices: /proc/1/cgroup ||
 	echo "WARNING: the 'devices' cgroup should be in its own hierarchy."
 grep -qw devices /proc/1/cgroup ||
 	echo "WARNING: it looks like the 'devices' cgroup is not mounted."
 # Now, close extraneous file descriptors.
 pushd /proc/self/fd >/dev/null
 for FD in *
 do
 	case "$FD" in
 	# Keep stdin/stdout/stderr
 	[012])
 		;;
 	# Nuke everything else
 	*)
 		eval exec "$FD>&-"
 		;;
 	esac
 done
 popd >/dev/null
 # If a pidfile is still around (for example after a container restart),
 # delete it so that docker can start.
 rm -rf /var/run/docker.pid
 # If we were given a PORT environment variable, start as a simple daemon;
 # otherwise, spawn a shell as well
 if [ "$PORT" ]
 then
 	exec dockerd -H 0.0.0.0:$PORT -H unix:///var/run/docker.sock \
 		$DOCKER_DAEMON_ARGS
 else
 	if [ "$LOG" == "file" ]
 	then
 		dockerd $DOCKER_DAEMON_ARGS &>/var/log/docker.log &
 	else
 		dockerd $DOCKER_DAEMON_ARGS &
 	fi
 	(( timeout = 60 + SECONDS ))
 	until docker info >/dev/null 2>&1
 	do
 		if (( SECONDS >= timeout )); then
 			echo 'Timed out trying to connect to internal docker host.' >&2
 			break
 		fi
 		sleep 1
 	done
 	[[ $1 ]] && exec "$@"
 	exec bash --login
 fi
		`@ -1 +0,0 @@`
			`git+https://github.com/facebookresearch/xformers.git@main#egg=xformers`