From 8a5e32d45227284c0f33470c8026501a6eb400f4 Mon Sep 17 00:00:00 2001
From: Guillermo Rodriguez <guillermo@telos.net>
Date: Mon, 14 Aug 2023 03:15:45 +0000
Subject: [PATCH]  Enable certain memory optmization options on cases where
 trying to load a large model on a low end card

---
 Dockerfile-cuda                               |  36 ++++++
 docker/Dockerfile.runtime                     |  16 ---
 docker/Dockerfile.runtime+cuda                |  29 -----
 requirements.cuda.1.txt                       |   1 -
 requirements.cuda.2.txt                       |   2 -
 ...ements.cuda.0.txt => requirements.cuda.txt |   9 +-
 skynet/constants.py                           |  29 ++---
 skynet/ipfs/docker.py                         |   5 +-
 skynet/utils.py                               |  29 +++--
 wrapdocker                                    | 113 ++++++++++++++++++
 10 files changed, 196 insertions(+), 73 deletions(-)
 create mode 100644 Dockerfile-cuda
 delete mode 100644 docker/Dockerfile.runtime
 delete mode 100644 docker/Dockerfile.runtime+cuda
 delete mode 100644 requirements.cuda.1.txt
 delete mode 100644 requirements.cuda.2.txt
 rename requirements.cuda.0.txt => requirements.cuda.txt (71%)
 create mode 100755 wrapdocker

diff --git a/Dockerfile-cuda b/Dockerfile-cuda
new file mode 100644
index 0000000..393f8c2
--- /dev/null
+++ b/Dockerfile-cuda
@@ -0,0 +1,36 @@
+from pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
+env DEBIAN_FRONTEND=noninteractive
+
+run apt-get update -qq && apt-get install -qqy \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    git \
+    lxc \
+    vim \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    iptables
+    
+run curl -sSL https://get.docker.com/ | sh
+
+add ./wrapdocker /usr/local/bin/wrapdocker
+run chmod +x /usr/local/bin/wrapdocker
+
+volume /var/lib/docker
+
+env HF_HOME hf_home
+
+workdir /root/target
+
+add ./requirements.cuda.txt  requirements.cuda.txt
+add ./requirements.txt requirements.txt
+add ./setup.py setup.py
+add ./skynet skynet
+
+run pip install -r requirements.cuda.txt
+run pip install -r requirements.txt
+run pip install -e .
+
+cmd ["wrapdocker"]
diff --git a/docker/Dockerfile.runtime b/docker/Dockerfile.runtime
deleted file mode 100644
index 316fdcb..0000000
--- a/docker/Dockerfile.runtime
+++ /dev/null
@@ -1,16 +0,0 @@
-from python:3.10.0
-
-env DEBIAN_FRONTEND=noninteractive
-
-workdir /skynet
-
-copy requirements.txt requirements.txt
-copy pytest.ini ./
-copy setup.py ./
-copy skynet ./skynet
-
-run pip install \
-    -e . \
-    -r requirements.txt
-
-copy tests ./
diff --git a/docker/Dockerfile.runtime+cuda b/docker/Dockerfile.runtime+cuda
deleted file mode 100644
index 6d52960..0000000
--- a/docker/Dockerfile.runtime+cuda
+++ /dev/null
@@ -1,29 +0,0 @@
-from nvidia/cuda:11.7.0-devel-ubuntu20.04
-from python:3.11
-
-env DEBIAN_FRONTEND=noninteractive
-
-run apt-get update && \
-    apt-get install -y ffmpeg libsm6 libxext6
-
-workdir /skynet
-
-copy requirements.cuda* ./
-
-run pip install -U pip ninja
-run pip install -v -r requirements.cuda.0.txt
-run pip install -v -r requirements.cuda.1.txt
-run pip install -v -r requirements.cuda.2.txt
-
-copy requirements.txt requirements.txt
-copy pytest.ini pytest.ini
-copy setup.py setup.py
-copy skynet skynet
-
-run pip install -e . -r requirements.txt
-
-env PYTORCH_CUDA_ALLOC_CONF max_split_size_mb:128
-env NVIDIA_VISIBLE_DEVICES=all
-env HF_HOME /hf_home
-
-copy tests tests
diff --git a/requirements.cuda.1.txt b/requirements.cuda.1.txt
deleted file mode 100644
index b9f2703..0000000
--- a/requirements.cuda.1.txt
+++ /dev/null
@@ -1 +0,0 @@
-git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
diff --git a/requirements.cuda.2.txt b/requirements.cuda.2.txt
deleted file mode 100644
index 4d3fee4..0000000
--- a/requirements.cuda.2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-basicsr
-realesrgan
diff --git a/requirements.cuda.0.txt b/requirements.cuda.txt
similarity index 71%
rename from requirements.cuda.0.txt
rename to requirements.cuda.txt
index f796537..8b7341c 100644
--- a/requirements.cuda.0.txt
+++ b/requirements.cuda.txt
@@ -1,9 +1,14 @@
+torch
 scipy
 triton
+xformers
 accelerate
 transformers
 huggingface_hub
-diffusers[torch]>=0.18.0
+diffusers[torch]
 invisible-watermark
-torch==1.13.0+cu117
+
+basicsr
+realesrgan
+
 --extra-index-url https://download.pytorch.org/whl/cu117
diff --git a/skynet/constants.py b/skynet/constants.py
index 743270f..f7bb5dc 100755
--- a/skynet/constants.py
+++ b/skynet/constants.py
@@ -5,18 +5,18 @@ VERSION = '0.1a11'
 DOCKER_RUNTIME_CUDA = 'skynet:runtime-cuda'
 
 MODELS = {
-    'prompthero/openjourney':                   { 'short': 'midj'},
-    'runwayml/stable-diffusion-v1-5':           { 'short': 'stable'},
-    'stabilityai/stable-diffusion-2-1-base':    { 'short': 'stable2'},
-    'snowkidy/stable-diffusion-xl-base-0.9':    { 'short': 'stablexl0.9'},
-    'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl'},
-    'Linaqruf/anything-v3.0':                   { 'short': 'hdanime'},
-    'hakurei/waifu-diffusion':                  { 'short': 'waifu'},
-    'nitrosocke/Ghibli-Diffusion':              { 'short': 'ghibli'},
-    'dallinmackay/Van-Gogh-diffusion':          { 'short': 'van-gogh'},
-    'lambdalabs/sd-pokemon-diffusers':          { 'short': 'pokemon'},
-    'Envvi/Inkpunk-Diffusion':                  { 'short': 'ink'},
-    'nousr/robo-diffusion':                     { 'short': 'robot'}
+    'prompthero/openjourney':                   { 'short': 'midj', 'mem': 8 },
+    'runwayml/stable-diffusion-v1-5':           { 'short': 'stable', 'mem': 8 },
+    'stabilityai/stable-diffusion-2-1-base':    { 'short': 'stable2', 'mem': 8 },
+    'snowkidy/stable-diffusion-xl-base-0.9':    { 'short': 'stablexl0.9', 'mem': 24 },
+    'stabilityai/stable-diffusion-xl-base-1.0': { 'short': 'stablexl', 'mem': 24 },
+    'Linaqruf/anything-v3.0':                   { 'short': 'hdanime', 'mem': 8 },
+    'hakurei/waifu-diffusion':                  { 'short': 'waifu', 'mem': 8 },
+    'nitrosocke/Ghibli-Diffusion':              { 'short': 'ghibli', 'mem': 8 },
+    'dallinmackay/Van-Gogh-diffusion':          { 'short': 'van-gogh', 'mem': 8 },
+    'lambdalabs/sd-pokemon-diffusers':          { 'short': 'pokemon', 'mem': 8 },
+    'Envvi/Inkpunk-Diffusion':                  { 'short': 'ink', 'mem': 8 },
+    'nousr/robo-diffusion':                     { 'short': 'robot', 'mem': 8 }
 }
 
 SHORT_NAMES = [
@@ -153,10 +153,11 @@ DEFAULT_UPSCALER = None
 DEFAULT_CONFIG_PATH = 'skynet.ini'
 
 DEFAULT_INITAL_MODELS = [
-    'prompthero/openjourney',
-    'runwayml/stable-diffusion-v1-5'
+    'stabilityai/stable-diffusion-xl-base-1.0'
 ]
 
+DEFAULT_SINGLE_CARD_MAP = 'cuda:0'
+
 DATE_FORMAT = '%B the %dth %Y, %H:%M:%S'
 
 CONFIG_ATTRS = [
diff --git a/skynet/ipfs/docker.py b/skynet/ipfs/docker.py
index 851bb12..1f315ba 100755
--- a/skynet/ipfs/docker.py
+++ b/skynet/ipfs/docker.py
@@ -83,9 +83,10 @@ def open_ipfs_node(name='skynet-ipfs'):
             remove=True
         )
 
+        uid = 1000
+        gid = 1000
+
         if sys.platform != 'win32':
-            uid = os.getuid()
-            gid = os.getgid()
             ec, out = container.exec_run(['chown', f'{uid}:{gid}', '-R', export_target])
             logging.info(out)
             assert ec == 0
diff --git a/skynet/utils.py b/skynet/utils.py
index 2ec7a6c..1cacd5b 100755
--- a/skynet/utils.py
+++ b/skynet/utils.py
@@ -4,6 +4,7 @@ import io
 import os
 import time
 import random
+import logging
 
 from typing import Optional
 from pathlib import Path
@@ -24,7 +25,7 @@ from diffusers import (
 from realesrgan import RealESRGANer
 from huggingface_hub import login
 
-from .constants import MODELS
+from .constants import MODELS, DEFAULT_SINGLE_CARD_MAP
 
 
 def time_ms():
@@ -74,16 +75,24 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio
     torch.backends.cudnn.benchmark = False
     torch.use_deterministic_algorithms(True)
 
+    model_info = MODELS[model]
+
+    req_mem = model_info['mem']
+    mem_gb = torch.cuda.mem_get_info()[1] / (10**9)
+    over_mem = mem_gb < req_mem
+    if over_mem:
+        logging.warn(f'model requires {req_mem} but card has {mem_gb}, model will run slower..')
+
+    shortname = model_info['short']
     params = {
         'torch_dtype': torch.float16,
         'safety_checker': None
     }
 
-    if model == 'runwayml/stable-diffusion-v1-5':
+    if shortname == 'stable':
         params['revision'] = 'fp16'
 
-    if (model == 'stabilityai/stable-diffusion-xl-base-1.0' or
-        model == 'snowkidy/stable-diffusion-xl-base-0.9'):
+    if 'xl' in shortname:
         if image:
             pipe_class = StableDiffusionXLImg2ImgPipeline
         else:
@@ -100,10 +109,16 @@ def pipeline_for(model: str, mem_fraction: float = 1.0, image=False) -> Diffusio
     pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
         pipe.scheduler.config)
 
-    if not image:
-        pipe.enable_vae_slicing()
+    if over_mem:
+        if not image:
+            pipe.enable_vae_slicing()
+            pipe.enable_vae_tiling()
 
-    return pipe.to('cuda')
+        pipe.enable_model_cpu_offload()
+
+    pipe.enable_xformers_memory_efficient_attention()
+
+    return pipe
 
 
 def txt2img(
diff --git a/wrapdocker b/wrapdocker
new file mode 100755
index 0000000..ac927a3
--- /dev/null
+++ b/wrapdocker
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Ensure that all nodes in /dev/mapper correspond to mapped devices currently loaded by the device-mapper kernel driver
+dmsetup mknodes
+
+# First, make sure that cgroups are mounted correctly.
+CGROUP=/sys/fs/cgroup
+: {LOG:=stdio}
+
+[ -d $CGROUP ] ||
+	mkdir $CGROUP
+
+mountpoint -q $CGROUP ||
+	mount -n -t tmpfs -o uid=0,gid=0,mode=0755 cgroup $CGROUP || {
+		echo "Could not make a tmpfs mount. Did you use --privileged?"
+		exit 1
+	}
+
+if [ -d /sys/kernel/security ] && ! mountpoint -q /sys/kernel/security
+then
+    mount -t securityfs none /sys/kernel/security || {
+        echo "Could not mount /sys/kernel/security."
+        echo "AppArmor detection and --privileged mode might break."
+    }
+fi
+
+# Mount the cgroup hierarchies exactly as they are in the parent system.
+for SUBSYS in $(cut -d: -f2 /proc/1/cgroup)
+do
+        [ -d $CGROUP/$SUBSYS ] || mkdir $CGROUP/$SUBSYS
+        mountpoint -q $CGROUP/$SUBSYS ||
+                mount -n -t cgroup -o $SUBSYS cgroup $CGROUP/$SUBSYS
+
+        # The two following sections address a bug which manifests itself
+        # by a cryptic "lxc-start: no ns_cgroup option specified" when
+        # trying to start containers withina container.
+        # The bug seems to appear when the cgroup hierarchies are not
+        # mounted on the exact same directories in the host, and in the
+        # container.
+
+        # Named, control-less cgroups are mounted with "-o name=foo"
+        # (and appear as such under /proc/<pid>/cgroup) but are usually
+        # mounted on a directory named "foo" (without the "name=" prefix).
+        # Systemd and OpenRC (and possibly others) both create such a
+        # cgroup. To avoid the aforementioned bug, we symlink "foo" to
+        # "name=foo". This shouldn't have any adverse effect.
+        echo $SUBSYS | grep -q ^name= && {
+                NAME=$(echo $SUBSYS | sed s/^name=//)
+                ln -s $SUBSYS $CGROUP/$NAME
+        }
+
+        # Likewise, on at least one system, it has been reported that
+        # systemd would mount the CPU and CPU accounting controllers
+        # (respectively "cpu" and "cpuacct") with "-o cpuacct,cpu"
+        # but on a directory called "cpu,cpuacct" (note the inversion
+        # in the order of the groups). This tries to work around it.
+        [ $SUBSYS = cpuacct,cpu ] && ln -s $SUBSYS $CGROUP/cpu,cpuacct
+done
+
+# Note: as I write those lines, the LXC userland tools cannot setup
+# a "sub-container" properly if the "devices" cgroup is not in its
+# own hierarchy. Let's detect this and issue a warning.
+grep -q :devices: /proc/1/cgroup ||
+	echo "WARNING: the 'devices' cgroup should be in its own hierarchy."
+grep -qw devices /proc/1/cgroup ||
+	echo "WARNING: it looks like the 'devices' cgroup is not mounted."
+
+# Now, close extraneous file descriptors.
+pushd /proc/self/fd >/dev/null
+for FD in *
+do
+	case "$FD" in
+	# Keep stdin/stdout/stderr
+	[012])
+		;;
+	# Nuke everything else
+	*)
+		eval exec "$FD>&-"
+		;;
+	esac
+done
+popd >/dev/null
+
+
+# If a pidfile is still around (for example after a container restart),
+# delete it so that docker can start.
+rm -rf /var/run/docker.pid
+
+# If we were given a PORT environment variable, start as a simple daemon;
+# otherwise, spawn a shell as well
+if [ "$PORT" ]
+then
+	exec dockerd -H 0.0.0.0:$PORT -H unix:///var/run/docker.sock \
+		$DOCKER_DAEMON_ARGS
+else
+	if [ "$LOG" == "file" ]
+	then
+		dockerd $DOCKER_DAEMON_ARGS &>/var/log/docker.log &
+	else
+		dockerd $DOCKER_DAEMON_ARGS &
+	fi
+	(( timeout = 60 + SECONDS ))
+	until docker info >/dev/null 2>&1
+	do
+		if (( SECONDS >= timeout )); then
+			echo 'Timed out trying to connect to internal docker host.' >&2
+			break
+		fi
+		sleep 1
+	done
+	[[ $1 ]] && exec "$@"
+	exec bash --login
+fi