lixiang6 commited on
Commit
15f87d2
·
verified ·
1 Parent(s): ef98fac

Upload 19 files

Browse files
.gitignore ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compile / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Jupyter Notebook
55
+ .ipynb_checkpoints
56
+
57
+ # pyenv
58
+ .python-version
59
+
60
+ # celery beat schedule file
61
+ celerybeat-schedule
62
+
63
+ # SageMath parsed files
64
+ *.sage.py
65
+
66
+ # Environments
67
+ .env
68
+ .venv
69
+ env/
70
+ venv/
71
+ ENV/
72
+ env.bak/
73
+ venv.bak/
74
+
75
+ # Spyder project settings
76
+ .spyderproject
77
+ .spyproject
78
+
79
+ # Rope project settings
80
+ .ropeproject
81
+
82
+ # mkdocs documentation
83
+ /site
84
+
85
+ # mypy
86
+ .mypy_cache/
87
+ .aider*
88
+
89
+ # ignore ALL .log files
90
+ *.log
91
+ # ignore ALL files in ANY directory named temp
92
+ .ipynb_checkpoints/
93
+ images/.ipynb_checkpoints/
94
+ results/
95
+ .vscode/settings.json
96
+ fonts/agency.ttf
97
+ fonts/calibri.ttf
98
+ data/
Comic_Generation.ipynb ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "editable": true,
7
+ "slideshow": {
8
+ "slide_type": ""
9
+ },
10
+ "tags": []
11
+ },
12
+ "source": [
13
+ "## StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation \n",
14
+ "[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md-dark.svg)]()\n",
15
+ "[[Paper]()] &emsp; [[Project Page]()] &emsp; <br>"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "metadata": {},
21
+ "source": [
22
+ "### Import Packages"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stderr",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "/home/tjut_lixiang/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
35
+ " from .autonotebook import tqdm as notebook_tqdm\n"
36
+ ]
37
+ }
38
+ ],
39
+ "source": [
40
+ "# %load_ext autoreload\n",
41
+ "# %autoreload 2\n",
42
+ "import gradio as gr\n",
43
+ "import numpy as np\n",
44
+ "import torch\n",
45
+ "import requests\n",
46
+ "import random\n",
47
+ "import os\n",
48
+ "import sys\n",
49
+ "import pickle\n",
50
+ "from PIL import Imagex\n",
51
+ "from tqdm.auto import tqdm\n",
52
+ "from datetime import datetime\n",
53
+ "from utils.gradio_utils import is_torch2_available\n",
54
+ "if is_torch2_available():\n",
55
+ " from utils.gradio_utils import \\\n",
56
+ " AttnProcessor2_0 as AttnProcessor\n",
57
+ "else:\n",
58
+ " from utils.gradio_utils import AttnProcessor\n",
59
+ "\n",
60
+ "import diffusers\n",
61
+ "from diffusers import StableDiffusionXLPipeline\n",
62
+ "from diffusers import DDIMScheduler\n",
63
+ "import torch.nn.functional as F\n",
64
+ "from utils.gradio_utils import cal_attn_mask_xl\n",
65
+ "import copy\n",
66
+ "import os\n",
67
+ "from diffusers.utils import load_image\n",
68
+ "from utils.utils import get_comic\n",
69
+ "from utils.style_template import styles"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "metadata": {},
75
+ "source": [
76
+ "### Set Config "
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 2,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "## Global\n",
86
+ "STYLE_NAMES = list(styles.keys())\n",
87
+ "DEFAULT_STYLE_NAME = \"(No style)\"\n",
88
+ "MAX_SEED = np.iinfo(np.int32).max\n",
89
+ "global models_dict\n",
90
+ "use_va = False\n",
91
+ "models_dict = {\n",
92
+ " \"Juggernaut\":\"RunDiffusion/Juggernaut-XL-v8\",\n",
93
+ " \"RealVision\":\"SG161222/RealVisXL_V4.0\" ,\n",
94
+ " \"SDXL\":\"stabilityai/stable-diffusion-xl-base-1.0\" ,\n",
95
+ " \"Unstable\": \"stablediffusionapi/sdxl-unstable-diffusers-y\"\n",
96
+ "}"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 3,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/plain": [
107
+ "True"
108
+ ]
109
+ },
110
+ "execution_count": 3,
111
+ "metadata": {},
112
+ "output_type": "execute_result"
113
+ }
114
+ ],
115
+ "source": [
116
+ "torch.cuda.is_available()"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 4,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "def setup_seed(seed):\n",
126
+ " torch.manual_seed(seed)\n",
127
+ " torch.cuda.manual_seed_all(seed)\n",
128
+ " np.random.seed(seed)\n",
129
+ " random.seed(seed)\n",
130
+ " torch.backends.cudnn.deterministic = True\n",
131
+ "\n",
132
+ " \n",
133
+ "#################################################\n",
134
+ "########Consistent Self-Attention################\n",
135
+ "#################################################\n",
136
+ "class SpatialAttnProcessor2_0(torch.nn.Module):\n",
137
+ " r\"\"\"\n",
138
+ " Attention processor for IP-Adapater for PyTorch 2.0.\n",
139
+ " Args:\n",
140
+ " hidden_size (`int`):\n",
141
+ " The hidden size of the attention layer.\n",
142
+ " cross_attention_dim (`int`):\n",
143
+ " The number of channels in the `encoder_hidden_states`.\n",
144
+ " text_context_len (`int`, defaults to 77):\n",
145
+ " The context length of the text features.\n",
146
+ " scale (`float`, defaults to 1.0):\n",
147
+ " the weight scale of image prompt.\n",
148
+ " \"\"\"\n",
149
+ "\n",
150
+ " def __init__(self, hidden_size = None, cross_attention_dim=None,id_length = 4,device = \"cuda:0\",dtype = torch.float16):\n",
151
+ " super().__init__()\n",
152
+ " if not hasattr(F, \"scaled_dot_product_attention\"):\n",
153
+ " raise ImportError(\"AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.\")\n",
154
+ " self.device = device\n",
155
+ " self.dtype = dtype\n",
156
+ " self.hidden_size = hidden_size\n",
157
+ " self.cross_attention_dim = cross_attention_dim\n",
158
+ " self.total_length = id_length + 1\n",
159
+ " self.id_length = id_length\n",
160
+ " self.id_bank = {}\n",
161
+ "\n",
162
+ " def __call__(\n",
163
+ " self,\n",
164
+ " attn,\n",
165
+ " hidden_states,\n",
166
+ " encoder_hidden_states=None,\n",
167
+ " attention_mask=None,\n",
168
+ " temb=None):\n",
169
+ " global total_count,attn_count,cur_step,mask1024,mask4096\n",
170
+ " global sa32, sa64\n",
171
+ " global write\n",
172
+ " global height,width\n",
173
+ " if write:\n",
174
+ " # print(f\"white:{cur_step}\")\n",
175
+ " self.id_bank[cur_step] = [hidden_states[:self.id_length], hidden_states[self.id_length:]]\n",
176
+ " else:\n",
177
+ " encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),hidden_states[:1],self.id_bank[cur_step][1].to(self.device),hidden_states[1:]))\n",
178
+ " # skip in early step\n",
179
+ " if cur_step <5:\n",
180
+ " hidden_states = self.__call2__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)\n",
181
+ " else: # 256 1024 4096\n",
182
+ " random_number = random.random()\n",
183
+ " if cur_step <20:\n",
184
+ " rand_num = 0.3\n",
185
+ " else:\n",
186
+ " rand_num = 0.1\n",
187
+ " if random_number > rand_num:\n",
188
+ " if not write:\n",
189
+ " if hidden_states.shape[1] == (height//32) * (width//32):\n",
190
+ " attention_mask = mask1024[mask1024.shape[0] // self.total_length * self.id_length:]\n",
191
+ " else:\n",
192
+ " attention_mask = mask4096[mask4096.shape[0] // self.total_length * self.id_length:]\n",
193
+ " else:\n",
194
+ " if hidden_states.shape[1] == (height//32) * (width//32):\n",
195
+ " attention_mask = mask1024[:mask1024.shape[0] // self.total_length * self.id_length,:mask1024.shape[0] // self.total_length * self.id_length]\n",
196
+ " else:\n",
197
+ " attention_mask = mask4096[:mask4096.shape[0] // self.total_length * self.id_length,:mask4096.shape[0] // self.total_length * self.id_length]\n",
198
+ " hidden_states = self.__call1__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)\n",
199
+ " else:\n",
200
+ " hidden_states = self.__call2__(attn, hidden_states,None,attention_mask,temb)\n",
201
+ " attn_count +=1\n",
202
+ " if attn_count == total_count:\n",
203
+ " attn_count = 0\n",
204
+ " cur_step += 1\n",
205
+ " mask1024,mask4096 = cal_attn_mask_xl(self.total_length,self.id_length,sa32,sa64,height,width, device=self.device, dtype= self.dtype)\n",
206
+ "\n",
207
+ " return hidden_states\n",
208
+ " def __call1__(\n",
209
+ " self,\n",
210
+ " attn,\n",
211
+ " hidden_states,\n",
212
+ " encoder_hidden_states=None,\n",
213
+ " attention_mask=None,\n",
214
+ " temb=None,\n",
215
+ " ):\n",
216
+ " residual = hidden_states\n",
217
+ " if attn.spatial_norm is not None:\n",
218
+ " hidden_states = attn.spatial_norm(hidden_states, temb)\n",
219
+ " input_ndim = hidden_states.ndim\n",
220
+ "\n",
221
+ " if input_ndim == 4:\n",
222
+ " total_batch_size, channel, height, width = hidden_states.shape\n",
223
+ " hidden_states = hidden_states.view(total_batch_size, channel, height * width).transpose(1, 2)\n",
224
+ " total_batch_size,nums_token,channel = hidden_states.shape\n",
225
+ " img_nums = total_batch_size//2\n",
226
+ " hidden_states = hidden_states.view(-1,img_nums,nums_token,channel).reshape(-1,img_nums * nums_token,channel)\n",
227
+ "\n",
228
+ " batch_size, sequence_length, _ = hidden_states.shape\n",
229
+ "\n",
230
+ " if attn.group_norm is not None:\n",
231
+ " hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n",
232
+ "\n",
233
+ " query = attn.to_q(hidden_states)\n",
234
+ "\n",
235
+ " if encoder_hidden_states is None:\n",
236
+ " encoder_hidden_states = hidden_states # B, N, C\n",
237
+ " else:\n",
238
+ " encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,nums_token,channel).reshape(-1,(self.id_length+1) * nums_token,channel)\n",
239
+ "\n",
240
+ " key = attn.to_k(encoder_hidden_states)\n",
241
+ " value = attn.to_v(encoder_hidden_states)\n",
242
+ "\n",
243
+ "\n",
244
+ " inner_dim = key.shape[-1]\n",
245
+ " head_dim = inner_dim // attn.heads\n",
246
+ "\n",
247
+ " query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
248
+ "\n",
249
+ " key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
250
+ " value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
251
+ " hidden_states = F.scaled_dot_product_attention(\n",
252
+ " query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False\n",
253
+ " )\n",
254
+ "\n",
255
+ " hidden_states = hidden_states.transpose(1, 2).reshape(total_batch_size, -1, attn.heads * head_dim)\n",
256
+ " hidden_states = hidden_states.to(query.dtype)\n",
257
+ "\n",
258
+ "\n",
259
+ "\n",
260
+ " # linear proj\n",
261
+ " hidden_states = attn.to_out[0](hidden_states)\n",
262
+ " # dropout\n",
263
+ " hidden_states = attn.to_out[1](hidden_states)\n",
264
+ "\n",
265
+ "\n",
266
+ " if input_ndim == 4:\n",
267
+ " hidden_states = hidden_states.transpose(-1, -2).reshape(total_batch_size, channel, height, width)\n",
268
+ " if attn.residual_connection:\n",
269
+ " hidden_states = hidden_states + residual\n",
270
+ " hidden_states = hidden_states / attn.rescale_output_factor\n",
271
+ " # print(hidden_states.shape)\n",
272
+ " return hidden_states\n",
273
+ " def __call2__(\n",
274
+ " self,\n",
275
+ " attn,\n",
276
+ " hidden_states,\n",
277
+ " encoder_hidden_states=None,\n",
278
+ " attention_mask=None,\n",
279
+ " temb=None):\n",
280
+ " residual = hidden_states\n",
281
+ "\n",
282
+ " if attn.spatial_norm is not None:\n",
283
+ " hidden_states = attn.spatial_norm(hidden_states, temb)\n",
284
+ "\n",
285
+ " input_ndim = hidden_states.ndim\n",
286
+ "\n",
287
+ " if input_ndim == 4:\n",
288
+ " batch_size, channel, height, width = hidden_states.shape\n",
289
+ " hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)\n",
290
+ "\n",
291
+ " batch_size, sequence_length, channel = (\n",
292
+ " hidden_states.shape\n",
293
+ " )\n",
294
+ " # print(hidden_states.shape)\n",
295
+ " if attention_mask is not None:\n",
296
+ " attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)\n",
297
+ " # scaled_dot_product_attention expects attention_mask shape to be\n",
298
+ " # (batch, heads, source_length, target_length)\n",
299
+ " attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])\n",
300
+ "\n",
301
+ " if attn.group_norm is not None:\n",
302
+ " hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)\n",
303
+ "\n",
304
+ " query = attn.to_q(hidden_states)\n",
305
+ "\n",
306
+ " if encoder_hidden_states is None:\n",
307
+ " encoder_hidden_states = hidden_states # B, N, C\n",
308
+ " else:\n",
309
+ " encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)\n",
310
+ "\n",
311
+ " key = attn.to_k(encoder_hidden_states)\n",
312
+ " value = attn.to_v(encoder_hidden_states)\n",
313
+ "\n",
314
+ " inner_dim = key.shape[-1]\n",
315
+ " head_dim = inner_dim // attn.heads\n",
316
+ "\n",
317
+ " query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
318
+ "\n",
319
+ " key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
320
+ " value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)\n",
321
+ "\n",
322
+ " hidden_states = F.scaled_dot_product_attention(\n",
323
+ " query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False\n",
324
+ " )\n",
325
+ "\n",
326
+ " hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)\n",
327
+ " hidden_states = hidden_states.to(query.dtype)\n",
328
+ "\n",
329
+ " # linear proj\n",
330
+ " hidden_states = attn.to_out[0](hidden_states)\n",
331
+ " # dropout\n",
332
+ " hidden_states = attn.to_out[1](hidden_states)\n",
333
+ "\n",
334
+ " if input_ndim == 4:\n",
335
+ " hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)\n",
336
+ "\n",
337
+ " if attn.residual_connection:\n",
338
+ " hidden_states = hidden_states + residual\n",
339
+ "\n",
340
+ " hidden_states = hidden_states / attn.rescale_output_factor\n",
341
+ "\n",
342
+ " return hidden_states\n",
343
+ "\n",
344
+ "def set_attention_processor(unet,id_length):\n",
345
+ " attn_procs = {}\n",
346
+ " for name in unet.attn_processors.keys():\n",
347
+ " cross_attention_dim = None if name.endswith(\"attn1.processor\") else unet.config.cross_attention_dim\n",
348
+ " if name.startswith(\"mid_block\"):\n",
349
+ " hidden_size = unet.config.block_out_channels[-1]\n",
350
+ " elif name.startswith(\"up_blocks\"):\n",
351
+ " block_id = int(name[len(\"up_blocks.\")])\n",
352
+ " hidden_size = list(reversed(unet.config.block_out_channels))[block_id]\n",
353
+ " elif name.startswith(\"down_blocks\"):\n",
354
+ " block_id = int(name[len(\"down_blocks.\")])\n",
355
+ " hidden_size = unet.config.block_out_channels[block_id]\n",
356
+ " if cross_attention_dim is None:\n",
357
+ " if name.startswith(\"up_blocks\") :\n",
358
+ " attn_procs[name] = SpatialAttnProcessor2_0(id_length = id_length)\n",
359
+ " else: \n",
360
+ " attn_procs[name] = AttnProcessor()\n",
361
+ " else:\n",
362
+ " attn_procs[name] = AttnProcessor()\n",
363
+ "\n",
364
+ " unet.set_attn_processor(attn_procs)"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "markdown",
369
+ "metadata": {},
370
+ "source": [
371
+ "### Load Pipeline"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 5,
377
+ "metadata": {},
378
+ "outputs": [
379
+ {
380
+ "name": "stderr",
381
+ "output_type": "stream",
382
+ "text": [
383
+ "/home/tjut_lixiang/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
384
+ " warnings.warn(\n",
385
+ "Loading pipeline components...: 100%|██████████| 7/7 [00:49<00:00, 7.13s/it]\n"
386
+ ]
387
+ },
388
+ {
389
+ "name": "stdout",
390
+ "output_type": "stream",
391
+ "text": [
392
+ "successsfully load consistent self-attention\n",
393
+ "number of the processor : 36\n"
394
+ ]
395
+ }
396
+ ],
397
+ "source": [
398
+ "global attn_count, total_count, id_length, total_length,cur_step, cur_model_type\n",
399
+ "global write\n",
400
+ "global sa32, sa64\n",
401
+ "global height,width\n",
402
+ "attn_count = 0\n",
403
+ "total_count = 0\n",
404
+ "cur_step = 0\n",
405
+ "id_length = 4\n",
406
+ "total_length = 5\n",
407
+ "cur_model_type = \"\"\n",
408
+ "device=\"cuda:0\"\n",
409
+ "global attn_procs,unet\n",
410
+ "attn_procs = {}\n",
411
+ "###\n",
412
+ "write = False\n",
413
+ "### strength of consistent self-attention: the larger, the stronger\n",
414
+ "sa32 = 0.5\n",
415
+ "sa64 = 0.5\n",
416
+ "### Res. of the Generated Comics. Please Note: SDXL models may do worse in a low-resolution! \n",
417
+ "height = 768\n",
418
+ "width = 768\n",
419
+ "###\n",
420
+ "global pipe\n",
421
+ "global sd_model_path\n",
422
+ "sd_model_path = models_dict[\"RealVision\"] #\"SG161222/RealVisXL_V4.0\"\n",
423
+ "### LOAD Stable Diffusion Pipeline\n",
424
+ "pipe = StableDiffusionXLPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float16, use_safetensors=False)\n",
425
+ "pipe = pipe.to(device)\n",
426
+ "pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)\n",
427
+ "pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)\n",
428
+ "pipe.scheduler.set_timesteps(50)\n",
429
+ "unet = pipe.unet\n",
430
+ "\n",
431
+ "### Insert PairedAttention\n",
432
+ "for name in unet.attn_processors.keys():\n",
433
+ " cross_attention_dim = None if name.endswith(\"attn1.processor\") else unet.config.cross_attention_dim\n",
434
+ " if name.startswith(\"mid_block\"):\n",
435
+ " hidden_size = unet.config.block_out_channels[-1]\n",
436
+ " elif name.startswith(\"up_blocks\"):\n",
437
+ " block_id = int(name[len(\"up_blocks.\")])\n",
438
+ " hidden_size = list(reversed(unet.config.block_out_channels))[block_id]\n",
439
+ " elif name.startswith(\"down_blocks\"):\n",
440
+ " block_id = int(name[len(\"down_blocks.\")])\n",
441
+ " hidden_size = unet.config.block_out_channels[block_id]\n",
442
+ " if cross_attention_dim is None and (name.startswith(\"up_blocks\") ) :\n",
443
+ " attn_procs[name] = SpatialAttnProcessor2_0(id_length = id_length)\n",
444
+ " total_count +=1\n",
445
+ " else:\n",
446
+ " attn_procs[name] = AttnProcessor()\n",
447
+ "print(\"successsfully load consistent self-attention\")\n",
448
+ "print(f\"number of the processor : {total_count}\")\n",
449
+ "unet.set_attn_processor(copy.deepcopy(attn_procs))\n",
450
+ "global mask1024,mask4096\n",
451
+ "mask1024, mask4096 = cal_attn_mask_xl(total_length,id_length,sa32,sa64,height,width,device=device,dtype= torch.float16)"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "markdown",
456
+ "metadata": {},
457
+ "source": [
458
+ "### Create the text description for the comics\n",
459
+ "Tips: Existing text2image diffusion models may not always generate images that accurately match text descriptions. Our training-free approach can improve the consistency of characters, but it does not enhance the control over the text. Therefore, in some cases, you may need to carefully craft your prompts."
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 6,
465
+ "metadata": {},
466
+ "outputs": [],
467
+ "source": [
468
+ "guidance_scale = 5.0\n",
469
+ "seed = 2047\n",
470
+ "sa32 = 0.5\n",
471
+ "sa64 = 0.5\n",
472
+ "id_length = 4\n",
473
+ "num_steps = 50\n",
474
+ "general_prompt = \"a man with a black suit\"\n",
475
+ "negative_prompt = \"naked, deformed, bad anatomy, disfigured, poorly drawn face, mutation, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, blurry, watermarks, oversaturated, distorted hands, amputation\"\n",
476
+ "prompt_array = [\"wake up in the bed\",\n",
477
+ " \"have breakfast\",\n",
478
+ " \"is on the road, go to the company\",\n",
479
+ " \"work in the company\",\n",
480
+ " \"running in the playground\",\n",
481
+ " \"reading book in the home\"\n",
482
+ " ]"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": 7,
488
+ "metadata": {},
489
+ "outputs": [
490
+ {
491
+ "name": "stderr",
492
+ "output_type": "stream",
493
+ "text": [
494
+ " 30%|███ | 15/50 [00:12<00:30, 1.16it/s]\n"
495
+ ]
496
+ },
497
+ {
498
+ "ename": "OutOfMemoryError",
499
+ "evalue": "CUDA out of memory. Tried to allocate 3.16 GiB (GPU 0; 23.70 GiB total capacity; 17.71 GiB already allocated; 1.04 GiB free; 21.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
500
+ "output_type": "error",
501
+ "traceback": [
502
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
503
+ "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)",
504
+ "Cell \u001b[0;32mIn[7], line 20\u001b[0m\n\u001b[1;32m 18\u001b[0m attn_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 19\u001b[0m id_prompts, negative_prompt \u001b[38;5;241m=\u001b[39m apply_style(style_name, id_prompts, negative_prompt)\n\u001b[0;32m---> 20\u001b[0m id_images \u001b[38;5;241m=\u001b[39m \u001b[43mpipe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mid_prompts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_inference_steps\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mnum_steps\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mguidance_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mguidance_scale\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheight\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mheight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[43m,\u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mnegative_prompt\u001b[49m\u001b[43m,\u001b[49m\u001b[43mgenerator\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mgenerator\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mimages\n\u001b[1;32m 22\u001b[0m write \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m id_image \u001b[38;5;129;01min\u001b[39;00m id_images:\n",
505
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
506
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py:1216\u001b[0m, in \u001b[0;36mStableDiffusionXLPipeline.__call__\u001b[0;34m(self, prompt, prompt_2, height, width, num_inference_steps, timesteps, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, ip_adapter_image, output_type, return_dict, cross_attention_kwargs, guidance_rescale, original_size, crops_coords_top_left, target_size, negative_original_size, negative_crops_coords_top_left, negative_target_size, clip_skip, callback_on_step_end, callback_on_step_end_tensor_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ip_adapter_image \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1215\u001b[0m added_cond_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimage_embeds\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m image_embeds\n\u001b[0;32m-> 1216\u001b[0m noise_pred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1217\u001b[0m \u001b[43m \u001b[49m\u001b[43mlatent_model_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1218\u001b[0m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1219\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprompt_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1220\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimestep_cond\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimestep_cond\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1221\u001b[0m \u001b[43m \u001b[49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1222\u001b[0m \u001b[43m \u001b[49m\u001b[43madded_cond_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43madded_cond_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1223\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1224\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1226\u001b[0m \u001b[38;5;66;03m# perform guidance\u001b[39;00m\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_classifier_free_guidance:\n",
507
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
508
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/models/unet_2d_condition.py:1177\u001b[0m, in \u001b[0;36mUNet2DConditionModel.forward\u001b[0;34m(self, sample, timestep, encoder_hidden_states, class_labels, timestep_cond, attention_mask, cross_attention_kwargs, added_cond_kwargs, down_block_additional_residuals, mid_block_additional_residual, down_intrablock_additional_residuals, encoder_attention_mask, return_dict)\u001b[0m\n\u001b[1;32m 1174\u001b[0m upsample_size \u001b[38;5;241m=\u001b[39m down_block_res_samples[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m:]\n\u001b[1;32m 1176\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(upsample_block, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_cross_attention\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m upsample_block\u001b[38;5;241m.\u001b[39mhas_cross_attention:\n\u001b[0;32m-> 1177\u001b[0m sample \u001b[38;5;241m=\u001b[39m \u001b[43mupsample_block\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1178\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1179\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43memb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1180\u001b[0m \u001b[43m \u001b[49m\u001b[43mres_hidden_states_tuple\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mres_samples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1181\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[43m \u001b[49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1183\u001b[0m \u001b[43m \u001b[49m\u001b[43mupsample_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mupsample_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1184\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1185\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1186\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1188\u001b[0m sample \u001b[38;5;241m=\u001b[39m upsample_block(\n\u001b[1;32m 1189\u001b[0m hidden_states\u001b[38;5;241m=\u001b[39msample,\n\u001b[1;32m 1190\u001b[0m temb\u001b[38;5;241m=\u001b[39memb,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1193\u001b[0m scale\u001b[38;5;241m=\u001b[39mlora_scale,\n\u001b[1;32m 1194\u001b[0m )\n",
509
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
510
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/models/unet_2d_blocks.py:2354\u001b[0m, in \u001b[0;36mCrossAttnUpBlock2D.forward\u001b[0;34m(self, hidden_states, res_hidden_states_tuple, temb, encoder_hidden_states, cross_attention_kwargs, upsample_size, attention_mask, encoder_attention_mask)\u001b[0m\n\u001b[1;32m 2352\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2353\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m resnet(hidden_states, temb, scale\u001b[38;5;241m=\u001b[39mlora_scale)\n\u001b[0;32m-> 2354\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mattn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2355\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2356\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2357\u001b[0m \u001b[43m \u001b[49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2358\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2359\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2360\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2361\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 2363\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2364\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m upsampler \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mupsamplers:\n",
511
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
512
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/models/transformer_2d.py:392\u001b[0m, in \u001b[0;36mTransformer2DModel.forward\u001b[0;34m(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs, class_labels, cross_attention_kwargs, attention_mask, encoder_attention_mask, return_dict)\u001b[0m\n\u001b[1;32m 380\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mcheckpoint\u001b[38;5;241m.\u001b[39mcheckpoint(\n\u001b[1;32m 381\u001b[0m create_custom_forward(block),\n\u001b[1;32m 382\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mckpt_kwargs,\n\u001b[1;32m 390\u001b[0m )\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 392\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mblock\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimestep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimestep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[43m \u001b[49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_labels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclass_labels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;66;03m# 3. Output\u001b[39;00m\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_input_continuous:\n",
513
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
514
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/models/attention.py:329\u001b[0m, in \u001b[0;36mBasicTransformerBlock.forward\u001b[0;34m(self, hidden_states, attention_mask, encoder_hidden_states, encoder_attention_mask, timestep, cross_attention_kwargs, class_labels, added_cond_kwargs)\u001b[0m\n\u001b[1;32m 326\u001b[0m cross_attention_kwargs \u001b[38;5;241m=\u001b[39m cross_attention_kwargs\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m cross_attention_kwargs \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[1;32m 327\u001b[0m gligen_kwargs \u001b[38;5;241m=\u001b[39m cross_attention_kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgligen\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 329\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattn1\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 330\u001b[0m \u001b[43m \u001b[49m\u001b[43mnorm_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 331\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43monly_cross_attention\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 333\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 334\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 335\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_ada_layer_norm_zero:\n\u001b[1;32m 336\u001b[0m attn_output \u001b[38;5;241m=\u001b[39m gate_msa\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m*\u001b[39m attn_output\n",
515
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1499\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1500\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
516
+ "File \u001b[0;32m~/anaconda3/envs/storydiffusion/lib/python3.10/site-packages/diffusers/models/attention_processor.py:527\u001b[0m, in \u001b[0;36mAttention.forward\u001b[0;34m(self, hidden_states, encoder_hidden_states, attention_mask, **cross_attention_kwargs)\u001b[0m\n\u001b[1;32m 508\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 509\u001b[0m \u001b[38;5;124;03mThe forward method of the `Attention` class.\u001b[39;00m\n\u001b[1;32m 510\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;124;03m `torch.Tensor`: The output of the attention layer.\u001b[39;00m\n\u001b[1;32m 523\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;66;03m# The `Attention` class can call different attention processors / attention functions\u001b[39;00m\n\u001b[1;32m 525\u001b[0m \u001b[38;5;66;03m# here we simply pass along all tensors to the selected processor class\u001b[39;00m\n\u001b[1;32m 526\u001b[0m \u001b[38;5;66;03m# For standard processors that are defined here, `**cross_attention_kwargs` is empty\u001b[39;00m\n\u001b[0;32m--> 527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 529\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 530\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 531\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcross_attention_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
517
+ "Cell \u001b[0;32mIn[4], line 74\u001b[0m, in \u001b[0;36mSpatialAttnProcessor2_0.__call__\u001b[0;34m(self, attn, hidden_states, encoder_hidden_states, attention_mask, temb)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 73\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m mask4096[:mask4096\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtotal_length \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid_length,:mask4096\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtotal_length \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mid_length]\n\u001b[0;32m---> 74\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__call1__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43mencoder_hidden_states\u001b[49m\u001b[43m,\u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtemb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 76\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__call2__(attn, hidden_states,\u001b[38;5;28;01mNone\u001b[39;00m,attention_mask,temb)\n",
518
+ "Cell \u001b[0;32mIn[4], line 127\u001b[0m, in \u001b[0;36mSpatialAttnProcessor2_0.__call1__\u001b[0;34m(self, attn, hidden_states, encoder_hidden_states, attention_mask, temb)\u001b[0m\n\u001b[1;32m 125\u001b[0m key \u001b[38;5;241m=\u001b[39m key\u001b[38;5;241m.\u001b[39mview(batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, attn\u001b[38;5;241m.\u001b[39mheads, head_dim)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 126\u001b[0m value \u001b[38;5;241m=\u001b[39m value\u001b[38;5;241m.\u001b[39mview(batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, attn\u001b[38;5;241m.\u001b[39mheads, head_dim)\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\n\u001b[0;32m--> 127\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscaled_dot_product_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdropout_p\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_causal\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 129\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 131\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m)\u001b[38;5;241m.\u001b[39mreshape(total_batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, attn\u001b[38;5;241m.\u001b[39mheads \u001b[38;5;241m*\u001b[39m head_dim)\n\u001b[1;32m 132\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mto(query\u001b[38;5;241m.\u001b[39mdtype)\n",
519
+ "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 3.16 GiB (GPU 0; 23.70 GiB total capacity; 17.71 GiB already allocated; 1.04 GiB free; 21.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
520
+ ]
521
+ }
522
+ ],
523
+ "source": [
524
+ "##########################################################################################\n",
525
+ "def apply_style_positive(style_name: str, positive: str):\n",
526
+ " p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])\n",
527
+ " return p.replace(\"{prompt}\", positive) \n",
528
+ "def apply_style(style_name: str, positives: list, negative: str = \"\"):\n",
529
+ " p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])\n",
530
+ " return [p.replace(\"{prompt}\", positive) for positive in positives], n + ' ' + negative\n",
531
+ "### Set the generated Style\n",
532
+ "style_name = \"Comic book\"\n",
533
+ "setup_seed(seed)\n",
534
+ "generator = torch.Generator(device=\"cuda:0\").manual_seed(seed)\n",
535
+ "prompts = [general_prompt+\",\"+prompt for prompt in prompt_array]\n",
536
+ "id_prompts = prompts[:id_length]\n",
537
+ "real_prompts = prompts[id_length:]\n",
538
+ "torch.cuda.empty_cache()\n",
539
+ "write = True\n",
540
+ "cur_step = 0\n",
541
+ "attn_count = 0\n",
542
+ "id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)\n",
543
+ "id_images = pipe(id_prompts, num_inference_steps = num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images\n",
544
+ "\n",
545
+ "write = False\n",
546
+ "for id_image in id_images:\n",
547
+ " display(id_image)\n",
548
+ "real_images = []\n",
549
+ "for real_prompt in real_prompts:\n",
550
+ " cur_step = 0\n",
551
+ " real_prompt = apply_style_positive(style_name, real_prompt)\n",
552
+ " real_images.append(pipe(real_prompt, num_inference_steps=num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])\n",
553
+ "for real_image in real_images:\n",
554
+ " display(real_image) "
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "markdown",
559
+ "metadata": {},
560
+ "source": [
561
+ "### Continued Creation\n",
562
+ "From now on, you can create endless stories about this character without worrying about memory constraints."
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": null,
568
+ "metadata": {},
569
+ "outputs": [],
570
+ "source": [
571
+ "new_prompt_array = [\"siting on the sofa\",\n",
572
+ " \"on the bed, at night \"]\n",
573
+ "new_prompts = [general_prompt+\",\"+prompt for prompt in new_prompt_array]\n",
574
+ "new_images = []\n",
575
+ "for new_prompt in new_prompts :\n",
576
+ " cur_step = 0\n",
577
+ " new_prompt = apply_style_positive(style_name, new_prompt)\n",
578
+ " new_images.append(pipe(new_prompt, num_inference_steps=num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])\n",
579
+ "for new_image in new_images:\n",
580
+ " display(new_image) "
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "markdown",
585
+ "metadata": {},
586
+ "source": [
587
+ "### Make pictures into comics"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": 222,
593
+ "metadata": {},
594
+ "outputs": [],
595
+ "source": [
596
+ "###\n",
597
+ "total_images = id_images + real_images + new_images\n",
598
+ "from PIL import Image,ImageOps,ImageDraw, ImageFont\n",
599
+ "#### LOAD Fonts, can also replace with any Fonts you have!\n",
600
+ "font = ImageFont.truetype(\"./fonts/Inkfree.ttf\", 30)\n"
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "code",
605
+ "execution_count": 223,
606
+ "metadata": {},
607
+ "outputs": [],
608
+ "source": [
609
+ "# import importlib\n",
610
+ "# import utils.utils\n",
611
+ "# importlib.reload(utils)\n",
612
+ "from utils.utils import get_row_image\n",
613
+ "from utils.utils import get_row_image\n",
614
+ "from utils.utils import get_comic_4panel"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": null,
620
+ "metadata": {},
621
+ "outputs": [],
622
+ "source": [
623
+ "comics = get_comic_4panel(total_images, captions = prompt_array+ new_prompts,font = font )\n",
624
+ "for comic in comics:\n",
625
+ " display(comic)"
626
+ ]
627
+ },
628
+ {
629
+ "cell_type": "code",
630
+ "execution_count": null,
631
+ "metadata": {},
632
+ "outputs": [],
633
+ "source": []
634
+ },
635
+ {
636
+ "cell_type": "code",
637
+ "execution_count": null,
638
+ "metadata": {},
639
+ "outputs": [],
640
+ "source": []
641
+ }
642
+ ],
643
+ "metadata": {
644
+ "fileId": "51613593-0d85-430e-8fce-c85e580fc483",
645
+ "kernelspec": {
646
+ "display_name": "storydiffusion",
647
+ "language": "python",
648
+ "name": "python3"
649
+ },
650
+ "language_info": {
651
+ "codemirror_mode": {
652
+ "name": "ipython",
653
+ "version": 3
654
+ },
655
+ "file_extension": ".py",
656
+ "mimetype": "text/x-python",
657
+ "name": "python",
658
+ "nbconvert_exporter": "python",
659
+ "pygments_lexer": "ipython3",
660
+ "version": "3.10.16"
661
+ }
662
+ },
663
+ "nbformat": 4,
664
+ "nbformat_minor": 4
665
+ }
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Demo
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.31.0
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/f79da6b7-0b3b-4dd7-8dd0-ba0b15306fe6" height=100>
3
+ </p>
4
+
5
+ <div align="center">
6
+
7
+ ## StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation [![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md-dark.svg)]()
8
+
9
+ [[Paper](https://arxiv.org/abs/2405.01434)] &emsp; [[Project Page](https://storydiffusion.github.io/)] &emsp; [[Jittor Version](https://github.com/JittorCV/jittordiffusion/tree/master)]&emsp; [[🤗 Comic Generation Demo ](https://huggingface.co/spaces/YupengZhou/StoryDiffusion)] [![Replicate](https://replicate.com/cjwbw/StoryDiffusion/badge)](https://replicate.com/cjwbw/StoryDiffusion) [![Run Comics Demo in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/HVision-NKU/StoryDiffusion/blob/main/Comic_Generation.ipynb) <br>
10
+ </div>
11
+
12
+
13
  ---
14
+
15
+ Official implementation of **[StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation]()**.
16
+
17
+ ### **Demo Video**
18
+
19
+ https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/d5b80f8f-09b0-48cd-8b10-daff46d422af
20
+
21
+
22
+ ### Update History
23
+
24
+ ***You can visit [here](update.md) to visit update history.***
25
+
26
+ ### 🌠 **Key Features:**
27
+ StoryDiffusion can create a magic story by generating consistent images and videos. Our work mainly has two parts:
28
+ 1. Consistent self-attention for character-consistent image generation over long-range sequences. It is hot-pluggable and compatible with all SD1.5 and SDXL-based image diffusion models. For the current implementation, the user needs to provide at least 3 text prompts for the consistent self-attention module. We recommend at least 5 - 6 text prompts for better layout arrangement.
29
+ 2. Motion predictor for long-range video generation, which predicts motion between Condition Images in a compressed image semantic space, achieving larger motion prediction.
30
+
31
+
32
+
33
+ ## 🔥 **Examples**
34
+
35
+
36
+ ### Comics generation
37
+
38
+
39
+ ![1](https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/b3771cbc-b6ca-4e26-bdc5-d944daf9f266)
40
+
41
+
42
+
43
+ ### Image-to-Video generation (Results are HIGHLY compressed for speed)
44
+ Leveraging the images produced through our Consistent Self-Attention mechanism, we can extend the process to create videos by seamlessly transitioning between these images. This can be considered as a two-stage long video generation approach.
45
+
46
+ Note: results are **highly compressed** for speed, you can visit [our website](https://storydiffusion.github.io/) for the high-quality version.
47
+ #### Two-stage Long Videos Generation (New Update)
48
+ Combining the two parts, we can generate very long and high-quality AIGC videos.
49
+ | Video1 | Video2 | Video3 |
50
+ | --- | --- | --- |
51
+ | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/4e7e0f24-5f90-419b-9a1e-cdf36d361b26" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/f509343d-d691-4e2a-b615-7d96381ef7c1" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/4f0f7abb-4ae4-47a6-b692-5bdd8d9c8006" width=224> |
52
+
53
+
54
+ #### Long Video Results using Condition Images
55
+ Our Image-to-Video model can generate a video by providing a sequence of user-input condition images.
56
+ | Video1 | Video2 | Video3 |
57
+ | --- | --- | --- |
58
+ | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/af6f5c50-c773-4ef2-a757-6d7a46393f39" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/d58e4037-d8df-4f90-8c81-ce4b6d2d868e" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/40da15ba-f5c1-48d8-84d6-8d327207d696" width=224> |
59
+
60
+ | Video4 | Video5 | Video6 |
61
+ | --- | --- | --- |
62
+ | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/8f04c9fc-3031-49e3-9de8-83d582b80a1f" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/604107fb-8afe-4052-bda4-362c646a756e" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/b05fa6a0-12e6-4111-abf8-18b8cd84f3ff" width=224> |
63
+
64
+
65
+
66
+
67
+ #### Short Videos
68
+
69
+ | Video1 | Video2 | Video3 |
70
+ | --- | --- | --- |
71
+ | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/5e7f717f-daad-46f6-b3ba-c087bd843158" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/79aa52b2-bf37-4c9c-8555-c7050aec0cdf" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/9fdfd091-10e6-434e-9ce7-6d6e6d8f4b22" width=224> |
72
+
73
+
74
+
75
+ | Video4 | Video5 | Video6 |
76
+ | --- | --- | --- |
77
+ | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/0b219b60-a998-4820-9657-6abe1747cb6b" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/d387aef0-ffc8-41b0-914f-4b0392d9f8c5" width=224> | <img src="https://github.com/HVision-NKU/StoryDiffusion/assets/49511209/3c64958a-1079-4ca0-a9cf-e0486adbc57f" width=224> |
78
+
79
+
80
+
81
+
82
+ ## 🚩 **TODO/Updates**
83
+ - [x] Comic Results of StoryDiffusion.
84
+ - [x] Video Results of StoryDiffusion.
85
+ - [x] Source code of Comic Generation
86
+ - [x] Source code of gradio demo
87
+ - [ ] Source code of Video Generation Model
88
+ - [ ] Pretrained weight of Video Generation Model
89
  ---
90
 
91
+ # 🔧 Dependencies and Installation
92
+
93
+ - Python >= 3.8 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
94
+ - [PyTorch >= 2.0.0](https://pytorch.org/)
95
+ ```bash
96
+ conda create --name storydiffusion python=3.10
97
+ conda activate storydiffusion
98
+ pip install -U pip
99
+
100
+ # Install requirements
101
+ pip install -r requirements.txt
102
+ ```
103
+ # How to use
104
+
105
+ Currently, we provide two ways for you to generate comics.
106
+
107
+ ## Use the jupyter notebook
108
+
109
+ You can open the `Comic_Generation.ipynb` and run the code.
110
+
111
+ ## Start a local gradio demo
112
+ Run the following command:
113
+
114
+
115
+ **(Recommend)** We provide a low GPU Memory cost version, it was tested on a machine with 24GB GPU-memory(Tesla A10) and 30GB RAM, and expected to work well with >20 G GPU-memory.
116
+
117
+ ```python
118
+ python gradio_app_sdxl_specific_id_low_vram.py
119
+ ```
120
+
121
+
122
+ ## Contact
123
+ If you have any questions, you are very welcome to email ypzhousdu@gmail.com and zhoudaquan21@gmail.com
124
+
125
+
126
+
127
+
128
+ # Disclaimer
129
+ This project strives to impact the domain of AI-driven image and video generation positively. Users are granted the freedom to create images and videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
130
+
131
+ # Related Resources
132
+ Following are some third-party implementations of StoryDiffusion.
133
+
134
+
135
+ ## API
136
+
137
+ - [runpod.io serverless worker](https://github.com/bes-dev/story-diffusion-runpod-serverless-worker) provided by [BeS](https://github.com/bes-dev).
138
+ - [Replicate worker](https://github.com/camenduru/StoryDiffusion-replicate) provided by [camenduru](https://github.com/camenduru).
139
+
140
+
141
+
142
+
143
+ # BibTeX
144
+ If you find StoryDiffusion useful for your research and applications, please cite using this BibTeX:
145
+
146
+ ```BibTeX
147
+ @article{zhou2024storydiffusion,
148
+ title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
149
+ author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
150
+ journal={NeurIPS 2024},
151
+ year={2024}
152
+ }
app.py ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from email.policy import default
2
+ import gradio as gr
3
+ import numpy as np
4
+ import spaces
5
+ import torch
6
+ import requests
7
+ import random
8
+ import os
9
+ import sys
10
+ import pickle
11
+ from PIL import Image
12
+ from tqdm.auto import tqdm
13
+ from datetime import datetime
14
+ from utils.gradio_utils import is_torch2_available
15
+ if is_torch2_available():
16
+ from utils.gradio_utils import \
17
+ AttnProcessor2_0 as AttnProcessor
18
+ # from utils.gradio_utils import SpatialAttnProcessor2_0
19
+ else:
20
+ from utils.gradio_utils import AttnProcessor
21
+
22
+ import diffusers
23
+ from diffusers import StableDiffusionXLPipeline
24
+ from utils import PhotoMakerStableDiffusionXLPipeline
25
+ from diffusers import DDIMScheduler
26
+ import torch.nn.functional as F
27
+ from utils.gradio_utils import cal_attn_mask_xl
28
+ import copy
29
+ import os
30
+ from huggingface_hub import hf_hub_download
31
+ from diffusers.utils import load_image
32
+ from utils.utils import get_comic
33
+ from utils.style_template import styles
34
+ image_encoder_path = "./data/models/ip_adapter/sdxl_models/image_encoder"
35
+ ip_ckpt = "./data/models/ip_adapter/sdxl_models/ip-adapter_sdxl_vit-h.bin"
36
+ os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
37
+ STYLE_NAMES = list(styles.keys())
38
+ DEFAULT_STYLE_NAME = "Japanese Anime"
39
+ global models_dict
40
+ use_va = True
41
+ models_dict = {
42
+ # "Juggernaut": "RunDiffusion/Juggernaut-XL-v8",
43
+ # "RealVision": "SG161222/RealVisXL_V4.0" ,
44
+ # "SDXL":"stabilityai/stable-diffusion-xl-base-1.0" ,
45
+ "Unstable": "stablediffusionapi/sdxl-unstable-diffusers-y"
46
+ }
47
+ photomaker_path = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
48
+ MAX_SEED = np.iinfo(np.int32).max
49
+ def setup_seed(seed):
50
+ torch.manual_seed(seed)
51
+ torch.cuda.manual_seed_all(seed)
52
+ np.random.seed(seed)
53
+ random.seed(seed)
54
+ torch.backends.cudnn.deterministic = True
55
+ def set_text_unfinished():
56
+ return gr.update(visible=True, value="<h3>(Not Finished) Generating ··· The intermediate results will be shown.</h3>")
57
+ def set_text_finished():
58
+ return gr.update(visible=True, value="<h3>Generation Finished</h3>")
59
+ #################################################
60
+ def get_image_path_list(folder_name):
61
+ image_basename_list = os.listdir(folder_name)
62
+ image_path_list = sorted([os.path.join(folder_name, basename) for basename in image_basename_list])
63
+ return image_path_list
64
+
65
+ #################################################
66
+ class SpatialAttnProcessor2_0(torch.nn.Module):
67
+ r"""
68
+ Attention processor for IP-Adapater for PyTorch 2.0.
69
+ Args:
70
+ hidden_size (`int`):
71
+ The hidden size of the attention layer.
72
+ cross_attention_dim (`int`):
73
+ The number of channels in the `encoder_hidden_states`.
74
+ text_context_len (`int`, defaults to 77):
75
+ The context length of the text features.
76
+ scale (`float`, defaults to 1.0):
77
+ the weight scale of image prompt.
78
+ """
79
+ ################################################################################################################################################################################
80
+ def __init__(self, hidden_size = None, cross_attention_dim=None,id_length = 4,device = "cuda",dtype = torch.float16):
81
+ super().__init__()
82
+ if not hasattr(F, "scaled_dot_product_attention"):
83
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
84
+ self.device = device
85
+ self.dtype = dtype
86
+ self.hidden_size = hidden_size
87
+ self.cross_attention_dim = cross_attention_dim
88
+ self.total_length = id_length + 1
89
+ self.id_length = id_length
90
+ self.id_bank = {}
91
+
92
+ def __call__(
93
+ self,
94
+ attn,
95
+ hidden_states,
96
+ encoder_hidden_states=None,
97
+ attention_mask=None,
98
+ temb=None):
99
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
100
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
101
+ # 生成一个0到1之间的随机数
102
+ global total_count,attn_count,cur_step,mask1024,mask4096
103
+ global sa32, sa64
104
+ global write
105
+ global height,width
106
+ if write:
107
+ # print(f"white:{cur_step}")
108
+ self.id_bank[cur_step] = [hidden_states[:self.id_length], hidden_states[self.id_length:]]
109
+ else:
110
+ encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),hidden_states[:1],self.id_bank[cur_step][1].to(self.device),hidden_states[1:]))
111
+ # 判断随机数是否大于0.5
112
+ if cur_step <5:
113
+ hidden_states = self.__call2__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)
114
+ else: # 256 1024 4096
115
+ random_number = random.random()
116
+ if cur_step <20:
117
+ rand_num = 0.3
118
+ else:
119
+ rand_num = 0.1
120
+ # print(f"hidden state shape {hidden_states.shape[1]}")
121
+ if random_number > rand_num:
122
+ # print("mask shape",mask1024.shape,mask4096.shape)
123
+ if not write:
124
+ if hidden_states.shape[1] == (height//32) * (width//32):
125
+ attention_mask = mask1024[mask1024.shape[0] // self.total_length * self.id_length:]
126
+ else:
127
+ attention_mask = mask4096[mask4096.shape[0] // self.total_length * self.id_length:]
128
+ else:
129
+ # print(self.total_length,self.id_length,hidden_states.shape,(height//32) * (width//32))
130
+ if hidden_states.shape[1] == (height//32) * (width//32):
131
+ attention_mask = mask1024[:mask1024.shape[0] // self.total_length * self.id_length,:mask1024.shape[0] // self.total_length * self.id_length]
132
+ else:
133
+ attention_mask = mask4096[:mask4096.shape[0] // self.total_length * self.id_length,:mask4096.shape[0] // self.total_length * self.id_length]
134
+ # print(attention_mask.shape)
135
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
136
+ hidden_states = self.__call1__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)
137
+ else:
138
+ hidden_states = self.__call2__(attn, hidden_states,None,attention_mask,temb)
139
+ attn_count +=1
140
+ if attn_count == total_count:
141
+ attn_count = 0
142
+ cur_step += 1
143
+ mask1024,mask4096 = cal_attn_mask_xl(self.total_length,self.id_length,sa32,sa64,height,width, device=self.device, dtype= self.dtype)
144
+
145
+ return hidden_states
146
+ def __call1__(
147
+ self,
148
+ attn,
149
+ hidden_states,
150
+ encoder_hidden_states=None,
151
+ attention_mask=None,
152
+ temb=None,
153
+ ):
154
+ # print("hidden state shape",hidden_states.shape,self.id_length)
155
+ residual = hidden_states
156
+ # if encoder_hidden_states is not None:
157
+ # raise Exception("not implement")
158
+ if attn.spatial_norm is not None:
159
+ hidden_states = attn.spatial_norm(hidden_states, temb)
160
+ input_ndim = hidden_states.ndim
161
+
162
+ if input_ndim == 4:
163
+ total_batch_size, channel, height, width = hidden_states.shape
164
+ hidden_states = hidden_states.view(total_batch_size, channel, height * width).transpose(1, 2)
165
+ total_batch_size,nums_token,channel = hidden_states.shape
166
+ img_nums = total_batch_size//2
167
+ hidden_states = hidden_states.view(-1,img_nums,nums_token,channel).reshape(-1,img_nums * nums_token,channel)
168
+
169
+ batch_size, sequence_length, _ = hidden_states.shape
170
+
171
+ if attn.group_norm is not None:
172
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
173
+
174
+ query = attn.to_q(hidden_states)
175
+
176
+ if encoder_hidden_states is None:
177
+ encoder_hidden_states = hidden_states # B, N, C
178
+ else:
179
+ encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,nums_token,channel).reshape(-1,(self.id_length+1) * nums_token,channel)
180
+
181
+ key = attn.to_k(encoder_hidden_states)
182
+ value = attn.to_v(encoder_hidden_states)
183
+
184
+
185
+ inner_dim = key.shape[-1]
186
+ head_dim = inner_dim // attn.heads
187
+
188
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
189
+
190
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
191
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
192
+ # print(key.shape,value.shape,query.shape,attention_mask.shape)
193
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
194
+ # TODO: add support for attn.scale when we move to Torch 2.1
195
+ #print(query.shape,key.shape,value.shape,attention_mask.shape)
196
+ hidden_states = F.scaled_dot_product_attention(
197
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
198
+ )
199
+
200
+ hidden_states = hidden_states.transpose(1, 2).reshape(total_batch_size, -1, attn.heads * head_dim)
201
+ hidden_states = hidden_states.to(query.dtype)
202
+
203
+
204
+
205
+ # linear proj
206
+ hidden_states = attn.to_out[0](hidden_states)
207
+ # dropout
208
+ hidden_states = attn.to_out[1](hidden_states)
209
+
210
+ # if input_ndim == 4:
211
+ # tile_hidden_states = tile_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
212
+
213
+ # if attn.residual_connection:
214
+ # tile_hidden_states = tile_hidden_states + residual
215
+
216
+ if input_ndim == 4:
217
+ hidden_states = hidden_states.transpose(-1, -2).reshape(total_batch_size, channel, height, width)
218
+ if attn.residual_connection:
219
+ hidden_states = hidden_states + residual
220
+ hidden_states = hidden_states / attn.rescale_output_factor
221
+ # print(hidden_states.shape)
222
+ return hidden_states
223
+ def __call2__(
224
+ self,
225
+ attn,
226
+ hidden_states,
227
+ encoder_hidden_states=None,
228
+ attention_mask=None,
229
+ temb=None):
230
+ residual = hidden_states
231
+
232
+ if attn.spatial_norm is not None:
233
+ hidden_states = attn.spatial_norm(hidden_states, temb)
234
+
235
+ input_ndim = hidden_states.ndim
236
+
237
+ if input_ndim == 4:
238
+ batch_size, channel, height, width = hidden_states.shape
239
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
240
+
241
+ batch_size, sequence_length, channel = (
242
+ hidden_states.shape
243
+ )
244
+ # print(hidden_states.shape)
245
+ if attention_mask is not None:
246
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
247
+ # scaled_dot_product_attention expects attention_mask shape to be
248
+ # (batch, heads, source_length, target_length)
249
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
250
+
251
+ if attn.group_norm is not None:
252
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
253
+
254
+ query = attn.to_q(hidden_states)
255
+
256
+ if encoder_hidden_states is None:
257
+ encoder_hidden_states = hidden_states # B, N, C
258
+ else:
259
+ encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
260
+
261
+ key = attn.to_k(encoder_hidden_states)
262
+ value = attn.to_v(encoder_hidden_states)
263
+
264
+ inner_dim = key.shape[-1]
265
+ head_dim = inner_dim // attn.heads
266
+
267
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
268
+
269
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
270
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
271
+
272
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
273
+ # TODO: add support for attn.scale when we move to Torch 2.1
274
+ hidden_states = F.scaled_dot_product_attention(
275
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
276
+ )
277
+
278
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
279
+ hidden_states = hidden_states.to(query.dtype)
280
+
281
+ # linear proj
282
+ hidden_states = attn.to_out[0](hidden_states)
283
+ # dropout
284
+ hidden_states = attn.to_out[1](hidden_states)
285
+
286
+ if input_ndim == 4:
287
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
288
+
289
+ if attn.residual_connection:
290
+ hidden_states = hidden_states + residual
291
+
292
+ hidden_states = hidden_states / attn.rescale_output_factor
293
+
294
+ return hidden_states
295
+
296
+ def set_attention_processor(unet,id_length,is_ipadapter = False):
297
+ global total_count
298
+ total_count = 0
299
+ attn_procs = {}
300
+ for name in unet.attn_processors.keys():
301
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
302
+ if name.startswith("mid_block"):
303
+ hidden_size = unet.config.block_out_channels[-1]
304
+ elif name.startswith("up_blocks"):
305
+ block_id = int(name[len("up_blocks.")])
306
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
307
+ elif name.startswith("down_blocks"):
308
+ block_id = int(name[len("down_blocks.")])
309
+ hidden_size = unet.config.block_out_channels[block_id]
310
+ if cross_attention_dim is None:
311
+ if name.startswith("up_blocks") :
312
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length = id_length)
313
+ total_count +=1
314
+ else:
315
+ attn_procs[name] = AttnProcessor()
316
+ else:
317
+ if is_ipadapter:
318
+ attn_procs[name] = IPAttnProcessor2_0(
319
+ hidden_size=hidden_size,
320
+ cross_attention_dim=cross_attention_dim,
321
+ scale=1,
322
+ num_tokens=4,
323
+ ).to(unet.device, dtype=torch.float16)
324
+ else:
325
+ attn_procs[name] = AttnProcessor()
326
+
327
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
328
+ print("successsfully load paired self-attention")
329
+ print(f"number of the processor : {total_count}")
330
+ #################################################
331
+ #################################################
332
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
333
+ load_js = """
334
+ async () => {
335
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
336
+ fetch(url)
337
+ .then(res => res.text())
338
+ .then(text => {
339
+ const script = document.createElement('script');
340
+ script.type = "module"
341
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
342
+ document.head.appendChild(script);
343
+ });
344
+ }
345
+ """
346
+
347
+ get_js_colors = """
348
+ async (canvasData) => {
349
+ const canvasEl = document.getElementById("canvas-root");
350
+ return [canvasEl._data]
351
+ }
352
+ """
353
+
354
+ css = '''
355
+ #color-bg{display:flex;justify-content: center;align-items: center;}
356
+ .color-bg-item{width: 100%; height: 32px}
357
+ #main_button{width:100%}
358
+ <style>
359
+ '''
360
+
361
+
362
+ #################################################
363
+ title = r"""
364
+ <h1 align="center">StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</h1>
365
+ """
366
+
367
+ description = r"""
368
+ <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
369
+ ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
370
+ 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
371
+ 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
372
+ 3️⃣ Choose your preferred style template.<br>
373
+ 4️⃣ Click the <b>Submit</b> button to start customizing.
374
+ """
375
+
376
+ article = r"""
377
+
378
+ If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
379
+ [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
380
+ ---
381
+ 📝 **Citation**
382
+ <br>
383
+ If our work is useful for your research, please consider citing:
384
+
385
+ ```bibtex
386
+ @article{Zhou2024storydiffusion,
387
+ title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
388
+ author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
389
+ year={2024}
390
+ }
391
+ ```
392
+ 📋 **License**
393
+ <br>
394
+ The Contents you create are under Apache-2.0 LICENSE. The Code are under Attribution-NonCommercial 4.0 International.
395
+
396
+ 📧 **Contact**
397
+ <br>
398
+ If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
399
+ """
400
+ version = r"""
401
+ <h3 align="center">StoryDiffusion Version 0.01 (test version)</h3>
402
+
403
+ <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
404
+ <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
405
+ <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling."),Currently, support is only using Textual Description</h5>
406
+ <h5 align="center">Tips: Not Ready Now! Just Test</h5>
407
+ """
408
+ #################################################
409
+ global attn_count, total_count, id_length, total_length,cur_step, cur_model_type
410
+ global write
411
+ global sa32, sa64
412
+ global height,width
413
+ attn_count = 0
414
+ total_count = 0
415
+ cur_step = 0
416
+ id_length = 4
417
+ total_length = 5
418
+ cur_model_type = ""
419
+ device="cuda"
420
+ global attn_procs,unet
421
+ attn_procs = {}
422
+ ###
423
+ write = False
424
+ ###
425
+ sa32 = 0.5
426
+ sa64 = 0.5
427
+ height = 768
428
+ width = 768
429
+ ###
430
+ global sd_model_path
431
+ sd_model_path = models_dict["Unstable"]#"SG161222/RealVisXL_V4.0"
432
+ use_safetensors= False
433
+ ### LOAD Stable Diffusion Pipeline
434
+ pipe1 = StableDiffusionXLPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float16, use_safetensors= use_safetensors)
435
+ pipe1 = pipe1.to("cuda")
436
+ pipe1.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
437
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
438
+ pipe1.scheduler.set_timesteps(50)
439
+ ###
440
+ pipe2 = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
441
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=use_safetensors)
442
+ pipe2 = pipe2.to("cuda")
443
+ pipe2.load_photomaker_adapter(
444
+ os.path.dirname(photomaker_path),
445
+ subfolder="",
446
+ weight_name=os.path.basename(photomaker_path),
447
+ trigger_word="img" # define the trigger word
448
+ )
449
+ pipe2 = pipe2.to("cuda")
450
+ pipe2.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
451
+ pipe2.fuse_lora()
452
+
453
+ ######### Gradio Fuction #############
454
+
455
+ def swap_to_gallery(images):
456
+ return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
457
+
458
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
459
+ return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
460
+
461
+ def remove_back_to_files():
462
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
463
+
464
+ def remove_tips():
465
+ return gr.update(visible=False)
466
+
467
+ def apply_style_positive(style_name: str, positive: str):
468
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
469
+ return p.replace("{prompt}", positive)
470
+
471
+ def apply_style(style_name: str, positives: list, negative: str = ""):
472
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
473
+ return [p.replace("{prompt}", positive) for positive in positives], n + ' ' + negative
474
+
475
+ def change_visiale_by_model_type(_model_type):
476
+ if _model_type == "Only Using Textual Description":
477
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
478
+ elif _model_type == "Using Ref Images":
479
+ return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
480
+ else:
481
+ raise ValueError("Invalid model type",_model_type)
482
+
483
+
484
+ ######### Image Generation ##############
485
+ @spaces.GPU
486
+ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_name, _Ip_Adapter_Strength ,_style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt,prompt_array,G_height,G_width,_comic_type):
487
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
488
+ if _model_type == "Photomaker" and "img" not in general_prompt:
489
+ raise gr.Error("Please add the triger word \" img \" behind the class word you want to customize, such as: man img or woman img")
490
+ if _upload_images is None and _model_type != "original":
491
+ raise gr.Error(f"Cannot find any input face image!")
492
+ global sa32, sa64,id_length,total_length,attn_procs,unet,cur_model_type,device
493
+ global write
494
+ global cur_step,attn_count
495
+ global height,width
496
+ height = G_height
497
+ width = G_width
498
+ global pipe1,pipe2
499
+ global sd_model_path,models_dict
500
+ sd_model_path = models_dict[_sd_type]
501
+ use_safe_tensor = True
502
+ if _model_type == "original":
503
+ pipe = pipe1
504
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
505
+ elif _model_type == "Photomaker":
506
+ pipe = pipe2
507
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
508
+ else:
509
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
510
+ ##### ########################
511
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
512
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
513
+ cur_model_type = _sd_type+"-"+_model_type+""+str(id_length_)
514
+ if _model_type != "original":
515
+ input_id_images = []
516
+ for img in _upload_images:
517
+ print(img)
518
+ input_id_images.append(load_image(img))
519
+ prompts = prompt_array.splitlines()
520
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
521
+ if start_merge_step > 30:
522
+ start_merge_step = 30
523
+ print(f"start_merge_step:{start_merge_step}")
524
+ generator = torch.Generator(device="cuda").manual_seed(seed_)
525
+ sa32, sa64 = sa32_, sa64_
526
+ id_length = id_length_
527
+ clipped_prompts = prompts[:]
528
+ prompts = [general_prompt + "," + prompt if "[NC]" not in prompt else prompt.replace("[NC]","") for prompt in clipped_prompts]
529
+ prompts = [prompt.rpartition('#')[0] if "#" in prompt else prompt for prompt in prompts]
530
+ print(prompts)
531
+ id_prompts = prompts[:id_length]
532
+ real_prompts = prompts[id_length:]
533
+ torch.cuda.empty_cache()
534
+ write = True
535
+ cur_step = 0
536
+
537
+ attn_count = 0
538
+ id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
539
+ setup_seed(seed_)
540
+ total_results = []
541
+ if _model_type == "original":
542
+ id_images = pipe(id_prompts, num_inference_steps=_num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
543
+ elif _model_type == "Photomaker":
544
+ id_images = pipe(id_prompts,input_id_images=input_id_images, num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
545
+ else:
546
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
547
+ total_results = id_images + total_results
548
+ yield total_results
549
+ real_images = []
550
+ write = False
551
+ for real_prompt in real_prompts:
552
+ setup_seed(seed_)
553
+ cur_step = 0
554
+ real_prompt = apply_style_positive(style_name, real_prompt)
555
+ if _model_type == "original":
556
+ real_images.append(pipe(real_prompt, num_inference_steps=_num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])
557
+ elif _model_type == "Photomaker":
558
+ real_images.append(pipe(real_prompt, input_id_images=input_id_images, num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])
559
+ else:
560
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
561
+ total_results = [real_images[-1]] + total_results
562
+ yield total_results
563
+ if _comic_type != "No typesetting (default)":
564
+ captions= prompt_array.splitlines()
565
+ captions = [caption.replace("[NC]","") for caption in captions]
566
+ captions = [caption.split('#')[-1] if "#" in caption else caption for caption in captions]
567
+ from PIL import ImageFont
568
+ total_results = get_comic(id_images + real_images, _comic_type,captions= captions,font=ImageFont.truetype("./fonts/Inkfree.ttf", int(45))) + total_results
569
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
570
+ yield total_results
571
+
572
+
573
+
574
+ def array2string(arr):
575
+ stringtmp = ""
576
+ for i,part in enumerate(arr):
577
+ if i != len(arr)-1:
578
+ stringtmp += part +"\n"
579
+ else:
580
+ stringtmp += part
581
+
582
+ return stringtmp
583
+
584
+
585
+ #################################################
586
+ #################################################
587
+ ### define the interface
588
+ with gr.Blocks(css=css) as demo:
589
+ binary_matrixes = gr.State([])
590
+ color_layout = gr.State([])
591
+
592
+ # gr.Markdown(logo)
593
+ gr.Markdown(title)
594
+ gr.Markdown(description)
595
+
596
+ with gr.Row():
597
+ with gr.Group(elem_id="main-image"):
598
+ # button_run = gr.Button("generate id images ! 😺", elem_id="main_button", interactive=True)
599
+
600
+ prompts = []
601
+ colors = []
602
+ # with gr.Column(visible=False) as post_sketch:
603
+ # for n in range(MAX_COLORS):
604
+ # if n == 0 :
605
+ # with gr.Row(visible=False) as color_row[n]:
606
+ # colors.append(gr.Image(shape=(100, 100), label="background", type="pil", image_mode="RGB", width=100, height=100))
607
+ # prompts.append(gr.Textbox(label="Prompt for the background (white region)", value=""))
608
+ # else:
609
+ # with gr.Row(visible=False) as color_row[n]:
610
+ # colors.append(gr.Image(shape=(100, 100), label="segment "+str(n), type="pil", image_mode="RGB", width=100, height=100))
611
+ # prompts.append(gr.Textbox(label="Prompt for the segment "+str(n)))
612
+
613
+ # get_genprompt_run = gr.Button("(2) I've finished segment labeling ! 😺", elem_id="prompt_button", interactive=True)
614
+
615
+ with gr.Column(visible=True) as gen_prompt_vis:
616
+ sd_type = gr.Dropdown(choices=list(models_dict.keys()), value = "Unstable",label="sd_type", info="Select pretrained model")
617
+ model_type = gr.Radio(["Only Using Textual Description", "Using Ref Images"], label="model_type", value = "Only Using Textual Description", info="Control type of the Character")
618
+ with gr.Group(visible=False) as control_image_input:
619
+ files = gr.Files(
620
+ label="Drag (Select) 1 or more photos of your face",
621
+ file_types=["image"],
622
+ )
623
+ uploaded_files = gr.Gallery(label="Your images", visible=False, columns=5, rows=1, height=200)
624
+ with gr.Column(visible=False) as clear_button:
625
+ remove_and_reupload = gr.ClearButton(value="Remove and upload new ones", components=files, size="sm")
626
+ general_prompt = gr.Textbox(value='', label="(1) Textual Description for Character", interactive=True)
627
+ negative_prompt = gr.Textbox(value='', label="(2) Negative_prompt", interactive=True)
628
+ style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
629
+ prompt_array = gr.Textbox(lines = 3,value='', label="(3) Comic Description (each line corresponds to a frame).", interactive=True)
630
+ with gr.Accordion("(4) Tune the hyperparameters", open=True):
631
+ #sa16_ = gr.Slider(label=" (The degree of Paired Attention at 16 x 16 self-attention layers) ", minimum=0, maximum=1., value=0.3, step=0.1)
632
+ sa32_ = gr.Slider(label=" (The degree of Paired Attention at 32 x 32 self-attention layers) ", minimum=0, maximum=1., value=0.7, step=0.1)
633
+ sa64_ = gr.Slider(label=" (The degree of Paired Attention at 64 x 64 self-attention layers) ", minimum=0, maximum=1., value=0.7, step=0.1)
634
+ id_length_ = gr.Slider(label= "Number of id images in total images" , minimum=2, maximum=4, value=2, step=1)
635
+ # total_length_ = gr.Slider(label= "Number of total images", minimum=1, maximum=20, value=1, step=1)
636
+ seed_ = gr.Slider(label="Seed", minimum=-1, maximum=MAX_SEED, value=0, step=1)
637
+ num_steps = gr.Slider(
638
+ label="Number of sample steps",
639
+ minimum=20,
640
+ maximum=100,
641
+ step=1,
642
+ value=50,
643
+ )
644
+ G_height = gr.Slider(
645
+ label="height",
646
+ minimum=256,
647
+ maximum=1024,
648
+ step=32,
649
+ value=768,
650
+ )
651
+ G_width = gr.Slider(
652
+ label="width",
653
+ minimum=256,
654
+ maximum=1024,
655
+ step=32,
656
+ value=768,
657
+ )
658
+ comic_type = gr.Radio(["No typesetting (default)", "Four Pannel", "Classic Comic Style"], value = "Classic Comic Style", label="Typesetting Style", info="Select the typesetting style ")
659
+ guidance_scale = gr.Slider(
660
+ label="Guidance scale",
661
+ minimum=0.1,
662
+ maximum=10.0,
663
+ step=0.1,
664
+ value=5,
665
+ )
666
+ style_strength_ratio = gr.Slider(
667
+ label="Style strength of Ref Image (%)",
668
+ minimum=15,
669
+ maximum=50,
670
+ step=1,
671
+ value=20,
672
+ visible=False
673
+ )
674
+ Ip_Adapter_Strength = gr.Slider(
675
+ label="Ip_Adapter_Strength",
676
+ minimum=0,
677
+ maximum=1,
678
+ step=0.1,
679
+ value=0.5,
680
+ visible=False
681
+ )
682
+ final_run_btn = gr.Button("Generate ! 😺")
683
+
684
+
685
+ with gr.Column():
686
+ out_image = gr.Gallery(label="Result", columns=2, height='auto')
687
+ generated_information = gr.Markdown(label="Generation Details", value="",visible=False)
688
+ gr.Markdown(version)
689
+ model_type.change(fn = change_visiale_by_model_type , inputs = model_type, outputs=[control_image_input,style_strength_ratio,Ip_Adapter_Strength])
690
+ files.upload(fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files])
691
+ remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files])
692
+
693
+ final_run_btn.click(fn=set_text_unfinished, outputs = generated_information
694
+ ).then(process_generation, inputs=[sd_type,model_type,files, num_steps,style, Ip_Adapter_Strength,style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,G_height,G_width,comic_type], outputs=out_image
695
+ ).then(fn=set_text_finished,outputs = generated_information)
696
+
697
+
698
+ gr.Examples(
699
+ examples=[
700
+ [1,0.5,0.5,3,"a woman img, wearing a white T-shirt, blue loose hair",
701
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
702
+ array2string(["wake up in the bed",
703
+ "have breakfast",
704
+ "is on the road, go to company",
705
+ "work in the company",
706
+ "Take a walk next to the company at noon",
707
+ "lying in bed at night"]),
708
+ "Japanese Anime", "Using Ref Images",get_image_path_list('./examples/taylor'),768,768
709
+ ],
710
+ [0,0.5,0.5,2,"a man, wearing black jacket",
711
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
712
+ array2string(["wake up in the bed",
713
+ "have breakfast",
714
+ "is on the road, go to the company, close look",
715
+ "work in the company",
716
+ "laughing happily",
717
+ "lying in bed at night"
718
+ ]),
719
+ "Japanese Anime","Only Using Textual Description",get_image_path_list('./examples/taylor'),768,768
720
+ ],
721
+ [0,0.3,0.5,2,"a girl, wearing white shirt, black skirt, black tie, yellow hair",
722
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
723
+ array2string([
724
+ "at home #at home, began to go to drawing",
725
+ "sitting alone on a park bench.",
726
+ "reading a book on a park bench.",
727
+ "[NC]A squirrel approaches, peeking over the bench. ",
728
+ "look around in the park. # She looks around and enjoys the beauty of nature.",
729
+ "[NC]leaf falls from the tree, landing on the sketchbook.",
730
+ "picks up the leaf, examining its details closely.",
731
+ "starts sketching the leaf with intricate lines.",
732
+ "holds up the sketch drawing of the leaf.",
733
+ "[NC]The brown squirrel appear.",
734
+ "is very happy # She is very happy to see the squirrel again",
735
+ "[NC]The brown squirrel takes the cracker and scampers up a tree. # She gives the squirrel cracker",
736
+ "laughs and tucks the leaf into her book as a keepsake.",
737
+ "ready to leave.",]),
738
+ "Japanese Anime","Only Using Textual Description",get_image_path_list('./examples/taylor'),768,768
739
+ ]
740
+ ],
741
+ inputs=[seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,style,model_type,files,G_height,G_width],
742
+ # outputs=[post_sketch, binary_matrixes, *color_row, *colors, *prompts, gen_prompt_vis, general_prompt, seed_],
743
+ # run_on_click=True,
744
+ label='😺 Examples 😺',
745
+ )
746
+ gr.Markdown(article)
747
+
748
+ # demo.load(None, None, None, _js=load_js)
749
+
750
+ demo.launch(server_name="0.0.0.0", share = True if use_va else False)
app1.py ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from email.policy import default
2
+ import gradio as gr
3
+ import numpy as np
4
+ import spaces
5
+ import torch
6
+ import requests
7
+ import random
8
+ import os
9
+ import sys
10
+ import pickle
11
+ from PIL import Image
12
+ from tqdm.auto import tqdm
13
+ from datetime import datetime
14
+ from utils.gradio_utils import is_torch2_available
15
+ if is_torch2_available():
16
+ from utils.gradio_utils import \
17
+ AttnProcessor2_0 as AttnProcessor
18
+ # from utils.gradio_utils import SpatialAttnProcessor2_0
19
+ else:
20
+ from utils.gradio_utils import AttnProcessor
21
+
22
+ import diffusers
23
+ from diffusers import StableDiffusionXLPipeline
24
+ from utils import PhotoMakerStableDiffusionXLPipeline
25
+ from diffusers import DDIMScheduler
26
+ import torch.nn.functional as F
27
+ from utils.gradio_utils import cal_attn_mask_xl
28
+ import copy
29
+ import os
30
+ from huggingface_hub import hf_hub_download
31
+ from diffusers.utils import load_image
32
+ from utils.utils import get_comic
33
+ from utils.style_template import styles
34
+ image_encoder_path = "./data/models/ip_adapter/sdxl_models/image_encoder"
35
+ ip_ckpt = "./data/models/ip_adapter/sdxl_models/ip-adapter_sdxl_vit-h.bin"
36
+ os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
37
+ STYLE_NAMES = list(styles.keys())
38
+ DEFAULT_STYLE_NAME = "Japanese Anime"
39
+ global models_dict
40
+ use_va = True
41
+ models_dict = {
42
+ # "Juggernaut": "RunDiffusion/Juggernaut-XL-v8",
43
+ # "RealVision": "SG161222/RealVisXL_V4.0" ,
44
+ # "SDXL":"stabilityai/stable-diffusion-xl-base-1.0" ,
45
+ "Unstable": "stablediffusionapi/sdxl-unstable-diffusers-y"
46
+ }
47
+ photomaker_path = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
48
+ MAX_SEED = np.iinfo(np.int32).max
49
+ def setup_seed(seed):
50
+ torch.manual_seed(seed)
51
+ torch.cuda.manual_seed_all(seed)
52
+ np.random.seed(seed)
53
+ random.seed(seed)
54
+ torch.backends.cudnn.deterministic = True
55
+ def set_text_unfinished():
56
+ return gr.update(visible=True, value="<h3>(Not Finished) Generating ··· The intermediate results will be shown.</h3>")
57
+ def set_text_finished():
58
+ return gr.update(visible=True, value="<h3>Generation Finished</h3>")
59
+ #################################################
60
+ def get_image_path_list(folder_name):
61
+ image_basename_list = os.listdir(folder_name)
62
+ image_path_list = sorted([os.path.join(folder_name, basename) for basename in image_basename_list])
63
+ return image_path_list
64
+
65
+ #################################################
66
+ class SpatialAttnProcessor2_0(torch.nn.Module):
67
+ r"""
68
+ Attention processor for IP-Adapater for PyTorch 2.0.
69
+ Args:
70
+ hidden_size (`int`):
71
+ The hidden size of the attention layer.
72
+ cross_attention_dim (`int`):
73
+ The number of channels in the `encoder_hidden_states`.
74
+ text_context_len (`int`, defaults to 77):
75
+ The context length of the text features.
76
+ scale (`float`, defaults to 1.0):
77
+ the weight scale of image prompt.
78
+ """
79
+ ################################################################################################################################################################################
80
+ def __init__(self, hidden_size = None, cross_attention_dim=None,id_length = 4,device = "cuda",dtype = torch.float16):
81
+ super().__init__()
82
+ if not hasattr(F, "scaled_dot_product_attention"):
83
+ raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
84
+ self.device = device
85
+ self.dtype = dtype
86
+ self.hidden_size = hidden_size
87
+ self.cross_attention_dim = cross_attention_dim
88
+ self.total_length = id_length + 1
89
+ self.id_length = id_length
90
+ self.id_bank = {}
91
+
92
+ def __call__(
93
+ self,
94
+ attn,
95
+ hidden_states,
96
+ encoder_hidden_states=None,
97
+ attention_mask=None,
98
+ temb=None):
99
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
100
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
101
+ # 生成一个0到1之间的随机数
102
+ global total_count,attn_count,cur_step,mask1024,mask4096
103
+ global sa32, sa64
104
+ global write
105
+ global height,width
106
+ if write:
107
+ # print(f"white:{cur_step}")
108
+ self.id_bank[cur_step] = [hidden_states[:self.id_length], hidden_states[self.id_length:]]
109
+ else:
110
+ encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),hidden_states[:1],self.id_bank[cur_step][1].to(self.device),hidden_states[1:]))
111
+ # 判断随机数是否大于0.5
112
+ if cur_step <5:
113
+ hidden_states = self.__call2__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)
114
+ else: # 256 1024 4096
115
+ random_number = random.random()
116
+ if cur_step <20:
117
+ rand_num = 0.3
118
+ else:
119
+ rand_num = 0.1
120
+ # print(f"hidden state shape {hidden_states.shape[1]}")
121
+ if random_number > rand_num:
122
+ # print("mask shape",mask1024.shape,mask4096.shape)
123
+ if not write:
124
+ if hidden_states.shape[1] == (height//32) * (width//32):
125
+ attention_mask = mask1024[mask1024.shape[0] // self.total_length * self.id_length:]
126
+ else:
127
+ attention_mask = mask4096[mask4096.shape[0] // self.total_length * self.id_length:]
128
+ else:
129
+ # print(self.total_length,self.id_length,hidden_states.shape,(height//32) * (width//32))
130
+ if hidden_states.shape[1] == (height//32) * (width//32):
131
+ attention_mask = mask1024[:mask1024.shape[0] // self.total_length * self.id_length,:mask1024.shape[0] // self.total_length * self.id_length]
132
+ else:
133
+ attention_mask = mask4096[:mask4096.shape[0] // self.total_length * self.id_length,:mask4096.shape[0] // self.total_length * self.id_length]
134
+ # print(attention_mask.shape)
135
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
136
+ hidden_states = self.__call1__(attn, hidden_states,encoder_hidden_states,attention_mask,temb)
137
+ else:
138
+ hidden_states = self.__call2__(attn, hidden_states,None,attention_mask,temb)
139
+ attn_count +=1
140
+ if attn_count == total_count:
141
+ attn_count = 0
142
+ cur_step += 1
143
+ mask1024,mask4096 = cal_attn_mask_xl(self.total_length,self.id_length,sa32,sa64,height,width, device=self.device, dtype= self.dtype)
144
+
145
+ return hidden_states
146
+ def __call1__(
147
+ self,
148
+ attn,
149
+ hidden_states,
150
+ encoder_hidden_states=None,
151
+ attention_mask=None,
152
+ temb=None,
153
+ ):
154
+ # print("hidden state shape",hidden_states.shape,self.id_length)
155
+ residual = hidden_states
156
+ # if encoder_hidden_states is not None:
157
+ # raise Exception("not implement")
158
+ if attn.spatial_norm is not None:
159
+ hidden_states = attn.spatial_norm(hidden_states, temb)
160
+ input_ndim = hidden_states.ndim
161
+
162
+ if input_ndim == 4:
163
+ total_batch_size, channel, height, width = hidden_states.shape
164
+ hidden_states = hidden_states.view(total_batch_size, channel, height * width).transpose(1, 2)
165
+ total_batch_size,nums_token,channel = hidden_states.shape
166
+ img_nums = total_batch_size//2
167
+ hidden_states = hidden_states.view(-1,img_nums,nums_token,channel).reshape(-1,img_nums * nums_token,channel)
168
+
169
+ batch_size, sequence_length, _ = hidden_states.shape
170
+
171
+ if attn.group_norm is not None:
172
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
173
+
174
+ query = attn.to_q(hidden_states)
175
+
176
+ if encoder_hidden_states is None:
177
+ encoder_hidden_states = hidden_states # B, N, C
178
+ else:
179
+ encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,nums_token,channel).reshape(-1,(self.id_length+1) * nums_token,channel)
180
+
181
+ key = attn.to_k(encoder_hidden_states)
182
+ value = attn.to_v(encoder_hidden_states)
183
+
184
+
185
+ inner_dim = key.shape[-1]
186
+ head_dim = inner_dim // attn.heads
187
+
188
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
189
+
190
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
191
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
192
+ # print(key.shape,value.shape,query.shape,attention_mask.shape)
193
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
194
+ # TODO: add support for attn.scale when we move to Torch 2.1
195
+ #print(query.shape,key.shape,value.shape,attention_mask.shape)
196
+ hidden_states = F.scaled_dot_product_attention(
197
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
198
+ )
199
+
200
+ hidden_states = hidden_states.transpose(1, 2).reshape(total_batch_size, -1, attn.heads * head_dim)
201
+ hidden_states = hidden_states.to(query.dtype)
202
+
203
+
204
+
205
+ # linear proj
206
+ hidden_states = attn.to_out[0](hidden_states)
207
+ # dropout
208
+ hidden_states = attn.to_out[1](hidden_states)
209
+
210
+ # if input_ndim == 4:
211
+ # tile_hidden_states = tile_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
212
+
213
+ # if attn.residual_connection:
214
+ # tile_hidden_states = tile_hidden_states + residual
215
+
216
+ if input_ndim == 4:
217
+ hidden_states = hidden_states.transpose(-1, -2).reshape(total_batch_size, channel, height, width)
218
+ if attn.residual_connection:
219
+ hidden_states = hidden_states + residual
220
+ hidden_states = hidden_states / attn.rescale_output_factor
221
+ # print(hidden_states.shape)
222
+ return hidden_states
223
+ def __call2__(
224
+ self,
225
+ attn,
226
+ hidden_states,
227
+ encoder_hidden_states=None,
228
+ attention_mask=None,
229
+ temb=None):
230
+ residual = hidden_states
231
+
232
+ if attn.spatial_norm is not None:
233
+ hidden_states = attn.spatial_norm(hidden_states, temb)
234
+
235
+ input_ndim = hidden_states.ndim
236
+
237
+ if input_ndim == 4:
238
+ batch_size, channel, height, width = hidden_states.shape
239
+ hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
240
+
241
+ batch_size, sequence_length, channel = (
242
+ hidden_states.shape
243
+ )
244
+ # print(hidden_states.shape)
245
+ if attention_mask is not None:
246
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
247
+ # scaled_dot_product_attention expects attention_mask shape to be
248
+ # (batch, heads, source_length, target_length)
249
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
250
+
251
+ if attn.group_norm is not None:
252
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
253
+
254
+ query = attn.to_q(hidden_states)
255
+
256
+ if encoder_hidden_states is None:
257
+ encoder_hidden_states = hidden_states # B, N, C
258
+ else:
259
+ encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
260
+
261
+ key = attn.to_k(encoder_hidden_states)
262
+ value = attn.to_v(encoder_hidden_states)
263
+
264
+ inner_dim = key.shape[-1]
265
+ head_dim = inner_dim // attn.heads
266
+
267
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
268
+
269
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
270
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
271
+
272
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
273
+ # TODO: add support for attn.scale when we move to Torch 2.1
274
+ hidden_states = F.scaled_dot_product_attention(
275
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
276
+ )
277
+
278
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
279
+ hidden_states = hidden_states.to(query.dtype)
280
+
281
+ # linear proj
282
+ hidden_states = attn.to_out[0](hidden_states)
283
+ # dropout
284
+ hidden_states = attn.to_out[1](hidden_states)
285
+
286
+ if input_ndim == 4:
287
+ hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
288
+
289
+ if attn.residual_connection:
290
+ hidden_states = hidden_states + residual
291
+
292
+ hidden_states = hidden_states / attn.rescale_output_factor
293
+
294
+ return hidden_states
295
+
296
+ def set_attention_processor(unet,id_length,is_ipadapter = False):
297
+ global total_count
298
+ total_count = 0
299
+ attn_procs = {}
300
+ for name in unet.attn_processors.keys():
301
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
302
+ if name.startswith("mid_block"):
303
+ hidden_size = unet.config.block_out_channels[-1]
304
+ elif name.startswith("up_blocks"):
305
+ block_id = int(name[len("up_blocks.")])
306
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
307
+ elif name.startswith("down_blocks"):
308
+ block_id = int(name[len("down_blocks.")])
309
+ hidden_size = unet.config.block_out_channels[block_id]
310
+ if cross_attention_dim is None:
311
+ if name.startswith("up_blocks") :
312
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length = id_length)
313
+ total_count +=1
314
+ else:
315
+ attn_procs[name] = AttnProcessor()
316
+ else:
317
+ if is_ipadapter:
318
+ attn_procs[name] = IPAttnProcessor2_0(
319
+ hidden_size=hidden_size,
320
+ cross_attention_dim=cross_attention_dim,
321
+ scale=1,
322
+ num_tokens=4,
323
+ ).to(unet.device, dtype=torch.float16)
324
+ else:
325
+ attn_procs[name] = AttnProcessor()
326
+
327
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
328
+ print("successsfully load paired self-attention")
329
+ print(f"number of the processor : {total_count}")
330
+ #################################################
331
+ #################################################
332
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
333
+ load_js = """
334
+ async () => {
335
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
336
+ fetch(url)
337
+ .then(res => res.text())
338
+ .then(text => {
339
+ const script = document.createElement('script');
340
+ script.type = "module"
341
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
342
+ document.head.appendChild(script);
343
+ });
344
+ }
345
+ """
346
+
347
+ get_js_colors = """
348
+ async (canvasData) => {
349
+ const canvasEl = document.getElementById("canvas-root");
350
+ return [canvasEl._data]
351
+ }
352
+ """
353
+
354
+ css = '''
355
+ #color-bg{display:flex;justify-content: center;align-items: center;}
356
+ .color-bg-item{width: 100%; height: 32px}
357
+ #main_button{width:100%}
358
+ <style>
359
+ '''
360
+
361
+
362
+ #################################################
363
+ title = r"""
364
+ <h1 align="center">StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</h1>
365
+ """
366
+
367
+ description = r"""
368
+ <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
369
+ ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
370
+ 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
371
+ 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
372
+ 3️⃣ Choose your preferred style template.<br>
373
+ 4️⃣ Click the <b>Submit</b> button to start customizing.
374
+ """
375
+
376
+ article = r"""
377
+
378
+ If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
379
+ [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
380
+ ---
381
+ 📝 **Citation**
382
+ <br>
383
+ If our work is useful for your research, please consider citing:
384
+
385
+ ```bibtex
386
+ @article{Zhou2024storydiffusion,
387
+ title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
388
+ author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
389
+ year={2024}
390
+ }
391
+ ```
392
+ 📋 **License**
393
+ <br>
394
+ The Contents you create are under Apache-2.0 LICENSE. The Code are under Attribution-NonCommercial 4.0 International.
395
+
396
+ 📧 **Contact**
397
+ <br>
398
+ If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
399
+ """
400
+ version = r"""
401
+ <h3 align="center">StoryDiffusion Version 0.01 (test version)</h3>
402
+
403
+ <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
404
+ <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
405
+ <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling."),Currently, support is only using Textual Description</h5>
406
+ <h5 align="center">Tips: Not Ready Now! Just Test</h5>
407
+ """
408
+ #################################################
409
+ global attn_count, total_count, id_length, total_length,cur_step, cur_model_type
410
+ global write
411
+ global sa32, sa64
412
+ global height,width
413
+ attn_count = 0
414
+ total_count = 0
415
+ cur_step = 0
416
+ id_length = 4
417
+ total_length = 5
418
+ cur_model_type = ""
419
+ device="cuda"
420
+ global attn_procs,unet
421
+ attn_procs = {}
422
+ ###
423
+ write = False
424
+ ###
425
+ sa32 = 0.5
426
+ sa64 = 0.5
427
+ height = 768
428
+ width = 768
429
+ ###
430
+ global sd_model_path
431
+ sd_model_path = models_dict["Unstable"]#"SG161222/RealVisXL_V4.0"
432
+ use_safetensors= False
433
+ ### LOAD Stable Diffusion Pipeline
434
+ pipe1 = StableDiffusionXLPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float16, use_safetensors= use_safetensors)
435
+ pipe1 = pipe1.to("cuda")
436
+ pipe1.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
437
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
438
+ pipe1.scheduler.set_timesteps(50)
439
+ ###
440
+ pipe2 = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
441
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=use_safetensors)
442
+ pipe2 = pipe2.to("cuda")
443
+ pipe2.load_photomaker_adapter(
444
+ os.path.dirname(photomaker_path),
445
+ subfolder="",
446
+ weight_name=os.path.basename(photomaker_path),
447
+ trigger_word="img" # define the trigger word
448
+ )
449
+ pipe2 = pipe2.to("cuda")
450
+ pipe2.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
451
+ pipe2.fuse_lora()
452
+
453
+ ######### Gradio Fuction #############
454
+
455
+ def swap_to_gallery(images):
456
+ return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
457
+
458
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
459
+ return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(visible=False)
460
+
461
+ def remove_back_to_files():
462
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
463
+
464
+ def remove_tips():
465
+ return gr.update(visible=False)
466
+
467
+ def apply_style_positive(style_name: str, positive: str):
468
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
469
+ return p.replace("{prompt}", positive)
470
+
471
+ def apply_style(style_name: str, positives: list, negative: str = ""):
472
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
473
+ return [p.replace("{prompt}", positive) for positive in positives], n + ' ' + negative
474
+
475
+ def change_visiale_by_model_type(_model_type):
476
+ if _model_type == "Only Using Textual Description":
477
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
478
+ elif _model_type == "Using Ref Images":
479
+ return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)
480
+ else:
481
+ raise ValueError("Invalid model type",_model_type)
482
+
483
+
484
+ ######### Image Generation ##############
485
+ @spaces.GPU
486
+ def process_generation(_sd_type,_model_type,_upload_images, _num_steps,style_name, _Ip_Adapter_Strength ,_style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt,prompt_array,G_height,G_width,_comic_type):
487
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
488
+ if _model_type == "Photomaker" and "img" not in general_prompt:
489
+ raise gr.Error("Please add the triger word \" img \" behind the class word you want to customize, such as: man img or woman img")
490
+ if _upload_images is None and _model_type != "original":
491
+ raise gr.Error(f"Cannot find any input face image!")
492
+ global sa32, sa64,id_length,total_length,attn_procs,unet,cur_model_type,device
493
+ global write
494
+ global cur_step,attn_count
495
+ global height,width
496
+ height = G_height
497
+ width = G_width
498
+ global pipe1,pipe2
499
+ global sd_model_path,models_dict
500
+ sd_model_path = models_dict[_sd_type]
501
+ use_safe_tensor = True
502
+ if _model_type == "original":
503
+ pipe = pipe1
504
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
505
+ elif _model_type == "Photomaker":
506
+ pipe = pipe2
507
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
508
+ else:
509
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
510
+ ##### ########################
511
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
512
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
513
+ cur_model_type = _sd_type+"-"+_model_type+""+str(id_length_)
514
+ if _model_type != "original":
515
+ input_id_images = []
516
+ for img in _upload_images:
517
+ print(img)
518
+ input_id_images.append(load_image(img))
519
+ prompts = prompt_array.splitlines()
520
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
521
+ if start_merge_step > 30:
522
+ start_merge_step = 30
523
+ print(f"start_merge_step:{start_merge_step}")
524
+ generator = torch.Generator(device="cuda").manual_seed(seed_)
525
+ sa32, sa64 = sa32_, sa64_
526
+ id_length = id_length_
527
+ clipped_prompts = prompts[:]
528
+ prompts = [general_prompt + "," + prompt if "[NC]" not in prompt else prompt.replace("[NC]","") for prompt in clipped_prompts]
529
+ prompts = [prompt.rpartition('#')[0] if "#" in prompt else prompt for prompt in prompts]
530
+ print(prompts)
531
+ id_prompts = prompts[:id_length]
532
+ real_prompts = prompts[id_length:]
533
+ torch.cuda.empty_cache()
534
+ write = True
535
+ cur_step = 0
536
+
537
+ attn_count = 0
538
+ id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
539
+ setup_seed(seed_)
540
+ total_results = []
541
+ if _model_type == "original":
542
+ id_images = pipe(id_prompts, num_inference_steps=_num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
543
+ elif _model_type == "Photomaker":
544
+ id_images = pipe(id_prompts,input_id_images=input_id_images, num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images
545
+ else:
546
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
547
+ total_results = id_images + total_results
548
+ yield total_results
549
+ real_images = []
550
+ write = False
551
+ for real_prompt in real_prompts:
552
+ setup_seed(seed_)
553
+ cur_step = 0
554
+ real_prompt = apply_style_positive(style_name, real_prompt)
555
+ if _model_type == "original":
556
+ real_images.append(pipe(real_prompt, num_inference_steps=_num_steps, guidance_scale=guidance_scale, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])
557
+ elif _model_type == "Photomaker":
558
+ real_images.append(pipe(real_prompt, input_id_images=input_id_images, num_inference_steps=_num_steps, guidance_scale=guidance_scale, start_merge_step = start_merge_step, height = height, width = width,negative_prompt = negative_prompt,generator = generator).images[0])
559
+ else:
560
+ raise NotImplementedError("You should choice between original and Photomaker!",f"But you choice {_model_type}")
561
+ total_results = [real_images[-1]] + total_results
562
+ yield total_results
563
+ if _comic_type != "No typesetting (default)":
564
+ captions= prompt_array.splitlines()
565
+ captions = [caption.replace("[NC]","") for caption in captions]
566
+ captions = [caption.split('#')[-1] if "#" in caption else caption for caption in captions]
567
+ from PIL import ImageFont
568
+ total_results = get_comic(id_images + real_images, _comic_type,captions= captions,font=ImageFont.truetype("./fonts/Inkfree.ttf", int(45))) + total_results
569
+ set_attention_processor(pipe.unet,id_length_,is_ipadapter = False)
570
+ yield total_results
571
+
572
+
573
+
574
+ def array2string(arr):
575
+ stringtmp = ""
576
+ for i,part in enumerate(arr):
577
+ if i != len(arr)-1:
578
+ stringtmp += part +"\n"
579
+ else:
580
+ stringtmp += part
581
+
582
+ return stringtmp
583
+
584
+
585
+ #################################################
586
+ #################################################
587
+ ### define the interface
588
+ with gr.Blocks(css=css) as demo:
589
+ binary_matrixes = gr.State([])
590
+ color_layout = gr.State([])
591
+
592
+ # gr.Markdown(logo)
593
+ gr.Markdown(title)
594
+ gr.Markdown(description)
595
+
596
+ with gr.Row():
597
+ with gr.Group(elem_id="main-image"):
598
+ # button_run = gr.Button("generate id images ! 😺", elem_id="main_button", interactive=True)
599
+
600
+ prompts = []
601
+ colors = []
602
+ # with gr.Column(visible=False) as post_sketch:
603
+ # for n in range(MAX_COLORS):
604
+ # if n == 0 :
605
+ # with gr.Row(visible=False) as color_row[n]:
606
+ # colors.append(gr.Image(shape=(100, 100), label="background", type="pil", image_mode="RGB", width=100, height=100))
607
+ # prompts.append(gr.Textbox(label="Prompt for the background (white region)", value=""))
608
+ # else:
609
+ # with gr.Row(visible=False) as color_row[n]:
610
+ # colors.append(gr.Image(shape=(100, 100), label="segment "+str(n), type="pil", image_mode="RGB", width=100, height=100))
611
+ # prompts.append(gr.Textbox(label="Prompt for the segment "+str(n)))
612
+
613
+ # get_genprompt_run = gr.Button("(2) I've finished segment labeling ! 😺", elem_id="prompt_button", interactive=True)
614
+
615
+ with gr.Column(visible=True) as gen_prompt_vis:
616
+ sd_type = gr.Dropdown(choices=list(models_dict.keys()), value = "Unstable",label="sd_type", info="Select pretrained model")
617
+ model_type = gr.Radio(["Only Using Textual Description", "Using Ref Images"], label="model_type", value = "Only Using Textual Description", info="Control type of the Character")
618
+ with gr.Group(visible=False) as control_image_input:
619
+ files = gr.Files(
620
+ label="Drag (Select) 1 or more photos of your face",
621
+ file_types=["image"],
622
+ )
623
+ uploaded_files = gr.Gallery(label="Your images", visible=False, columns=5, rows=1, height=200)
624
+ with gr.Column(visible=False) as clear_button:
625
+ remove_and_reupload = gr.ClearButton(value="Remove and upload new ones", components=files, size="sm")
626
+ general_prompt = gr.Textbox(value='', label="(1) Textual Description for Character", interactive=True)
627
+ negative_prompt = gr.Textbox(value='', label="(2) Negative_prompt", interactive=True)
628
+ style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
629
+ prompt_array = gr.Textbox(lines = 3,value='', label="(3) Comic Description (each line corresponds to a frame).", interactive=True)
630
+ with gr.Accordion("(4) Tune the hyperparameters", open=True):
631
+ #sa16_ = gr.Slider(label=" (The degree of Paired Attention at 16 x 16 self-attention layers) ", minimum=0, maximum=1., value=0.3, step=0.1)
632
+ sa32_ = gr.Slider(label=" (The degree of Paired Attention at 32 x 32 self-attention layers) ", minimum=0, maximum=1., value=0.7, step=0.1)
633
+ sa64_ = gr.Slider(label=" (The degree of Paired Attention at 64 x 64 self-attention layers) ", minimum=0, maximum=1., value=0.7, step=0.1)
634
+ id_length_ = gr.Slider(label= "Number of id images in total images" , minimum=2, maximum=4, value=2, step=1)
635
+ # total_length_ = gr.Slider(label= "Number of total images", minimum=1, maximum=20, value=1, step=1)
636
+ seed_ = gr.Slider(label="Seed", minimum=-1, maximum=MAX_SEED, value=0, step=1)
637
+ num_steps = gr.Slider(
638
+ label="Number of sample steps",
639
+ minimum=20,
640
+ maximum=100,
641
+ step=1,
642
+ value=50,
643
+ )
644
+ G_height = gr.Slider(
645
+ label="height",
646
+ minimum=256,
647
+ maximum=1024,
648
+ step=32,
649
+ value=768,
650
+ )
651
+ G_width = gr.Slider(
652
+ label="width",
653
+ minimum=256,
654
+ maximum=1024,
655
+ step=32,
656
+ value=768,
657
+ )
658
+ comic_type = gr.Radio(["No typesetting (default)", "Four Pannel", "Classic Comic Style"], value = "Classic Comic Style", label="Typesetting Style", info="Select the typesetting style ")
659
+ guidance_scale = gr.Slider(
660
+ label="Guidance scale",
661
+ minimum=0.1,
662
+ maximum=10.0,
663
+ step=0.1,
664
+ value=5,
665
+ )
666
+ style_strength_ratio = gr.Slider(
667
+ label="Style strength of Ref Image (%)",
668
+ minimum=15,
669
+ maximum=50,
670
+ step=1,
671
+ value=20,
672
+ visible=False
673
+ )
674
+ Ip_Adapter_Strength = gr.Slider(
675
+ label="Ip_Adapter_Strength",
676
+ minimum=0,
677
+ maximum=1,
678
+ step=0.1,
679
+ value=0.5,
680
+ visible=False
681
+ )
682
+ final_run_btn = gr.Button("Generate ! 😺")
683
+
684
+
685
+ with gr.Column():
686
+ out_image = gr.Gallery(label="Result", columns=2, height='auto')
687
+ generated_information = gr.Markdown(label="Generation Details", value="",visible=False)
688
+ gr.Markdown(version)
689
+ model_type.change(fn = change_visiale_by_model_type , inputs = model_type, outputs=[control_image_input,style_strength_ratio,Ip_Adapter_Strength])
690
+ files.upload(fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files])
691
+ remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files])
692
+
693
+ final_run_btn.click(fn=set_text_unfinished, outputs = generated_information
694
+ ).then(process_generation, inputs=[sd_type,model_type,files, num_steps,style, Ip_Adapter_Strength,style_strength_ratio, guidance_scale, seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,G_height,G_width,comic_type], outputs=out_image
695
+ ).then(fn=set_text_finished,outputs = generated_information)
696
+
697
+
698
+ gr.Examples(
699
+ examples=[
700
+ [1,0.5,0.5,3,"a woman img, wearing a white T-shirt, blue loose hair",
701
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
702
+ array2string(["wake up in the bed",
703
+ "have breakfast",
704
+ "is on the road, go to company",
705
+ "work in the company",
706
+ "Take a walk next to the company at noon",
707
+ "lying in bed at night"]),
708
+ "Japanese Anime", "Using Ref Images",get_image_path_list('./examples/taylor'),768,768
709
+ ],
710
+ [0,0.5,0.5,2,"a man, wearing black jacket",
711
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
712
+ array2string(["wake up in the bed",
713
+ "have breakfast",
714
+ "is on the road, go to the company, close look",
715
+ "work in the company",
716
+ "laughing happily",
717
+ "lying in bed at night"
718
+ ]),
719
+ "Japanese Anime","Only Using Textual Description",get_image_path_list('./examples/taylor'),768,768
720
+ ],
721
+ [0,0.3,0.5,2,"a girl, wearing white shirt, black skirt, black tie, yellow hair",
722
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
723
+ array2string([
724
+ "at home #at home, began to go to drawing",
725
+ "sitting alone on a park bench.",
726
+ "reading a book on a park bench.",
727
+ "[NC]A squirrel approaches, peeking over the bench. ",
728
+ "look around in the park. # She looks around and enjoys the beauty of nature.",
729
+ "[NC]leaf falls from the tree, landing on the sketchbook.",
730
+ "picks up the leaf, examining its details closely.",
731
+ "starts sketching the leaf with intricate lines.",
732
+ "holds up the sketch drawing of the leaf.",
733
+ "[NC]The brown squirrel appear.",
734
+ "is very happy # She is very happy to see the squirrel again",
735
+ "[NC]The brown squirrel takes the cracker and scampers up a tree. # She gives the squirrel cracker",
736
+ "laughs and tucks the leaf into her book as a keepsake.",
737
+ "ready to leave.",]),
738
+ "Japanese Anime","Only Using Textual Description",get_image_path_list('./examples/taylor'),768,768
739
+ ]
740
+ ],
741
+ inputs=[seed_, sa32_, sa64_, id_length_, general_prompt, negative_prompt, prompt_array,style,model_type,files,G_height,G_width],
742
+ # outputs=[post_sketch, binary_matrixes, *color_row, *colors, *prompts, gen_prompt_vis, general_prompt, seed_],
743
+ # run_on_click=True,
744
+ label='😺 Examples 😺',
745
+ )
746
+ gr.Markdown(article)
747
+
748
+ # demo.load(None, None, None, _js=load_js)
749
+
750
+ demo.launch(server_name="0.0.0.0", share = True if use_va else False)
cog.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Cog ⚙️
2
+ # Reference: https://cog.run/yaml
3
+
4
+ build:
5
+ gpu: true
6
+ system_packages:
7
+ - "libgl1-mesa-glx"
8
+ - "libglib2.0-0"
9
+ python_version: "3.11"
10
+ python_packages:
11
+ - xformers==0.0.20
12
+ - torch==2.0.1
13
+ - torchvision==0.15.2
14
+ - diffusers==0.25.0
15
+ - transformers==4.36.2
16
+ - gradio==3.48.0
17
+ - accelerate
18
+ - safetensors
19
+ - peft
20
+ - Pillow==9.5.0
21
+ run:
22
+ - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
23
+ predict: "predict.py:Predictor"
gradio_app_sdxl_specific_id_low_vram copy.py ADDED
@@ -0,0 +1,1453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from this import d
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ import gc
6
+ import copy
7
+ import os
8
+ import random
9
+ import datetime
10
+ from PIL import ImageFont
11
+ from utils.gradio_utils import (
12
+ character_to_dict,
13
+ process_original_prompt,
14
+ get_ref_character,
15
+ cal_attn_mask_xl,
16
+ cal_attn_indice_xl_effcient_memory,
17
+ is_torch2_available,
18
+ )
19
+
20
+ import os
21
+ os.environ['GPU_PLATFORM_ID'] = '0'
22
+ os.environ['GPU_DEVICE_ID'] = '0'
23
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
24
+
25
+
26
+ import os
27
+ os.environ['HF_ENDPOINT']= 'https://hf-mirror.com'
28
+ torch.backends.cudnn.enabled = True
29
+
30
+ if is_torch2_available():
31
+ from utils.gradio_utils import AttnProcessor2_0 as AttnProcessor
32
+ else:
33
+ from utils.gradio_utils import AttnProcessor
34
+ from huggingface_hub import hf_hub_download
35
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
36
+ StableDiffusionXLPipeline,
37
+ )
38
+ from diffusers.schedulers.scheduling_ddim import DDIMScheduler
39
+ import torch.nn.functional as F
40
+ from diffusers.utils.loading_utils import load_image
41
+ from utils.utils import get_comic
42
+ from utils.style_template import styles
43
+ from utils.load_models_utils import get_models_dict, load_models
44
+
45
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '5'
46
+
47
+ STYLE_NAMES = list(styles.keys())
48
+ DEFAULT_STYLE_NAME = "日本漫画风"
49
+ global models_dict
50
+
51
+ models_dict = get_models_dict()
52
+
53
+ # Automatically select the device
54
+ device = (
55
+ "cuda:0"
56
+ if torch.cuda.is_available()
57
+ else "mps" if torch.backends.mps.is_available() else "cpu"
58
+ )
59
+
60
+ # device = "cpu"
61
+
62
+ # torch.cuda.set_device(5)
63
+ print(f"@@device:{device}")
64
+
65
+
66
+ # check if the file exists locally at a specified path before downloading it.
67
+ # if the file doesn't exist, it uses `hf_hub_download` to download the file
68
+ # and optionally move it to a specific directory. If the file already
69
+ # exists, it simply uses the local path.
70
+ local_dir = "data/"
71
+ photomaker_local_path = f"{local_dir}photomaker-v1.bin"
72
+ if not os.path.exists(photomaker_local_path):
73
+ photomaker_path = hf_hub_download(
74
+ repo_id="TencentARC/PhotoMaker",
75
+ filename="photomaker-v1.bin",
76
+ repo_type="model",
77
+ local_dir=local_dir,
78
+ )
79
+ else:
80
+ photomaker_path = photomaker_local_path
81
+
82
+ MAX_SEED = np.iinfo(np.int32).max
83
+
84
+
85
+ def setup_seed(seed):
86
+ torch.manual_seed(seed)
87
+ if device == "cuda":
88
+ torch.cuda.manual_seed_all(seed)
89
+ np.random.seed(seed)
90
+ random.seed(seed)
91
+ torch.backends.cudnn.deterministic = True
92
+
93
+
94
+ def set_text_unfinished():
95
+ return gr.update(
96
+ visible=True,
97
+ value="<h3>正在生成中......</h3>",
98
+ )
99
+
100
+
101
+ def set_text_finished():
102
+ return gr.update(visible=True, value="<h3>生成完成!</h3>")
103
+
104
+
105
+ #################################################
106
+ def get_image_path_list(folder_name):
107
+ image_basename_list = os.listdir(folder_name)
108
+ image_path_list = sorted(
109
+ [os.path.join(folder_name, basename) for basename in image_basename_list]
110
+ )
111
+ return image_path_list
112
+
113
+
114
+ #################################################
115
+ class SpatialAttnProcessor2_0(torch.nn.Module):
116
+ r"""
117
+ Attention processor for IP-Adapater for PyTorch 2.0.
118
+ Args:
119
+ hidden_size (`int`):
120
+ The hidden size of the attention layer.
121
+ cross_attention_dim (`int`):
122
+ The number of channels in the `encoder_hidden_states`.
123
+ text_context_len (`int`, defaults to 77):
124
+ The context length of the text features.
125
+ scale (`float`, defaults to 1.0):
126
+ the weight scale of image prompt.
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ hidden_size=None,
132
+ cross_attention_dim=None,
133
+ id_length=4,
134
+ device=device,
135
+ dtype=torch.float16,
136
+ ):
137
+ super().__init__()
138
+ if not hasattr(F, "scaled_dot_product_attention"):
139
+ raise ImportError(
140
+ "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
141
+ )
142
+ self.device = device
143
+ self.dtype = dtype
144
+ self.hidden_size = hidden_size
145
+ self.cross_attention_dim = cross_attention_dim
146
+ self.total_length = id_length + 1
147
+ self.id_length = id_length
148
+ self.id_bank = {}
149
+
150
+ def __call__(
151
+ self,
152
+ attn,
153
+ hidden_states,
154
+ encoder_hidden_states=None,
155
+ attention_mask=None,
156
+ temb=None,
157
+ ):
158
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
159
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
160
+ # 生成一个0到1之间的随机数
161
+ global total_count, attn_count, cur_step, indices1024, indices4096
162
+ global sa32, sa64
163
+ global write
164
+ global height, width
165
+ global character_dict, character_index_dict, invert_character_index_dict, cur_character, ref_indexs_dict, ref_totals, cur_character
166
+ if attn_count == 0 and cur_step == 0:
167
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
168
+ self.total_length,
169
+ self.id_length,
170
+ sa32,
171
+ sa64,
172
+ height,
173
+ width,
174
+ device=self.device,
175
+ dtype=self.dtype,
176
+ )
177
+ if write:
178
+ assert len(cur_character) == 1
179
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
180
+ indices = indices1024
181
+ else:
182
+ indices = indices4096
183
+ # print(f"white:{cur_step}")
184
+ total_batch_size, nums_token, channel = hidden_states.shape
185
+ img_nums = total_batch_size // 2
186
+ hidden_states = hidden_states.reshape(-1, img_nums, nums_token, channel)
187
+ # print(img_nums,len(indices),hidden_states.shape,self.total_length)
188
+ if cur_character[0] not in self.id_bank:
189
+ self.id_bank[cur_character[0]] = {}
190
+ self.id_bank[cur_character[0]][cur_step] = [
191
+ hidden_states[:, img_ind, indices[img_ind], :]
192
+ .reshape(2, -1, channel)
193
+ .clone()
194
+ for img_ind in range(img_nums)
195
+ ]
196
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
197
+ # self.id_bank[cur_step] = [hidden_states[:self.id_length].clone(), hidden_states[self.id_length:].clone()]
198
+ else:
199
+ # encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),self.id_bank[cur_step][1].to(self.device)))
200
+ # TODO: ADD Multipersion Control
201
+ encoder_arr = []
202
+ for character in cur_character:
203
+ encoder_arr = encoder_arr + [
204
+ tensor.to(self.device)
205
+ for tensor in self.id_bank[character][cur_step]
206
+ ]
207
+ # 判断随机数是否大于0.5
208
+ if cur_step < 1:
209
+ hidden_states = self.__call2__(
210
+ attn, hidden_states, None, attention_mask, temb
211
+ )
212
+ else: # 256 1024 4096
213
+ random_number = random.random()
214
+ if cur_step < 20:
215
+ rand_num = 0.3
216
+ else:
217
+ rand_num = 0.1
218
+ # print(f"hidden state shape {hidden_states.shape[1]}")
219
+ if random_number > rand_num:
220
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
221
+ indices = indices1024
222
+ else:
223
+ indices = indices4096
224
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
225
+ if write:
226
+ total_batch_size, nums_token, channel = hidden_states.shape
227
+ img_nums = total_batch_size // 2
228
+ hidden_states = hidden_states.reshape(
229
+ -1, img_nums, nums_token, channel
230
+ )
231
+ encoder_arr = [
232
+ hidden_states[:, img_ind, indices[img_ind], :].reshape(
233
+ 2, -1, channel
234
+ )
235
+ for img_ind in range(img_nums)
236
+ ]
237
+ for img_ind in range(img_nums):
238
+ # print(img_nums)
239
+ # assert img_nums != 1
240
+ img_ind_list = [i for i in range(img_nums)]
241
+ # print(img_ind_list,img_ind)
242
+ img_ind_list.remove(img_ind)
243
+ # print(img_ind,invert_character_index_dict[img_ind])
244
+ # print(character_index_dict[invert_character_index_dict[img_ind]])
245
+ # print(img_ind_list)
246
+ # print(img_ind,img_ind_list)
247
+ encoder_hidden_states_tmp = torch.cat(
248
+ [encoder_arr[img_ind] for img_ind in img_ind_list]
249
+ + [hidden_states[:, img_ind, :, :]],
250
+ dim=1,
251
+ )
252
+
253
+ hidden_states[:, img_ind, :, :] = self.__call2__(
254
+ attn,
255
+ hidden_states[:, img_ind, :, :],
256
+ encoder_hidden_states_tmp,
257
+ None,
258
+ temb,
259
+ )
260
+ else:
261
+ _, nums_token, channel = hidden_states.shape
262
+ # img_nums = total_batch_size // 2
263
+ # encoder_hidden_states = encoder_hidden_states.reshape(-1,img_nums,nums_token,channel)
264
+ hidden_states = hidden_states.reshape(2, -1, nums_token, channel)
265
+ # print(len(indices))
266
+ # encoder_arr = [encoder_hidden_states[:,img_ind,indices[img_ind],:].reshape(2,-1,channel) for img_ind in range(img_nums)]
267
+ encoder_hidden_states_tmp = torch.cat(
268
+ encoder_arr + [hidden_states[:, 0, :, :]], dim=1
269
+ )
270
+ # print(len(encoder_arr),encoder_hidden_states_tmp.shape)
271
+ hidden_states[:, 0, :, :] = self.__call2__(
272
+ attn,
273
+ hidden_states[:, 0, :, :],
274
+ encoder_hidden_states_tmp,
275
+ None,
276
+ temb,
277
+ )
278
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
279
+ else:
280
+ hidden_states = self.__call2__(
281
+ attn, hidden_states, None, attention_mask, temb
282
+ )
283
+ attn_count += 1
284
+ if attn_count == total_count:
285
+ attn_count = 0
286
+ cur_step += 1
287
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
288
+ self.total_length,
289
+ self.id_length,
290
+ sa32,
291
+ sa64,
292
+ height,
293
+ width,
294
+ device=self.device,
295
+ dtype=self.dtype,
296
+ )
297
+
298
+ return hidden_states
299
+
300
+ def __call2__(
301
+ self,
302
+ attn,
303
+ hidden_states,
304
+ encoder_hidden_states=None,
305
+ attention_mask=None,
306
+ temb=None,
307
+ ):
308
+ residual = hidden_states
309
+
310
+ if attn.spatial_norm is not None:
311
+ hidden_states = attn.spatial_norm(hidden_states, temb)
312
+
313
+ input_ndim = hidden_states.ndim
314
+
315
+ if input_ndim == 4:
316
+ batch_size, channel, height, width = hidden_states.shape
317
+ hidden_states = hidden_states.view(
318
+ batch_size, channel, height * width
319
+ ).transpose(1, 2)
320
+
321
+ batch_size, sequence_length, channel = hidden_states.shape
322
+ # print(hidden_states.shape)
323
+ if attention_mask is not None:
324
+ attention_mask = attn.prepare_attention_mask(
325
+ attention_mask, sequence_length, batch_size
326
+ )
327
+ # scaled_dot_product_attention expects attention_mask shape to be
328
+ # (batch, heads, source_length, target_length)
329
+ attention_mask = attention_mask.view(
330
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
331
+ )
332
+
333
+ if attn.group_norm is not None:
334
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
335
+ 1, 2
336
+ )
337
+
338
+ query = attn.to_q(hidden_states)
339
+
340
+ if encoder_hidden_states is None:
341
+ encoder_hidden_states = hidden_states # B, N, C
342
+ # else:
343
+ # encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
344
+
345
+ key = attn.to_k(encoder_hidden_states)
346
+ value = attn.to_v(encoder_hidden_states)
347
+
348
+ inner_dim = key.shape[-1]
349
+ head_dim = inner_dim // attn.heads
350
+
351
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
352
+
353
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
354
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
355
+
356
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
357
+ # TODO: add support for attn.scale when we move to Torch 2.1
358
+ hidden_states = F.scaled_dot_product_attention(
359
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
360
+ )
361
+
362
+ hidden_states = hidden_states.transpose(1, 2).reshape(
363
+ batch_size, -1, attn.heads * head_dim
364
+ )
365
+ hidden_states = hidden_states.to(query.dtype)
366
+
367
+ # linear proj
368
+ hidden_states = attn.to_out[0](hidden_states)
369
+ # dropout
370
+ hidden_states = attn.to_out[1](hidden_states)
371
+
372
+ if input_ndim == 4:
373
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
374
+ batch_size, channel, height, width
375
+ )
376
+
377
+ if attn.residual_connection:
378
+ hidden_states = hidden_states + residual
379
+
380
+ hidden_states = hidden_states / attn.rescale_output_factor
381
+
382
+ return hidden_states
383
+
384
+
385
+ def set_attention_processor(unet, id_length, is_ipadapter=False):
386
+ global attn_procs
387
+ attn_procs = {}
388
+ for name in unet.attn_processors.keys():
389
+ cross_attention_dim = (
390
+ None
391
+ if name.endswith("attn1.processor")
392
+ else unet.config.cross_attention_dim
393
+ )
394
+ if name.startswith("mid_block"):
395
+ hidden_size = unet.config.block_out_channels[-1]
396
+ elif name.startswith("up_blocks"):
397
+ block_id = int(name[len("up_blocks.")])
398
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
399
+ elif name.startswith("down_blocks"):
400
+ block_id = int(name[len("down_blocks.")])
401
+ hidden_size = unet.config.block_out_channels[block_id]
402
+ if cross_attention_dim is None:
403
+ if name.startswith("up_blocks"):
404
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
405
+ else:
406
+ attn_procs[name] = AttnProcessor()
407
+ else:
408
+ if is_ipadapter:
409
+ attn_procs[name] = IPAttnProcessor2_0(
410
+ hidden_size=hidden_size,
411
+ cross_attention_dim=cross_attention_dim,
412
+ scale=1,
413
+ num_tokens=4,
414
+ ).to(unet.device, dtype=torch.float16)
415
+ else:
416
+ attn_procs[name] = AttnProcessor()
417
+
418
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
419
+
420
+
421
+ #################################################
422
+ #################################################
423
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
424
+ load_js = """
425
+ async () => {
426
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
427
+ fetch(url)
428
+ .then(res => res.text())
429
+ .then(text => {
430
+ const script = document.createElement('script');
431
+ script.type = "module"
432
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
433
+ document.head.appendChild(script);
434
+ });
435
+ }
436
+ """
437
+
438
+ get_js_colors = """
439
+ async (canvasData) => {
440
+ const canvasEl = document.getElementById("canvas-root");
441
+ return [canvasEl._data]
442
+ }
443
+ """
444
+
445
+ css = """
446
+ #color-bg{display:flex;justify-content: center;align-items: center;}
447
+ .color-bg-item{width: 100%; height: 32px}
448
+ #main_button{width:100%}
449
+ <style>
450
+ """
451
+
452
+
453
+ def save_single_character_weights(unet, character, description, filepath):
454
+ """
455
+ 保存 attention_processor 类中的 id_bank GPU Tensor 列表到指定文件中。
456
+ 参数:
457
+ - model: 包含 attention_processor 类实例的模型。
458
+ - filepath: 权重要保存到的文件路径。
459
+ """
460
+ weights_to_save = {}
461
+ weights_to_save["description"] = description
462
+ weights_to_save["character"] = character
463
+ for attn_name, attn_processor in unet.attn_processors.items():
464
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
465
+ # 将每个 Tensor 转到 CPU 并转为列表,以确保它可以被序列化
466
+ weights_to_save[attn_name] = {}
467
+ for step_key in attn_processor.id_bank[character].keys():
468
+ weights_to_save[attn_name][step_key] = [
469
+ tensor.cpu()
470
+ for tensor in attn_processor.id_bank[character][step_key]
471
+ ]
472
+ # 使用torch.save保存权重
473
+ torch.save(weights_to_save, filepath)
474
+
475
+
476
+ def load_single_character_weights(unet, filepath):
477
+ """
478
+ 从指定文件中加载权重到 attention_processor 类的 id_bank 中。
479
+ 参数:
480
+ - model: 包含 attention_processor 类实例的模型。
481
+ - filepath: 权重文件的路径。
482
+ """
483
+ # 使用torch.load来读取权重
484
+ weights_to_load = torch.load(filepath, map_location=torch.device("cpu"))
485
+ character = weights_to_load["character"]
486
+ description = weights_to_load["description"]
487
+ for attn_name, attn_processor in unet.attn_processors.items():
488
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
489
+ # 转移权重到GPU(如果GPU可用的话)并赋值给id_bank
490
+ attn_processor.id_bank[character] = {}
491
+ for step_key in weights_to_load[attn_name].keys():
492
+ attn_processor.id_bank[character][step_key] = [
493
+ tensor.to(unet.device)
494
+ for tensor in weights_to_load[attn_name][step_key]
495
+ ]
496
+
497
+
498
+ def save_results(unet, img_list):
499
+
500
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
501
+ folder_name = f"results/{timestamp}"
502
+ weight_folder_name = f"{folder_name}/weights"
503
+ # 创建文件夹
504
+ if not os.path.exists(folder_name):
505
+ os.makedirs(folder_name)
506
+ os.makedirs(weight_folder_name)
507
+
508
+ for idx, img in enumerate(img_list):
509
+ file_path = os.path.join(folder_name, f"image_{idx}.png") # 图片文件名
510
+ img.save(file_path)
511
+ global character_dict
512
+ # for char in character_dict:
513
+ # description = character_dict[char]
514
+ # save_single_character_weights(unet,char,description,os.path.join(weight_folder_name, f'{char}.pt'))
515
+
516
+
517
+ #################################################
518
+ title = r"""
519
+ <h1 align="center" style="font-family: 'Comic Sans MS', 'Orbitron', sans-serif; color: #00ccff; text-shadow: 2px 2px 4px #000000; font-size: 48px;">
520
+ 🧠✨ 我的AI研学旅记 🚀🤖
521
+ </h1>
522
+ """
523
+
524
+
525
+ # title = r"""
526
+ # <h1 align="center">我的AI研学旅记</h1>
527
+ # """
528
+
529
+ description = r"""
530
+ """
531
+ # <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
532
+ # ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
533
+ # 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
534
+ # 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
535
+ # 3️⃣ Choose your preferred style template.<br>
536
+ # 4️⃣ Click the <b>Submit</b> button to start customizing.
537
+
538
+
539
+ article = r"""
540
+ """
541
+ # If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
542
+ # [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
543
+ # ---
544
+ # 📝 **Citation**
545
+ # <br>
546
+ # If our work is useful for your research, please consider citing:
547
+
548
+ # ```bibtex
549
+ # @article{Zhou2024storydiffusion,
550
+ # title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
551
+ # author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
552
+ # year={2024}
553
+ # }
554
+ # ```
555
+ # 📋 **License**
556
+ # <br>
557
+ # Apache-2.0 LICENSE.
558
+
559
+ # 📧 **Contact**
560
+ # <br>
561
+ # If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
562
+
563
+ version = r"""
564
+ """
565
+ # <h3 align="center">StoryDiffusion Version 0.02 (test version)</h3>
566
+
567
+ # <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
568
+ # <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
569
+ # <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling.")</h5>
570
+ # <h5 align="center">Tips: </h4>
571
+
572
+
573
+
574
+
575
+
576
+
577
+ #################################################
578
+ global attn_count, total_count, id_length, total_length, cur_step, cur_model_type
579
+ global write
580
+ global sa32, sa64
581
+ global height, width
582
+ attn_count = 0
583
+ total_count = 0
584
+ cur_step = 0
585
+ id_length = 4
586
+ total_length = 5
587
+ cur_model_type = ""
588
+ global attn_procs, unet
589
+ attn_procs = {}
590
+ ###
591
+ write = False
592
+ ###
593
+ sa32 = 0.5
594
+ sa64 = 0.5
595
+ height = 768
596
+ width = 768
597
+ ###
598
+ global pipe
599
+ global sd_model_path
600
+ pipe = None
601
+ sd_model_path = models_dict["Unstable"]["path"] # "SG161222/RealVisXL_V4.0"
602
+ single_files = models_dict["Unstable"]["single_files"]
603
+ ### LOAD Stable Diffusion Pipeline
604
+ if single_files:
605
+ pipe = StableDiffusionXLPipeline.from_single_file(
606
+ sd_model_path, torch_dtype=torch.float16
607
+ )
608
+ else:
609
+ pipe = StableDiffusionXLPipeline.from_pretrained(
610
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=False
611
+ )
612
+ print("pipE.device = ", device)
613
+ pipe = pipe.to(device)
614
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
615
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
616
+ pipe.scheduler.set_timesteps(50)
617
+ pipe.enable_vae_slicing()
618
+ if device != "mps":
619
+ pipe.enable_model_cpu_offload()
620
+ unet = pipe.unet
621
+ cur_model_type = "Unstable" + "-" + "original"
622
+ ### Insert PairedAttention
623
+ for name in unet.attn_processors.keys():
624
+ cross_attention_dim = (
625
+ None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
626
+ )
627
+ if name.startswith("mid_block"):
628
+ hidden_size = unet.config.block_out_channels[-1]
629
+ elif name.startswith("up_blocks"):
630
+ block_id = int(name[len("up_blocks.")])
631
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
632
+ elif name.startswith("down_blocks"):
633
+ block_id = int(name[len("down_blocks.")])
634
+ hidden_size = unet.config.block_out_channels[block_id]
635
+ if cross_attention_dim is None and (name.startswith("up_blocks")):
636
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
637
+ total_count += 1
638
+ else:
639
+ attn_procs[name] = AttnProcessor()
640
+ print("successsfully load paired self-attention")
641
+ print(f"number of the processor : {total_count}")
642
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
643
+ global mask1024, mask4096
644
+ mask1024, mask4096 = cal_attn_mask_xl(
645
+ total_length,
646
+ id_length,
647
+ sa32,
648
+ sa64,
649
+ height,
650
+ width,
651
+ device=device,
652
+ dtype=torch.float16,
653
+ )
654
+
655
+ ######### Gradio Fuction #############
656
+
657
+
658
+
659
+
660
+ def swap_to_gallery(images):
661
+ return (
662
+ gr.update(value=images, visible=True),
663
+ gr.update(visible=True),
664
+ gr.update(visible=False),
665
+ )
666
+
667
+
668
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
669
+ return (
670
+ gr.update(value=images, visible=True),
671
+ gr.update(visible=True),
672
+ gr.update(visible=False),
673
+ )
674
+
675
+
676
+ def remove_back_to_files():
677
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
678
+
679
+
680
+ def remove_tips():
681
+ return gr.update(visible=False)
682
+
683
+
684
+ def apply_style_positive(style_name: str, positive: str):
685
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
686
+ return p.replace("{prompt}", positive)
687
+
688
+
689
+ def apply_style(style_name: str, positives: list, negative: str = ""):
690
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
691
+ return [
692
+ p.replace("{prompt}", positive) for positive in positives
693
+ ], n + " " + negative
694
+
695
+
696
+ def change_visiale_by_model_type(_model_type):
697
+ if _model_type == "Only Using Textual Description":
698
+ return (
699
+ gr.update(visible=False),
700
+ gr.update(visible=False),
701
+ gr.update(visible=False),
702
+ )
703
+ elif _model_type == "Using Ref Images":
704
+ return (
705
+ gr.update(visible=True),
706
+ gr.update(visible=True),
707
+ gr.update(visible=False),
708
+ )
709
+ else:
710
+ raise ValueError("Invalid model type", _model_type)
711
+
712
+
713
+ def load_character_files(character_files: str):
714
+ if character_files == "":
715
+ raise gr.Error("Please set a character file!")
716
+ character_files_arr = character_files.splitlines()
717
+ primarytext = []
718
+ for character_file_name in character_files_arr:
719
+ character_file = torch.load(
720
+ character_file_name, map_location=torch.device("cpu")
721
+ )
722
+ primarytext.append(character_file["character"] + character_file["description"])
723
+ return array2string(primarytext)
724
+
725
+
726
+ def load_character_files_on_running(unet, character_files: str):
727
+ if character_files == "":
728
+ return False
729
+ character_files_arr = character_files.splitlines()
730
+ for character_file in character_files_arr:
731
+ load_single_character_weights(unet, character_file)
732
+ return True
733
+
734
+
735
+ ######### Image Generation ##############
736
+ def process_generation(
737
+ _sd_type,
738
+ _model_type,
739
+ _upload_images,
740
+ _num_steps,
741
+ style_name,
742
+ _Ip_Adapter_Strength,
743
+ _style_strength_ratio,
744
+ guidance_scale,
745
+ seed_,
746
+ sa32_,
747
+ sa64_,
748
+ id_length_,
749
+ general_prompt,
750
+ negative_prompt,
751
+ prompt_array,
752
+ G_height,
753
+ G_width,
754
+ _comic_type,
755
+ font_choice,
756
+ _char_files,
757
+ ): # Corrected font_choice usage
758
+ if len(general_prompt.splitlines()) > 5:
759
+ raise gr.Error(
760
+ "Support for more than three characters is temporarily unavailable due to VRAM limitations, but this issue will be resolved soon."
761
+ )
762
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
763
+ if _model_type == "Photomaker" and "img" not in general_prompt:
764
+ raise gr.Error(
765
+ 'Please add the triger word " img " behind the class word you want to customize, such as: man img or woman img'
766
+ )
767
+ if _upload_images is None and _model_type != "original":
768
+ raise gr.Error(f"Cannot find any input face image!")
769
+ global sa32, sa64, id_length, total_length, attn_procs, unet, cur_model_type
770
+ global write
771
+ global cur_step, attn_count
772
+ global height, width
773
+ height = G_height
774
+ width = G_width
775
+ global pipe
776
+ global sd_model_path, models_dict
777
+ sd_model_path = models_dict[_sd_type]
778
+ use_safe_tensor = True
779
+ for attn_processor in pipe.unet.attn_processors.values():
780
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
781
+ for values in attn_processor.id_bank.values():
782
+ del values
783
+ attn_processor.id_bank = {}
784
+ attn_processor.id_length = id_length
785
+ attn_processor.total_length = id_length + 1
786
+ gc.collect()
787
+ torch.cuda.empty_cache()
788
+ if cur_model_type != _sd_type + "-" + _model_type:
789
+ # apply the style template
790
+ ##### load pipe
791
+ del pipe
792
+ gc.collect()
793
+ if device == "cuda":
794
+ torch.cuda.empty_cache()
795
+ model_info = models_dict[_sd_type]
796
+ model_info["model_type"] = _model_type
797
+ print("device = ", device)
798
+ pipe = load_models(model_info, device=device, photomaker_path=photomaker_path)
799
+ set_attention_processor(pipe.unet, id_length_, is_ipadapter=False)
800
+ ##### ########################
801
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
802
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
803
+ cur_model_type = _sd_type + "-" + _model_type
804
+ pipe.enable_vae_slicing()
805
+ if device != "mps":
806
+ pipe.enable_model_cpu_offload()
807
+ else:
808
+ unet = pipe.unet
809
+ # unet.set_attn_processor(copy.deepcopy(attn_procs))
810
+
811
+ load_chars = load_character_files_on_running(unet, character_files=_char_files)
812
+
813
+ prompts = prompt_array.splitlines()
814
+ global character_dict, character_index_dict, invert_character_index_dict, ref_indexs_dict, ref_totals
815
+ character_dict, character_list = character_to_dict(general_prompt)
816
+
817
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
818
+ if start_merge_step > 30:
819
+ start_merge_step = 30
820
+ print(f"start_merge_step:{start_merge_step}")
821
+ generator = torch.Generator(device=device).manual_seed(seed_)
822
+ sa32, sa64 = sa32_, sa64_
823
+ id_length = id_length_
824
+ clipped_prompts = prompts[:]
825
+ nc_indexs = []
826
+ for ind, prompt in enumerate(clipped_prompts):
827
+ if "[NC]" in prompt:
828
+ nc_indexs.append(ind)
829
+ if ind < id_length:
830
+ raise gr.Error(
831
+ f"The first {id_length} row is id prompts, cannot use [NC]!"
832
+ )
833
+ prompts = [
834
+ prompt if "[NC]" not in prompt else prompt.replace("[NC]", "")
835
+ for prompt in clipped_prompts
836
+ ]
837
+
838
+ prompts = [
839
+ prompt.rpartition("#")[0] if "#" in prompt else prompt for prompt in prompts
840
+ ]
841
+ print(prompts)
842
+ # id_prompts = prompts[:id_length]
843
+ (
844
+ character_index_dict,
845
+ invert_character_index_dict,
846
+ replace_prompts,
847
+ ref_indexs_dict,
848
+ ref_totals,
849
+ ) = process_original_prompt(character_dict, prompts.copy(), id_length)
850
+ if _model_type != "original":
851
+ input_id_images_dict = {}
852
+ if len(_upload_images) != len(character_dict.keys()):
853
+ raise gr.Error(
854
+ f"You upload images({len(_upload_images)}) is not equal to the number of characters({len(character_dict.keys())})!"
855
+ )
856
+ for ind, img in enumerate(_upload_images):
857
+ input_id_images_dict[character_list[ind]] = [load_image(img)]
858
+ print(character_dict)
859
+ print(character_index_dict)
860
+ print(invert_character_index_dict)
861
+ # real_prompts = prompts[id_length:]
862
+ if device == "cuda":
863
+ torch.cuda.empty_cache()
864
+ write = True
865
+ cur_step = 0
866
+
867
+ attn_count = 0
868
+ # id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
869
+ # print(id_prompts)
870
+ setup_seed(seed_)
871
+ total_results = []
872
+ id_images = []
873
+ results_dict = {}
874
+ global cur_character
875
+
876
+ if not load_chars:
877
+ for character_key in character_dict.keys():
878
+ cur_character = [character_key]
879
+ ref_indexs = ref_indexs_dict[character_key]
880
+ print(character_key, ref_indexs)
881
+ current_prompts = [replace_prompts[ref_ind] for ref_ind in ref_indexs]
882
+ print(current_prompts)
883
+ setup_seed(seed_)
884
+ generator = torch.Generator(device=device).manual_seed(seed_)
885
+ cur_step = 0
886
+ cur_positive_prompts, negative_prompt = apply_style(
887
+ style_name, current_prompts, negative_prompt
888
+ )
889
+ if _model_type == "original":
890
+ id_images = pipe(
891
+ cur_positive_prompts,
892
+ num_inference_steps=_num_steps,
893
+ guidance_scale=guidance_scale,
894
+ height=height,
895
+ width=width,
896
+ negative_prompt=negative_prompt,
897
+ generator=generator,
898
+ ).images
899
+ elif _model_type == "Photomaker":
900
+ id_images = pipe(
901
+ cur_positive_prompts,
902
+ input_id_images=input_id_images_dict[character_key],
903
+ num_inference_steps=_num_steps,
904
+ guidance_scale=guidance_scale,
905
+ start_merge_step=start_merge_step,
906
+ height=height,
907
+ width=width,
908
+ negative_prompt=negative_prompt,
909
+ generator=generator,
910
+ ).images
911
+ else:
912
+ raise NotImplementedError(
913
+ "You should choice between original and Photomaker!",
914
+ f"But you choice {_model_type}",
915
+ )
916
+
917
+ # total_results = id_images + total_results
918
+ # yield total_results
919
+ print(id_images)
920
+ for ind, img in enumerate(id_images):
921
+ print(ref_indexs[ind])
922
+ results_dict[ref_indexs[ind]] = img
923
+ # real_images = []
924
+ yield [results_dict[ind] for ind in results_dict.keys()]
925
+ write = False
926
+ if not load_chars:
927
+ real_prompts_inds = [
928
+ ind for ind in range(len(prompts)) if ind not in ref_totals
929
+ ]
930
+ else:
931
+ real_prompts_inds = [ind for ind in range(len(prompts))]
932
+ print(real_prompts_inds)
933
+
934
+ for real_prompts_ind in real_prompts_inds:
935
+ real_prompt = replace_prompts[real_prompts_ind]
936
+ cur_character = get_ref_character(prompts[real_prompts_ind], character_dict)
937
+ print(cur_character, real_prompt)
938
+ setup_seed(seed_)
939
+ if len(cur_character) > 1 and _model_type == "Photomaker":
940
+ raise gr.Error(
941
+ "Temporarily Not Support Multiple character in Ref Image Mode!"
942
+ )
943
+ generator = torch.Generator(device=device).manual_seed(seed_)
944
+ cur_step = 0
945
+ real_prompt = apply_style_positive(style_name, real_prompt)
946
+ if _model_type == "original":
947
+ results_dict[real_prompts_ind] = pipe(
948
+ real_prompt,
949
+ num_inference_steps=_num_steps,
950
+ guidance_scale=guidance_scale,
951
+ height=height,
952
+ width=width,
953
+ negative_prompt=negative_prompt,
954
+ generator=generator,
955
+ ).images[0]
956
+ elif _model_type == "Photomaker":
957
+ results_dict[real_prompts_ind] = pipe(
958
+ real_prompt,
959
+ input_id_images=(
960
+ input_id_images_dict[cur_character[0]]
961
+ if real_prompts_ind not in nc_indexs
962
+ else input_id_images_dict[character_list[0]]
963
+ ),
964
+ num_inference_steps=_num_steps,
965
+ guidance_scale=guidance_scale,
966
+ start_merge_step=start_merge_step,
967
+ height=height,
968
+ width=width,
969
+ negative_prompt=negative_prompt,
970
+ generator=generator,
971
+ nc_flag=True if real_prompts_ind in nc_indexs else False,
972
+ ).images[0]
973
+ else:
974
+ raise NotImplementedError(
975
+ "You should choice between original and Photomaker!",
976
+ f"But you choice {_model_type}",
977
+ )
978
+ yield [results_dict[ind] for ind in results_dict.keys()]
979
+ total_results = [results_dict[ind] for ind in range(len(prompts))]
980
+ if _comic_type != "No typesetting (default)":
981
+ captions = prompt_array.splitlines()
982
+ captions = [caption.replace("[NC]", "") for caption in captions]
983
+ captions = [
984
+ caption.split("#")[-1] if "#" in caption else caption
985
+ for caption in captions
986
+ ]
987
+ font_path = os.path.join("fonts", font_choice)
988
+ font = ImageFont.truetype(font_path, int(45))
989
+ total_results = (
990
+ get_comic(total_results, _comic_type, captions=captions, font=font)
991
+ + total_results
992
+ )
993
+ save_results(pipe.unet, total_results)
994
+
995
+ yield total_results
996
+
997
+
998
+ def array2string(arr):
999
+ stringtmp = ""
1000
+ for i, part in enumerate(arr):
1001
+ if i != len(arr) - 1:
1002
+ stringtmp += part + "\n"
1003
+ else:
1004
+ stringtmp += part
1005
+
1006
+ return stringtmp
1007
+
1008
+
1009
+ #################################################
1010
+ #################################################
1011
+ ### define the interface
1012
+
1013
+ with gr.Blocks(css=css) as demo:
1014
+ binary_matrixes = gr.State([])
1015
+ color_layout = gr.State([])
1016
+
1017
+ # gr.Markdown(logo)
1018
+ gr.Markdown(title)
1019
+ gr.Markdown(description)
1020
+
1021
+ with gr.Row():
1022
+ with gr.Group(elem_id="main-image"):
1023
+
1024
+ prompts = []
1025
+ colors = []
1026
+
1027
+ with gr.Column(visible=True) as gen_prompt_vis:
1028
+ with gr.Group(visible=False):
1029
+ sd_type = gr.Dropdown(
1030
+ choices=list(models_dict.keys()),
1031
+ value="Unstable",
1032
+ label="模型类型",
1033
+ info="选择预训练模型",
1034
+ )
1035
+ model_type = gr.Radio(
1036
+ ["仅使用文本描述", "使用参考图像"],
1037
+ label="控制模式",
1038
+ value="仅使用文本描述",
1039
+ info="角色控制方式",
1040
+ )
1041
+ with gr.Group(visible=True) as control_image_input:
1042
+ files = gr.Files(
1043
+ label="拖放或选择1张或多张面部照片",
1044
+ file_types=["image"],
1045
+ )
1046
+ uploaded_files = gr.Gallery(
1047
+ label="已上传图片",
1048
+ visible=False,
1049
+ columns=5,
1050
+ rows=1,
1051
+ height=200,
1052
+ )
1053
+ with gr.Column(visible=False) as clear_button:
1054
+ remove_and_reupload = gr.ClearButton(
1055
+ value="清除并重新上传",
1056
+ components=files,
1057
+ size="sm",
1058
+ )
1059
+
1060
+ general_prompt = gr.Textbox(
1061
+ value="",
1062
+ lines=2,
1063
+ visible=False,
1064
+ label="(1) 角色文本描述",
1065
+ interactive=True,
1066
+ )
1067
+ negative_prompt = gr.Textbox(
1068
+ value="",
1069
+ label="(2) 负面提示词",
1070
+ visible=False,
1071
+ interactive=True
1072
+ )
1073
+ style = gr.Dropdown(
1074
+ label="风格模板",
1075
+ choices=STYLE_NAMES,
1076
+ value=DEFAULT_STYLE_NAME,
1077
+ )
1078
+ prompt_array = gr.Textbox(
1079
+ lines=1,
1080
+ value="",
1081
+ visible=False,
1082
+ label="(3) 漫画描述(每行对应一个画格)",
1083
+ interactive=True,
1084
+ )
1085
+ char_path = gr.Textbox(
1086
+ lines=2,
1087
+ value="",
1088
+ visible=False,
1089
+ label="(可选) 角色文件路径",
1090
+ interactive=True,
1091
+ )
1092
+ char_btn = gr.Button("加载角色文件", visible=False)
1093
+
1094
+ with gr.Group(visible=False):
1095
+ font_choice = gr.Dropdown(
1096
+ label="选择字体",
1097
+ choices=[
1098
+ f for f in os.listdir("fonts") if f.endswith(".ttf")
1099
+ ],
1100
+ value="Inkfree.ttf",
1101
+ info="选择最终幻灯片的字体",
1102
+ interactive=True,
1103
+ )
1104
+ sa32_ = gr.Slider(
1105
+ label="32x32自注意力层配对注意力强度",
1106
+ minimum=0,
1107
+ maximum=1.0,
1108
+ value=0.5,
1109
+ step=0.1,
1110
+ )
1111
+ sa64_ = gr.Slider(
1112
+ label="64x64自注意力层配对注意力强度",
1113
+ minimum=0,
1114
+ maximum=1.0,
1115
+ value=0.5,
1116
+ step=0.1,
1117
+ )
1118
+ id_length_ = gr.Slider(
1119
+ label="总图像中包含ID图像的数量",
1120
+ minimum=1,
1121
+ maximum=4,
1122
+ value=1,
1123
+ step=1,
1124
+ )
1125
+ with gr.Row():
1126
+ seed_ = gr.Slider(
1127
+ label="随机种子", minimum=-1, maximum=MAX_SEED, value=0, step=1
1128
+ )
1129
+ randomize_seed_btn = gr.Button("🎲", size="sm")
1130
+ num_steps = gr.Slider(
1131
+ label="采样步数",
1132
+ minimum=20,
1133
+ maximum=100,
1134
+ step=1,
1135
+ value=35,
1136
+ )
1137
+ G_height = gr.Slider(
1138
+ label="图像高度",
1139
+ minimum=256,
1140
+ maximum=1024,
1141
+ step=32,
1142
+ value=768,
1143
+ )
1144
+ G_width = gr.Slider(
1145
+ label="图像宽度",
1146
+ minimum=256,
1147
+ maximum=1024,
1148
+ step=32,
1149
+ value=768,
1150
+ )
1151
+ comic_type = gr.Radio(
1152
+ [
1153
+ "默认",
1154
+ "四格漫画",
1155
+ "经典漫画风格",
1156
+ ],
1157
+ value="四格漫画",###########################################################################
1158
+ label="排版风格",
1159
+ info="选择漫画排版风格",
1160
+ )
1161
+ with gr.Group(visible=False):
1162
+ guidance_scale = gr.Slider(
1163
+ label="引导尺度",
1164
+ minimum=0.1,
1165
+ maximum=10.0,
1166
+ step=0.1,
1167
+ value=5,
1168
+ )
1169
+ style_strength_ratio = gr.Slider(
1170
+ label="参考图像风格强度 (%)",
1171
+ minimum=15,
1172
+ maximum=50,
1173
+ step=1,
1174
+ value=20,
1175
+ visible=False,
1176
+ )
1177
+ Ip_Adapter_Strength = gr.Slider(
1178
+ label="IP适配器强度",
1179
+ minimum=0,
1180
+ maximum=1,
1181
+ step=0.1,
1182
+ value=0.5,
1183
+ visible=False,
1184
+ )
1185
+ final_run_btn = gr.Button("😺开始生成!😺")
1186
+
1187
+ with gr.Column():
1188
+ out_image = gr.Gallery(label="生成结果", columns=2, height="auto")
1189
+ # print(out_image,"#########################################################################################################")
1190
+ generated_information = gr.Markdown(
1191
+ label="生成详情", value="", visible=False
1192
+ )
1193
+ gr.Markdown(version)
1194
+ model_type.change(
1195
+ fn=change_visiale_by_model_type,
1196
+ inputs=model_type,
1197
+ outputs=[control_image_input, style_strength_ratio, Ip_Adapter_Strength],
1198
+ )
1199
+ files.upload(
1200
+ fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files]
1201
+ )
1202
+ remove_and_reupload.click(
1203
+ fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files]
1204
+ )
1205
+ char_btn.click(fn=load_character_files, inputs=char_path, outputs=[general_prompt])
1206
+
1207
+ randomize_seed_btn.click(
1208
+ fn=lambda: random.randint(-1, MAX_SEED),
1209
+ inputs=[],
1210
+ outputs=seed_,
1211
+ )
1212
+
1213
+ final_run_btn.click(fn=set_text_unfinished, outputs=generated_information).then(
1214
+ process_generation,
1215
+ inputs=[
1216
+ sd_type,
1217
+ model_type,
1218
+ files,
1219
+ num_steps,
1220
+ style,
1221
+ Ip_Adapter_Strength,
1222
+ style_strength_ratio,
1223
+ guidance_scale,
1224
+ seed_,
1225
+ sa32_,
1226
+ sa64_,
1227
+ id_length_,
1228
+ general_prompt,
1229
+ negative_prompt,
1230
+ prompt_array,
1231
+ G_height,
1232
+ G_width,
1233
+ comic_type,
1234
+ font_choice,
1235
+ char_path,
1236
+ ],
1237
+ outputs=out_image,
1238
+ ).then(fn=set_text_finished, outputs=generated_information)
1239
+ with gr.Accordion("😺 点击选择内容 😺", open=False):
1240
+
1241
+ gr.Markdown("### 👦 男生视角")
1242
+ gr.Examples(
1243
+ examples=[
1244
+ [
1245
+ 0,
1246
+ 0.3,
1247
+ 0.5,
1248
+ 1,
1249
+ "[Alice]a man img\n[d]Chinese Rose img\n[b]a tree img\n[c]a sun flower img\n[e]a peach blossom img",
1250
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1251
+ array2string(
1252
+ [
1253
+ "[Alice]a portrait of a man standing alone, realistic style, natural lighting",
1254
+ "[b] is a tall tree in a meadow",
1255
+ "[Alice] stands next to the [b]",
1256
+ "[b] is a tall tree in a meadow, photorealistic,with # [Alice] standing beside it",
1257
+ "[c] a vibrant sunflower in a sunny field, close-up, golden petals, ultra-detailed",
1258
+ "[d] a blooming Chinese rose in a garden, macro shot, pink petals, soft focus",
1259
+ "[e] a peach blossom on a branch in spring, shallow depth of field, morning light",
1260
+ ]
1261
+ ),
1262
+ "日本漫画风",
1263
+ "Using Ref Images",
1264
+ [],# get_image_path_list("examples/taylor"),
1265
+ 768,
1266
+ 768
1267
+ ],
1268
+
1269
+ [
1270
+ 0,
1271
+ 0.3,
1272
+ 0.5,
1273
+ 1,
1274
+ "[Bob] a man img, \n[a]a flower img\n[b]a camel img\n[c] a bridge img\n[d] a gatehouse img",
1275
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1276
+ array2string(
1277
+ [
1278
+ "[Bob] a portrait of a man standing alone, realistic style, natural lighting",
1279
+ "[a] a peach blossom flower",
1280
+ "[b] a Bactrian camel with a desert background",
1281
+ "[c] a Chinese bridge",
1282
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1283
+ "[Bob] stands next to a blooming peach blossom flower, its petals glowing softly under the sunlight",
1284
+ "[Bob] stands next to the camel",
1285
+ "[Bob] stands on an ancient Chinese stone arch bridge over a quiet river",
1286
+ "[Bob] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1287
+ ]
1288
+ ),
1289
+ "日本漫画风",
1290
+ "Using Ref Images",
1291
+ [],# get_image_path_list("examples/taylor"),
1292
+ 768,
1293
+ 768
1294
+ ],
1295
+ [
1296
+ 0,
1297
+ 0.3,
1298
+ 0.5,
1299
+ 1,
1300
+ "[Bob] a man img, \n[a]a sunflower img\n[b]a horse img\n[c]a riverside house img\n[d]a Chinese pavilion img",
1301
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1302
+ array2string(
1303
+ [
1304
+ "[Bob] a portrait of a man standing alone, realistic style, natural lighting",
1305
+ "[a] a sunflower with large golden petals facing the sun",
1306
+ "[b] a brown horse standing on a grassy field",
1307
+ "[c] a traditional riverside house with white walls and black-tiled roof",
1308
+ "[d] an ancient Chinese multi-tiered pavilion with curved eaves",
1309
+ "[Bob] stands next to a sunflower, its bright petals glowing in the sunlight",
1310
+ "[Bob] stands next to the horse",
1311
+ "[Bob] stands beside a riverside house, reflected softly in the calm water",
1312
+ "[Bob] stands under a traditional Chinese pavilion with curved roofs and wooden columns",
1313
+ ]
1314
+ ),
1315
+ "日本漫画风",
1316
+ "Using Ref Images",
1317
+ [], # get_image_path_list("examples/taylor"),
1318
+ 768,
1319
+ 768
1320
+ ]
1321
+ ],
1322
+ example_labels=[
1323
+ "与花朵合影",
1324
+ "示例1:桃花,骆驼,桥,门楼",
1325
+ "示例2:向日葵,马,水乡,亭子",
1326
+ ],
1327
+ inputs=[
1328
+ seed_,
1329
+ sa32_,
1330
+ sa64_,
1331
+ id_length_,
1332
+ general_prompt,
1333
+ negative_prompt,
1334
+ prompt_array,
1335
+ style,
1336
+ model_type,
1337
+ files,
1338
+ G_height,
1339
+ G_width,
1340
+ ],
1341
+ )
1342
+
1343
+ gr.Markdown("### 👧 女生视角")
1344
+ gr.Examples(
1345
+ examples=[
1346
+
1347
+ [
1348
+ 0,
1349
+ 0.3,
1350
+ 0.5,
1351
+ 1,
1352
+ "[Alice] a woman img, \n[a]a flower img\n[b]a camel img\n[c] a bridge img\n[d] a gatehouse img",
1353
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1354
+ array2string(
1355
+ [
1356
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1357
+ "[a] a peach blossom flower",
1358
+ "[b] a Bactrian camel with a desert background",
1359
+ "[c] a Chinese bridge",
1360
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1361
+ "[Alice] stands next to a blooming peach blossom flower, its petals glowing softly under the sunlight",
1362
+ "[Alice] stands beside a camel",
1363
+ "[Alice] stands on an ancient Chinese stone arch bridge over a quiet river",
1364
+ "[Alice] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1365
+ ]
1366
+ ),
1367
+ "日本漫画风",
1368
+ "Using Ref Images",
1369
+ [], # get_image_path_list("examples/taylor"),
1370
+ 768,
1371
+ 768
1372
+ ],
1373
+ [
1374
+ 0,
1375
+ 0.3,
1376
+ 0.5,
1377
+ 1,
1378
+ "[Alice] a woman img, \n[a]a tree img\n[b]an alpaca img\n[c] a bridge img\n[d] a gatehouse img",
1379
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1380
+ array2string(
1381
+ [
1382
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1383
+ "[a] a tree ",
1384
+ "[b] an alpaca with fluffy white wool standing on grassland",
1385
+ "[c] a Chinese bridge",
1386
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1387
+ "[Alice] stands next to a tree, under the sunlight",
1388
+ "[Alice] stands next to the alpaca",
1389
+ "[Alice] stands on an ancient Chinese stone arch bridge over a quiet river",
1390
+ "[Alice] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1391
+ ]
1392
+ ),
1393
+ "日本漫画风",
1394
+ "Using Ref Images",
1395
+ [], # get_image_path_list("examples/taylor"),
1396
+ 768,
1397
+ 768
1398
+ ],
1399
+ [
1400
+ 0,
1401
+ 0.3,
1402
+ 0.5,
1403
+ 1,
1404
+ "[Alice] a woman img, \n[a]a sunflower img\n[b]a white goose img\n[c]a riverside house img\n[d]a Chinese pavilion img",
1405
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1406
+ array2string(
1407
+ [
1408
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1409
+ "[a] a sunflower with large golden petals facing the sun",
1410
+ "[b] a white goose with smooth feathers standing near the water",
1411
+ "[c] a traditional riverside house with white walls and black-tiled roof",
1412
+ "[d] an ancient Chinese multi-tiered pavilion with curved eaves",
1413
+ "[Alice] stands next to a sunflower, its bright petals glowing in the sunlight",
1414
+ "[Alice] stands next to the white goose, its feathers clean and shining in the sun",
1415
+ "[Alice] stands beside a riverside house, its reflection shimmering in the gentle stream",
1416
+ "[Alice] stands under a Chinese pavilion with layered roofs and carved wooden pillars",
1417
+ ]
1418
+ ),
1419
+ "日本漫画风",
1420
+ "Using Ref Images",
1421
+ [], # get_image_path_list("examples/taylor"),
1422
+ 768,
1423
+ 768
1424
+ ]
1425
+ ],
1426
+ example_labels=[
1427
+ "示例1:桃花,骆驼,桥,门楼",
1428
+ "示例2:树,羊驼,桥,门楼",
1429
+ "示例3:向日葵,鹅,水乡,亭子",
1430
+ ],
1431
+ inputs=[
1432
+ seed_,
1433
+ sa32_,
1434
+ sa64_,
1435
+ id_length_,
1436
+ general_prompt,
1437
+ negative_prompt,
1438
+ prompt_array,
1439
+ style,
1440
+ model_type,
1441
+ files,
1442
+ G_height,
1443
+ G_width,
1444
+ ],
1445
+ # outputs=[post_sketch, binary_matrixes, *color_row, *colors, *prompts, gen_prompt_vis, general_prompt, seed_],
1446
+ # run_on_click=True,
1447
+ label="😺 请选择: 😺",
1448
+ )
1449
+
1450
+ gr.Markdown(article)
1451
+
1452
+
1453
+ demo.launch(server_name="0.0.0.0", share=True)
gradio_app_sdxl_specific_id_low_vram.py ADDED
@@ -0,0 +1,1454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from this import d
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ import gc
6
+ import copy
7
+ import os
8
+ import random
9
+ import datetime
10
+ from PIL import ImageFont
11
+ from utils.gradio_utils import (
12
+ character_to_dict,
13
+ process_original_prompt,
14
+ get_ref_character,
15
+ cal_attn_mask_xl,
16
+ cal_attn_indice_xl_effcient_memory,
17
+ is_torch2_available,
18
+ )
19
+
20
+ import os
21
+ os.environ['GPU_PLATFORM_ID'] = '0'
22
+ os.environ['GPU_DEVICE_ID'] = '0'
23
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
24
+
25
+
26
+ import os
27
+ os.environ['HF_ENDPOINT']= 'https://hf-mirror.com'
28
+ torch.backends.cudnn.enabled = True
29
+
30
+ if is_torch2_available():
31
+ from utils.gradio_utils import AttnProcessor2_0 as AttnProcessor
32
+ else:
33
+ from utils.gradio_utils import AttnProcessor
34
+ from huggingface_hub import hf_hub_download
35
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
36
+ StableDiffusionXLPipeline,
37
+ )
38
+ from diffusers.schedulers.scheduling_ddim import DDIMScheduler
39
+ import torch.nn.functional as F
40
+ from diffusers.utils.loading_utils import load_image
41
+ from utils.utils import get_comic
42
+ from utils.style_template import styles
43
+ from utils.load_models_utils import get_models_dict, load_models
44
+
45
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '5'
46
+
47
+ STYLE_NAMES = list(styles.keys())
48
+ DEFAULT_STYLE_NAME = "日本漫画风"
49
+ global models_dict
50
+
51
+ models_dict = get_models_dict()
52
+
53
+ # Automatically select the device
54
+ device = (
55
+ "cuda:0"
56
+ if torch.cuda.is_available()
57
+ else "mps" if torch.backends.mps.is_available() else "cpu"
58
+ )
59
+
60
+ # device = "cpu"
61
+
62
+ # torch.cuda.set_device(5)
63
+ print(f"@@device:{device}")
64
+
65
+
66
+ # check if the file exists locally at a specified path before downloading it.
67
+ # if the file doesn't exist, it uses `hf_hub_download` to download the file
68
+ # and optionally move it to a specific directory. If the file already
69
+ # exists, it simply uses the local path.
70
+ local_dir = "data/"
71
+ photomaker_local_path = f"{local_dir}photomaker-v1.bin"
72
+ if not os.path.exists(photomaker_local_path):
73
+ photomaker_path = hf_hub_download(
74
+ repo_id="TencentARC/PhotoMaker",
75
+ filename="photomaker-v1.bin",
76
+ repo_type="model",
77
+ local_dir=local_dir,
78
+ )
79
+ else:
80
+ photomaker_path = photomaker_local_path
81
+
82
+ MAX_SEED = np.iinfo(np.int32).max
83
+
84
+
85
+ def setup_seed(seed):
86
+ torch.manual_seed(seed)
87
+ if device == "cuda":
88
+ torch.cuda.manual_seed_all(seed)
89
+ np.random.seed(seed)
90
+ random.seed(seed)
91
+ torch.backends.cudnn.deterministic = True
92
+
93
+
94
+ def set_text_unfinished():
95
+ return gr.update(
96
+ visible=True,
97
+ value="<h3>正在生成中......</h3>",
98
+ )
99
+
100
+
101
+ def set_text_finished():
102
+ return gr.update(visible=True, value="<h3>生成完成!</h3>")
103
+
104
+
105
+ #################################################
106
+ def get_image_path_list(folder_name):
107
+ image_basename_list = os.listdir(folder_name)
108
+ image_path_list = sorted(
109
+ [os.path.join(folder_name, basename) for basename in image_basename_list]
110
+ )
111
+ return image_path_list
112
+
113
+
114
+ #################################################
115
+ class SpatialAttnProcessor2_0(torch.nn.Module):
116
+ r"""
117
+ Attention processor for IP-Adapater for PyTorch 2.0.
118
+ Args:
119
+ hidden_size (`int`):
120
+ The hidden size of the attention layer.
121
+ cross_attention_dim (`int`):
122
+ The number of channels in the `encoder_hidden_states`.
123
+ text_context_len (`int`, defaults to 77):
124
+ The context length of the text features.
125
+ scale (`float`, defaults to 1.0):
126
+ the weight scale of image prompt.
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ hidden_size=None,
132
+ cross_attention_dim=None,
133
+ id_length=4,
134
+ device=device,
135
+ dtype=torch.float16,
136
+ ):
137
+ super().__init__()
138
+ if not hasattr(F, "scaled_dot_product_attention"):
139
+ raise ImportError(
140
+ "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
141
+ )
142
+ self.device = device
143
+ self.dtype = dtype
144
+ self.hidden_size = hidden_size
145
+ self.cross_attention_dim = cross_attention_dim
146
+ self.total_length = id_length + 1
147
+ self.id_length = id_length
148
+ self.id_bank = {}
149
+
150
+ def __call__(
151
+ self,
152
+ attn,
153
+ hidden_states,
154
+ encoder_hidden_states=None,
155
+ attention_mask=None,
156
+ temb=None,
157
+ ):
158
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
159
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
160
+ # 生成一个0到1之间的随机数
161
+ global total_count, attn_count, cur_step, indices1024, indices4096
162
+ global sa32, sa64
163
+ global write
164
+ global height, width
165
+ global character_dict, character_index_dict, invert_character_index_dict, cur_character, ref_indexs_dict, ref_totals, cur_character
166
+ if attn_count == 0 and cur_step == 0:
167
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
168
+ self.total_length,
169
+ self.id_length,
170
+ sa32,
171
+ sa64,
172
+ height,
173
+ width,
174
+ device=self.device,
175
+ dtype=self.dtype,
176
+ )
177
+ if write:
178
+ assert len(cur_character) == 1
179
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
180
+ indices = indices1024
181
+ else:
182
+ indices = indices4096
183
+ # print(f"white:{cur_step}")
184
+ total_batch_size, nums_token, channel = hidden_states.shape
185
+ img_nums = total_batch_size // 2
186
+ hidden_states = hidden_states.reshape(-1, img_nums, nums_token, channel)
187
+ # print(img_nums,len(indices),hidden_states.shape,self.total_length)
188
+ if cur_character[0] not in self.id_bank:
189
+ self.id_bank[cur_character[0]] = {}
190
+ self.id_bank[cur_character[0]][cur_step] = [
191
+ hidden_states[:, img_ind, indices[img_ind], :]
192
+ .reshape(2, -1, channel)
193
+ .clone()
194
+ for img_ind in range(img_nums)
195
+ ]
196
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
197
+ # self.id_bank[cur_step] = [hidden_states[:self.id_length].clone(), hidden_states[self.id_length:].clone()]
198
+ else:
199
+ # encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),self.id_bank[cur_step][1].to(self.device)))
200
+ # TODO: ADD Multipersion Control
201
+ encoder_arr = []
202
+ for character in cur_character:
203
+ encoder_arr = encoder_arr + [
204
+ tensor.to(self.device)
205
+ for tensor in self.id_bank[character][cur_step]
206
+ ]
207
+ # 判断随机数是否大于0.5
208
+ if cur_step < 1:
209
+ hidden_states = self.__call2__(
210
+ attn, hidden_states, None, attention_mask, temb
211
+ )
212
+ else: # 256 1024 4096
213
+ random_number = random.random()
214
+ if cur_step < 20:
215
+ rand_num = 0.3
216
+ else:
217
+ rand_num = 0.1
218
+ # print(f"hidden state shape {hidden_states.shape[1]}")
219
+ if random_number > rand_num:
220
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
221
+ indices = indices1024
222
+ else:
223
+ indices = indices4096
224
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
225
+ if write:
226
+ total_batch_size, nums_token, channel = hidden_states.shape
227
+ img_nums = total_batch_size // 2
228
+ hidden_states = hidden_states.reshape(
229
+ -1, img_nums, nums_token, channel
230
+ )
231
+ encoder_arr = [
232
+ hidden_states[:, img_ind, indices[img_ind], :].reshape(
233
+ 2, -1, channel
234
+ )
235
+ for img_ind in range(img_nums)
236
+ ]
237
+ for img_ind in range(img_nums):
238
+ # print(img_nums)
239
+ # assert img_nums != 1
240
+ img_ind_list = [i for i in range(img_nums)]
241
+ # print(img_ind_list,img_ind)
242
+ img_ind_list.remove(img_ind)
243
+ # print(img_ind,invert_character_index_dict[img_ind])
244
+ # print(character_index_dict[invert_character_index_dict[img_ind]])
245
+ # print(img_ind_list)
246
+ # print(img_ind,img_ind_list)
247
+ encoder_hidden_states_tmp = torch.cat(
248
+ [encoder_arr[img_ind] for img_ind in img_ind_list]
249
+ + [hidden_states[:, img_ind, :, :]],
250
+ dim=1,
251
+ )
252
+
253
+ hidden_states[:, img_ind, :, :] = self.__call2__(
254
+ attn,
255
+ hidden_states[:, img_ind, :, :],
256
+ encoder_hidden_states_tmp,
257
+ None,
258
+ temb,
259
+ )
260
+ else:
261
+ _, nums_token, channel = hidden_states.shape
262
+ # img_nums = total_batch_size // 2
263
+ # encoder_hidden_states = encoder_hidden_states.reshape(-1,img_nums,nums_token,channel)
264
+ hidden_states = hidden_states.reshape(2, -1, nums_token, channel)
265
+ # print(len(indices))
266
+ # encoder_arr = [encoder_hidden_states[:,img_ind,indices[img_ind],:].reshape(2,-1,channel) for img_ind in range(img_nums)]
267
+ encoder_hidden_states_tmp = torch.cat(
268
+ encoder_arr + [hidden_states[:, 0, :, :]], dim=1
269
+ )
270
+ # print(len(encoder_arr),encoder_hidden_states_tmp.shape)
271
+ hidden_states[:, 0, :, :] = self.__call2__(
272
+ attn,
273
+ hidden_states[:, 0, :, :],
274
+ encoder_hidden_states_tmp,
275
+ None,
276
+ temb,
277
+ )
278
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
279
+ else:
280
+ hidden_states = self.__call2__(
281
+ attn, hidden_states, None, attention_mask, temb
282
+ )
283
+ attn_count += 1
284
+ if attn_count == total_count:
285
+ attn_count = 0
286
+ cur_step += 1
287
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
288
+ self.total_length,
289
+ self.id_length,
290
+ sa32,
291
+ sa64,
292
+ height,
293
+ width,
294
+ device=self.device,
295
+ dtype=self.dtype,
296
+ )
297
+
298
+ return hidden_states
299
+
300
+ def __call2__(
301
+ self,
302
+ attn,
303
+ hidden_states,
304
+ encoder_hidden_states=None,
305
+ attention_mask=None,
306
+ temb=None,
307
+ ):
308
+ residual = hidden_states
309
+
310
+ if attn.spatial_norm is not None:
311
+ hidden_states = attn.spatial_norm(hidden_states, temb)
312
+
313
+ input_ndim = hidden_states.ndim
314
+
315
+ if input_ndim == 4:
316
+ batch_size, channel, height, width = hidden_states.shape
317
+ hidden_states = hidden_states.view(
318
+ batch_size, channel, height * width
319
+ ).transpose(1, 2)
320
+
321
+ batch_size, sequence_length, channel = hidden_states.shape
322
+ # print(hidden_states.shape)
323
+ if attention_mask is not None:
324
+ attention_mask = attn.prepare_attention_mask(
325
+ attention_mask, sequence_length, batch_size
326
+ )
327
+ # scaled_dot_product_attention expects attention_mask shape to be
328
+ # (batch, heads, source_length, target_length)
329
+ attention_mask = attention_mask.view(
330
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
331
+ )
332
+
333
+ if attn.group_norm is not None:
334
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
335
+ 1, 2
336
+ )
337
+
338
+ query = attn.to_q(hidden_states)
339
+
340
+ if encoder_hidden_states is None:
341
+ encoder_hidden_states = hidden_states # B, N, C
342
+ # else:
343
+ # encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
344
+
345
+ key = attn.to_k(encoder_hidden_states)
346
+ value = attn.to_v(encoder_hidden_states)
347
+
348
+ inner_dim = key.shape[-1]
349
+ head_dim = inner_dim // attn.heads
350
+
351
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
352
+
353
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
354
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
355
+
356
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
357
+ # TODO: add support for attn.scale when we move to Torch 2.1
358
+ hidden_states = F.scaled_dot_product_attention(
359
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
360
+ )
361
+
362
+ hidden_states = hidden_states.transpose(1, 2).reshape(
363
+ batch_size, -1, attn.heads * head_dim
364
+ )
365
+ hidden_states = hidden_states.to(query.dtype)
366
+
367
+ # linear proj
368
+ hidden_states = attn.to_out[0](hidden_states)
369
+ # dropout
370
+ hidden_states = attn.to_out[1](hidden_states)
371
+
372
+ if input_ndim == 4:
373
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
374
+ batch_size, channel, height, width
375
+ )
376
+
377
+ if attn.residual_connection:
378
+ hidden_states = hidden_states + residual
379
+
380
+ hidden_states = hidden_states / attn.rescale_output_factor
381
+
382
+ return hidden_states
383
+
384
+
385
+ def set_attention_processor(unet, id_length, is_ipadapter=False):
386
+ global attn_procs
387
+ attn_procs = {}
388
+ for name in unet.attn_processors.keys():
389
+ cross_attention_dim = (
390
+ None
391
+ if name.endswith("attn1.processor")
392
+ else unet.config.cross_attention_dim
393
+ )
394
+ if name.startswith("mid_block"):
395
+ hidden_size = unet.config.block_out_channels[-1]
396
+ elif name.startswith("up_blocks"):
397
+ block_id = int(name[len("up_blocks.")])
398
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
399
+ elif name.startswith("down_blocks"):
400
+ block_id = int(name[len("down_blocks.")])
401
+ hidden_size = unet.config.block_out_channels[block_id]
402
+ if cross_attention_dim is None:
403
+ if name.startswith("up_blocks"):
404
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
405
+ else:
406
+ attn_procs[name] = AttnProcessor()
407
+ else:
408
+ if is_ipadapter:
409
+ attn_procs[name] = IPAttnProcessor2_0(
410
+ hidden_size=hidden_size,
411
+ cross_attention_dim=cross_attention_dim,
412
+ scale=1,
413
+ num_tokens=4,
414
+ ).to(unet.device, dtype=torch.float16)
415
+ else:
416
+ attn_procs[name] = AttnProcessor()
417
+
418
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
419
+
420
+
421
+ #################################################
422
+ #################################################
423
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
424
+ load_js = """
425
+ async () => {
426
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
427
+ fetch(url)
428
+ .then(res => res.text())
429
+ .then(text => {
430
+ const script = document.createElement('script');
431
+ script.type = "module"
432
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
433
+ document.head.appendChild(script);
434
+ });
435
+ }
436
+ """
437
+
438
+ get_js_colors = """
439
+ async (canvasData) => {
440
+ const canvasEl = document.getElementById("canvas-root");
441
+ return [canvasEl._data]
442
+ }
443
+ """
444
+
445
+ css = """
446
+ #color-bg{display:flex;justify-content: center;align-items: center;}
447
+ .color-bg-item{width: 100%; height: 32px}
448
+ #main_button{width:100%}
449
+ <style>
450
+ """
451
+
452
+
453
+ def save_single_character_weights(unet, character, description, filepath):
454
+ """
455
+ 保存 attention_processor 类中的 id_bank GPU Tensor 列表到指定文件中。
456
+ 参数:
457
+ - model: 包含 attention_processor 类实例的模型。
458
+ - filepath: 权重要保存到的文件路径。
459
+ """
460
+ weights_to_save = {}
461
+ weights_to_save["description"] = description
462
+ weights_to_save["character"] = character
463
+ for attn_name, attn_processor in unet.attn_processors.items():
464
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
465
+ # 将每个 Tensor 转到 CPU 并转为列表,以确保它可以被序列化
466
+ weights_to_save[attn_name] = {}
467
+ for step_key in attn_processor.id_bank[character].keys():
468
+ weights_to_save[attn_name][step_key] = [
469
+ tensor.cpu()
470
+ for tensor in attn_processor.id_bank[character][step_key]
471
+ ]
472
+ # 使用torch.save保存权重
473
+ torch.save(weights_to_save, filepath)
474
+
475
+
476
+ def load_single_character_weights(unet, filepath):
477
+ """
478
+ 从指定文件中加载权重到 attention_processor 类的 id_bank 中。
479
+ 参数:
480
+ - model: 包含 attention_processor 类实例的模型。
481
+ - filepath: 权重文件的路径。
482
+ """
483
+ # 使用torch.load来读取权重
484
+ weights_to_load = torch.load(filepath, map_location=torch.device("cpu"))
485
+ character = weights_to_load["character"]
486
+ description = weights_to_load["description"]
487
+ for attn_name, attn_processor in unet.attn_processors.items():
488
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
489
+ # 转移权重到GPU(如果GPU可用的话)并赋值给id_bank
490
+ attn_processor.id_bank[character] = {}
491
+ for step_key in weights_to_load[attn_name].keys():
492
+ attn_processor.id_bank[character][step_key] = [
493
+ tensor.to(unet.device)
494
+ for tensor in weights_to_load[attn_name][step_key]
495
+ ]
496
+
497
+
498
+ def save_results(unet, img_list):
499
+
500
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
501
+ folder_name = f"results/{timestamp}"
502
+ weight_folder_name = f"{folder_name}/weights"
503
+ # 创建文件夹
504
+ if not os.path.exists(folder_name):
505
+ os.makedirs(folder_name)
506
+ os.makedirs(weight_folder_name)
507
+
508
+ for idx, img in enumerate(img_list):
509
+ file_path = os.path.join(folder_name, f"image_{idx}.png") # 图片文件名
510
+ img.save(file_path)
511
+ global character_dict
512
+ # for char in character_dict:
513
+ # description = character_dict[char]
514
+ # save_single_character_weights(unet,char,description,os.path.join(weight_folder_name, f'{char}.pt'))
515
+
516
+
517
+ #################################################
518
+ title = r"""
519
+ <h1 align="center" style="font-family: 'Comic Sans MS', 'Orbitron', sans-serif; color: #00ccff; text-shadow: 2px 2px 4px #000000; font-size: 48px;">
520
+ 🧠✨ 我的AI研学旅记 🚀🤖
521
+ </h1>
522
+ """
523
+
524
+
525
+ # title = r"""
526
+ # <h1 align="center">我的AI研学旅记</h1>
527
+ # """
528
+
529
+ description = r"""
530
+ """
531
+ # <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
532
+ # ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
533
+ # 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
534
+ # 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
535
+ # 3️⃣ Choose your preferred style template.<br>
536
+ # 4️⃣ Click the <b>Submit</b> button to start customizing.
537
+
538
+
539
+ article = r"""
540
+ """
541
+ # If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
542
+ # [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
543
+ # ---
544
+ # 📝 **Citation**
545
+ # <br>
546
+ # If our work is useful for your research, please consider citing:
547
+
548
+ # ```bibtex
549
+ # @article{Zhou2024storydiffusion,
550
+ # title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
551
+ # author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
552
+ # year={2024}
553
+ # }
554
+ # ```
555
+ # 📋 **License**
556
+ # <br>
557
+ # Apache-2.0 LICENSE.
558
+
559
+ # 📧 **Contact**
560
+ # <br>
561
+ # If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
562
+
563
+ version = r"""
564
+ """
565
+ # <h3 align="center">StoryDiffusion Version 0.02 (test version)</h3>
566
+
567
+ # <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
568
+ # <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
569
+ # <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling.")</h5>
570
+ # <h5 align="center">Tips: </h4>
571
+
572
+
573
+
574
+
575
+
576
+
577
+ #################################################
578
+ global attn_count, total_count, id_length, total_length, cur_step, cur_model_type
579
+ global write
580
+ global sa32, sa64
581
+ global height, width
582
+ attn_count = 0
583
+ total_count = 0
584
+ cur_step = 0
585
+ id_length = 4
586
+ total_length = 5
587
+ cur_model_type = ""
588
+ global attn_procs, unet
589
+ attn_procs = {}
590
+ ###
591
+ write = False
592
+ ###
593
+ sa32 = 0.5
594
+ sa64 = 0.5
595
+ height = 768
596
+ width = 768
597
+ ###
598
+ global pipe
599
+ global sd_model_path
600
+ pipe = None
601
+ sd_model_path = models_dict["Unstable"]["path"] # "SG161222/RealVisXL_V4.0"
602
+ single_files = models_dict["Unstable"]["single_files"]
603
+ ### LOAD Stable Diffusion Pipeline
604
+ if single_files:
605
+ pipe = StableDiffusionXLPipeline.from_single_file(
606
+ sd_model_path, torch_dtype=torch.float16
607
+ )
608
+ else:
609
+ pipe = StableDiffusionXLPipeline.from_pretrained(
610
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=False
611
+ )
612
+ print("pipE.device = ", device)
613
+ pipe = pipe.to(device)
614
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
615
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
616
+ pipe.scheduler.set_timesteps(50)
617
+ pipe.enable_vae_slicing()
618
+ if device != "mps":
619
+ pipe.enable_model_cpu_offload()
620
+ unet = pipe.unet
621
+ cur_model_type = "Unstable" + "-" + "original"
622
+ ### Insert PairedAttention
623
+ for name in unet.attn_processors.keys():
624
+ cross_attention_dim = (
625
+ None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
626
+ )
627
+ if name.startswith("mid_block"):
628
+ hidden_size = unet.config.block_out_channels[-1]
629
+ elif name.startswith("up_blocks"):
630
+ block_id = int(name[len("up_blocks.")])
631
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
632
+ elif name.startswith("down_blocks"):
633
+ block_id = int(name[len("down_blocks.")])
634
+ hidden_size = unet.config.block_out_channels[block_id]
635
+ if cross_attention_dim is None and (name.startswith("up_blocks")):
636
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
637
+ total_count += 1
638
+ else:
639
+ attn_procs[name] = AttnProcessor()
640
+ print("successsfully load paired self-attention")
641
+ print(f"number of the processor : {total_count}")
642
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
643
+ global mask1024, mask4096
644
+ mask1024, mask4096 = cal_attn_mask_xl(
645
+ total_length,
646
+ id_length,
647
+ sa32,
648
+ sa64,
649
+ height,
650
+ width,
651
+ device=device,
652
+ dtype=torch.float16,
653
+ )
654
+
655
+ ######### Gradio Fuction #############
656
+
657
+
658
+ def swap_to_gallery(images):
659
+ return (
660
+ gr.update(value=images, visible=True),
661
+ gr.update(visible=True),
662
+ gr.update(visible=False),
663
+ )
664
+
665
+
666
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
667
+ return (
668
+ gr.update(value=images, visible=True),
669
+ gr.update(visible=True),
670
+ gr.update(visible=False),
671
+ )
672
+
673
+
674
+ def remove_back_to_files():
675
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
676
+
677
+
678
+ def remove_tips():
679
+ return gr.update(visible=False)
680
+
681
+
682
+ def apply_style_positive(style_name: str, positive: str):
683
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
684
+ return p.replace("{prompt}", positive)
685
+
686
+
687
+ def apply_style(style_name: str, positives: list, negative: str = ""):
688
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
689
+ return [
690
+ p.replace("{prompt}", positive) for positive in positives
691
+ ], n + " " + negative
692
+
693
+
694
+ def change_visiale_by_model_type(_model_type):
695
+ if _model_type == "Only Using Textual Description":
696
+ return (
697
+ gr.update(visible=False),
698
+ gr.update(visible=False),
699
+ gr.update(visible=False),
700
+ )
701
+ elif _model_type == "Using Ref Images":
702
+ return (
703
+ gr.update(visible=True),
704
+ gr.update(visible=True),
705
+ gr.update(visible=False),
706
+ )
707
+ else:
708
+ raise ValueError("Invalid model type", _model_type)
709
+
710
+
711
+ def load_character_files(character_files: str):
712
+ if character_files == "":
713
+ raise gr.Error("Please set a character file!")
714
+ character_files_arr = character_files.splitlines()
715
+ primarytext = []
716
+ for character_file_name in character_files_arr:
717
+ character_file = torch.load(
718
+ character_file_name, map_location=torch.device("cpu")
719
+ )
720
+ primarytext.append(character_file["character"] + character_file["description"])
721
+ return array2string(primarytext)
722
+
723
+
724
+ def load_character_files_on_running(unet, character_files: str):
725
+ if character_files == "":
726
+ return False
727
+ character_files_arr = character_files.splitlines()
728
+ for character_file in character_files_arr:
729
+ load_single_character_weights(unet, character_file)
730
+ return True
731
+
732
+
733
+ ######### Image Generation ##############
734
+ def process_generation(
735
+ _sd_type,
736
+ _model_type,
737
+ _upload_images,
738
+ _num_steps,
739
+ style_name,
740
+ _Ip_Adapter_Strength,
741
+ _style_strength_ratio,
742
+ guidance_scale,
743
+ seed_,
744
+ sa32_,
745
+ sa64_,
746
+ id_length_,
747
+ general_prompt,
748
+ negative_prompt,
749
+ prompt_array,
750
+ G_height,
751
+ G_width,
752
+ _comic_type,
753
+ font_choice,
754
+ _char_files,
755
+ ): # Corrected font_choice usage
756
+ if len(general_prompt.splitlines()) > 5:
757
+ raise gr.Error(
758
+ "Support for more than three characters is temporarily unavailable due to VRAM limitations, but this issue will be resolved soon."
759
+ )
760
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
761
+ if _model_type == "Photomaker" and "img" not in general_prompt:
762
+ raise gr.Error(
763
+ 'Please add the triger word " img " behind the class word you want to customize, such as: man img or woman img'
764
+ )
765
+ if _upload_images is None and _model_type != "original":
766
+ raise gr.Error(f"Cannot find any input face image!")
767
+ global sa32, sa64, id_length, total_length, attn_procs, unet, cur_model_type
768
+ global write
769
+ global cur_step, attn_count
770
+ global height, width
771
+ height = G_height
772
+ width = G_width
773
+ global pipe
774
+ global sd_model_path, models_dict
775
+ sd_model_path = models_dict[_sd_type]
776
+ use_safe_tensor = True
777
+ for attn_processor in pipe.unet.attn_processors.values():
778
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
779
+ for values in attn_processor.id_bank.values():
780
+ del values
781
+ attn_processor.id_bank = {}
782
+ attn_processor.id_length = id_length
783
+ attn_processor.total_length = id_length + 1
784
+ gc.collect()
785
+ torch.cuda.empty_cache()
786
+ if cur_model_type != _sd_type + "-" + _model_type:
787
+ # apply the style template
788
+ ##### load pipe
789
+ del pipe
790
+ gc.collect()
791
+ if device == "cuda":
792
+ torch.cuda.empty_cache()
793
+ model_info = models_dict[_sd_type]
794
+ model_info["model_type"] = _model_type
795
+ print("device = ", device)
796
+ pipe = load_models(model_info, device=device, photomaker_path=photomaker_path)
797
+ set_attention_processor(pipe.unet, id_length_, is_ipadapter=False)
798
+ ##### ########################
799
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
800
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
801
+ cur_model_type = _sd_type + "-" + _model_type
802
+ pipe.enable_vae_slicing()
803
+ if device != "mps":
804
+ pipe.enable_model_cpu_offload()
805
+ else:
806
+ unet = pipe.unet
807
+ # unet.set_attn_processor(copy.deepcopy(attn_procs))
808
+
809
+ load_chars = load_character_files_on_running(unet, character_files=_char_files)
810
+
811
+ prompts = prompt_array.splitlines()
812
+ global character_dict, character_index_dict, invert_character_index_dict, ref_indexs_dict, ref_totals
813
+ character_dict, character_list = character_to_dict(general_prompt)
814
+
815
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
816
+ if start_merge_step > 30:
817
+ start_merge_step = 30
818
+ print(f"start_merge_step:{start_merge_step}")
819
+ generator = torch.Generator(device=device).manual_seed(seed_)
820
+ sa32, sa64 = sa32_, sa64_
821
+ id_length = id_length_
822
+ clipped_prompts = prompts[:]
823
+ nc_indexs = []
824
+ for ind, prompt in enumerate(clipped_prompts):
825
+ if "[NC]" in prompt:
826
+ nc_indexs.append(ind)
827
+ if ind < id_length:
828
+ raise gr.Error(
829
+ f"The first {id_length} row is id prompts, cannot use [NC]!"
830
+ )
831
+ prompts = [
832
+ prompt if "[NC]" not in prompt else prompt.replace("[NC]", "")
833
+ for prompt in clipped_prompts
834
+ ]
835
+
836
+ prompts = [
837
+ prompt.rpartition("#")[0] if "#" in prompt else prompt for prompt in prompts
838
+ ]
839
+ print(prompts)
840
+ # id_prompts = prompts[:id_length]
841
+ (
842
+ character_index_dict,
843
+ invert_character_index_dict,
844
+ replace_prompts,
845
+ ref_indexs_dict,
846
+ ref_totals,
847
+ ) = process_original_prompt(character_dict, prompts.copy(), id_length)
848
+ if _model_type != "original":
849
+ input_id_images_dict = {}
850
+ if len(_upload_images) != len(character_dict.keys()):
851
+ raise gr.Error(
852
+ f"You upload images({len(_upload_images)}) is not equal to the number of characters({len(character_dict.keys())})!"
853
+ )
854
+ for ind, img in enumerate(_upload_images):
855
+ input_id_images_dict[character_list[ind]] = [load_image(img)]
856
+ print(character_dict)
857
+ print(character_index_dict)
858
+ print(invert_character_index_dict)
859
+ # real_prompts = prompts[id_length:]
860
+ if device == "cuda":
861
+ torch.cuda.empty_cache()
862
+ write = True
863
+ cur_step = 0
864
+
865
+ attn_count = 0
866
+ # id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
867
+ # print(id_prompts)
868
+ setup_seed(seed_)
869
+ total_results = []
870
+ id_images = []
871
+ results_dict = {}
872
+ global cur_character
873
+
874
+ if not load_chars:
875
+ for character_key in character_dict.keys():
876
+ cur_character = [character_key]
877
+ ref_indexs = ref_indexs_dict[character_key]
878
+ print(character_key, ref_indexs)
879
+ current_prompts = [replace_prompts[ref_ind] for ref_ind in ref_indexs]
880
+ print(current_prompts)
881
+ setup_seed(seed_)
882
+ generator = torch.Generator(device=device).manual_seed(seed_)
883
+ cur_step = 0
884
+ cur_positive_prompts, negative_prompt = apply_style(
885
+ style_name, current_prompts, negative_prompt
886
+ )
887
+ if _model_type == "original":
888
+ id_images = pipe(
889
+ cur_positive_prompts,
890
+ num_inference_steps=_num_steps,
891
+ guidance_scale=guidance_scale,
892
+ height=height,
893
+ width=width,
894
+ negative_prompt=negative_prompt,
895
+ generator=generator,
896
+ ).images
897
+ elif _model_type == "Photomaker":
898
+ id_images = pipe(
899
+ cur_positive_prompts,
900
+ input_id_images=input_id_images_dict[character_key],
901
+ num_inference_steps=_num_steps,
902
+ guidance_scale=guidance_scale,
903
+ start_merge_step=start_merge_step,
904
+ height=height,
905
+ width=width,
906
+ negative_prompt=negative_prompt,
907
+ generator=generator,
908
+ ).images
909
+ else:
910
+ raise NotImplementedError(
911
+ "You should choice between original and Photomaker!",
912
+ f"But you choice {_model_type}",
913
+ )
914
+
915
+ # total_results = id_images + total_results
916
+ # yield total_results
917
+ print(id_images)
918
+ for ind, img in enumerate(id_images):
919
+ print(ref_indexs[ind])
920
+ results_dict[ref_indexs[ind]] = img
921
+ # real_images = []
922
+ yield [results_dict[ind] for ind in results_dict.keys()]
923
+ write = False
924
+ if not load_chars:
925
+ real_prompts_inds = [
926
+ ind for ind in range(len(prompts)) if ind not in ref_totals
927
+ ]
928
+ else:
929
+ real_prompts_inds = [ind for ind in range(len(prompts))]
930
+ print(real_prompts_inds)
931
+
932
+ for real_prompts_ind in real_prompts_inds:
933
+ real_prompt = replace_prompts[real_prompts_ind]
934
+ cur_character = get_ref_character(prompts[real_prompts_ind], character_dict)
935
+ print(cur_character, real_prompt)
936
+ setup_seed(seed_)
937
+ if len(cur_character) > 1 and _model_type == "Photomaker":
938
+ raise gr.Error(
939
+ "Temporarily Not Support Multiple character in Ref Image Mode!"
940
+ )
941
+ generator = torch.Generator(device=device).manual_seed(seed_)
942
+ cur_step = 0
943
+ real_prompt = apply_style_positive(style_name, real_prompt)
944
+ if _model_type == "original":
945
+ results_dict[real_prompts_ind] = pipe(
946
+ real_prompt,
947
+ num_inference_steps=_num_steps,
948
+ guidance_scale=guidance_scale,
949
+ height=height,
950
+ width=width,
951
+ negative_prompt=negative_prompt,
952
+ generator=generator,
953
+ ).images[0]
954
+ elif _model_type == "Photomaker":
955
+ results_dict[real_prompts_ind] = pipe(
956
+ real_prompt,
957
+ input_id_images=(
958
+ input_id_images_dict[cur_character[0]]
959
+ if real_prompts_ind not in nc_indexs
960
+ else input_id_images_dict[character_list[0]]
961
+ ),
962
+ num_inference_steps=_num_steps,
963
+ guidance_scale=guidance_scale,
964
+ start_merge_step=start_merge_step,
965
+ height=height,
966
+ width=width,
967
+ negative_prompt=negative_prompt,
968
+ generator=generator,
969
+ nc_flag=True if real_prompts_ind in nc_indexs else False,
970
+ ).images[0]
971
+ else:
972
+ raise NotImplementedError(
973
+ "You should choice between original and Photomaker!",
974
+ f"But you choice {_model_type}",
975
+ )
976
+ yield [results_dict[ind] for ind in results_dict.keys()]
977
+ total_results = [results_dict[ind] for ind in range(len(prompts))]
978
+ if _comic_type != "No typesetting (default)":
979
+ captions = prompt_array.splitlines()
980
+ captions = [caption.replace("[NC]", "") for caption in captions]
981
+ captions = [
982
+ caption.split("#")[-1] if "#" in caption else caption
983
+ for caption in captions
984
+ ]
985
+ font_path = os.path.join("fonts", font_choice)
986
+ font = ImageFont.truetype(font_path, int(45))
987
+ total_results = (
988
+ get_comic(total_results, _comic_type, captions=captions, font=font)
989
+ + total_results
990
+ )
991
+ save_results(pipe.unet, total_results)
992
+
993
+ yield total_results
994
+
995
+
996
+ def array2string(arr):
997
+ stringtmp = ""
998
+ for i, part in enumerate(arr):
999
+ if i != len(arr) - 1:
1000
+ stringtmp += part + "\n"
1001
+ else:
1002
+ stringtmp += part
1003
+
1004
+ return stringtmp
1005
+
1006
+
1007
+ #################################################
1008
+ #################################################
1009
+ ### define the interface
1010
+
1011
+ with gr.Blocks(css=css) as demo:
1012
+ binary_matrixes = gr.State([])
1013
+ color_layout = gr.State([])
1014
+
1015
+ # gr.Markdown(logo)
1016
+ gr.Markdown(title)
1017
+ gr.Markdown(description)
1018
+
1019
+ with gr.Row():
1020
+ with gr.Group(elem_id="main-image"):
1021
+
1022
+ prompts = []
1023
+ colors = []
1024
+
1025
+ with gr.Column(visible=True) as gen_prompt_vis:
1026
+ with gr.Group(visible=False):
1027
+ sd_type = gr.Dropdown(
1028
+ choices=list(models_dict.keys()),
1029
+ value="Unstable",
1030
+ label="模型类型",
1031
+ info="选择预训练模型",
1032
+ )
1033
+ model_type = gr.Radio(
1034
+ ["仅使用文本描述", "使用参考图像"],
1035
+ label="控制模式",
1036
+ value="仅使用文本描述",
1037
+ info="角色控制方式",
1038
+ )
1039
+ with gr.Group(visible=True) as control_image_input:
1040
+ files = gr.Files(
1041
+ label="拖放或选择1张或多张面部照片",
1042
+ file_types=["image"],
1043
+ )
1044
+ uploaded_files = gr.Gallery(
1045
+ label="已上传图片",
1046
+ visible=False,
1047
+ columns=5,
1048
+ rows=1,
1049
+ height=200,
1050
+ )
1051
+ with gr.Column(visible=False) as clear_button:
1052
+ remove_and_reupload = gr.ClearButton(
1053
+ value="清除并重新上传",
1054
+ components=files,
1055
+ size="sm",
1056
+ )
1057
+
1058
+ general_prompt = gr.Textbox(
1059
+ value="",
1060
+ lines=2,
1061
+ visible=False,
1062
+ label="(1) 角色文本描述",
1063
+ interactive=True,
1064
+ )
1065
+ negative_prompt = gr.Textbox(
1066
+ value="",
1067
+ label="(2) 负面提示词",
1068
+ visible=False,
1069
+ interactive=True
1070
+ )
1071
+ style = gr.Dropdown(
1072
+ label="风格模板",
1073
+ choices=STYLE_NAMES,
1074
+ value=DEFAULT_STYLE_NAME,
1075
+ )
1076
+ prompt_array = gr.Textbox(
1077
+ lines=1,
1078
+ value="",
1079
+ visible=False,
1080
+ label="(3) 漫画描述(每行对应一个画格)",
1081
+ interactive=True,
1082
+ )
1083
+ char_path = gr.Textbox(
1084
+ lines=2,
1085
+ value="",
1086
+ visible=False,
1087
+ label="(可选) 角色文件路径",
1088
+ interactive=True,
1089
+ )
1090
+ char_btn = gr.Button("加载角色文件", visible=False)
1091
+
1092
+ with gr.Group(visible=False):
1093
+ font_choice = gr.Dropdown(
1094
+ label="选择字体",
1095
+ choices=[
1096
+ f for f in os.listdir("fonts") if f.endswith(".ttf")
1097
+ ],
1098
+ value="Inkfree.ttf",
1099
+ info="选择最终幻灯片的字体",
1100
+ interactive=True,
1101
+ )
1102
+ sa32_ = gr.Slider(
1103
+ label="32x32自注意力层配对注意力强度",
1104
+ minimum=0,
1105
+ maximum=1.0,
1106
+ value=0.5,
1107
+ step=0.1,
1108
+ )
1109
+ sa64_ = gr.Slider(
1110
+ label="64x64自注意力层配对注意力强度",
1111
+ minimum=0,
1112
+ maximum=1.0,
1113
+ value=0.5,
1114
+ step=0.1,
1115
+ )
1116
+ id_length_ = gr.Slider(
1117
+ label="总图像中包含ID图像的数量",
1118
+ minimum=1,
1119
+ maximum=4,
1120
+ value=1,
1121
+ step=1,
1122
+ )
1123
+ with gr.Row():
1124
+ seed_ = gr.Slider(
1125
+ label="随机种子", minimum=-1, maximum=MAX_SEED, value=0, step=1
1126
+ )
1127
+ randomize_seed_btn = gr.Button("🎲", size="sm")
1128
+ num_steps = gr.Slider(
1129
+ label="采样步数",
1130
+ minimum=20,
1131
+ maximum=100,
1132
+ step=1,
1133
+ value=35,
1134
+ )
1135
+ G_height = gr.Slider(
1136
+ label="图像高度",
1137
+ minimum=256,
1138
+ maximum=1024,
1139
+ step=32,
1140
+ value=768,
1141
+ )
1142
+ G_width = gr.Slider(
1143
+ label="图像宽度",
1144
+ minimum=256,
1145
+ maximum=1024,
1146
+ step=32,
1147
+ value=768,
1148
+ )
1149
+ comic_type = gr.Radio(
1150
+ [
1151
+ "默认",
1152
+ "四格漫画",
1153
+ "经典漫画风格",
1154
+ ],
1155
+ value="四格漫画",###########################################################################
1156
+ label="排版风格",
1157
+ info="选择漫画排版风格",
1158
+ )
1159
+ with gr.Group(visible=False):
1160
+ guidance_scale = gr.Slider(
1161
+ label="引导尺度",
1162
+ minimum=0.1,
1163
+ maximum=10.0,
1164
+ step=0.1,
1165
+ value=5,
1166
+ )
1167
+ style_strength_ratio = gr.Slider(
1168
+ label="参考图像风格强度 (%)",
1169
+ minimum=15,
1170
+ maximum=50,
1171
+ step=1,
1172
+ value=20,
1173
+ visible=False,
1174
+ )
1175
+ Ip_Adapter_Strength = gr.Slider(
1176
+ label="IP适配器强度",
1177
+ minimum=0,
1178
+ maximum=1,
1179
+ step=0.1,
1180
+ value=0.5,
1181
+ visible=False,
1182
+ )
1183
+ final_run_btn = gr.Button("开始生成!😺")
1184
+
1185
+ with gr.Column():
1186
+ out_image = gr.Gallery(label="生成结果", columns=2, height="auto")
1187
+ # print(out_image,"#########################################################################################################")
1188
+ generated_information = gr.Markdown(
1189
+ label="生成详情", value="", visible=False
1190
+ )
1191
+ gr.Markdown(version)
1192
+ model_type.change(
1193
+ fn=change_visiale_by_model_type,
1194
+ inputs=model_type,
1195
+ outputs=[control_image_input, style_strength_ratio, Ip_Adapter_Strength],
1196
+ )
1197
+ files.upload(
1198
+ fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files]
1199
+ )
1200
+ remove_and_reupload.click(
1201
+ fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files]
1202
+ )
1203
+ char_btn.click(fn=load_character_files, inputs=char_path, outputs=[general_prompt])
1204
+
1205
+ randomize_seed_btn.click(
1206
+ fn=lambda: random.randint(-1, MAX_SEED),
1207
+ inputs=[],
1208
+ outputs=seed_,
1209
+ )
1210
+
1211
+ final_run_btn.click(fn=set_text_unfinished, outputs=generated_information).then(
1212
+ process_generation,
1213
+ inputs=[
1214
+ sd_type,
1215
+ model_type,
1216
+ files,
1217
+ num_steps,
1218
+ style,
1219
+ Ip_Adapter_Strength,
1220
+ style_strength_ratio,
1221
+ guidance_scale,
1222
+ seed_,
1223
+ sa32_,
1224
+ sa64_,
1225
+ id_length_,
1226
+ general_prompt,
1227
+ negative_prompt,
1228
+ prompt_array,
1229
+ G_height,
1230
+ G_width,
1231
+ comic_type,
1232
+ font_choice,
1233
+ char_path,
1234
+ ],
1235
+ outputs=out_image,
1236
+ ).then(fn=set_text_finished, outputs=generated_information)
1237
+
1238
+
1239
+ with gr.Accordion("😺 点击选择内容 😺", open=False, elem_id="my_accordion"):
1240
+
1241
+ gr.Markdown("### 👦 男生视角")
1242
+ gr.Examples(
1243
+ examples=[
1244
+ [
1245
+ 0,
1246
+ 0.3,
1247
+ 0.5,
1248
+ 1,
1249
+ "[Bob] a man img, \n[a]a flower img\n[b]a camel img\n[c] a bridge img\n[d] a gatehouse img",
1250
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1251
+ array2string(
1252
+ [
1253
+ "[Bob] a portrait of a man standing alone, realistic style, natural lighting",
1254
+ "[a] a peach blossom flower",
1255
+ "[b] a Bactrian camel with a desert background",
1256
+ "[c] a Chinese bridge",
1257
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1258
+ "[Bob] stands next to a blooming peach blossom flower, its petals glowing softly under the sunlight",
1259
+ "[Bob] stands next to the camel",
1260
+ "[Bob] stands on an ancient Chinese stone arch bridge over a quiet river",
1261
+ "[Bob] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1262
+ ]
1263
+ ),
1264
+ "日本漫画风",
1265
+ "Using Ref Images",
1266
+ # [],# get_image_path_list("examples/taylor"),
1267
+ 768,
1268
+ 768
1269
+ ],
1270
+ [
1271
+ 0,
1272
+ 0.3,
1273
+ 0.5,
1274
+ 1,
1275
+ "[Bob] a man img, \n[a]a sunflower img\n[b]a horse img\n[c]a riverside house img\n[d]a Chinese pavilion img",
1276
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1277
+ array2string(
1278
+ [
1279
+ "[Bob] a portrait of a man standing alone, realistic style, natural lighting",
1280
+ "[a] a sunflower with large golden petals facing the sun",
1281
+ "[b] a brown horse standing on a grassy field",
1282
+ "[c] a traditional riverside house with white walls and black-tiled roof",
1283
+ "[d] an ancient Chinese multi-tiered pavilion with curved eaves",
1284
+ "[Bob] stands next to a sunflower, its bright petals glowing in the sunlight",
1285
+ "[Bob] stands next to the horse",
1286
+ "[Bob] stands beside a riverside house, reflected softly in the calm water",
1287
+ "[Bob] stands under a traditional Chinese pavilion with curved roofs and wooden columns",
1288
+ ]
1289
+ ),
1290
+ "日本漫画风",
1291
+ "Using Ref Images",
1292
+ # [], # get_image_path_list("examples/taylor"),
1293
+ 768,
1294
+ 768
1295
+ ],
1296
+ [
1297
+ 0,
1298
+ 0.3,
1299
+ 0.5,
1300
+ 1,
1301
+ "[Bob] a man img, \n[a]a tree img\n[b]an alpaca img\n[c] a bridge img\n[d] a riverside house img",
1302
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1303
+ array2string(
1304
+ [
1305
+ "[Bob] a portrait of a man standing alone, realistic style, natural lighting",
1306
+ "[a] a tree ",
1307
+ "[b] an alpaca with fluffy white wool standing on grassland",
1308
+ "[c] a Chinese bridge",
1309
+ "[d] a traditional riverside house with white walls and black-tiled roof",
1310
+ "[Bob] stands next to a tree, under the sunlight",
1311
+ "[Bob] stands next to the alpaca",
1312
+ "[Bob] stands on an ancient Chinese stone arch bridge over a quiet river",
1313
+ "[Bob] stands beside a riverside house, its reflection shimmering in the gentle stream",
1314
+ ]
1315
+ ),
1316
+ "日本漫画风",
1317
+ "Using Ref Images",
1318
+ # [], # get_image_path_list("examples/taylor"),
1319
+ 768,
1320
+ 768
1321
+ ]
1322
+ ],
1323
+ example_labels=[
1324
+ "示例1:桃花,骆驼,桥,门楼",
1325
+ "示例2:向日葵,马,水乡,亭子",
1326
+ "示例3:树,羊驼,桥,水乡",
1327
+ ],
1328
+ inputs=[
1329
+ seed_,
1330
+ sa32_,
1331
+ sa64_,
1332
+ id_length_,
1333
+ general_prompt,
1334
+ negative_prompt,
1335
+ prompt_array,
1336
+ style,
1337
+ model_type,
1338
+ # files,
1339
+ G_height,
1340
+ G_width,
1341
+ ],
1342
+ )
1343
+
1344
+ gr.Markdown("### 👧 女生视角")
1345
+ gr.Examples(
1346
+ examples=[
1347
+
1348
+ [
1349
+ 0,
1350
+ 0.3,
1351
+ 0.5,
1352
+ 1,
1353
+ "[Alice] a woman img, \n[a]a flower img\n[b]a camel img\n[c] a bridge img\n[d] a gatehouse img",
1354
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1355
+ array2string(
1356
+ [
1357
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1358
+ "[a] a peach blossom flower",
1359
+ "[b] a Bactrian camel with a desert background",
1360
+ "[c] a Chinese bridge",
1361
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1362
+ "[Alice] stands next to a blooming peach blossom flower, its petals glowing softly under the sunlight",
1363
+ "[Alice] stands beside a camel",
1364
+ "[Alice] stands on an ancient Chinese stone arch bridge over a quiet river",
1365
+ "[Alice] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1366
+ ]
1367
+ ),
1368
+ "日本漫画风",
1369
+ "Using Ref Images",
1370
+ # [], # get_image_path_list("examples/taylor"),
1371
+ 768,
1372
+ 768
1373
+ ],
1374
+ [
1375
+ 0,
1376
+ 0.3,
1377
+ 0.5,
1378
+ 1,
1379
+ "[Alice] a woman img, \n[a]a tree img\n[b]an alpaca img\n[c] a bridge img\n[d] a gatehouse img",
1380
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1381
+ array2string(
1382
+ [
1383
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1384
+ "[a] a tree ",
1385
+ "[b] an alpaca with fluffy white wool standing on grassland",
1386
+ "[c] a Chinese bridge",
1387
+ "[d] a traditional Chinese gatehouse (ornamental archway with a plaque)",
1388
+ "[Alice] stands next to a tree, under the sunlight",
1389
+ "[Alice] stands next to the alpaca",
1390
+ "[Alice] stands on an ancient Chinese stone arch bridge over a quiet river",
1391
+ "[Alice] stands beneath a traditional Chinese gatehouse with red lanterns and wooden carvings",
1392
+ ]
1393
+ ),
1394
+ "日本漫画风",
1395
+ "Using Ref Images",
1396
+ # [], # get_image_path_list("examples/taylor"),
1397
+ 768,
1398
+ 768
1399
+ ],
1400
+ [
1401
+ 0,
1402
+ 0.3,
1403
+ 0.5,
1404
+ 1,
1405
+ "[Alice] a woman img, \n[a]a sunflower img\n[b]a white goose img\n[c]a riverside house img\n[d]a Chinese pavilion img",
1406
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1407
+ array2string(
1408
+ [
1409
+ "[Alice] a portrait of a woman standing alone, realistic style, natural lighting",
1410
+ "[a] a sunflower with large golden petals facing the sun",
1411
+ "[b] a white goose with smooth feathers standing near the water",
1412
+ "[c] a traditional riverside house with white walls and black-tiled roof",
1413
+ "[d] an ancient Chinese multi-tiered pavilion with curved eaves",
1414
+ "[Alice] stands next to a sunflower, its bright petals glowing in the sunlight",
1415
+ "[Alice] stands next to the white goose, its feathers clean and shining in the sun",
1416
+ "[Alice] stands beside a riverside house, its reflection shimmering in the gentle stream",
1417
+ "[Alice] stands under a Chinese pavilion with layered roofs and carved wooden pillars",
1418
+ ]
1419
+ ),
1420
+ "日本漫画风",
1421
+ "Using Ref Images",
1422
+ # [], # get_image_path_list("examples/taylor"),
1423
+ 768,
1424
+ 768
1425
+ ]
1426
+ ],
1427
+ example_labels=[
1428
+ "示例1:桃花,骆驼,桥,门楼",
1429
+ "示例2:树,羊驼,桥,门楼",
1430
+ "示例3:向日葵,鹅,水乡,亭子",
1431
+ ],
1432
+ inputs=[
1433
+ seed_,
1434
+ sa32_,
1435
+ sa64_,
1436
+ id_length_,
1437
+ general_prompt,
1438
+ negative_prompt,
1439
+ prompt_array,
1440
+ style,
1441
+ model_type,
1442
+ # files,
1443
+ G_height,
1444
+ G_width,
1445
+ ],
1446
+ # outputs=[post_sketch, binary_matrixes, *color_row, *colors, *prompts, gen_prompt_vis, general_prompt, seed_],
1447
+ # run_on_click=True,
1448
+ label="😺 请选择: 😺",
1449
+ )
1450
+
1451
+ gr.Markdown(article)
1452
+
1453
+
1454
+ demo.launch(server_name="0.0.0.0", share=True)
gradio_app_sdxl_specific_id_low_vram1.py ADDED
@@ -0,0 +1,1365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from this import d
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ import gc
6
+ import copy
7
+ import os
8
+ import random
9
+ import datetime
10
+ from PIL import ImageFont
11
+ from utils.gradio_utils import (
12
+ character_to_dict,
13
+ process_original_prompt,
14
+ get_ref_character,
15
+ cal_attn_mask_xl,
16
+ cal_attn_indice_xl_effcient_memory,
17
+ is_torch2_available,
18
+ )
19
+
20
+ import os
21
+ os.environ['GPU_PLATFORM_ID'] = '0'
22
+ os.environ['GPU_DEVICE_ID'] = '0'
23
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
24
+
25
+
26
+ import os
27
+ os.environ['HF_ENDPOINT']= 'https://hf-mirror.com'
28
+ torch.backends.cudnn.enabled = True
29
+
30
+ if is_torch2_available():
31
+ from utils.gradio_utils import AttnProcessor2_0 as AttnProcessor
32
+ else:
33
+ from utils.gradio_utils import AttnProcessor
34
+ from huggingface_hub import hf_hub_download
35
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
36
+ StableDiffusionXLPipeline,
37
+ )
38
+ from diffusers.schedulers.scheduling_ddim import DDIMScheduler
39
+ import torch.nn.functional as F
40
+ from diffusers.utils.loading_utils import load_image
41
+ from utils.utils import get_comic
42
+ from utils.style_template1 import styles
43
+ from utils.load_models_utils import get_models_dict, load_models
44
+
45
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '5'
46
+
47
+ STYLE_NAMES = list(styles.keys())
48
+ DEFAULT_STYLE_NAME = "Japanese Anime"
49
+ global models_dict
50
+
51
+ models_dict = get_models_dict()
52
+
53
+ # Automatically select the device
54
+ device = (
55
+ "cuda:0"
56
+ if torch.cuda.is_available()
57
+ else "mps" if torch.backends.mps.is_available() else "cpu"
58
+ )
59
+
60
+ # device = "cpu"
61
+
62
+ # torch.cuda.set_device(5)
63
+ print(f"@@device:{device}")
64
+
65
+
66
+ # check if the file exists locally at a specified path before downloading it.
67
+ # if the file doesn't exist, it uses `hf_hub_download` to download the file
68
+ # and optionally move it to a specific directory. If the file already
69
+ # exists, it simply uses the local path.
70
+ local_dir = "data/"
71
+ photomaker_local_path = f"{local_dir}photomaker-v1.bin"
72
+ if not os.path.exists(photomaker_local_path):
73
+ photomaker_path = hf_hub_download(
74
+ repo_id="TencentARC/PhotoMaker",
75
+ filename="photomaker-v1.bin",
76
+ repo_type="model",
77
+ local_dir=local_dir,
78
+ )
79
+ else:
80
+ photomaker_path = photomaker_local_path
81
+
82
+ MAX_SEED = np.iinfo(np.int32).max
83
+
84
+
85
+ def setup_seed(seed):
86
+ torch.manual_seed(seed)
87
+ if device == "cuda":
88
+ torch.cuda.manual_seed_all(seed)
89
+ np.random.seed(seed)
90
+ random.seed(seed)
91
+ torch.backends.cudnn.deterministic = True
92
+
93
+
94
+ def set_text_unfinished():
95
+ return gr.update(
96
+ visible=True,
97
+ value="<h3>(Not Finished) Generating ··· The intermediate results will be shown.</h3>",
98
+ )
99
+
100
+
101
+ def set_text_finished():
102
+ return gr.update(visible=True, value="<h3>Generation Finished</h3>")
103
+
104
+
105
+ #################################################
106
+ def get_image_path_list(folder_name):
107
+ image_basename_list = os.listdir(folder_name)
108
+ image_path_list = sorted(
109
+ [os.path.join(folder_name, basename) for basename in image_basename_list]
110
+ )
111
+ return image_path_list
112
+
113
+
114
+ #################################################
115
+ class SpatialAttnProcessor2_0(torch.nn.Module):
116
+ r"""
117
+ Attention processor for IP-Adapater for PyTorch 2.0.
118
+ Args:
119
+ hidden_size (`int`):
120
+ The hidden size of the attention layer.
121
+ cross_attention_dim (`int`):
122
+ The number of channels in the `encoder_hidden_states`.
123
+ text_context_len (`int`, defaults to 77):
124
+ The context length of the text features.
125
+ scale (`float`, defaults to 1.0):
126
+ the weight scale of image prompt.
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ hidden_size=None,
132
+ cross_attention_dim=None,
133
+ id_length=4,
134
+ device=device,
135
+ dtype=torch.float16,
136
+ ):
137
+ super().__init__()
138
+ if not hasattr(F, "scaled_dot_product_attention"):
139
+ raise ImportError(
140
+ "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
141
+ )
142
+ self.device = device
143
+ self.dtype = dtype
144
+ self.hidden_size = hidden_size
145
+ self.cross_attention_dim = cross_attention_dim
146
+ self.total_length = id_length + 1
147
+ self.id_length = id_length
148
+ self.id_bank = {}
149
+
150
+ def __call__(
151
+ self,
152
+ attn,
153
+ hidden_states,
154
+ encoder_hidden_states=None,
155
+ attention_mask=None,
156
+ temb=None,
157
+ ):
158
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
159
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
160
+ # 生成一个0到1之间的随机数
161
+ global total_count, attn_count, cur_step, indices1024, indices4096
162
+ global sa32, sa64
163
+ global write
164
+ global height, width
165
+ global character_dict, character_index_dict, invert_character_index_dict, cur_character, ref_indexs_dict, ref_totals, cur_character
166
+ if attn_count == 0 and cur_step == 0:
167
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
168
+ self.total_length,
169
+ self.id_length,
170
+ sa32,
171
+ sa64,
172
+ height,
173
+ width,
174
+ device=self.device,
175
+ dtype=self.dtype,
176
+ )
177
+ if write:
178
+ assert len(cur_character) == 1
179
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
180
+ indices = indices1024
181
+ else:
182
+ indices = indices4096
183
+ # print(f"white:{cur_step}")
184
+ total_batch_size, nums_token, channel = hidden_states.shape
185
+ img_nums = total_batch_size // 2
186
+ hidden_states = hidden_states.reshape(-1, img_nums, nums_token, channel)
187
+ # print(img_nums,len(indices),hidden_states.shape,self.total_length)
188
+ if cur_character[0] not in self.id_bank:
189
+ self.id_bank[cur_character[0]] = {}
190
+ self.id_bank[cur_character[0]][cur_step] = [
191
+ hidden_states[:, img_ind, indices[img_ind], :]
192
+ .reshape(2, -1, channel)
193
+ .clone()
194
+ for img_ind in range(img_nums)
195
+ ]
196
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
197
+ # self.id_bank[cur_step] = [hidden_states[:self.id_length].clone(), hidden_states[self.id_length:].clone()]
198
+ else:
199
+ # encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),self.id_bank[cur_step][1].to(self.device)))
200
+ # TODO: ADD Multipersion Control
201
+ encoder_arr = []
202
+ for character in cur_character:
203
+ encoder_arr = encoder_arr + [
204
+ tensor.to(self.device)
205
+ for tensor in self.id_bank[character][cur_step]
206
+ ]
207
+ # 判断随机数是否大于0.5
208
+ if cur_step < 1:
209
+ hidden_states = self.__call2__(
210
+ attn, hidden_states, None, attention_mask, temb
211
+ )
212
+ else: # 256 1024 4096
213
+ random_number = random.random()
214
+ if cur_step < 20:
215
+ rand_num = 0.3
216
+ else:
217
+ rand_num = 0.1
218
+ # print(f"hidden state shape {hidden_states.shape[1]}")
219
+ if random_number > rand_num:
220
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
221
+ indices = indices1024
222
+ else:
223
+ indices = indices4096
224
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
225
+ if write:
226
+ total_batch_size, nums_token, channel = hidden_states.shape
227
+ img_nums = total_batch_size // 2
228
+ hidden_states = hidden_states.reshape(
229
+ -1, img_nums, nums_token, channel
230
+ )
231
+ encoder_arr = [
232
+ hidden_states[:, img_ind, indices[img_ind], :].reshape(
233
+ 2, -1, channel
234
+ )
235
+ for img_ind in range(img_nums)
236
+ ]
237
+ for img_ind in range(img_nums):
238
+ # print(img_nums)
239
+ # assert img_nums != 1
240
+ img_ind_list = [i for i in range(img_nums)]
241
+ # print(img_ind_list,img_ind)
242
+ img_ind_list.remove(img_ind)
243
+ # print(img_ind,invert_character_index_dict[img_ind])
244
+ # print(character_index_dict[invert_character_index_dict[img_ind]])
245
+ # print(img_ind_list)
246
+ # print(img_ind,img_ind_list)
247
+ encoder_hidden_states_tmp = torch.cat(
248
+ [encoder_arr[img_ind] for img_ind in img_ind_list]
249
+ + [hidden_states[:, img_ind, :, :]],
250
+ dim=1,
251
+ )
252
+
253
+ hidden_states[:, img_ind, :, :] = self.__call2__(
254
+ attn,
255
+ hidden_states[:, img_ind, :, :],
256
+ encoder_hidden_states_tmp,
257
+ None,
258
+ temb,
259
+ )
260
+ else:
261
+ _, nums_token, channel = hidden_states.shape
262
+ # img_nums = total_batch_size // 2
263
+ # encoder_hidden_states = encoder_hidden_states.reshape(-1,img_nums,nums_token,channel)
264
+ hidden_states = hidden_states.reshape(2, -1, nums_token, channel)
265
+ # print(len(indices))
266
+ # encoder_arr = [encoder_hidden_states[:,img_ind,indices[img_ind],:].reshape(2,-1,channel) for img_ind in range(img_nums)]
267
+ encoder_hidden_states_tmp = torch.cat(
268
+ encoder_arr + [hidden_states[:, 0, :, :]], dim=1
269
+ )
270
+ # print(len(encoder_arr),encoder_hidden_states_tmp.shape)
271
+ hidden_states[:, 0, :, :] = self.__call2__(
272
+ attn,
273
+ hidden_states[:, 0, :, :],
274
+ encoder_hidden_states_tmp,
275
+ None,
276
+ temb,
277
+ )
278
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
279
+ else:
280
+ hidden_states = self.__call2__(
281
+ attn, hidden_states, None, attention_mask, temb
282
+ )
283
+ attn_count += 1
284
+ if attn_count == total_count:
285
+ attn_count = 0
286
+ cur_step += 1
287
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
288
+ self.total_length,
289
+ self.id_length,
290
+ sa32,
291
+ sa64,
292
+ height,
293
+ width,
294
+ device=self.device,
295
+ dtype=self.dtype,
296
+ )
297
+
298
+ return hidden_states
299
+
300
+ def __call2__(
301
+ self,
302
+ attn,
303
+ hidden_states,
304
+ encoder_hidden_states=None,
305
+ attention_mask=None,
306
+ temb=None,
307
+ ):
308
+ residual = hidden_states
309
+
310
+ if attn.spatial_norm is not None:
311
+ hidden_states = attn.spatial_norm(hidden_states, temb)
312
+
313
+ input_ndim = hidden_states.ndim
314
+
315
+ if input_ndim == 4:
316
+ batch_size, channel, height, width = hidden_states.shape
317
+ hidden_states = hidden_states.view(
318
+ batch_size, channel, height * width
319
+ ).transpose(1, 2)
320
+
321
+ batch_size, sequence_length, channel = hidden_states.shape
322
+ # print(hidden_states.shape)
323
+ if attention_mask is not None:
324
+ attention_mask = attn.prepare_attention_mask(
325
+ attention_mask, sequence_length, batch_size
326
+ )
327
+ # scaled_dot_product_attention expects attention_mask shape to be
328
+ # (batch, heads, source_length, target_length)
329
+ attention_mask = attention_mask.view(
330
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
331
+ )
332
+
333
+ if attn.group_norm is not None:
334
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
335
+ 1, 2
336
+ )
337
+
338
+ query = attn.to_q(hidden_states)
339
+
340
+ if encoder_hidden_states is None:
341
+ encoder_hidden_states = hidden_states # B, N, C
342
+ # else:
343
+ # encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
344
+
345
+ key = attn.to_k(encoder_hidden_states)
346
+ value = attn.to_v(encoder_hidden_states)
347
+
348
+ inner_dim = key.shape[-1]
349
+ head_dim = inner_dim // attn.heads
350
+
351
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
352
+
353
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
354
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
355
+
356
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
357
+ # TODO: add support for attn.scale when we move to Torch 2.1
358
+ hidden_states = F.scaled_dot_product_attention(
359
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
360
+ )
361
+
362
+ hidden_states = hidden_states.transpose(1, 2).reshape(
363
+ batch_size, -1, attn.heads * head_dim
364
+ )
365
+ hidden_states = hidden_states.to(query.dtype)
366
+
367
+ # linear proj
368
+ hidden_states = attn.to_out[0](hidden_states)
369
+ # dropout
370
+ hidden_states = attn.to_out[1](hidden_states)
371
+
372
+ if input_ndim == 4:
373
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
374
+ batch_size, channel, height, width
375
+ )
376
+
377
+ if attn.residual_connection:
378
+ hidden_states = hidden_states + residual
379
+
380
+ hidden_states = hidden_states / attn.rescale_output_factor
381
+
382
+ return hidden_states
383
+
384
+
385
+ def set_attention_processor(unet, id_length, is_ipadapter=False):
386
+ global attn_procs
387
+ attn_procs = {}
388
+ for name in unet.attn_processors.keys():
389
+ cross_attention_dim = (
390
+ None
391
+ if name.endswith("attn1.processor")
392
+ else unet.config.cross_attention_dim
393
+ )
394
+ if name.startswith("mid_block"):
395
+ hidden_size = unet.config.block_out_channels[-1]
396
+ elif name.startswith("up_blocks"):
397
+ block_id = int(name[len("up_blocks.")])
398
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
399
+ elif name.startswith("down_blocks"):
400
+ block_id = int(name[len("down_blocks.")])
401
+ hidden_size = unet.config.block_out_channels[block_id]
402
+ if cross_attention_dim is None:
403
+ if name.startswith("up_blocks"):
404
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
405
+ else:
406
+ attn_procs[name] = AttnProcessor()
407
+ else:
408
+ if is_ipadapter:
409
+ attn_procs[name] = IPAttnProcessor2_0(
410
+ hidden_size=hidden_size,
411
+ cross_attention_dim=cross_attention_dim,
412
+ scale=1,
413
+ num_tokens=4,
414
+ ).to(unet.device, dtype=torch.float16)
415
+ else:
416
+ attn_procs[name] = AttnProcessor()
417
+
418
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
419
+
420
+
421
+ #################################################
422
+ #################################################
423
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
424
+ load_js = """
425
+ async () => {
426
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
427
+ fetch(url)
428
+ .then(res => res.text())
429
+ .then(text => {
430
+ const script = document.createElement('script');
431
+ script.type = "module"
432
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
433
+ document.head.appendChild(script);
434
+ });
435
+ }
436
+ """
437
+
438
+ get_js_colors = """
439
+ async (canvasData) => {
440
+ const canvasEl = document.getElementById("canvas-root");
441
+ return [canvasEl._data]
442
+ }
443
+ """
444
+
445
+ css = """
446
+ #color-bg{display:flex;justify-content: center;align-items: center;}
447
+ .color-bg-item{width: 100%; height: 32px}
448
+ #main_button{width:100%}
449
+ <style>
450
+ """
451
+
452
+
453
+ def save_single_character_weights(unet, character, description, filepath):
454
+ """
455
+ 保存 attention_processor 类中的 id_bank GPU Tensor 列表到指定文件中。
456
+ 参数:
457
+ - model: 包含 attention_processor 类实例的模型。
458
+ - filepath: 权重要保存到的文件路径。
459
+ """
460
+ weights_to_save = {}
461
+ weights_to_save["description"] = description
462
+ weights_to_save["character"] = character
463
+ for attn_name, attn_processor in unet.attn_processors.items():
464
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
465
+ # 将每个 Tensor 转到 CPU 并转为列表,以确保它可以被序列化
466
+ weights_to_save[attn_name] = {}
467
+ for step_key in attn_processor.id_bank[character].keys():
468
+ weights_to_save[attn_name][step_key] = [
469
+ tensor.cpu()
470
+ for tensor in attn_processor.id_bank[character][step_key]
471
+ ]
472
+ # 使用torch.save保存权重
473
+ torch.save(weights_to_save, filepath)
474
+
475
+
476
+ def load_single_character_weights(unet, filepath):
477
+ """
478
+ 从指定文件中加载权重到 attention_processor 类的 id_bank 中。
479
+ 参数:
480
+ - model: 包含 attention_processor 类实例的模型。
481
+ - filepath: 权重文件的路径。
482
+ """
483
+ # 使用torch.load来读取权重
484
+ weights_to_load = torch.load(filepath, map_location=torch.device("cpu"))
485
+ character = weights_to_load["character"]
486
+ description = weights_to_load["description"]
487
+ for attn_name, attn_processor in unet.attn_processors.items():
488
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
489
+ # 转移权重到GPU(如果GPU可用的话)并赋值给id_bank
490
+ attn_processor.id_bank[character] = {}
491
+ for step_key in weights_to_load[attn_name].keys():
492
+ attn_processor.id_bank[character][step_key] = [
493
+ tensor.to(unet.device)
494
+ for tensor in weights_to_load[attn_name][step_key]
495
+ ]
496
+
497
+
498
+ def save_results(unet, img_list):
499
+
500
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
501
+ folder_name = f"results/{timestamp}"
502
+ weight_folder_name = f"{folder_name}/weights"
503
+ # 创建文件夹
504
+ if not os.path.exists(folder_name):
505
+ os.makedirs(folder_name)
506
+ os.makedirs(weight_folder_name)
507
+
508
+ for idx, img in enumerate(img_list):
509
+ file_path = os.path.join(folder_name, f"image_{idx}.png") # 图片文件名
510
+ img.save(file_path)
511
+ global character_dict
512
+ # for char in character_dict:
513
+ # description = character_dict[char]
514
+ # save_single_character_weights(unet,char,description,os.path.join(weight_folder_name, f'{char}.pt'))
515
+
516
+
517
+ #################################################
518
+ title = r"""
519
+ <h1 align="center">StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</h1>
520
+ """
521
+
522
+ description = r"""
523
+ <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
524
+ ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
525
+ 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
526
+ 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
527
+ 3️��� Choose your preferred style template.<br>
528
+ 4️⃣ Click the <b>Submit</b> button to start customizing.
529
+ """
530
+
531
+ article = r"""
532
+
533
+ If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
534
+ [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
535
+ ---
536
+ 📝 **Citation**
537
+ <br>
538
+ If our work is useful for your research, please consider citing:
539
+
540
+ ```bibtex
541
+ @article{Zhou2024storydiffusion,
542
+ title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
543
+ author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
544
+ year={2024}
545
+ }
546
+ ```
547
+ 📋 **License**
548
+ <br>
549
+ Apache-2.0 LICENSE.
550
+
551
+ 📧 **Contact**
552
+ <br>
553
+ If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
554
+ """
555
+ version = r"""
556
+ <h3 align="center">StoryDiffusion Version 0.02 (test version)</h3>
557
+
558
+ <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
559
+ <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
560
+ <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling.")</h5>
561
+ <h5 align="center">Tips: </h4>
562
+ """
563
+ #################################################
564
+ global attn_count, total_count, id_length, total_length, cur_step, cur_model_type
565
+ global write
566
+ global sa32, sa64
567
+ global height, width
568
+ attn_count = 0
569
+ total_count = 0
570
+ cur_step = 0
571
+ id_length = 4
572
+ total_length = 5
573
+ cur_model_type = ""
574
+ global attn_procs, unet
575
+ attn_procs = {}
576
+ ###
577
+ write = False
578
+ ###
579
+ sa32 = 0.5
580
+ sa64 = 0.5
581
+ height = 768
582
+ width = 768
583
+ ###
584
+ global pipe
585
+ global sd_model_path
586
+ pipe = None
587
+ sd_model_path = models_dict["Unstable"]["path"] # "SG161222/RealVisXL_V4.0"
588
+ single_files = models_dict["Unstable"]["single_files"]
589
+ ### LOAD Stable Diffusion Pipeline
590
+ if single_files:
591
+ pipe = StableDiffusionXLPipeline.from_single_file(
592
+ sd_model_path, torch_dtype=torch.float16
593
+ )
594
+ else:
595
+ pipe = StableDiffusionXLPipeline.from_pretrained(
596
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=False
597
+ )
598
+ print("pipE.device = ", device)
599
+ pipe = pipe.to(device)
600
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
601
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
602
+ pipe.scheduler.set_timesteps(50)
603
+ pipe.enable_vae_slicing()
604
+ if device != "mps":
605
+ pipe.enable_model_cpu_offload()
606
+ unet = pipe.unet
607
+ cur_model_type = "Unstable" + "-" + "original"
608
+ ### Insert PairedAttention
609
+ for name in unet.attn_processors.keys():
610
+ cross_attention_dim = (
611
+ None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
612
+ )
613
+ if name.startswith("mid_block"):
614
+ hidden_size = unet.config.block_out_channels[-1]
615
+ elif name.startswith("up_blocks"):
616
+ block_id = int(name[len("up_blocks.")])
617
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
618
+ elif name.startswith("down_blocks"):
619
+ block_id = int(name[len("down_blocks.")])
620
+ hidden_size = unet.config.block_out_channels[block_id]
621
+ if cross_attention_dim is None and (name.startswith("up_blocks")):
622
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
623
+ total_count += 1
624
+ else:
625
+ attn_procs[name] = AttnProcessor()
626
+ print("successsfully load paired self-attention")
627
+ print(f"number of the processor : {total_count}")
628
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
629
+ global mask1024, mask4096
630
+ mask1024, mask4096 = cal_attn_mask_xl(
631
+ total_length,
632
+ id_length,
633
+ sa32,
634
+ sa64,
635
+ height,
636
+ width,
637
+ device=device,
638
+ dtype=torch.float16,
639
+ )
640
+
641
+ ######### Gradio Fuction #############
642
+
643
+
644
+ def swap_to_gallery(images):
645
+ return (
646
+ gr.update(value=images, visible=True),
647
+ gr.update(visible=True),
648
+ gr.update(visible=False),
649
+ )
650
+
651
+
652
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
653
+ return (
654
+ gr.update(value=images, visible=True),
655
+ gr.update(visible=True),
656
+ gr.update(visible=False),
657
+ )
658
+
659
+
660
+ def remove_back_to_files():
661
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
662
+
663
+
664
+ def remove_tips():
665
+ return gr.update(visible=False)
666
+
667
+
668
+ def apply_style_positive(style_name: str, positive: str):
669
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
670
+ return p.replace("{prompt}", positive)
671
+
672
+
673
+ def apply_style(style_name: str, positives: list, negative: str = ""):
674
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
675
+ return [
676
+ p.replace("{prompt}", positive) for positive in positives
677
+ ], n + " " + negative
678
+
679
+
680
+ def change_visiale_by_model_type(_model_type):
681
+ if _model_type == "Only Using Textual Description":
682
+ return (
683
+ gr.update(visible=False),
684
+ gr.update(visible=False),
685
+ gr.update(visible=False),
686
+ )
687
+ elif _model_type == "Using Ref Images":
688
+ return (
689
+ gr.update(visible=True),
690
+ gr.update(visible=True),
691
+ gr.update(visible=False),
692
+ )
693
+ else:
694
+ raise ValueError("Invalid model type", _model_type)
695
+
696
+
697
+ def load_character_files(character_files: str):
698
+ if character_files == "":
699
+ raise gr.Error("Please set a character file!")
700
+ character_files_arr = character_files.splitlines()
701
+ primarytext = []
702
+ for character_file_name in character_files_arr:
703
+ character_file = torch.load(
704
+ character_file_name, map_location=torch.device("cpu")
705
+ )
706
+ primarytext.append(character_file["character"] + character_file["description"])
707
+ return array2string(primarytext)
708
+
709
+
710
+ def load_character_files_on_running(unet, character_files: str):
711
+ if character_files == "":
712
+ return False
713
+ character_files_arr = character_files.splitlines()
714
+ for character_file in character_files_arr:
715
+ load_single_character_weights(unet, character_file)
716
+ return True
717
+
718
+
719
+ ######### Image Generation ##############
720
+ def process_generation(
721
+ _sd_type,
722
+ _model_type,
723
+ _upload_images,
724
+ _num_steps,
725
+ style_name,
726
+ _Ip_Adapter_Strength,
727
+ _style_strength_ratio,
728
+ guidance_scale,
729
+ seed_,
730
+ sa32_,
731
+ sa64_,
732
+ id_length_,
733
+ general_prompt,
734
+ negative_prompt,
735
+ prompt_array,
736
+ G_height,
737
+ G_width,
738
+ _comic_type,
739
+ font_choice,
740
+ _char_files,
741
+ ): # Corrected font_choice usage
742
+ if len(general_prompt.splitlines()) > 5:
743
+ raise gr.Error(
744
+ "Support for more than three characters is temporarily unavailable due to VRAM limitations, but this issue will be resolved soon."
745
+ )
746
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
747
+ if _model_type == "Photomaker" and "img" not in general_prompt:
748
+ raise gr.Error(
749
+ 'Please add the triger word " img " behind the class word you want to customize, such as: man img or woman img'
750
+ )
751
+ if _upload_images is None and _model_type != "original":
752
+ raise gr.Error(f"Cannot find any input face image!")
753
+ global sa32, sa64, id_length, total_length, attn_procs, unet, cur_model_type
754
+ global write
755
+ global cur_step, attn_count
756
+ global height, width
757
+ height = G_height
758
+ width = G_width
759
+ global pipe
760
+ global sd_model_path, models_dict
761
+ sd_model_path = models_dict[_sd_type]
762
+ use_safe_tensor = True
763
+ for attn_processor in pipe.unet.attn_processors.values():
764
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
765
+ for values in attn_processor.id_bank.values():
766
+ del values
767
+ attn_processor.id_bank = {}
768
+ attn_processor.id_length = id_length
769
+ attn_processor.total_length = id_length + 1
770
+ gc.collect()
771
+ torch.cuda.empty_cache()
772
+ if cur_model_type != _sd_type + "-" + _model_type:
773
+ # apply the style template
774
+ ##### load pipe
775
+ del pipe
776
+ gc.collect()
777
+ if device == "cuda":
778
+ torch.cuda.empty_cache()
779
+ model_info = models_dict[_sd_type]
780
+ model_info["model_type"] = _model_type
781
+ print("device = ", device)
782
+ pipe = load_models(model_info, device=device, photomaker_path=photomaker_path)
783
+ set_attention_processor(pipe.unet, id_length_, is_ipadapter=False)
784
+ ##### ########################
785
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
786
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
787
+ cur_model_type = _sd_type + "-" + _model_type
788
+ pipe.enable_vae_slicing()
789
+ if device != "mps":
790
+ pipe.enable_model_cpu_offload()
791
+ else:
792
+ unet = pipe.unet
793
+ # unet.set_attn_processor(copy.deepcopy(attn_procs))
794
+
795
+ load_chars = load_character_files_on_running(unet, character_files=_char_files)
796
+
797
+ prompts = prompt_array.splitlines()
798
+ global character_dict, character_index_dict, invert_character_index_dict, ref_indexs_dict, ref_totals
799
+ character_dict, character_list = character_to_dict(general_prompt)
800
+
801
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
802
+ if start_merge_step > 30:
803
+ start_merge_step = 30
804
+ print(f"start_merge_step:{start_merge_step}")
805
+ generator = torch.Generator(device=device).manual_seed(seed_)
806
+ sa32, sa64 = sa32_, sa64_
807
+ id_length = id_length_
808
+ clipped_prompts = prompts[:]
809
+ nc_indexs = []
810
+ for ind, prompt in enumerate(clipped_prompts):
811
+ if "[NC]" in prompt:
812
+ nc_indexs.append(ind)
813
+ if ind < id_length:
814
+ raise gr.Error(
815
+ f"The first {id_length} row is id prompts, cannot use [NC]!"
816
+ )
817
+ prompts = [
818
+ prompt if "[NC]" not in prompt else prompt.replace("[NC]", "")
819
+ for prompt in clipped_prompts
820
+ ]
821
+
822
+ prompts = [
823
+ prompt.rpartition("#")[0] if "#" in prompt else prompt for prompt in prompts
824
+ ]
825
+ print(prompts)
826
+ # id_prompts = prompts[:id_length]
827
+ (
828
+ character_index_dict,
829
+ invert_character_index_dict,
830
+ replace_prompts,
831
+ ref_indexs_dict,
832
+ ref_totals,
833
+ ) = process_original_prompt(character_dict, prompts.copy(), id_length)
834
+ if _model_type != "original":
835
+ input_id_images_dict = {}
836
+ if len(_upload_images) != len(character_dict.keys()):
837
+ raise gr.Error(
838
+ f"You upload images({len(_upload_images)}) is not equal to the number of characters({len(character_dict.keys())})!"
839
+ )
840
+ for ind, img in enumerate(_upload_images):
841
+ input_id_images_dict[character_list[ind]] = [load_image(img)]
842
+ print(character_dict)
843
+ print(character_index_dict)
844
+ print(invert_character_index_dict)
845
+ # real_prompts = prompts[id_length:]
846
+ if device == "cuda":
847
+ torch.cuda.empty_cache()
848
+ write = True
849
+ cur_step = 0
850
+
851
+ attn_count = 0
852
+ # id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
853
+ # print(id_prompts)
854
+ setup_seed(seed_)
855
+ total_results = []
856
+ id_images = []
857
+ results_dict = {}
858
+ global cur_character
859
+
860
+ if not load_chars:
861
+ for character_key in character_dict.keys():
862
+ cur_character = [character_key]
863
+ ref_indexs = ref_indexs_dict[character_key]
864
+ print(character_key, ref_indexs)
865
+ current_prompts = [replace_prompts[ref_ind] for ref_ind in ref_indexs]
866
+ print(current_prompts)
867
+ setup_seed(seed_)
868
+ generator = torch.Generator(device=device).manual_seed(seed_)
869
+ cur_step = 0
870
+ cur_positive_prompts, negative_prompt = apply_style(
871
+ style_name, current_prompts, negative_prompt
872
+ )
873
+ if _model_type == "original":
874
+ id_images = pipe(
875
+ cur_positive_prompts,
876
+ num_inference_steps=_num_steps,
877
+ guidance_scale=guidance_scale,
878
+ height=height,
879
+ width=width,
880
+ negative_prompt=negative_prompt,
881
+ generator=generator,
882
+ ).images
883
+ elif _model_type == "Photomaker":
884
+ id_images = pipe(
885
+ cur_positive_prompts,
886
+ input_id_images=input_id_images_dict[character_key],
887
+ num_inference_steps=_num_steps,
888
+ guidance_scale=guidance_scale,
889
+ start_merge_step=start_merge_step,
890
+ height=height,
891
+ width=width,
892
+ negative_prompt=negative_prompt,
893
+ generator=generator,
894
+ ).images
895
+ else:
896
+ raise NotImplementedError(
897
+ "You should choice between original and Photomaker!",
898
+ f"But you choice {_model_type}",
899
+ )
900
+
901
+ # total_results = id_images + total_results
902
+ # yield total_results
903
+ print(id_images)
904
+ for ind, img in enumerate(id_images):
905
+ print(ref_indexs[ind])
906
+ results_dict[ref_indexs[ind]] = img
907
+ # real_images = []
908
+ yield [results_dict[ind] for ind in results_dict.keys()]
909
+ write = False
910
+ if not load_chars:
911
+ real_prompts_inds = [
912
+ ind for ind in range(len(prompts)) if ind not in ref_totals
913
+ ]
914
+ else:
915
+ real_prompts_inds = [ind for ind in range(len(prompts))]
916
+ print(real_prompts_inds)
917
+
918
+ for real_prompts_ind in real_prompts_inds:
919
+ real_prompt = replace_prompts[real_prompts_ind]
920
+ cur_character = get_ref_character(prompts[real_prompts_ind], character_dict)
921
+ print(cur_character, real_prompt)
922
+ setup_seed(seed_)
923
+ if len(cur_character) > 2 and _model_type == "Photomaker":
924
+ raise gr.Error(
925
+ "Temporarily Not Support Multiple character in Ref Image Mode!"
926
+ )
927
+ generator = torch.Generator(device=device).manual_seed(seed_)
928
+ cur_step = 0
929
+ real_prompt = apply_style_positive(style_name, real_prompt)
930
+ if _model_type == "original":
931
+ results_dict[real_prompts_ind] = pipe(
932
+ real_prompt,
933
+ num_inference_steps=_num_steps,
934
+ guidance_scale=guidance_scale,
935
+ height=height,
936
+ width=width,
937
+ negative_prompt=negative_prompt,
938
+ generator=generator,
939
+ ).images[0]
940
+ elif _model_type == "Photomaker":
941
+ results_dict[real_prompts_ind] = pipe(
942
+ real_prompt,
943
+ input_id_images=(
944
+ input_id_images_dict[cur_character[0]]
945
+ if real_prompts_ind not in nc_indexs
946
+ else input_id_images_dict[character_list[0]]
947
+ ),
948
+ num_inference_steps=_num_steps,
949
+ guidance_scale=guidance_scale,
950
+ start_merge_step=start_merge_step,
951
+ height=height,
952
+ width=width,
953
+ negative_prompt=negative_prompt,
954
+ generator=generator,
955
+ nc_flag=True if real_prompts_ind in nc_indexs else False,
956
+ ).images[0]
957
+ else:
958
+ raise NotImplementedError(
959
+ "You should choice between original and Photomaker!",
960
+ f"But you choice {_model_type}",
961
+ )
962
+ yield [results_dict[ind] for ind in results_dict.keys()]
963
+ total_results = [results_dict[ind] for ind in range(len(prompts))]
964
+ if _comic_type != "No typesetting (default)":
965
+ captions = prompt_array.splitlines()
966
+ captions = [caption.replace("[NC]", "") for caption in captions]
967
+ captions = [
968
+ caption.split("#")[-1] if "#" in caption else caption
969
+ for caption in captions
970
+ ]
971
+ font_path = os.path.join("fonts", font_choice)
972
+ font = ImageFont.truetype(font_path, int(45))
973
+ total_results = (
974
+ get_comic(total_results, _comic_type, captions=captions, font=font)
975
+ + total_results
976
+ )
977
+ save_results(pipe.unet, total_results)
978
+
979
+ yield total_results
980
+
981
+
982
+ def array2string(arr):
983
+ stringtmp = ""
984
+ for i, part in enumerate(arr):
985
+ if i != len(arr) - 1:
986
+ stringtmp += part + "\n"
987
+ else:
988
+ stringtmp += part
989
+
990
+ return stringtmp
991
+
992
+
993
+ #################################################
994
+ #################################################
995
+ ### define the interface
996
+
997
+ with gr.Blocks(css=css) as demo:
998
+ binary_matrixes = gr.State([])
999
+ color_layout = gr.State([])
1000
+
1001
+ # gr.Markdown(logo)
1002
+ gr.Markdown(title)
1003
+ gr.Markdown(description)
1004
+
1005
+ with gr.Row():
1006
+ with gr.Group(elem_id="main-image"):
1007
+
1008
+ prompts = []
1009
+ colors = []
1010
+
1011
+ with gr.Column(visible=True) as gen_prompt_vis:
1012
+ sd_type = gr.Dropdown(
1013
+ choices=list(models_dict.keys()),
1014
+ value="Unstable",
1015
+ label="sd_type",
1016
+ info="Select pretrained model",
1017
+ )
1018
+ model_type = gr.Radio(
1019
+ ["Only Using Textual Description", "Using Ref Images"],
1020
+ label="model_type",
1021
+ value="Only Using Textual Description",
1022
+ info="Control type of the Character",
1023
+ )
1024
+ with gr.Group(visible=False) as control_image_input:
1025
+ files = gr.Files(
1026
+ label="Drag (Select) 1 or more photos of your face",
1027
+ file_types=["image"],
1028
+ )
1029
+ uploaded_files = gr.Gallery(
1030
+ label="Your images",
1031
+ visible=False,
1032
+ columns=5,
1033
+ rows=1,
1034
+ height=200,
1035
+ )
1036
+ with gr.Column(visible=False) as clear_button:
1037
+ remove_and_reupload = gr.ClearButton(
1038
+ value="Remove and upload new ones",
1039
+ components=files,
1040
+ size="sm",
1041
+ )
1042
+ general_prompt = gr.Textbox(
1043
+ value="",
1044
+ lines=2,
1045
+ label="(1) Textual Description for Character",
1046
+ interactive=True,
1047
+ )
1048
+ negative_prompt = gr.Textbox(
1049
+ value="", label="(2) Negative_prompt", interactive=True
1050
+ )
1051
+ style = gr.Dropdown(
1052
+ label="Style template",
1053
+ choices=STYLE_NAMES,
1054
+ value=DEFAULT_STYLE_NAME,
1055
+ )
1056
+ prompt_array = gr.Textbox(
1057
+ lines=1,
1058
+ value="",
1059
+ label="(3) Comic Description (each line corresponds to a frame).",
1060
+ interactive=True,
1061
+ )
1062
+ char_path = gr.Textbox(
1063
+ lines=2,
1064
+ value="",
1065
+ visible=False,
1066
+ label="(Optional) Character files",
1067
+ interactive=True,
1068
+ )
1069
+ char_btn = gr.Button("Load Character files", visible=False)
1070
+ with gr.Accordion("(4) Tune the hyperparameters", open=True):
1071
+ font_choice = gr.Dropdown(
1072
+ label="Select Font",
1073
+ choices=[
1074
+ f for f in os.listdir("fonts") if f.endswith(".ttf")
1075
+ ],
1076
+ value="Inkfree.ttf",
1077
+ info="Select font for the final slide.",
1078
+ interactive=True,
1079
+ )
1080
+ sa32_ = gr.Slider(
1081
+ label=" (The degree of Paired Attention at 32 x 32 self-attention layers) ",
1082
+ minimum=0,
1083
+ maximum=1.0,
1084
+ value=0.5,
1085
+ step=0.1,
1086
+ )
1087
+ sa64_ = gr.Slider(
1088
+ label=" (The degree of Paired Attention at 64 x 64 self-attention layers) ",
1089
+ minimum=0,
1090
+ maximum=1.0,
1091
+ value=0.5,
1092
+ step=0.1,
1093
+ )
1094
+ id_length_ = gr.Slider(
1095
+ label="Number of id images in total images",
1096
+ minimum=1,
1097
+ maximum=4,
1098
+ value=1,
1099
+ step=1,
1100
+ )
1101
+ with gr.Row():
1102
+ seed_ = gr.Slider(
1103
+ label="Seed", minimum=-1, maximum=MAX_SEED, value=0, step=1
1104
+ )
1105
+ randomize_seed_btn = gr.Button("🎲", size="sm")
1106
+ num_steps = gr.Slider(
1107
+ label="Number of sample steps",
1108
+ minimum=20,
1109
+ maximum=100,
1110
+ step=1,
1111
+ value=35,
1112
+ )
1113
+ G_height = gr.Slider(
1114
+ label="height",
1115
+ minimum=256,
1116
+ maximum=1024,
1117
+ step=32,
1118
+ value=768,
1119
+ )
1120
+ G_width = gr.Slider(
1121
+ label="width",
1122
+ minimum=256,
1123
+ maximum=1024,
1124
+ step=32,
1125
+ value=768,
1126
+ )
1127
+ comic_type = gr.Radio(
1128
+ [
1129
+ "No typesetting (default)",
1130
+ "Four Pannel",
1131
+ "Classic Comic Style",
1132
+ ],
1133
+ value="Classic Comic Style",
1134
+ label="Typesetting Style",
1135
+ info="Select the typesetting style ",
1136
+ )
1137
+ guidance_scale = gr.Slider(
1138
+ label="Guidance scale",
1139
+ minimum=0.1,
1140
+ maximum=10.0,
1141
+ step=0.1,
1142
+ value=5,
1143
+ )
1144
+ style_strength_ratio = gr.Slider(
1145
+ label="Style strength of Ref Image (%)",
1146
+ minimum=15,
1147
+ maximum=50,
1148
+ step=1,
1149
+ value=20,
1150
+ visible=False,
1151
+ )
1152
+ Ip_Adapter_Strength = gr.Slider(
1153
+ label="Ip_Adapter_Strength",
1154
+ minimum=0,
1155
+ maximum=1,
1156
+ step=0.1,
1157
+ value=0.5,
1158
+ visible=False,
1159
+ )
1160
+ final_run_btn = gr.Button("Generate ! 😺")
1161
+
1162
+ with gr.Column():
1163
+ out_image = gr.Gallery(label="Result", columns=2, height="auto")
1164
+ generated_information = gr.Markdown(
1165
+ label="Generation Details", value="", visible=False
1166
+ )
1167
+ gr.Markdown(version)
1168
+ model_type.change(
1169
+ fn=change_visiale_by_model_type,
1170
+ inputs=model_type,
1171
+ outputs=[control_image_input, style_strength_ratio, Ip_Adapter_Strength],
1172
+ )
1173
+ files.upload(
1174
+ fn=swap_to_gallery, inputs=files, outputs=[uploaded_files, clear_button, files]
1175
+ )
1176
+ remove_and_reupload.click(
1177
+ fn=remove_back_to_files, outputs=[uploaded_files, clear_button, files]
1178
+ )
1179
+ char_btn.click(fn=load_character_files, inputs=char_path, outputs=[general_prompt])
1180
+
1181
+ randomize_seed_btn.click(
1182
+ fn=lambda: random.randint(-1, MAX_SEED),
1183
+ inputs=[],
1184
+ outputs=seed_,
1185
+ )
1186
+
1187
+ final_run_btn.click(fn=set_text_unfinished, outputs=generated_information).then(
1188
+ process_generation,
1189
+ inputs=[
1190
+ sd_type,
1191
+ model_type,
1192
+ files,
1193
+ num_steps,
1194
+ style,
1195
+ Ip_Adapter_Strength,
1196
+ style_strength_ratio,
1197
+ guidance_scale,
1198
+ seed_,
1199
+ sa32_,
1200
+ sa64_,
1201
+ id_length_,
1202
+ general_prompt,
1203
+ negative_prompt,
1204
+ prompt_array,
1205
+ G_height,
1206
+ G_width,
1207
+ comic_type,
1208
+ font_choice,
1209
+ char_path,
1210
+ ],
1211
+ outputs=out_image,
1212
+ ).then(fn=set_text_finished, outputs=generated_information)
1213
+
1214
+ gr.Examples(
1215
+ examples=[
1216
+ [
1217
+ 0,
1218
+ 0.5,
1219
+ 0.5,
1220
+ 2,
1221
+ "[Bob] A man, wearing a black suit\n[Alice]a woman, wearing a white shirt",
1222
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1223
+ array2string(
1224
+ [
1225
+ "[Bob] at home, read new paper #at home, The newspaper says there is a treasure house in the forest.",
1226
+ "[Bob] on the road, near the forest",
1227
+ "[Alice] is make a call at home # [Bob] invited [Alice] to join him on an adventure.",
1228
+ "[NC]A tiger appeared in the forest, at night ",
1229
+ "[NC] The car on the road, near the forest #They drives to the forest in search of treasure.",
1230
+ "[Bob] very frightened, open mouth, in the forest, at night",
1231
+ "[Alice] very frightened, open mouth, in the forest, at night",
1232
+ "[Bob] and [Alice] running very fast, in the forest, at night",
1233
+ "[NC] A house in the forest, at night #Suddenly, They discovers the treasure house!",
1234
+ "[Bob] and [Alice] in the house filled with treasure, laughing, at night #He is overjoyed inside the house.",
1235
+ ]
1236
+ ),
1237
+ "Comic book",
1238
+ "Only Using Textual Description",
1239
+ get_image_path_list("examples/taylor"),
1240
+ 768,
1241
+ 768,
1242
+ ],
1243
+ [
1244
+ 0,
1245
+ 0.5,
1246
+ 0.5,
1247
+ 2,
1248
+ "[Bob] A man img, wearing a black suit\n[Alice]a woman img, wearing a white shirt",
1249
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1250
+ array2string(
1251
+ [
1252
+ "[Bob] at home, read new paper #at home, The newspaper says there is a treasure house in the forest.",
1253
+ "[Bob] on the road, near the forest",
1254
+ "[Alice] is make a call at home # [Bob] invited [Alice] to join him on an adventure.",
1255
+ "[NC] The car on the road, near the forest #They drives to the forest in search of treasure.",
1256
+ "[NC]A tiger appeared in the forest, at night ",
1257
+ "[Bob] very frightened, open mouth, in the forest, at night",
1258
+ "[Alice] very frightened, open mouth, in the forest, at night",
1259
+ "[Bob] running very fast, in the forest, at night",
1260
+ "[NC] A house in the forest, at night #Suddenly, They discovers the treasure house!",
1261
+ "[Bob] in the house filled with treasure, laughing, at night #They are overjoyed inside the house.",
1262
+ ]
1263
+ ),
1264
+ "Comic book",
1265
+ "Using Ref Images",
1266
+ get_image_path_list("examples/twoperson"),
1267
+ 1024,
1268
+ 1024,
1269
+ ],
1270
+ [
1271
+ 1,
1272
+ 0.5,
1273
+ 0.5,
1274
+ 3,
1275
+ "[Taylor]a woman img, wearing a white T-shirt, blue loose hair",
1276
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1277
+ array2string(
1278
+ [
1279
+ "[Taylor]wake up in the bed",
1280
+ "[Taylor]have breakfast",
1281
+ "[Taylor]is on the road, go to company",
1282
+ "[Taylor]work in the company",
1283
+ "[Taylor]Take a walk next to the company at noon",
1284
+ "[Taylor]lying in bed at night",
1285
+ ]
1286
+ ),
1287
+ "Japanese Anime",
1288
+ "Using Ref Images",
1289
+ get_image_path_list("examples/taylor"),
1290
+ 768,
1291
+ 768,
1292
+ ],
1293
+ [
1294
+ 0,
1295
+ 0.5,
1296
+ 0.5,
1297
+ 3,
1298
+ "[Bob]a man, wearing black jacket",
1299
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1300
+ array2string(
1301
+ [
1302
+ "[Bob]wake up in the bed",
1303
+ "[Bob]have breakfast",
1304
+ "[Bob]is on the road, go to the company, close look",
1305
+ "[Bob]work in the company",
1306
+ "[Bob]laughing happily",
1307
+ "[Bob]lying in bed at night",
1308
+ ]
1309
+ ),
1310
+ "Japanese Anime",
1311
+ "Only Using Textual Description",
1312
+ get_image_path_list("examples/taylor"),
1313
+ 768,
1314
+ 768,
1315
+ ],
1316
+ [
1317
+ 0,
1318
+ 0.3,
1319
+ 0.5,
1320
+ 3,
1321
+ "[Kitty]a girl, wearing white shirt, black skirt, black tie, yellow hair",
1322
+ "bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
1323
+ array2string(
1324
+ [
1325
+ "[Kitty]at home #at home, began to go to drawing",
1326
+ "[Kitty]sitting alone on a park bench.",
1327
+ "[Kitty]reading a book on a park bench.",
1328
+ "[NC]A squirrel approaches, peeking over the bench. ",
1329
+ "[Kitty]look around in the park. # She looks around and enjoys the beauty of nature.",
1330
+ "[NC]leaf falls from the tree, landing on the sketchbook.",
1331
+ "[Kitty]picks up the leaf, examining its details closely.",
1332
+ "[NC]The brown squirrel appear.",
1333
+ "[Kitty]is very happy # She is very happy to see the squirrel again",
1334
+ "[NC]The brown squirrel takes the cracker and scampers up a tree. # She gives the squirrel cracker",
1335
+ ]
1336
+ ),
1337
+ "Japanese Anime",
1338
+ "Only Using Textual Description",
1339
+ get_image_path_list("examples/taylor"),
1340
+ 768,
1341
+ 768,
1342
+ ],
1343
+ ],
1344
+ inputs=[
1345
+ seed_,
1346
+ sa32_,
1347
+ sa64_,
1348
+ id_length_,
1349
+ general_prompt,
1350
+ negative_prompt,
1351
+ prompt_array,
1352
+ style,
1353
+ model_type,
1354
+ files,
1355
+ G_height,
1356
+ G_width,
1357
+ ],
1358
+ # outputs=[post_sketch, binary_matrixes, *color_row, *colors, *prompts, gen_prompt_vis, general_prompt, seed_],
1359
+ # run_on_click=True,
1360
+ label="😺 Examples 😺",
1361
+ )
1362
+ gr.Markdown(article)
1363
+
1364
+
1365
+ demo.launch(server_name="0.0.0.0", share=True)
gradio_app_sdxl_specific_id_low_vram1231231.py ADDED
@@ -0,0 +1,1187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from this import d
2
+ import gradio as gr
3
+ import numpy as np
4
+ import torch
5
+ import gc
6
+ import copy
7
+ import os
8
+ import random
9
+ import datetime
10
+ from PIL import ImageFont
11
+ from utils.gradio_utils import (
12
+ character_to_dict,
13
+ process_original_prompt,
14
+ get_ref_character,
15
+ cal_attn_mask_xl,
16
+ cal_attn_indice_xl_effcient_memory,
17
+ is_torch2_available,
18
+ )
19
+
20
+ import os
21
+ os.environ['GPU_PLATFORM_ID'] = '0'
22
+ os.environ['GPU_DEVICE_ID'] = '0'
23
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
24
+
25
+
26
+ import os
27
+ os.environ['HF_ENDPOINT']= 'https://hf-mirror.com'
28
+ torch.backends.cudnn.enabled = True
29
+
30
+ if is_torch2_available():
31
+ from utils.gradio_utils import AttnProcessor2_0 as AttnProcessor
32
+ else:
33
+ from utils.gradio_utils import AttnProcessor
34
+ from huggingface_hub import hf_hub_download
35
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
36
+ StableDiffusionXLPipeline,
37
+ )
38
+ from diffusers.schedulers.scheduling_ddim import DDIMScheduler
39
+ import torch.nn.functional as F
40
+ from diffusers.utils.loading_utils import load_image
41
+ from utils.utils import get_comic
42
+ from utils.style_template import styles
43
+ from utils.load_models_utils import get_models_dict, load_models
44
+
45
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '5'
46
+
47
+ STYLE_NAMES = list(styles.keys())
48
+ DEFAULT_STYLE_NAME = "Japanese Anime"
49
+ global models_dict
50
+
51
+ models_dict = get_models_dict()
52
+
53
+ # Automatically select the device
54
+ device = (
55
+ "cuda:0"
56
+ if torch.cuda.is_available()
57
+ else "mps" if torch.backends.mps.is_available() else "cpu"
58
+ )
59
+
60
+ # device = "cpu"
61
+
62
+ # torch.cuda.set_device(5)
63
+ print(f"@@device:{device}")
64
+
65
+
66
+ # check if the file exists locally at a specified path before downloading it.
67
+ # if the file doesn't exist, it uses `hf_hub_download` to download the file
68
+ # and optionally move it to a specific directory. If the file already
69
+ # exists, it simply uses the local path.
70
+ local_dir = "data/"
71
+ photomaker_local_path = f"{local_dir}photomaker-v1.bin"
72
+ if not os.path.exists(photomaker_local_path):
73
+ photomaker_path = hf_hub_download(
74
+ repo_id="TencentARC/PhotoMaker",
75
+ filename="photomaker-v1.bin",
76
+ repo_type="model",
77
+ local_dir=local_dir,
78
+ )
79
+ else:
80
+ photomaker_path = photomaker_local_path
81
+
82
+ MAX_SEED = np.iinfo(np.int32).max
83
+
84
+
85
+ def setup_seed(seed):
86
+ torch.manual_seed(seed)
87
+ if device == "cuda":
88
+ torch.cuda.manual_seed_all(seed)
89
+ np.random.seed(seed)
90
+ random.seed(seed)
91
+ torch.backends.cudnn.deterministic = True
92
+
93
+
94
+ def set_text_unfinished():
95
+ return gr.update(
96
+ visible=True,
97
+ value="<h3>(Not Finished) Generating ··· The intermediate results will be shown.</h3>",
98
+ )
99
+
100
+
101
+ def set_text_finished():
102
+ return gr.update(visible=True, value="<h3>Generation Finished</h3>")
103
+
104
+
105
+ #################################################
106
+ def get_image_path_list(folder_name):
107
+ image_basename_list = os.listdir(folder_name)
108
+ image_path_list = sorted(
109
+ [os.path.join(folder_name, basename) for basename in image_basename_list]
110
+ )
111
+ return image_path_list
112
+
113
+
114
+ #################################################
115
+ class SpatialAttnProcessor2_0(torch.nn.Module):
116
+ r"""
117
+ Attention processor for IP-Adapater for PyTorch 2.0.
118
+ Args:
119
+ hidden_size (`int`):
120
+ The hidden size of the attention layer.
121
+ cross_attention_dim (`int`):
122
+ The number of channels in the `encoder_hidden_states`.
123
+ text_context_len (`int`, defaults to 77):
124
+ The context length of the text features.
125
+ scale (`float`, defaults to 1.0):
126
+ the weight scale of image prompt.
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ hidden_size=None,
132
+ cross_attention_dim=None,
133
+ id_length=4,
134
+ device=device,
135
+ dtype=torch.float16,
136
+ ):
137
+ super().__init__()
138
+ if not hasattr(F, "scaled_dot_product_attention"):
139
+ raise ImportError(
140
+ "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
141
+ )
142
+ self.device = device
143
+ self.dtype = dtype
144
+ self.hidden_size = hidden_size
145
+ self.cross_attention_dim = cross_attention_dim
146
+ self.total_length = id_length + 1
147
+ self.id_length = id_length
148
+ self.id_bank = {}
149
+
150
+ def __call__(
151
+ self,
152
+ attn,
153
+ hidden_states,
154
+ encoder_hidden_states=None,
155
+ attention_mask=None,
156
+ temb=None,
157
+ ):
158
+ # un_cond_hidden_states, cond_hidden_states = hidden_states.chunk(2)
159
+ # un_cond_hidden_states = self.__call2__(attn, un_cond_hidden_states,encoder_hidden_states,attention_mask,temb)
160
+ # 生成一个0到1之间的随机数
161
+ global total_count, attn_count, cur_step, indices1024, indices4096
162
+ global sa32, sa64
163
+ global write
164
+ global height, width
165
+ global character_dict, character_index_dict, invert_character_index_dict, cur_character, ref_indexs_dict, ref_totals, cur_character
166
+ if attn_count == 0 and cur_step == 0:
167
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
168
+ self.total_length,
169
+ self.id_length,
170
+ sa32,
171
+ sa64,
172
+ height,
173
+ width,
174
+ device=self.device,
175
+ dtype=self.dtype,
176
+ )
177
+ if write:
178
+ assert len(cur_character) == 1
179
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
180
+ indices = indices1024
181
+ else:
182
+ indices = indices4096
183
+ # print(f"white:{cur_step}")
184
+ total_batch_size, nums_token, channel = hidden_states.shape
185
+ img_nums = total_batch_size // 2
186
+ hidden_states = hidden_states.reshape(-1, img_nums, nums_token, channel)
187
+ # print(img_nums,len(indices),hidden_states.shape,self.total_length)
188
+ if cur_character[0] not in self.id_bank:
189
+ self.id_bank[cur_character[0]] = {}
190
+ self.id_bank[cur_character[0]][cur_step] = [
191
+ hidden_states[:, img_ind, indices[img_ind], :]
192
+ .reshape(2, -1, channel)
193
+ .clone()
194
+ for img_ind in range(img_nums)
195
+ ]
196
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
197
+ # self.id_bank[cur_step] = [hidden_states[:self.id_length].clone(), hidden_states[self.id_length:].clone()]
198
+ else:
199
+ # encoder_hidden_states = torch.cat((self.id_bank[cur_step][0].to(self.device),self.id_bank[cur_step][1].to(self.device)))
200
+ # TODO: ADD Multipersion Control
201
+ encoder_arr = []
202
+ for character in cur_character:
203
+ encoder_arr = encoder_arr + [
204
+ tensor.to(self.device)
205
+ for tensor in self.id_bank[character][cur_step]
206
+ ]
207
+ # 判断随机数是否大于0.5
208
+ if cur_step < 1:
209
+ hidden_states = self.__call2__(
210
+ attn, hidden_states, None, attention_mask, temb
211
+ )
212
+ else: # 256 1024 4096
213
+ random_number = random.random()
214
+ if cur_step < 20:
215
+ rand_num = 0.3
216
+ else:
217
+ rand_num = 0.1
218
+ # print(f"hidden state shape {hidden_states.shape[1]}")
219
+ if random_number > rand_num:
220
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
221
+ indices = indices1024
222
+ else:
223
+ indices = indices4096
224
+ # print("before attention",hidden_states.shape,attention_mask.shape,encoder_hidden_states.shape if encoder_hidden_states is not None else "None")
225
+ if write:
226
+ total_batch_size, nums_token, channel = hidden_states.shape
227
+ img_nums = total_batch_size // 2
228
+ hidden_states = hidden_states.reshape(
229
+ -1, img_nums, nums_token, channel
230
+ )
231
+ encoder_arr = [
232
+ hidden_states[:, img_ind, indices[img_ind], :].reshape(
233
+ 2, -1, channel
234
+ )
235
+ for img_ind in range(img_nums)
236
+ ]
237
+ for img_ind in range(img_nums):
238
+ # print(img_nums)
239
+ # assert img_nums != 1
240
+ img_ind_list = [i for i in range(img_nums)]
241
+ # print(img_ind_list,img_ind)
242
+ img_ind_list.remove(img_ind)
243
+ # print(img_ind,invert_character_index_dict[img_ind])
244
+ # print(character_index_dict[invert_character_index_dict[img_ind]])
245
+ # print(img_ind_list)
246
+ # print(img_ind,img_ind_list)
247
+ encoder_hidden_states_tmp = torch.cat(
248
+ [encoder_arr[img_ind] for img_ind in img_ind_list]
249
+ + [hidden_states[:, img_ind, :, :]],
250
+ dim=1,
251
+ )
252
+
253
+ hidden_states[:, img_ind, :, :] = self.__call2__(
254
+ attn,
255
+ hidden_states[:, img_ind, :, :],
256
+ encoder_hidden_states_tmp,
257
+ None,
258
+ temb,
259
+ )
260
+ else:
261
+ _, nums_token, channel = hidden_states.shape
262
+ # img_nums = total_batch_size // 2
263
+ # encoder_hidden_states = encoder_hidden_states.reshape(-1,img_nums,nums_token,channel)
264
+ hidden_states = hidden_states.reshape(2, -1, nums_token, channel)
265
+ # print(len(indices))
266
+ # encoder_arr = [encoder_hidden_states[:,img_ind,indices[img_ind],:].reshape(2,-1,channel) for img_ind in range(img_nums)]
267
+ encoder_hidden_states_tmp = torch.cat(
268
+ encoder_arr + [hidden_states[:, 0, :, :]], dim=1
269
+ )
270
+ # print(len(encoder_arr),encoder_hidden_states_tmp.shape)
271
+ hidden_states[:, 0, :, :] = self.__call2__(
272
+ attn,
273
+ hidden_states[:, 0, :, :],
274
+ encoder_hidden_states_tmp,
275
+ None,
276
+ temb,
277
+ )
278
+ hidden_states = hidden_states.reshape(-1, nums_token, channel)
279
+ else:
280
+ hidden_states = self.__call2__(
281
+ attn, hidden_states, None, attention_mask, temb
282
+ )
283
+ attn_count += 1
284
+ if attn_count == total_count:
285
+ attn_count = 0
286
+ cur_step += 1
287
+ indices1024, indices4096 = cal_attn_indice_xl_effcient_memory(
288
+ self.total_length,
289
+ self.id_length,
290
+ sa32,
291
+ sa64,
292
+ height,
293
+ width,
294
+ device=self.device,
295
+ dtype=self.dtype,
296
+ )
297
+
298
+ return hidden_states
299
+
300
+ def __call2__(
301
+ self,
302
+ attn,
303
+ hidden_states,
304
+ encoder_hidden_states=None,
305
+ attention_mask=None,
306
+ temb=None,
307
+ ):
308
+ residual = hidden_states
309
+
310
+ if attn.spatial_norm is not None:
311
+ hidden_states = attn.spatial_norm(hidden_states, temb)
312
+
313
+ input_ndim = hidden_states.ndim
314
+
315
+ if input_ndim == 4:
316
+ batch_size, channel, height, width = hidden_states.shape
317
+ hidden_states = hidden_states.view(
318
+ batch_size, channel, height * width
319
+ ).transpose(1, 2)
320
+
321
+ batch_size, sequence_length, channel = hidden_states.shape
322
+ # print(hidden_states.shape)
323
+ if attention_mask is not None:
324
+ attention_mask = attn.prepare_attention_mask(
325
+ attention_mask, sequence_length, batch_size
326
+ )
327
+ # scaled_dot_product_attention expects attention_mask shape to be
328
+ # (batch, heads, source_length, target_length)
329
+ attention_mask = attention_mask.view(
330
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
331
+ )
332
+
333
+ if attn.group_norm is not None:
334
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
335
+ 1, 2
336
+ )
337
+
338
+ query = attn.to_q(hidden_states)
339
+
340
+ if encoder_hidden_states is None:
341
+ encoder_hidden_states = hidden_states # B, N, C
342
+ # else:
343
+ # encoder_hidden_states = encoder_hidden_states.view(-1,self.id_length+1,sequence_length,channel).reshape(-1,(self.id_length+1) * sequence_length,channel)
344
+
345
+ key = attn.to_k(encoder_hidden_states)
346
+ value = attn.to_v(encoder_hidden_states)
347
+
348
+ inner_dim = key.shape[-1]
349
+ head_dim = inner_dim // attn.heads
350
+
351
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
352
+
353
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
354
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
355
+
356
+ # the output of sdp = (batch, num_heads, seq_len, head_dim)
357
+ # TODO: add support for attn.scale when we move to Torch 2.1
358
+ hidden_states = F.scaled_dot_product_attention(
359
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
360
+ )
361
+
362
+ hidden_states = hidden_states.transpose(1, 2).reshape(
363
+ batch_size, -1, attn.heads * head_dim
364
+ )
365
+ hidden_states = hidden_states.to(query.dtype)
366
+
367
+ # linear proj
368
+ hidden_states = attn.to_out[0](hidden_states)
369
+ # dropout
370
+ hidden_states = attn.to_out[1](hidden_states)
371
+
372
+ if input_ndim == 4:
373
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
374
+ batch_size, channel, height, width
375
+ )
376
+
377
+ if attn.residual_connection:
378
+ hidden_states = hidden_states + residual
379
+
380
+ hidden_states = hidden_states / attn.rescale_output_factor
381
+
382
+ return hidden_states
383
+
384
+
385
+ def set_attention_processor(unet, id_length, is_ipadapter=False):
386
+ global attn_procs
387
+ attn_procs = {}
388
+ for name in unet.attn_processors.keys():
389
+ cross_attention_dim = (
390
+ None
391
+ if name.endswith("attn1.processor")
392
+ else unet.config.cross_attention_dim
393
+ )
394
+ if name.startswith("mid_block"):
395
+ hidden_size = unet.config.block_out_channels[-1]
396
+ elif name.startswith("up_blocks"):
397
+ block_id = int(name[len("up_blocks.")])
398
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
399
+ elif name.startswith("down_blocks"):
400
+ block_id = int(name[len("down_blocks.")])
401
+ hidden_size = unet.config.block_out_channels[block_id]
402
+ if cross_attention_dim is None:
403
+ if name.startswith("up_blocks"):
404
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
405
+ else:
406
+ attn_procs[name] = AttnProcessor()
407
+ else:
408
+ if is_ipadapter:
409
+ attn_procs[name] = IPAttnProcessor2_0(
410
+ hidden_size=hidden_size,
411
+ cross_attention_dim=cross_attention_dim,
412
+ scale=1,
413
+ num_tokens=4,
414
+ ).to(unet.device, dtype=torch.float16)
415
+ else:
416
+ attn_procs[name] = AttnProcessor()
417
+
418
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
419
+
420
+
421
+ #################################################
422
+ #################################################
423
+ canvas_html = "<div id='canvas-root' style='max-width:400px; margin: 0 auto'></div>"
424
+ load_js = """
425
+ async () => {
426
+ const url = "https://huggingface.co/datasets/radames/gradio-components/raw/main/sketch-canvas.js"
427
+ fetch(url)
428
+ .then(res => res.text())
429
+ .then(text => {
430
+ const script = document.createElement('script');
431
+ script.type = "module"
432
+ script.src = URL.createObjectURL(new Blob([text], { type: 'application/javascript' }));
433
+ document.head.appendChild(script);
434
+ });
435
+ }
436
+ """
437
+
438
+ get_js_colors = """
439
+ async (canvasData) => {
440
+ const canvasEl = document.getElementById("canvas-root");
441
+ return [canvasEl._data]
442
+ }
443
+ """
444
+
445
+ css = """
446
+ #color-bg{display:flex;justify-content: center;align-items: center;}
447
+ .color-bg-item{width: 100%; height: 32px}
448
+ #main_button{width:100%}
449
+ <style>
450
+ """
451
+
452
+
453
+ def save_single_character_weights(unet, character, description, filepath):
454
+ """
455
+ 保存 attention_processor 类中的 id_bank GPU Tensor 列表到指定文件中。
456
+ 参数:
457
+ - model: 包含 attention_processor 类实例的模型。
458
+ - filepath: 权重要保存到的文件路径。
459
+ """
460
+ weights_to_save = {}
461
+ weights_to_save["description"] = description
462
+ weights_to_save["character"] = character
463
+ for attn_name, attn_processor in unet.attn_processors.items():
464
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
465
+ # 将每个 Tensor 转到 CPU 并转为列表,以确保它可以被序列化
466
+ weights_to_save[attn_name] = {}
467
+ for step_key in attn_processor.id_bank[character].keys():
468
+ weights_to_save[attn_name][step_key] = [
469
+ tensor.cpu()
470
+ for tensor in attn_processor.id_bank[character][step_key]
471
+ ]
472
+ # 使用torch.save保存权重
473
+ torch.save(weights_to_save, filepath)
474
+
475
+
476
+ def load_single_character_weights(unet, filepath):
477
+ """
478
+ 从指定文件中加载权重到 attention_processor 类的 id_bank 中。
479
+ 参数:
480
+ - model: 包含 attention_processor 类实例的模型。
481
+ - filepath: 权重文件的路径。
482
+ """
483
+ # 使用torch.load来读取权重
484
+ weights_to_load = torch.load(filepath, map_location=torch.device("cpu"))
485
+ character = weights_to_load["character"]
486
+ description = weights_to_load["description"]
487
+ for attn_name, attn_processor in unet.attn_processors.items():
488
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
489
+ # 转移权重到GPU(如果GPU可用的话)并赋值给id_bank
490
+ attn_processor.id_bank[character] = {}
491
+ for step_key in weights_to_load[attn_name].keys():
492
+ attn_processor.id_bank[character][step_key] = [
493
+ tensor.to(unet.device)
494
+ for tensor in weights_to_load[attn_name][step_key]
495
+ ]
496
+
497
+
498
+ def save_results(unet, img_list):
499
+
500
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
501
+ folder_name = f"results/{timestamp}"
502
+ weight_folder_name = f"{folder_name}/weights"
503
+ # 创建文件夹
504
+ if not os.path.exists(folder_name):
505
+ os.makedirs(folder_name)
506
+ os.makedirs(weight_folder_name)
507
+
508
+ for idx, img in enumerate(img_list):
509
+ file_path = os.path.join(folder_name, f"image_{idx}.png") # 图片文件名
510
+ img.save(file_path)
511
+ global character_dict
512
+ # for char in character_dict:
513
+ # description = character_dict[char]
514
+ # save_single_character_weights(unet,char,description,os.path.join(weight_folder_name, f'{char}.pt'))
515
+
516
+
517
+ #################################################
518
+ title = r"""
519
+ <h1 align="center">StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</h1>
520
+ """
521
+
522
+ description = r"""
523
+ <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'><b>StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation</b></a>.<br>
524
+ ❗️❗️❗️[<b>Important</b>] Personalization steps:<br>
525
+ 1️⃣ Enter a Textual Description for Character, if you add the Ref-Image, making sure to <b>follow the class word</b> you want to customize with the <b>trigger word</b>: `img`, such as: `man img` or `woman img` or `girl img`.<br>
526
+ 2️⃣ Enter the prompt array, each line corrsponds to one generated image.<br>
527
+ 3️�� Choose your preferred style template.<br>
528
+ 4️⃣ Click the <b>Submit</b> button to start customizing.
529
+ """
530
+
531
+ article = r"""
532
+
533
+ If StoryDiffusion is helpful, please help to ⭐ the <a href='https://github.com/HVision-NKU/StoryDiffusion' target='_blank'>Github Repo</a>. Thanks!
534
+ [![GitHub Stars](https://img.shields.io/github/stars/HVision-NKU/StoryDiffusion?style=social)](https://github.com/HVision-NKU/StoryDiffusion)
535
+ ---
536
+ 📝 **Citation**
537
+ <br>
538
+ If our work is useful for your research, please consider citing:
539
+
540
+ ```bibtex
541
+ @article{Zhou2024storydiffusion,
542
+ title={StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation},
543
+ author={Zhou, Yupeng and Zhou, Daquan and Cheng, Ming-Ming and Feng, Jiashi and Hou, Qibin},
544
+ year={2024}
545
+ }
546
+ ```
547
+ 📋 **License**
548
+ <br>
549
+ Apache-2.0 LICENSE.
550
+
551
+ 📧 **Contact**
552
+ <br>
553
+ If you have any questions, please feel free to reach me out at <b>ypzhousdu@gmail.com</b>.
554
+ """
555
+ version = r"""
556
+ <h3 align="center">StoryDiffusion Version 0.02 (test version)</h3>
557
+
558
+ <h5 >1. Support image ref image. (Cartoon Ref image is not support now)</h5>
559
+ <h5 >2. Support Typesetting Style and Captioning.(By default, the prompt is used as the caption for each image. If you need to change the caption, add a # at the end of each line. Only the part after the # will be added as a caption to the image.)</h5>
560
+ <h5 >3. [NC]symbol (The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the "[NC]" at the beginning of the line. For example, to generate a scene of falling leaves without any character, write: "[NC] The leaves are falling.")</h5>
561
+ <h5 align="center">Tips: </h4>
562
+ """
563
+ #################################################
564
+ global attn_count, total_count, id_length, total_length, cur_step, cur_model_type
565
+ global write
566
+ global sa32, sa64
567
+ global height, width
568
+ attn_count = 0
569
+ total_count = 0
570
+ cur_step = 0
571
+ id_length = 4
572
+ total_length = 5
573
+ cur_model_type = ""
574
+ global attn_procs, unet
575
+ attn_procs = {}
576
+ ###
577
+ write = False
578
+ ###
579
+ sa32 = 0.5
580
+ sa64 = 0.5
581
+ height = 768
582
+ width = 768
583
+ ###
584
+ global pipe
585
+ global sd_model_path
586
+ pipe = None
587
+ sd_model_path = models_dict["Unstable"]["path"] # "SG161222/RealVisXL_V4.0"
588
+ single_files = models_dict["Unstable"]["single_files"]
589
+ ### LOAD Stable Diffusion Pipeline
590
+ if single_files:
591
+ pipe = StableDiffusionXLPipeline.from_single_file(
592
+ sd_model_path, torch_dtype=torch.float16
593
+ )
594
+ else:
595
+ pipe = StableDiffusionXLPipeline.from_pretrained(
596
+ sd_model_path, torch_dtype=torch.float16, use_safetensors=False
597
+ )
598
+ print("pipE.device = ", device)
599
+ pipe = pipe.to(device)
600
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
601
+ # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
602
+ pipe.scheduler.set_timesteps(50)
603
+ pipe.enable_vae_slicing()
604
+ if device != "mps":
605
+ pipe.enable_model_cpu_offload()
606
+ unet = pipe.unet
607
+ cur_model_type = "Unstable" + "-" + "original"
608
+ ### Insert PairedAttention
609
+ for name in unet.attn_processors.keys():
610
+ cross_attention_dim = (
611
+ None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
612
+ )
613
+ if name.startswith("mid_block"):
614
+ hidden_size = unet.config.block_out_channels[-1]
615
+ elif name.startswith("up_blocks"):
616
+ block_id = int(name[len("up_blocks.")])
617
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
618
+ elif name.startswith("down_blocks"):
619
+ block_id = int(name[len("down_blocks.")])
620
+ hidden_size = unet.config.block_out_channels[block_id]
621
+ if cross_attention_dim is None and (name.startswith("up_blocks")):
622
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
623
+ total_count += 1
624
+ else:
625
+ attn_procs[name] = AttnProcessor()
626
+ print("successsfully load paired self-attention")
627
+ print(f"number of the processor : {total_count}")
628
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
629
+ global mask1024, mask4096
630
+ mask1024, mask4096 = cal_attn_mask_xl(
631
+ total_length,
632
+ id_length,
633
+ sa32,
634
+ sa64,
635
+ height,
636
+ width,
637
+ device=device,
638
+ dtype=torch.float16,
639
+ )
640
+
641
+ ######### Gradio Fuction #############
642
+
643
+
644
+ def swap_to_gallery(images):
645
+ return (
646
+ gr.update(value=images, visible=True),
647
+ gr.update(visible=True),
648
+ gr.update(visible=False),
649
+ )
650
+
651
+
652
+ def upload_example_to_gallery(images, prompt, style, negative_prompt):
653
+ return (
654
+ gr.update(value=images, visible=True),
655
+ gr.update(visible=True),
656
+ gr.update(visible=False),
657
+ )
658
+
659
+
660
+ def remove_back_to_files():
661
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
662
+
663
+
664
+ def remove_tips():
665
+ return gr.update(visible=False)
666
+
667
+
668
+ def apply_style_positive(style_name: str, positive: str):
669
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
670
+ return p.replace("{prompt}", positive)
671
+
672
+
673
+ def apply_style(style_name: str, positives: list, negative: str = ""):
674
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
675
+ return [
676
+ p.replace("{prompt}", positive) for positive in positives
677
+ ], n + " " + negative
678
+
679
+
680
+ def change_visiale_by_model_type(_model_type):
681
+ if _model_type == "Only Using Textual Description":
682
+ return (
683
+ gr.update(visible=False),
684
+ gr.update(visible=False),
685
+ gr.update(visible=False),
686
+ )
687
+ elif _model_type == "Using Ref Images":
688
+ return (
689
+ gr.update(visible=True),
690
+ gr.update(visible=True),
691
+ gr.update(visible=False),
692
+ )
693
+ else:
694
+ raise ValueError("Invalid model type", _model_type)
695
+
696
+
697
+ def load_character_files(character_files: str):
698
+ if character_files == "":
699
+ raise gr.Error("Please set a character file!")
700
+ character_files_arr = character_files.splitlines()
701
+ primarytext = []
702
+ for character_file_name in character_files_arr:
703
+ character_file = torch.load(
704
+ character_file_name, map_location=torch.device("cpu")
705
+ )
706
+ primarytext.append(character_file["character"] + character_file["description"])
707
+ return array2string(primarytext)
708
+
709
+
710
+ def load_character_files_on_running(unet, character_files: str):
711
+ if character_files == "":
712
+ return False
713
+ character_files_arr = character_files.splitlines()
714
+ for character_file in character_files_arr:
715
+ load_single_character_weights(unet, character_file)
716
+ return True
717
+
718
+
719
+ ######### Image Generation ##############
720
+ def process_generation(
721
+ _sd_type,
722
+ _model_type,
723
+ _upload_images,
724
+ _num_steps,
725
+ style_name,
726
+ _Ip_Adapter_Strength,
727
+ _style_strength_ratio,
728
+ guidance_scale,
729
+ seed_,
730
+ sa32_,
731
+ sa64_,
732
+ id_length_,
733
+ general_prompt,
734
+ negative_prompt,
735
+ prompt_array,
736
+ G_height,
737
+ G_width,
738
+ _comic_type,
739
+ font_choice,
740
+ _char_files,
741
+ ): # Corrected font_choice usage
742
+ if len(general_prompt.splitlines()) >= 3:
743
+ raise gr.Error(
744
+ "Support for more than three characters is temporarily unavailable due to VRAM limitations, but this issue will be resolved soon."
745
+ )
746
+ _model_type = "Photomaker" if _model_type == "Using Ref Images" else "original"
747
+ if _model_type == "Photomaker" and "img" not in general_prompt:
748
+ raise gr.Error(
749
+ 'Please add the triger word " img " behind the class word you want to customize, such as: man img or woman img'
750
+ )
751
+ if _upload_images is None and _model_type != "original":
752
+ raise gr.Error(f"Cannot find any input face image!")
753
+ global sa32, sa64, id_length, total_length, attn_procs, unet, cur_model_type
754
+ global write
755
+ global cur_step, attn_count
756
+ global height, width
757
+ height = G_height
758
+ width = G_width
759
+ global pipe
760
+ global sd_model_path, models_dict
761
+ sd_model_path = models_dict[_sd_type]
762
+ use_safe_tensor = True
763
+ for attn_processor in pipe.unet.attn_processors.values():
764
+ if isinstance(attn_processor, SpatialAttnProcessor2_0):
765
+ for values in attn_processor.id_bank.values():
766
+ del values
767
+ attn_processor.id_bank = {}
768
+ attn_processor.id_length = id_length
769
+ attn_processor.total_length = id_length + 1
770
+ gc.collect()
771
+ torch.cuda.empty_cache()
772
+ if cur_model_type != _sd_type + "-" + _model_type:
773
+ # apply the style template
774
+ ##### load pipe
775
+ del pipe
776
+ gc.collect()
777
+ if device == "cuda":
778
+ torch.cuda.empty_cache()
779
+ model_info = models_dict[_sd_type]
780
+ model_info["model_type"] = _model_type
781
+ print("device = ", device)
782
+ pipe = load_models(model_info, device=device, photomaker_path=photomaker_path)
783
+ set_attention_processor(pipe.unet, id_length_, is_ipadapter=False)
784
+ ##### ########################
785
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
786
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
787
+ cur_model_type = _sd_type + "-" + _model_type
788
+ pipe.enable_vae_slicing()
789
+ if device != "mps":
790
+ pipe.enable_model_cpu_offload()
791
+ else:
792
+ unet = pipe.unet
793
+ # unet.set_attn_processor(copy.deepcopy(attn_procs))
794
+
795
+ load_chars = load_character_files_on_running(unet, character_files=_char_files)
796
+
797
+ prompts = prompt_array.splitlines()
798
+ global character_dict, character_index_dict, invert_character_index_dict, ref_indexs_dict, ref_totals
799
+ character_dict, character_list = character_to_dict(general_prompt)
800
+
801
+ start_merge_step = int(float(_style_strength_ratio) / 100 * _num_steps)
802
+ if start_merge_step > 30:
803
+ start_merge_step = 30
804
+ print(f"start_merge_step:{start_merge_step}")
805
+ generator = torch.Generator(device=device).manual_seed(seed_)
806
+ sa32, sa64 = sa32_, sa64_
807
+ id_length = id_length_
808
+ clipped_prompts = prompts[:]
809
+ nc_indexs = []
810
+ for ind, prompt in enumerate(clipped_prompts):
811
+ if "[NC]" in prompt:
812
+ nc_indexs.append(ind)
813
+ if ind < id_length:
814
+ raise gr.Error(
815
+ f"The first {id_length} row is id prompts, cannot use [NC]!"
816
+ )
817
+ prompts = [
818
+ prompt if "[NC]" not in prompt else prompt.replace("[NC]", "")
819
+ for prompt in clipped_prompts
820
+ ]
821
+
822
+ prompts = [
823
+ prompt.rpartition("#")[0] if "#" in prompt else prompt for prompt in prompts
824
+ ]
825
+ print(prompts)
826
+ # id_prompts = prompts[:id_length]
827
+ (
828
+ character_index_dict,
829
+ invert_character_index_dict,
830
+ replace_prompts,
831
+ ref_indexs_dict,
832
+ ref_totals,
833
+ ) = process_original_prompt(character_dict, prompts.copy(), id_length)
834
+ if _model_type != "original":
835
+ input_id_images_dict = {}
836
+ if len(_upload_images) != len(character_dict.keys()):
837
+ raise gr.Error(
838
+ f"You upload images({len(_upload_images)}) is not equal to the number of characters({len(character_dict.keys())})!"
839
+ )
840
+ for ind, img in enumerate(_upload_images):
841
+ input_id_images_dict[character_list[ind]] = [load_image(img)]
842
+ print(character_dict)
843
+ print(character_index_dict)
844
+ print(invert_character_index_dict)
845
+ # real_prompts = prompts[id_length:]
846
+ if device == "cuda":
847
+ torch.cuda.empty_cache()
848
+ write = True
849
+ cur_step = 0
850
+
851
+ attn_count = 0
852
+ # id_prompts, negative_prompt = apply_style(style_name, id_prompts, negative_prompt)
853
+ # print(id_prompts)
854
+ setup_seed(seed_)
855
+ total_results = []
856
+ id_images = []
857
+ results_dict = {}
858
+ global cur_character
859
+
860
+ if not load_chars:
861
+ for character_key in character_dict.keys():
862
+ cur_character = [character_key]
863
+ ref_indexs = ref_indexs_dict[character_key]
864
+ print(character_key, ref_indexs)
865
+ current_prompts = [replace_prompts[ref_ind] for ref_ind in ref_indexs]
866
+ print(current_prompts)
867
+ setup_seed(seed_)
868
+ generator = torch.Generator(device=device).manual_seed(seed_)
869
+ cur_step = 0
870
+ cur_positive_prompts, negative_prompt = apply_style(
871
+ style_name, current_prompts, negative_prompt
872
+ )
873
+ if _model_type == "original":
874
+ id_images = pipe(
875
+ cur_positive_prompts,
876
+ num_inference_steps=_num_steps,
877
+ guidance_scale=guidance_scale,
878
+ height=height,
879
+ width=width,
880
+ negative_prompt=negative_prompt,
881
+ generator=generator,
882
+ ).images
883
+ elif _model_type == "Photomaker":
884
+ id_images = pipe(
885
+ cur_positive_prompts,
886
+ input_id_images=input_id_images_dict[character_key],
887
+ num_inference_steps=_num_steps,
888
+ guidance_scale=guidance_scale,
889
+ start_merge_step=start_merge_step,
890
+ height=height,
891
+ width=width,
892
+ negative_prompt=negative_prompt,
893
+ generator=generator,
894
+ ).images
895
+ else:
896
+ raise NotImplementedError(
897
+ "You should choice between original and Photomaker!",
898
+ f"But you choice {_model_type}",
899
+ )
900
+
901
+ # total_results = id_images + total_results
902
+ # yield total_results
903
+ print(id_images)
904
+ for ind, img in enumerate(id_images):
905
+ print(ref_indexs[ind])
906
+ results_dict[ref_indexs[ind]] = img
907
+ # real_images = []
908
+ yield [results_dict[ind] for ind in results_dict.keys()]
909
+ write = False
910
+ if not load_chars:
911
+ real_prompts_inds = [
912
+ ind for ind in range(len(prompts)) if ind not in ref_totals
913
+ ]
914
+ else:
915
+ real_prompts_inds = [ind for ind in range(len(prompts))]
916
+ print(real_prompts_inds)
917
+
918
+ for real_prompts_ind in real_prompts_inds:
919
+ real_prompt = replace_prompts[real_prompts_ind]
920
+ cur_character = get_ref_character(prompts[real_prompts_ind], character_dict)
921
+ print(cur_character, real_prompt)
922
+ setup_seed(seed_)
923
+ if len(cur_character) > 1 and _model_type == "Photomaker":
924
+ raise gr.Error(
925
+ "Temporarily Not Support Multiple character in Ref Image Mode!"
926
+ )
927
+ generator = torch.Generator(device=device).manual_seed(seed_)
928
+ cur_step = 0
929
+ real_prompt = apply_style_positive(style_name, real_prompt)
930
+ if _model_type == "original":
931
+ results_dict[real_prompts_ind] = pipe(
932
+ real_prompt,
933
+ num_inference_steps=_num_steps,
934
+ guidance_scale=guidance_scale,
935
+ height=height,
936
+ width=width,
937
+ negative_prompt=negative_prompt,
938
+ generator=generator,
939
+ ).images[0]
940
+ elif _model_type == "Photomaker":
941
+ results_dict[real_prompts_ind] = pipe(
942
+ real_prompt,
943
+ input_id_images=(
944
+ input_id_images_dict[cur_character[0]]
945
+ if real_prompts_ind not in nc_indexs
946
+ else input_id_images_dict[character_list[0]]
947
+ ),
948
+ num_inference_steps=_num_steps,
949
+ guidance_scale=guidance_scale,
950
+ start_merge_step=start_merge_step,
951
+ height=height,
952
+ width=width,
953
+ negative_prompt=negative_prompt,
954
+ generator=generator,
955
+ nc_flag=True if real_prompts_ind in nc_indexs else False,
956
+ ).images[0]
957
+ else:
958
+ raise NotImplementedError(
959
+ "You should choice between original and Photomaker!",
960
+ f"But you choice {_model_type}",
961
+ )
962
+ yield [results_dict[ind] for ind in results_dict.keys()]
963
+ total_results = [results_dict[ind] for ind in range(len(prompts))]
964
+ if _comic_type != "No typesetting (default)":
965
+ captions = prompt_array.splitlines()
966
+ captions = [caption.replace("[NC]", "") for caption in captions]
967
+ captions = [
968
+ caption.split("#")[-1] if "#" in caption else caption
969
+ for caption in captions
970
+ ]
971
+ font_path = os.path.join("fonts", font_choice)
972
+ font = ImageFont.truetype(font_path, int(45))
973
+ total_results = (
974
+ get_comic(total_results, _comic_type, captions=captions, font=font)
975
+ + total_results
976
+ )
977
+ save_results(pipe.unet, total_results)
978
+
979
+ yield total_results
980
+
981
+
982
+ def array2string(arr):
983
+ stringtmp = ""
984
+ for i, part in enumerate(arr):
985
+ if i != len(arr) - 1:
986
+ stringtmp += part + "\n"
987
+ else:
988
+ stringtmp += part
989
+
990
+ return stringtmp
991
+
992
+
993
+ #################################################
994
+ #################################################
995
+ ### define the interface
996
+
997
+ css = """
998
+ :root {
999
+ --main-blue: #4A90E2;
1000
+ --tech-purple: #9B59B6;
1001
+ --fresh-green: #2ECC71;
1002
+ --bg-gradient: linear-gradient(135deg, #F5F7FA 0%, #E8F4FF 100%);
1003
+ }
1004
+
1005
+ body {
1006
+ background: var(--bg-gradient);
1007
+ min-height: 100vh;
1008
+ }
1009
+
1010
+ .gr-container {
1011
+ max-width: 1200px!important;
1012
+ margin: 0 auto!important;
1013
+ gap: 40px!important;
1014
+ }
1015
+
1016
+ .upload-section {
1017
+ border: 2px dashed var(--main-blue)!important;
1018
+ border-radius: 20px!important;
1019
+ padding: 30px!important;
1020
+ }
1021
+
1022
+ .generate-btn {
1023
+ background: var(--fresh-green)!important;
1024
+ color: white!important;
1025
+ border-radius: 12px!important;
1026
+ }
1027
+
1028
+ /* 其他样式保持与提供的一致 */
1029
+ """
1030
+
1031
+ # 提示词映射字典
1032
+ PROMPT_TEMPLATES = {
1033
+ "双人冒险故事 🔥": {
1034
+ "general": "[Bob] A man, wearing a black suit\n[Alice]a woman, wearing a white shirt",
1035
+ "negative": "bad anatomy, bad hands, missing fingers...", # 保持原负面提示词
1036
+ "scenes": [
1037
+ "[Bob] at home, read new paper #at home...",
1038
+ "[Bob] on the road, near the forest",
1039
+ # ... 其他场景
1040
+ ]
1041
+ },
1042
+ "夜间森林探险 🌲": {
1043
+ "general": "[Bob] A man img, wearing a black suit...",
1044
+ "negative": "bad anatomy...",
1045
+ "scenes": [...]
1046
+ },
1047
+ # 其他模板...
1048
+ }
1049
+
1050
+ def array2string(arr):
1051
+ return "\n".join(arr)
1052
+
1053
+ def load_example(prompt_key, style, files, height, width):
1054
+ """处理示例加载"""
1055
+ template = PROMPT_TEMPLATES[prompt_key]
1056
+ return {
1057
+ general_prompt: template["general"],
1058
+ negative_prompt: template["negative"],
1059
+ prompt_array: array2string(template["scenes"]),
1060
+ style: style,
1061
+ files: files,
1062
+ G_height: height,
1063
+ G_width: width
1064
+ }
1065
+
1066
+ with gr.Blocks(css=css, title="AI研学旅记") as demo:
1067
+ # 状态变量
1068
+ binary_matrixes = gr.State([])
1069
+ color_layout = gr.State([])
1070
+
1071
+ # 标题
1072
+ gr.Markdown("<h1 class='main-title'>我的AI研学旅记</h1>")
1073
+
1074
+ with gr.Row(elem_id="container"):
1075
+ # 左侧输入区
1076
+ with gr.Column(elem_classes="input-area"):
1077
+ # === 文件上传 ===
1078
+ with gr.Group(elem_classes="upload-section"):
1079
+ files = gr.Files(
1080
+ label="上传研学照片",
1081
+ file_types=["image"],
1082
+ file_count="multiple",
1083
+ elem_id="fileInput"
1084
+ )
1085
+ uploaded_files = gr.Gallery(
1086
+ label="您的照片",
1087
+ columns=5,
1088
+ rows=1,
1089
+ height=200,
1090
+ visible=False
1091
+ )
1092
+
1093
+ # === 提示词选择 ===
1094
+ prompt_btns = gr.Radio(
1095
+ choices=list(PROMPT_TEMPLATES.keys()),
1096
+ label="场景模板",
1097
+ interactive=True,
1098
+ elem_classes="prompt-section"
1099
+ )
1100
+
1101
+ # === 风格选择 ===
1102
+ style = gr.Dropdown(
1103
+ choices=["🎞️ ���本动漫风", "🌸 电影影视风", "🎨 摄影写真风", "🌟 漫画书风", "🌙 皮克斯/迪士尼角色", "📘 线条艺术风"],
1104
+ value="🎞️ 日本动漫风",
1105
+ label="艺术风格",
1106
+ elem_classes="style-select"
1107
+ )
1108
+
1109
+ # === 固定参数 ===
1110
+ with gr.Accordion("高级设置", open=False):
1111
+ sa32_ = gr.Slider(visible=False, value=0.5)
1112
+ sa64_ = gr.Slider(visible=False, value=0.5)
1113
+ id_length_ = gr.Slider(visible=False, value=3)
1114
+ guidance_scale = gr.Slider(visible=False, value=5)
1115
+ num_steps = gr.Slider(visible=False, value=35)
1116
+
1117
+ # === 生成按钮 ===
1118
+ generate_btn = gr.Button("✨ 立即生成", elem_classes="generate-btn")
1119
+
1120
+ # 右侧输出区
1121
+ with gr.Column(elem_classes="output-section"):
1122
+ out_image = gr.Gallery(
1123
+ label="生成结果",
1124
+ columns=2,
1125
+ height="auto",
1126
+ elem_classes="output-image"
1127
+ )
1128
+ loading = gr.HTML("""
1129
+ <div class="loading-overlay">
1130
+ <div style="text-align: center">
1131
+ <div style="font-size: 2rem">🎨 AI创作中...</div>
1132
+ <div style="font-size: 1.2rem; color: #666">正在将回忆转化为数字艺术</div>
1133
+ </div>
1134
+ </div>
1135
+ """, visible=False)
1136
+
1137
+ # === 事件处理 ===
1138
+ files.upload(
1139
+ fn=swap_to_gallery,
1140
+ inputs=files,
1141
+ outputs=[uploaded_files, files]
1142
+ )
1143
+
1144
+ generate_btn.click(
1145
+ fn=lambda: gr.update(visible=True),
1146
+ outputs=loading
1147
+ ).then(
1148
+ fn=process_generation,
1149
+ inputs=[
1150
+ prompt_btns, style, files,
1151
+ sa32_, sa64_, id_length_,
1152
+ guidance_scale, num_steps,
1153
+ gr.State(768), gr.State(768) # 固定尺寸
1154
+ ],
1155
+ outputs=out_image
1156
+ ).then(
1157
+ fn=lambda: gr.update(visible=False),
1158
+ outputs=loading
1159
+ )
1160
+
1161
+ # === 示例 ===
1162
+ gr.Examples(
1163
+ examples=[
1164
+ [
1165
+ "双人冒险故事 🔥",
1166
+ "🌟 漫画书风",
1167
+ get_image_path_list("examples/taylor"),
1168
+ 768,
1169
+ 768
1170
+ ],
1171
+ [
1172
+ "夜间森林探险 🌲",
1173
+ "🎞️ 日本动漫风",
1174
+ get_image_path_list("examples/twoperson"),
1175
+ 1024,
1176
+ 1024
1177
+ ],
1178
+ # 其他示例...
1179
+ ],
1180
+ inputs=[prompt_btns, style, files, gr.State(768), gr.State(768)],
1181
+ outputs=out_image,
1182
+ fn=load_example,
1183
+ label="预设场景"
1184
+ )
1185
+
1186
+ # 保留原有启动设置
1187
+ demo.launch(server_name="0.0.0.0", share=True)
main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from diffusers import StableDiffusionPipeline
4
+ import torch
5
+ from PIL import Image
6
+ import io
7
+ import base64
8
+
9
+ app = FastAPI()
10
+
11
+ # 🔧 加载模型(你可以替换成你本地修改后的 StoryDiffusion Pipeline)
12
+ pipe = StableDiffusionPipeline.from_pretrained("你的模型路径").to("cuda")
13
+
14
+ class PromptInput(BaseModel):
15
+ prompt: str
16
+
17
+ @app.post("/generate")
18
+ def generate_image(data: PromptInput):
19
+ image = pipe(data.prompt).images[0]
20
+
21
+ # 把图像编码为 base64,便于前端显示
22
+ buffered = io.BytesIO()
23
+ image.save(buffered, format="PNG")
24
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
25
+ return {"image_base64": img_str}
page.html ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="zh-CN">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>跳转页面</title>
6
+ <style>
7
+ @import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@500&display=swap');
8
+
9
+ body {
10
+ margin: 0;
11
+ padding: 0;
12
+ height: 100vh;
13
+ display: flex;
14
+ justify-content: center;
15
+ align-items: center;
16
+ background: linear-gradient(135deg, #0f0c29, #302b63, #24243e);
17
+ font-family: 'Orbitron', sans-serif;
18
+ color: #00ffe7;
19
+ overflow: hidden;
20
+ }
21
+
22
+ button {
23
+ font-size: 24px;
24
+ padding: 15px 30px;
25
+ background: rgba(0, 255, 231, 0.2);
26
+ border: 2px solid #00ffe7;
27
+ border-radius: 12px;
28
+ color: #00ffe7;
29
+ cursor: pointer;
30
+ box-shadow: 0 0 15px #00ffe7;
31
+ transition: all 0.3s ease;
32
+ backdrop-filter: blur(5px);
33
+ }
34
+
35
+ button:hover {
36
+ background: rgba(0, 255, 231, 0.4);
37
+ box-shadow: 0 0 25px #00ffe7;
38
+ transform: scale(1.05);
39
+ }
40
+
41
+ .glow {
42
+ position: absolute;
43
+ width: 100%;
44
+ height: 100%;
45
+ background: radial-gradient(circle, rgba(0, 255, 231, 0.1) 0%, transparent 70%);
46
+ animation: pulse 5s infinite;
47
+ }
48
+
49
+ @keyframes pulse {
50
+ 0% { transform: scale(1); opacity: 0.5; }
51
+ 50% { transform: scale(1.2); opacity: 0.2; }
52
+ 100% { transform: scale(1); opacity: 0.5; }
53
+ }
54
+ </style>
55
+ </head>
56
+ <body>
57
+ <div class="glow"></div>
58
+ <button onclick="window.location.href='http://localhost:7861/'">gogogo</button>
59
+ </body>
60
+ </html>
predict.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prediction interface for Cog ⚙️
2
+ # https://cog.run/python
3
+
4
+ import os
5
+ import copy
6
+ import random
7
+ import subprocess
8
+ import numpy as np
9
+ import time
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from PIL import ImageFont
13
+ from cog import BasePredictor, Input, Path, BaseModel
14
+ from diffusers import StableDiffusionXLPipeline, DDIMScheduler
15
+ from diffusers.utils import load_image
16
+
17
+ from utils import PhotoMakerStableDiffusionXLPipeline
18
+ from utils.style_template import styles
19
+ from utils.gradio_utils import (
20
+ AttnProcessor2_0 as AttnProcessor,
21
+ ) # with torch2 installed
22
+ from utils.gradio_utils import cal_attn_mask_xl
23
+ from utils.utils import get_comic
24
+
25
+ MODEL_URL = "https://weights.replicate.delivery/default/HVision_NKU/StoryDiffusion.tar"
26
+ MODEL_CACHE = "model_weights"
27
+ STYLE_NAMES = list(styles.keys())
28
+ DEFAULT_STYLE_NAME = "Japanese Anime"
29
+
30
+ global total_count, attn_count, cur_step, mask1024, mask4096, attn_procs, unet
31
+ global sa32, sa64
32
+ global write
33
+ global height, width
34
+
35
+
36
+ """
37
+ # load and upload the weights to replicate.delivery for faster booting on Replicate
38
+ models_dict = {
39
+ "RealVision": "SG161222/RealVisXL_V4.0",
40
+ "Unstable": "stablediffusionapi/sdxl-unstable-diffusers-y",
41
+ }
42
+ # photomaker_path = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
43
+ photomaker_path = f"{MODEL_CACHE}/PhotoMaker/photomaker-v1.bin"
44
+
45
+ pipe_unstable = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
46
+ models_dict["Unstable"],
47
+ torch_dtype=torch.float16,
48
+ use_safetensors=False,
49
+ )
50
+ pipe_unstable.save_pretrained(f"{MODEL_CACHE}/Unstable/stablediffusionapi/sdxl-unstable-diffusers-y")
51
+
52
+ pipe_realvision = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
53
+ models_dict["RealVision"], torch_dtype=torch.float16, use_safetensors=True
54
+ )
55
+ pipe_realvision.save_pretrained(f"{MODEL_CACHE}/RealVision/SG161222/RealVisXL_V4.0")
56
+ """
57
+
58
+
59
+ class ModelOutput(BaseModel):
60
+ comic: Path
61
+ individual_images: list[Path]
62
+
63
+
64
+ def download_weights(url, dest):
65
+ start = time.time()
66
+ print("downloading url: ", url)
67
+ print("downloading to: ", dest)
68
+ subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
69
+ print("downloading took: ", time.time() - start)
70
+
71
+
72
+ def setup_seed(seed):
73
+ torch.manual_seed(seed)
74
+ torch.cuda.manual_seed_all(seed)
75
+ np.random.seed(seed)
76
+ random.seed(seed)
77
+ torch.backends.cudnn.deterministic = True
78
+
79
+
80
+ def apply_style_positive(style_name: str, positive: str):
81
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
82
+ return p.replace("{prompt}", positive)
83
+
84
+
85
+ def apply_style(style_name: str, positives: list, negative: str = ""):
86
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
87
+ return [
88
+ p.replace("{prompt}", positive) for positive in positives
89
+ ], n + " " + negative
90
+
91
+
92
+ def set_attention_processor(unet, id_length, is_ipadapter=False):
93
+ global total_count
94
+ total_count = 0
95
+ attn_procs = {}
96
+ for name in unet.attn_processors.keys():
97
+ cross_attention_dim = (
98
+ None
99
+ if name.endswith("attn1.processor")
100
+ else unet.config.cross_attention_dim
101
+ )
102
+ if name.startswith("mid_block"):
103
+ hidden_size = unet.config.block_out_channels[-1]
104
+ elif name.startswith("up_blocks"):
105
+ block_id = int(name[len("up_blocks.")])
106
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
107
+ elif name.startswith("down_blocks"):
108
+ block_id = int(name[len("down_blocks.")])
109
+ hidden_size = unet.config.block_out_channels[block_id]
110
+ if cross_attention_dim is None:
111
+ if name.startswith("up_blocks"):
112
+ attn_procs[name] = SpatialAttnProcessor2_0(id_length=id_length)
113
+ total_count += 1
114
+ else:
115
+ attn_procs[name] = AttnProcessor()
116
+ else:
117
+ if is_ipadapter:
118
+ attn_procs[name] = IPAttnProcessor2_0(
119
+ hidden_size=hidden_size,
120
+ cross_attention_dim=cross_attention_dim,
121
+ scale=1,
122
+ num_tokens=4,
123
+ ).to(unet.device, dtype=torch.float16)
124
+ else:
125
+ attn_procs[name] = AttnProcessor()
126
+
127
+ unet.set_attn_processor(copy.deepcopy(attn_procs))
128
+ print("Successfully load paired self-attention")
129
+ print(f"Number of the processor : {total_count}")
130
+
131
+
132
+ #################################################
133
+ ########Consistent Self-Attention################
134
+ #################################################
135
+ class SpatialAttnProcessor2_0(torch.nn.Module):
136
+ r"""
137
+ Attention processor for IP-Adapater for PyTorch 2.0.
138
+ Args:
139
+ hidden_size (`int`):
140
+ The hidden size of the attention layer.
141
+ cross_attention_dim (`int`):
142
+ The number of channels in the `encoder_hidden_states`.
143
+ text_context_len (`int`, defaults to 77):
144
+ The context length of the text features.
145
+ scale (`float`, defaults to 1.0):
146
+ the weight scale of image prompt.
147
+ """
148
+ ################################################################################################################################################################################
149
+ def __init__(
150
+ self,
151
+ hidden_size=None,
152
+ cross_attention_dim=None,
153
+ id_length=4,
154
+ device="cuda",
155
+ dtype=torch.float16,
156
+ ):
157
+ super().__init__()
158
+ if not hasattr(F, "scaled_dot_product_attention"):
159
+ raise ImportError(
160
+ "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
161
+ )
162
+ self.device = device
163
+ self.dtype = dtype
164
+ self.hidden_size = hidden_size
165
+ self.cross_attention_dim = cross_attention_dim
166
+ self.total_length = id_length + 1
167
+ self.id_length = id_length
168
+ self.id_bank = {}
169
+
170
+ def __call__(
171
+ self,
172
+ attn,
173
+ hidden_states,
174
+ encoder_hidden_states=None,
175
+ attention_mask=None,
176
+ temb=None,
177
+ ):
178
+ global total_count, attn_count, cur_step, mask1024, mask4096
179
+ global sa32, sa64
180
+ global write
181
+ global height, width
182
+ if write:
183
+ self.id_bank[cur_step] = [
184
+ hidden_states[: self.id_length],
185
+ hidden_states[self.id_length :],
186
+ ]
187
+ else:
188
+ encoder_hidden_states = torch.cat(
189
+ (
190
+ self.id_bank[cur_step][0].to(self.device),
191
+ hidden_states[:1],
192
+ self.id_bank[cur_step][1].to(self.device),
193
+ hidden_states[1:],
194
+ )
195
+ )
196
+ # skip in early step
197
+ if cur_step < 5:
198
+ hidden_states = self.__call2__(
199
+ attn, hidden_states, encoder_hidden_states, attention_mask, temb
200
+ )
201
+ else: # 256 1024 4096
202
+ random_number = random.random()
203
+ if cur_step < 20:
204
+ rand_num = 0.3
205
+ else:
206
+ rand_num = 0.1
207
+ if random_number > rand_num:
208
+ if not write:
209
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
210
+ attention_mask = mask1024[
211
+ mask1024.shape[0] // self.total_length * self.id_length :
212
+ ]
213
+ else:
214
+ attention_mask = mask4096[
215
+ mask4096.shape[0] // self.total_length * self.id_length :
216
+ ]
217
+ else:
218
+ if hidden_states.shape[1] == (height // 32) * (width // 32):
219
+ attention_mask = mask1024[
220
+ : mask1024.shape[0] // self.total_length * self.id_length,
221
+ : mask1024.shape[0] // self.total_length * self.id_length,
222
+ ]
223
+ else:
224
+ attention_mask = mask4096[
225
+ : mask4096.shape[0] // self.total_length * self.id_length,
226
+ : mask4096.shape[0] // self.total_length * self.id_length,
227
+ ]
228
+ hidden_states = self.__call1__(
229
+ attn, hidden_states, encoder_hidden_states, attention_mask, temb
230
+ )
231
+ else:
232
+ hidden_states = self.__call2__(
233
+ attn, hidden_states, None, attention_mask, temb
234
+ )
235
+ attn_count += 1
236
+ if attn_count == total_count:
237
+ attn_count = 0
238
+ cur_step += 1
239
+ mask1024, mask4096 = cal_attn_mask_xl(
240
+ self.total_length,
241
+ self.id_length,
242
+ sa32,
243
+ sa64,
244
+ height,
245
+ width,
246
+ device=self.device,
247
+ dtype=self.dtype,
248
+ )
249
+
250
+ return hidden_states
251
+
252
+ def __call1__(
253
+ self,
254
+ attn,
255
+ hidden_states,
256
+ encoder_hidden_states=None,
257
+ attention_mask=None,
258
+ temb=None,
259
+ ):
260
+ residual = hidden_states
261
+ if attn.spatial_norm is not None:
262
+ hidden_states = attn.spatial_norm(hidden_states, temb)
263
+ input_ndim = hidden_states.ndim
264
+
265
+ if input_ndim == 4:
266
+ total_batch_size, channel, height, width = hidden_states.shape
267
+ hidden_states = hidden_states.view(
268
+ total_batch_size, channel, height * width
269
+ ).transpose(1, 2)
270
+ total_batch_size, nums_token, channel = hidden_states.shape
271
+ img_nums = total_batch_size // 2
272
+ hidden_states = hidden_states.view(-1, img_nums, nums_token, channel).reshape(
273
+ -1, img_nums * nums_token, channel
274
+ )
275
+
276
+ batch_size, sequence_length, _ = hidden_states.shape
277
+
278
+ if attn.group_norm is not None:
279
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
280
+ 1, 2
281
+ )
282
+
283
+ query = attn.to_q(hidden_states)
284
+
285
+ if encoder_hidden_states is None:
286
+ encoder_hidden_states = hidden_states # B, N, C
287
+ else:
288
+ encoder_hidden_states = encoder_hidden_states.view(
289
+ -1, self.id_length + 1, nums_token, channel
290
+ ).reshape(-1, (self.id_length + 1) * nums_token, channel)
291
+
292
+ key = attn.to_k(encoder_hidden_states)
293
+ value = attn.to_v(encoder_hidden_states)
294
+
295
+ inner_dim = key.shape[-1]
296
+ head_dim = inner_dim // attn.heads
297
+
298
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
299
+
300
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
301
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
302
+ hidden_states = F.scaled_dot_product_attention(
303
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
304
+ )
305
+
306
+ hidden_states = hidden_states.transpose(1, 2).reshape(
307
+ total_batch_size, -1, attn.heads * head_dim
308
+ )
309
+ hidden_states = hidden_states.to(query.dtype)
310
+
311
+ # linear proj
312
+ hidden_states = attn.to_out[0](hidden_states)
313
+ # dropout
314
+ hidden_states = attn.to_out[1](hidden_states)
315
+
316
+ if input_ndim == 4:
317
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
318
+ total_batch_size, channel, height, width
319
+ )
320
+ if attn.residual_connection:
321
+ hidden_states = hidden_states + residual
322
+ hidden_states = hidden_states / attn.rescale_output_factor
323
+ # print(hidden_states.shape)
324
+ return hidden_states
325
+
326
+ def __call2__(
327
+ self,
328
+ attn,
329
+ hidden_states,
330
+ encoder_hidden_states=None,
331
+ attention_mask=None,
332
+ temb=None,
333
+ ):
334
+ residual = hidden_states
335
+
336
+ if attn.spatial_norm is not None:
337
+ hidden_states = attn.spatial_norm(hidden_states, temb)
338
+
339
+ input_ndim = hidden_states.ndim
340
+
341
+ if input_ndim == 4:
342
+ batch_size, channel, height, width = hidden_states.shape
343
+ hidden_states = hidden_states.view(
344
+ batch_size, channel, height * width
345
+ ).transpose(1, 2)
346
+
347
+ batch_size, sequence_length, channel = hidden_states.shape
348
+ # print(hidden_states.shape)
349
+ if attention_mask is not None:
350
+ attention_mask = attn.prepare_attention_mask(
351
+ attention_mask, sequence_length, batch_size
352
+ )
353
+ # scaled_dot_product_attention expects attention_mask shape to be
354
+ # (batch, heads, source_length, target_length)
355
+ attention_mask = attention_mask.view(
356
+ batch_size, attn.heads, -1, attention_mask.shape[-1]
357
+ )
358
+
359
+ if attn.group_norm is not None:
360
+ hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
361
+ 1, 2
362
+ )
363
+
364
+ query = attn.to_q(hidden_states)
365
+
366
+ if encoder_hidden_states is None:
367
+ encoder_hidden_states = hidden_states # B, N, C
368
+ else:
369
+ encoder_hidden_states = encoder_hidden_states.view(
370
+ -1, self.id_length + 1, sequence_length, channel
371
+ ).reshape(-1, (self.id_length + 1) * sequence_length, channel)
372
+
373
+ key = attn.to_k(encoder_hidden_states)
374
+ value = attn.to_v(encoder_hidden_states)
375
+
376
+ inner_dim = key.shape[-1]
377
+ head_dim = inner_dim // attn.heads
378
+
379
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
380
+
381
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
382
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
383
+
384
+ hidden_states = F.scaled_dot_product_attention(
385
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
386
+ )
387
+
388
+ hidden_states = hidden_states.transpose(1, 2).reshape(
389
+ batch_size, -1, attn.heads * head_dim
390
+ )
391
+ hidden_states = hidden_states.to(query.dtype)
392
+
393
+ # linear proj
394
+ hidden_states = attn.to_out[0](hidden_states)
395
+ # dropout
396
+ hidden_states = attn.to_out[1](hidden_states)
397
+
398
+ if input_ndim == 4:
399
+ hidden_states = hidden_states.transpose(-1, -2).reshape(
400
+ batch_size, channel, height, width
401
+ )
402
+
403
+ if attn.residual_connection:
404
+ hidden_states = hidden_states + residual
405
+
406
+ hidden_states = hidden_states / attn.rescale_output_factor
407
+
408
+ return hidden_states
409
+
410
+
411
+ class Predictor(BasePredictor):
412
+ def setup(self) -> None:
413
+ """Load the model into memory to make running multiple predictions efficient"""
414
+
415
+ models_dict = {
416
+ "RealVision": "SG161222/RealVisXL_V4.0",
417
+ "Unstable": "stablediffusionapi/sdxl-unstable-diffusers-y",
418
+ }
419
+
420
+ if not os.path.exists(MODEL_CACHE):
421
+ download_weights(MODEL_URL, MODEL_CACHE)
422
+
423
+ photomaker_path = f"{MODEL_CACHE}/PhotoMaker/photomaker-v1.bin"
424
+
425
+ self.sdxl_pipe_unstable = StableDiffusionXLPipeline.from_pretrained(
426
+ f"{MODEL_CACHE}/Unstable/sdxl/stablediffusionapi/sdxl-unstable-diffusers-y",
427
+ torch_dtype=torch.float16,
428
+ )
429
+ self.sdxl_pipe_realvision = StableDiffusionXLPipeline.from_pretrained(
430
+ f"{MODEL_CACHE}/RealVision/sdxl/SG161222/RealVisXL_V4.0",
431
+ torch_dtype=torch.float16,
432
+ )
433
+
434
+ self.pipe_unstable = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
435
+ f"{MODEL_CACHE}/Unstable/stablediffusionapi/sdxl-unstable-diffusers-y",
436
+ torch_dtype=torch.float16,
437
+ use_safetensors=False,
438
+ )
439
+ self.pipe_unstable.load_photomaker_adapter(
440
+ os.path.dirname(photomaker_path),
441
+ subfolder="",
442
+ weight_name=os.path.basename(photomaker_path),
443
+ trigger_word="img", # define the trigger word
444
+ )
445
+
446
+ self.pipe_realvision = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
447
+ f"{MODEL_CACHE}/RealVision/SG161222/RealVisXL_V4.0",
448
+ torch_dtype=torch.float16,
449
+ use_safetensors=True,
450
+ )
451
+ self.pipe_realvision.load_photomaker_adapter(
452
+ os.path.dirname(photomaker_path),
453
+ subfolder="",
454
+ weight_name=os.path.basename(photomaker_path),
455
+ trigger_word="img", # define the trigger word
456
+ )
457
+ self.pipe_realvision.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
458
+ self.pipe_realvision.fuse_lora()
459
+
460
+ @torch.inference_mode()
461
+ def predict(
462
+ self,
463
+ sd_model: str = Input(
464
+ description="Choose a model",
465
+ choices=["Unstable", "RealVision"],
466
+ default="Unstable",
467
+ ),
468
+ ref_image: Path = Input(
469
+ description="Reference image for the character",
470
+ default=None,
471
+ ),
472
+ character_description: str = Input(
473
+ description="General description of the character. If ref_image above is provided, making sure to follow the class word you want to customize with the trigger word 'img', such as: 'man img' or 'woman img' or 'girl img'",
474
+ default="a man, wearing black suit",
475
+ ),
476
+ negative_prompt: str = Input(
477
+ description="Describe things you do not want to see in the output",
478
+ default="bad anatomy, bad hands, missing fingers, extra fingers, three hands, three legs, bad arms, missing legs, missing arms, poorly drawn face, bad face, fused face, cloned face, three crus, fused feet, fused thigh, extra crus, ugly fingers, horn, cartoon, cg, 3d, unreal, animate, amputation, disconnected limbs",
479
+ ),
480
+ comic_description: str = Input(
481
+ description="Comic Description. Each frame is divided by a new line. Only the first 10 prompts are valid for demo speed! For comic_description NOT using ref_image: (1) Support Typesetting Style and Captioning. By default, the prompt is used as the caption for each image. If you need to change the caption, add a '#' at the end of each line. Only the part after the '#' will be added as a caption to the image. (2) The [NC] symbol is used as a flag to indicate that no characters should be present in the generated scene images. If you want do that, prepend the '[NC]' at the beginning of the line.",
482
+ default="at home, read new paper #at home, The newspaper says there is a treasure house in the forest.\non the road, near the forest\n[NC] The car on the road, near the forest #He drives to the forest in search of treasure.\n[NC]A tiger appeared in the forest, at night \nvery frightened, open mouth, in the forest, at night\nrunning very fast, in the forest, at night\n[NC] A house in the forest, at night #Suddenly, he discovers the treasure house!\nin the house filled with treasure, laughing, at night #He is overjoyed inside the house.",
483
+ ),
484
+ style_name: str = Input(
485
+ description="Style template",
486
+ choices=STYLE_NAMES,
487
+ default=DEFAULT_STYLE_NAME,
488
+ ),
489
+ comic_style: str = Input(
490
+ description="Select the comic style for the combined comic",
491
+ choices=["Four Pannel", "Classic Comic Style"],
492
+ default="Classic Comic Style",
493
+ ),
494
+ style_strength_ratio: int = Input(
495
+ description="Style strength of Ref Image (%), only used if ref_image is provided",
496
+ default=20,
497
+ ge=15,
498
+ le=50,
499
+ ),
500
+ image_width: int = Input(
501
+ description="Width of output image",
502
+ choices=[
503
+ 256,
504
+ 288,
505
+ 320,
506
+ 352,
507
+ 384,
508
+ 416,
509
+ 448,
510
+ 480,
511
+ 512,
512
+ 544,
513
+ 576,
514
+ 608,
515
+ 640,
516
+ 672,
517
+ 704,
518
+ 736,
519
+ 768,
520
+ 800,
521
+ 832,
522
+ 864,
523
+ 896,
524
+ 928,
525
+ 960,
526
+ 992,
527
+ 1024,
528
+ ],
529
+ default=768,
530
+ ),
531
+ image_height: int = Input(
532
+ description="Height of output image",
533
+ choices=[
534
+ 256,
535
+ 288,
536
+ 320,
537
+ 352,
538
+ 384,
539
+ 416,
540
+ 448,
541
+ 480,
542
+ 512,
543
+ 544,
544
+ 576,
545
+ 608,
546
+ 640,
547
+ 672,
548
+ 704,
549
+ 736,
550
+ 768,
551
+ 800,
552
+ 832,
553
+ 864,
554
+ 896,
555
+ 928,
556
+ 960,
557
+ 992,
558
+ 1024,
559
+ ],
560
+ default=768,
561
+ ),
562
+ num_steps: int = Input(
563
+ description="Number of sample steps", ge=20, le=50, default=25
564
+ ),
565
+ guidance_scale: float = Input(
566
+ description="Scale for classifier-free guidance", ge=0.1, le=10, default=5
567
+ ),
568
+ seed: int = Input(
569
+ description="Random seed. Leave blank to randomize the seed", default=None
570
+ ),
571
+ sa32_setting: float = Input(
572
+ description="The degree of Paired Attention at 32 x 32 self-attention layers",
573
+ default=0.5,
574
+ ge=0,
575
+ le=1.0,
576
+ ),
577
+ sa64_setting: float = Input(
578
+ description="The degree of Paired Attention at 64 x 64 self-attention layers",
579
+ default=0.5,
580
+ ge=0,
581
+ le=1.0,
582
+ ),
583
+ num_ids: int = Input(
584
+ description="Number of id images in total images. This should not exceed total number of line-separated prompts",
585
+ default=3,
586
+ ),
587
+ output_format: str = Input(
588
+ description="Format of the output images",
589
+ choices=["webp", "jpg", "png"],
590
+ default="webp",
591
+ ),
592
+ output_quality: int = Input(
593
+ description="Quality of the output images, from 0 to 100. 100 is best quality, 0 is lowest quality",
594
+ default=80,
595
+ ge=0,
596
+ le=100,
597
+ ),
598
+ ) -> ModelOutput:
599
+ """Run a single prediction on the model"""
600
+
601
+ global total_count, attn_count, cur_step, mask1024, mask4096, attn_procs, unet
602
+ global sa32, sa64
603
+ global write
604
+ global height, width
605
+
606
+ assert (
607
+ len(character_description.strip()) > 0
608
+ ), "Please provide the description of the character."
609
+
610
+ if ref_image is not None:
611
+ assert (
612
+ "img" in character_description
613
+ ), f"When using ref_image, please add the trigger word 'img' behind the class word you want to customize, such as: man img or woman img"
614
+ assert (
615
+ "[NC]" not in comic_description
616
+ ), "You should not use trigger word [NC] when ref_image is provided."
617
+
618
+ height = image_height
619
+ width = image_width
620
+ id_length = num_ids
621
+ sa32 = sa32_setting
622
+ sa64 = sa64_setting
623
+
624
+ clipped_prompts = comic_description.splitlines()[:10]
625
+ print(clipped_prompts)
626
+ prompts = [
627
+ (
628
+ character_description + "," + prompt
629
+ if "[NC]" not in prompt
630
+ else prompt.replace("[NC]", "")
631
+ )
632
+ for prompt in clipped_prompts
633
+ ]
634
+ print(prompts)
635
+ prompts = [
636
+ prompt.rpartition("#")[0].strip() if "#" in prompt else prompt.strip()
637
+ for prompt in prompts
638
+ ]
639
+ print(prompts)
640
+ assert id_length <= len(
641
+ prompts
642
+ ), "id_length should not exceed total number of line-separated prompts"
643
+
644
+ id_prompts = prompts[:id_length]
645
+ real_prompts = prompts[id_length:]
646
+
647
+ if seed is None:
648
+ seed = int.from_bytes(os.urandom(2), "big")
649
+ print(f"Using seed: {seed}")
650
+ ################################################################################################################################################################################
651
+ device = "cuda"
652
+ setup_seed(seed)
653
+ generator = torch.Generator(device=device).manual_seed(seed)
654
+
655
+ torch.cuda.empty_cache()
656
+
657
+ model_type = "original" if ref_image is None else "Photomaker"
658
+
659
+ if model_type == "original":
660
+ pipe = (
661
+ self.sdxl_pipe_realvision
662
+ if style_name == "(No style)"
663
+ else self.sdxl_pipe_unstable
664
+ )
665
+ pipe = pipe.to(device)
666
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
667
+ else:
668
+ if sd_model != "RealVision" and style_name != "(No style)":
669
+ pipe = self.pipe_unstable.to(device)
670
+ else:
671
+ pipe = self.pipe_realvision.to(device)
672
+ pipe.id_encoder.to(device)
673
+
674
+ write = True
675
+ cur_step = 0
676
+ attn_count = 0
677
+
678
+ set_attention_processor(pipe.unet, id_length, is_ipadapter=False)
679
+ pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
680
+ pipe.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2)
681
+ curmodel_type = sd_model + "-" + model_type + "" + str(id_length)
682
+
683
+ id_prompts, negative_prompt = apply_style(
684
+ style_name, id_prompts, negative_prompt
685
+ )
686
+
687
+ total_results = []
688
+ if model_type == "original":
689
+ id_images = pipe(
690
+ id_prompts,
691
+ num_inference_steps=num_steps,
692
+ guidance_scale=guidance_scale,
693
+ height=height,
694
+ width=width,
695
+ negative_prompt=negative_prompt,
696
+ generator=generator,
697
+ ).images
698
+ else:
699
+ input_id_images = [load_image(str(ref_image))]
700
+ start_merge_step = int(float(style_strength_ratio) / 100 * num_steps)
701
+ id_images = pipe(
702
+ id_prompts,
703
+ input_id_images=input_id_images,
704
+ num_inference_steps=num_steps,
705
+ guidance_scale=guidance_scale,
706
+ start_merge_step=start_merge_step,
707
+ height=height,
708
+ width=width,
709
+ negative_prompt=negative_prompt,
710
+ generator=generator,
711
+ ).images
712
+
713
+ total_results = id_images + total_results
714
+
715
+ real_images = []
716
+ write = False
717
+ for real_prompt in real_prompts:
718
+ cur_step = 0
719
+ real_prompt = apply_style_positive(style_name, real_prompt)
720
+ if model_type == "original":
721
+ real_images.append(
722
+ pipe(
723
+ real_prompt,
724
+ num_inference_steps=num_steps,
725
+ guidance_scale=guidance_scale,
726
+ height=height,
727
+ width=width,
728
+ negative_prompt=negative_prompt,
729
+ generator=generator,
730
+ ).images[0]
731
+ )
732
+ else:
733
+ real_images.append(
734
+ pipe(
735
+ real_prompt,
736
+ input_id_images=input_id_images,
737
+ num_inference_steps=num_steps,
738
+ guidance_scale=guidance_scale,
739
+ start_merge_step=start_merge_step,
740
+ height=height,
741
+ width=width,
742
+ negative_prompt=negative_prompt,
743
+ generator=generator,
744
+ ).images[0]
745
+ )
746
+
747
+ total_results = [real_images[-1]] + total_results
748
+
749
+ captions = clipped_prompts
750
+ captions = [caption.replace("[NC]", "") for caption in captions]
751
+ captions = [
752
+ caption.split("#")[-1].strip() if "#" in caption else caption.strip()
753
+ for caption in captions
754
+ ]
755
+
756
+ comic = get_comic(
757
+ id_images + real_images,
758
+ comic_style,
759
+ captions=captions,
760
+ font=ImageFont.truetype("./fonts/Inkfree.ttf", int(45)),
761
+ )
762
+
763
+ extension = output_format.lower()
764
+ extension = "jpeg" if extension == "jpg" else extension
765
+ comic_out = f"/tmp/comic.{extension}"
766
+ comic[0].save(comic_out)
767
+
768
+ save_params = {"format": extension.upper()}
769
+ if not output_format == "png":
770
+ save_params["quality"] = output_quality
771
+ save_params["optimize"] = True
772
+
773
+ output_paths = []
774
+ for index, sample in enumerate(total_results[::-1]):
775
+ output_filename = f"/tmp/out-{index}.{extension}"
776
+ sample.save(output_filename, **save_params)
777
+ output_paths.append(Path(output_filename))
778
+
779
+ del pipe
780
+
781
+ return ModelOutput(comic=Path(comic_out), individual_images=output_paths)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.22.0
2
+ xformers==0.0.20
3
+ torch==2.0.1
4
+ torchvision==0.15.2
5
+ diffusers==0.25.0
6
+ transformers==4.36.2
7
+ huggingface-hub==0.20.2
8
+ spaces==0.19.4
9
+ numpy
10
+ accelerate
11
+ safetensors
12
+ omegaconf
13
+ peft
14
+ httpx==0.27.0
15
+ safetensors==0.4.0
run.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "✨ 启动 Gradio(本地监听 7860 端口)"
4
+ python gradio_app_sdxl_specific_id_low_vram.py & # 后台运行
5
+
6
+ sleep 60
7
+
8
+ echo "🌐 尝试使用 cloudflared 暴露端口..."
9
+ cloudflared tunnel --url http://localhost:7860
storydiffusionpipeline.py ADDED
File without changes
test.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import torch
2
+ print("CUDA available:", torch.cuda.is_available())
3
+ print("cuDNN version:", torch.backends.cudnn.version())
4
+ print("cuDNN enabled:", torch.backends.cudnn.enabled)
update.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Update History
2
+
3
+ ### Update 2023-05-14
4
+
5
+ - Support Two persons,support for more characters will also be possible in the feature. In Pnhotomaker, currently, only one person can appear in a single image.
6
+ - Auto Save generated images in the ‘results’ folder.
7
+ - I have changed the way to fill in prompts; please refer to the example provided.
8
+
9
+ ### Update 2024-05-08
10
+
11
+ - Support [NC] in Ref Image Model (Photomaker work best in 1024x1024 but may cost a lot of GPU memory, I recommend you to use the res. as larger as possible)
12
+
13
+ <img src="results_examples/image1.png" height=100>
14
+
15
+ - Merge Push by @cryptowooser to support lastest pillow. But you may be updated pillow if you using the old version.
16
+
17
+
18
+
19
+ ### Todo
20
+
21
+ - Support add captions on all images for the classical commic Typesetting Style
22
+
23
+
24
+
25
+
26
+ ### Welcome to contribute
27
+
28
+ - Various layout styles.