Spaces:

mallepally
/

MultimodalGPT

Build error

App Files Files Community

Harold-lkk commited on Apr 27, 2023

Commit

0f4b503

1 Parent(s): 579d11f

update readme

Browse files

Files changed (2) hide show

README.md +1 -1
chat_gradio_demo.py → app.py +77 -30

README.md CHANGED Viewed

@@ -17,7 +17,7 @@ The **joint training** of visual and language instructions effectively improves
 To install the package in an existing environment, run
 ```bash
-git clone xxxxxx.git
 pip install -r requirements.txt
 pip install -e. -v
 ```

 To install the package in an existing environment, run
 ```bash
+git clone https://github.com/open-mmlab/Multimodal-GPT.git
 pip install -r requirements.txt
 pip install -e. -v
 ```

chat_gradio_demo.py → app.py RENAMED Viewed

@@ -12,12 +12,16 @@ Prompt_Tutorial = "Model Inputs = {Prompt}({seperator}Image:\n<image> if image u
 class Inferencer:
     def __init__(self, finetune_path, llama_path, open_flamingo_path):
         ckpt = torch.load(finetune_path, map_location="cpu")
         if "model_state_dict" in ckpt:
             state_dict = ckpt["model_state_dict"]
             # remove the "module." prefix
-            state_dict = {k[7:]: v for k, v in state_dict.items() if k.startswith("module.")}
         else:
             state_dict = ckpt
         tuning_config = ckpt.get("tuning_config")
@@ -35,16 +39,21 @@ class Inferencer:
             tuning_config=tuning_config,
         )
         model.load_state_dict(state_dict, strict=False)
         model = model.to("cuda")
         tokenizer.padding_side = "left"
         tokenizer.add_eos_token = False
         self.model = model
         self.image_processor = image_processor
         self.tokenizer = tokenizer
-    def __call__(self, prompt, imgpaths, max_new_token, num_beams, temperature, top_k, top_p, do_sample):
         if len(imgpaths) > 1:
-            raise gr.Error("Current only support one image, please clear gallery and upload one image")
         lang_x = self.tokenizer([prompt], return_tensors="pt")
         if len(imgpaths) == 0 or imgpaths is None:
             for layer in self.model.lang_encoder._get_decoder_layers():
@@ -65,7 +74,7 @@ class Inferencer:
             images = (Image.open(fp) for fp in imgpaths)
             vision_x = [self.image_processor(im).unsqueeze(0) for im in images]
             vision_x = torch.cat(vision_x, dim=0)
-            vision_x = vision_x.unsqueeze(1).unsqueeze(0)
             output_ids = self.model.generate(
                 vision_x=vision_x.cuda(),
@@ -78,13 +87,15 @@ class Inferencer:
                 top_p=top_p,
                 do_sample=do_sample,
             )[0]
-        generated_text = self.tokenizer.decode(output_ids, skip_special_tokens=True)
         # print(generated_text)
         result = generated_text.split(response_split)[-1].strip()
         return result
 class PromptGenerator:
     def __init__(
         self,
         prompt_template=TEMPLATE,
@@ -106,7 +117,7 @@ class PromptGenerator:
     def get_images(self):
         img_list = list()
         if self.buffer_size > 0:
-            all_history = self.all_history[-2 * (self.buffer_size + 1) :]
         elif self.buffer_size == 0:
             all_history = self.all_history[-2:]
         else:
@@ -125,7 +136,7 @@ class PromptGenerator:
         prompt_template = self.prompt_template.format(**format_dict)
         ret = prompt_template
         if self.buffer_size > 0:
-            all_history = self.all_history[-2 * (self.buffer_size + 1) :]
         elif self.buffer_size == 0:
             all_history = self.all_history[-2:]
         else:
@@ -134,9 +145,11 @@ class PromptGenerator:
         have_image = False
         for role, message in all_history[::-1]:
             if message:
-                if type(message) is tuple and message[1] is not None and not have_image:
                     message, _ = message
-                    context.append(self.sep + "Image:\n<image>" + self.sep + role + ":\n" + message)
                 else:
                     context.append(self.sep + role + ":\n" + message)
             else:
@@ -162,7 +175,8 @@ def to_gradio_chatbot(prompt_generator):
                 max_hw, min_hw = max(image.size), min(image.size)
                 aspect_ratio = max_hw / min_hw
                 max_len, min_len = 800, 400
-                shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
                 longest_edge = int(shortest_edge * aspect_ratio)
                 H, W = image.size
                 if H > W:
@@ -211,10 +225,12 @@ def bot(
     inputs = state.get_prompt()
     image_paths = state.get_images()[-1:]
-    inference_results = inferencer(inputs, image_paths, max_new_token, num_beams, temperature, top_k, top_p, do_sample)
     state.all_history[-1][-1] = inference_results
-    return state, to_gradio_chatbot(state), "", None, inputs
 def clear(state):
@@ -222,41 +238,57 @@ def clear(state):
     return state, to_gradio_chatbot(state), "", None, ""
 def build_conversation_demo():
-    with gr.Blocks() as demo:
         state = gr.State(PromptGenerator())
         with gr.Row():
             with gr.Column(scale=3):
                 imagebox = gr.Image(type="filepath")
                 # TODO config parameters
                 with gr.Accordion(
-                    "Parameters",
-                    open=True,
                 ):
-                    max_new_token_bar = gr.Slider(0, 1024, 512, label="max_new_token", step=1)
-                    num_beams_bar = gr.Slider(0.0, 10, 3, label="num_beams", step=1)
-                    temperature_bar = gr.Slider(0.0, 1.0, 1.0, label="temperature", step=0.01)
                     topk_bar = gr.Slider(0, 100, 20, label="top_k", step=1)
                     topp_bar = gr.Slider(0, 1.0, 1.0, label="top_p", step=0.01)
                     do_sample = gr.Checkbox(True, label="do_sample")
                 with gr.Accordion(
-                    "Prompt",
-                    open=False,
                 ):
-                    with gr.Accordion("Click to hide the tutorial", open=False):
                         gr.Markdown(Prompt_Tutorial)
                     with gr.Row():
                         ai_prefix = gr.Text("Response", label="AI Prefix")
-                        user_prefix = gr.Text("Instruction", label="User Prefix")
                         seperator = gr.Text("\n\n### ", label="Seperator")
-                    history_buffer = gr.Slider(-1, 10, -1, label="History buffer", step=1)
                     prompt = gr.Text(TEMPLATE, label="Prompt")
                     model_inputs = gr.Textbox(label="Actual inputs for Model")
             with gr.Column(scale=6):
                 with gr.Row():
                     with gr.Column():
-                        chatbot = gr.Chatbot(elem_id="chatbot").style(height=750)
                 with gr.Row():
                     with gr.Column(scale=8):
                         textbox = gr.Textbox(
@@ -268,7 +300,10 @@ def build_conversation_demo():
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         gr.Examples(
             examples=[
-                [f"{cur_dir}/docs/images/demo_image.jpg", "What is in this image?"],
             ],
             inputs=[imagebox, textbox],
         )
@@ -290,7 +325,10 @@ def build_conversation_demo():
                 topp_bar,
                 do_sample,
             ],
-            [state, chatbot, textbox, imagebox, model_inputs],
         )
         submit_btn.click(
             bot,
@@ -310,9 +348,13 @@ def build_conversation_demo():
                 topp_bar,
                 do_sample,
             ],
-            [state, chatbot, textbox, imagebox, model_inputs],
         )
-        clear_btn.click(clear, [state], [state, chatbot, textbox, imagebox, model_inputs])
     return demo
@@ -320,7 +362,12 @@ if __name__ == "__main__":
     llama_path = "checkpoints/llama-7b_hf"
     open_flamingo_path = "checkpoints/OpenFlamingo-9B/checkpoint.pt"
     finetune_path = "checkpoints/mmgpt-lora-v0-release.pt"
-    inferencer = Inferencer(llama_path=llama_path, open_flamingo_path=open_flamingo_path, finetune_path=finetune_path)
     demo = build_conversation_demo()
     demo.queue(concurrency_count=3)
     IP = "0.0.0.0"

 class Inferencer:
     def __init__(self, finetune_path, llama_path, open_flamingo_path):
         ckpt = torch.load(finetune_path, map_location="cpu")
         if "model_state_dict" in ckpt:
             state_dict = ckpt["model_state_dict"]
             # remove the "module." prefix
+            state_dict = {
+                k[7:]: v
+                for k, v in state_dict.items() if k.startswith("module.")
+            }
         else:
             state_dict = ckpt
         tuning_config = ckpt.get("tuning_config")
             tuning_config=tuning_config,
         )
         model.load_state_dict(state_dict, strict=False)
+        model.half()
         model = model.to("cuda")
+        model.eval()
         tokenizer.padding_side = "left"
         tokenizer.add_eos_token = False
         self.model = model
         self.image_processor = image_processor
         self.tokenizer = tokenizer
+    def __call__(self, prompt, imgpaths, max_new_token, num_beams, temperature,
+                 top_k, top_p, do_sample):
         if len(imgpaths) > 1:
+            raise gr.Error(
+                "Current only support one image, please clear gallery and upload one image"
+            )
         lang_x = self.tokenizer([prompt], return_tensors="pt")
         if len(imgpaths) == 0 or imgpaths is None:
             for layer in self.model.lang_encoder._get_decoder_layers():
             images = (Image.open(fp) for fp in imgpaths)
             vision_x = [self.image_processor(im).unsqueeze(0) for im in images]
             vision_x = torch.cat(vision_x, dim=0)
+            vision_x = vision_x.unsqueeze(1).unsqueeze(0).half()
             output_ids = self.model.generate(
                 vision_x=vision_x.cuda(),
                 top_p=top_p,
                 do_sample=do_sample,
             )[0]
+        generated_text = self.tokenizer.decode(
+            output_ids, skip_special_tokens=True)
         # print(generated_text)
         result = generated_text.split(response_split)[-1].strip()
         return result
 class PromptGenerator:
     def __init__(
         self,
         prompt_template=TEMPLATE,
     def get_images(self):
         img_list = list()
         if self.buffer_size > 0:
+            all_history = self.all_history[-2 * (self.buffer_size + 1):]
         elif self.buffer_size == 0:
             all_history = self.all_history[-2:]
         else:
         prompt_template = self.prompt_template.format(**format_dict)
         ret = prompt_template
         if self.buffer_size > 0:
+            all_history = self.all_history[-2 * (self.buffer_size + 1):]
         elif self.buffer_size == 0:
             all_history = self.all_history[-2:]
         else:
         have_image = False
         for role, message in all_history[::-1]:
             if message:
+                if type(message) is tuple and message[
+                        1] is not None and not have_image:
                     message, _ = message
+                    context.append(self.sep + "Image:\n<image>" + self.sep +
+                                   role + ":\n" + message)
                 else:
                     context.append(self.sep + role + ":\n" + message)
             else:
                 max_hw, min_hw = max(image.size), min(image.size)
                 aspect_ratio = max_hw / min_hw
                 max_len, min_len = 800, 400
+                shortest_edge = int(
+                    min(max_len / aspect_ratio, min_len, min_hw))
                 longest_edge = int(shortest_edge * aspect_ratio)
                 H, W = image.size
                 if H > W:
     inputs = state.get_prompt()
     image_paths = state.get_images()[-1:]
+    inference_results = inferencer(inputs, image_paths, max_new_token,
+                                   num_beams, temperature, top_k, top_p,
+                                   do_sample)
     state.all_history[-1][-1] = inference_results
+    memory_allocated = str(torch.cuda.memory_allocated() / 1024**3) + 'GB'
+    return state, to_gradio_chatbot(state), "", None, inputs, memory_allocated
 def clear(state):
     return state, to_gradio_chatbot(state), "", None, ""
+title_markdown = ("""
+    # 🤖 Multi-modal GPT
+    [[Project]](https://github.com/open-mmlab/Multimodal-GPT.git)""")
 def build_conversation_demo():
+    with gr.Blocks(title="Multi-modal GPT") as demo:
+        gr.Markdown(title_markdown)
         state = gr.State(PromptGenerator())
         with gr.Row():
             with gr.Column(scale=3):
+                memory_allocated = gr.Textbox(
+                    value=init_memory, label="Memory")
                 imagebox = gr.Image(type="filepath")
                 # TODO config parameters
                 with gr.Accordion(
+                        "Parameters",
+                        open=True,
                 ):
+                    max_new_token_bar = gr.Slider(
+                        0, 1024, 512, label="max_new_token", step=1)
+                    num_beams_bar = gr.Slider(
+                        0.0, 10, 3, label="num_beams", step=1)
+                    temperature_bar = gr.Slider(
+                        0.0, 1.0, 1.0, label="temperature", step=0.01)
                     topk_bar = gr.Slider(0, 100, 20, label="top_k", step=1)
                     topp_bar = gr.Slider(0, 1.0, 1.0, label="top_p", step=0.01)
                     do_sample = gr.Checkbox(True, label="do_sample")
                 with gr.Accordion(
+                        "Prompt",
+                        open=False,
                 ):
+                    with gr.Accordion(
+                            "Click to hide the tutorial", open=False):
                         gr.Markdown(Prompt_Tutorial)
                     with gr.Row():
                         ai_prefix = gr.Text("Response", label="AI Prefix")
+                        user_prefix = gr.Text(
+                            "Instruction", label="User Prefix")
                         seperator = gr.Text("\n\n### ", label="Seperator")
+                    history_buffer = gr.Slider(
+                        -1, 10, -1, label="History buffer", step=1)
                     prompt = gr.Text(TEMPLATE, label="Prompt")
                     model_inputs = gr.Textbox(label="Actual inputs for Model")
             with gr.Column(scale=6):
                 with gr.Row():
                     with gr.Column():
+                        chatbot = gr.Chatbot(elem_id="chatbot").style(
+                            height=750)
                 with gr.Row():
                     with gr.Column(scale=8):
                         textbox = gr.Textbox(
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         gr.Examples(
             examples=[
+                [
+                    f"{cur_dir}/docs/images/demo_image.jpg",
+                    "What is in this image?"
+                ],
             ],
             inputs=[imagebox, textbox],
         )
                 topp_bar,
                 do_sample,
             ],
+            [
+                state, chatbot, textbox, imagebox, model_inputs,
+                memory_allocated
+            ],
         )
         submit_btn.click(
             bot,
                 topp_bar,
                 do_sample,
             ],
+            [
+                state, chatbot, textbox, imagebox, model_inputs,
+                memory_allocated
+            ],
         )
+        clear_btn.click(clear, [state],
+                        [state, chatbot, textbox, imagebox, model_inputs])
     return demo
     llama_path = "checkpoints/llama-7b_hf"
     open_flamingo_path = "checkpoints/OpenFlamingo-9B/checkpoint.pt"
     finetune_path = "checkpoints/mmgpt-lora-v0-release.pt"
+    inferencer = Inferencer(
+        llama_path=llama_path,
+        open_flamingo_path=open_flamingo_path,
+        finetune_path=finetune_path)
+    init_memory = str(torch.cuda.memory_allocated() / 1024**3) + 'GB'
     demo = build_conversation_demo()
     demo.queue(concurrency_count=3)
     IP = "0.0.0.0"