import os from huggingface_hub import login, snapshot_download from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image from dotenv import load_dotenv import gradio as gr from diffusers import FluxPipeline import torch import spaces # Hugging Face Spaces module # ----------------------- # Pre-cache models at startup # ----------------------- snapshot_download("Salesforce/blip-image-captioning-large", timeout=120) snapshot_download("noamrot/FuseCap", timeout=120) snapshot_download("black-forest-labs/FLUX.1-dev", timeout=300) # ----------------------- # Authentication # ----------------------- load_dotenv() HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") if HUGGINGFACE_TOKEN: login(token=HUGGINGFACE_TOKEN) # ----------------------- # Load models # ----------------------- device = "cuda" if torch.cuda.is_available() else "cpu" processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", timeout=120) model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", timeout=120).to(device) processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap", timeout=120) model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap", timeout=120).to(device) pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, timeout=300).to(device) # ----------------------- # Options # ----------------------- fabrics = ['cotton', 'silk', 'denim', 'linen', 'polyester', 'wool', 'velvet'] patterns = ['striped', 'floral', 'geometric', 'abstract', 'solid', 'polka dots'] textile_designs = ['woven texture', 'embroidery', 'printed fabric', 'hand-dyed', 'quilting'] # ----------------------- # Inference Function # ----------------------- @spaces.GPU(duration=150) def generate_caption_and_image(image, f, p, d): if image and f and p and d: img = image.convert("RGB") # Caption with FuseCap inputs = processor(img, "a picture of ", return_tensors="pt").to(device) out = model2.generate(**inputs, num_beams=3) caption2 = processor1.decode(out[0], skip_special_tokens=True) # Caption with BLIP inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250) inputs = {k: v.to(device) for k, v in inputs.items()} out = model.generate(**inputs) caption1 = processor.decode(out[0], skip_special_tokens=True) # Compose prompt prompt = ( f"Design a high-quality, stylish clothing item that combines the essence of {caption1} and {caption2}. " f"Use luxurious {f} fabric with intricate {d} design elements. " f"Incorporate {p} patterns to elevate the garment's aesthetic. " "Ensure sophistication, innovation, and timeless elegance." ) # Generate image result = pipe( prompt, height=1024, width=1024, guidance_scale=3.5, num_inference_steps=50, max_sequence_length=512, generator=torch.Generator('cpu').manual_seed(0) ).images[0] return result return None # ----------------------- # Gradio UI # ----------------------- iface = gr.Interface( fn=generate_caption_and_image, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Radio(fabrics, label="Select Fabric"), gr.Radio(patterns, label="Select Pattern"), gr.Radio(textile_designs, label="Select Textile Design") ], outputs=gr.Image(label="Generated Design"), live=True ) iface.launch()