from transformers import MllamaForConditionalGeneration, AutoProcessor from PIL import Image import torch import gradio as gr import spaces # Initialize model and processor ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( ckpt, torch_dtype=torch.bfloat16 ).to("cuda") processor = AutoProcessor.from_pretrained(ckpt) @spaces.GPU def extract_text(image): # Convert image to RGB image = Image.open(image).convert("RGB") prompt = ( "Output ONLY the raw text exactly as it appears in the image. Do not add anything.\n\n" "The image may contain both handwritten and printed text in French and/or English, including punctuation and underscores.\n\n" "Your task: Transcribe all visible text exactly, preserving:\n" "- All characters, accents, punctuation, spacing, and line breaks.\n" "- The original reading order and layout, including tables and forms if present.\n\n" "Rules:\n" "- Do NOT add any explanations, summaries, comments, or extra text.\n" "- Do NOT duplicate any content.\n" "- Do NOT indicate blank space.\n" "- Do NOT separate handwritten and printed text.\n" "- Do NOT confuse '.' (a period) with '|' (a border).\n\n" "Only extract the text that is actually visible in the image, and nothing else.") # Create message structure messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image"} ] } ] # Process input texts = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda") # Generate output outputs = model.generate(**inputs, max_new_tokens=250) result = processor.decode(outputs[0], skip_special_tokens=True) print(result) # Clean up the output to remove the prompt and assistant text if "assistant" in result.lower(): result = result[result.lower().find("assistant") + len("assistant"):].strip() # Remove any remaining conversation markers result = result.replace("user", "").replace(prompt, "").strip() print(result) return result # Create Gradio interface demo = gr.Interface( fn=extract_text, inputs=gr.Image(type="filepath", label="Upload Image"), outputs=gr.Textbox(label="Extracted Text"), title="Handwritten Text Extractor", description="Upload an image containing handwritten text to extract its content.", ) # Launch the app demo.launch(debug=True)