🎨X-Omni: Reinforcement Learning Makes Discrete Autoregressive Image Generative Models Great Again

import gradio as gr
import numpy as np
import random

import spaces #[uncomment to use ZeroGPU]
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.generation import GenerationConfig


device = "cuda"
torch_dtype = torch.bfloat16
model_name_or_path = "X-Omni/X-Omni-En"
flux_model_name_or_path = "zhangxiaosong18/FLUX.1-dev-VAE"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, 
    torch_dtype=torch_dtype,
    trust_remote_code=True,
).cuda()
model.init_vision(flux_model_name_or_path)
model.set_generation_mode('image')
model.eval()

@spaces.GPU(duration=199) #[uncomment to use ZeroGPU]
def generate_image(
    image_prompt,
    image_size,
    top_p,
    min_p,
    seed,
):
    image_prompt = image_prompt.strip()
    image_size = tuple(map(int, image_size.split('x')))
    token_h, token_w = image_size[0] // 16, image_size[1] // 16
    image_prefix = f'<SOM>{token_h} {token_w}<IMAGE>'
    generation_config = GenerationConfig(
        max_new_tokens=token_h * token_w,
        do_sample=True,
        temperature=1.0,
        min_p=min_p,
        top_p=top_p,
        guidance_scale=1.0,
        suppress_tokens=tokenizer.convert_tokens_to_ids(model.config.mm_special_tokens),
    )

    tokens = tokenizer(
        [image_prompt + image_prefix],
        return_tensors='pt', 
        padding='longest', 
        padding_side='left',
    )
    input_ids = tokens.input_ids.cuda()
    attention_mask = tokens.attention_mask.cuda()
    
    torch.manual_seed(seed)
    tokens = model.generate(
        inputs=input_ids, 
        attention_mask=attention_mask,
        generation_config=generation_config,
    )
    torch.manual_seed(seed)
    _, images = model.mmdecode(tokenizer, tokens[0], skip_special_tokens=False)

    return images[0]


examples = [
    '''
A formal letter document with a professional tone. Create a document that includes  a section starting with "To, Mr. Edward Robertson," aligned to the left. Underneath, place the date "Date: 27th July 2025" also aligned to the left. Begin the body of the letter with "Dear Sir," indented slightly from the left margin. The first paragraph should state, "I am writing to you with intent of purchasing your property located at #765, Lincoln Street, New York." The second paragraph should read, "I want to propose a purchase price of $100,000 for your property. I am willing to pay you $20,000 as advance." The closing remarks should be, "Kindly let me know what do you think of the offer and we can make a few changes as per your requirements." followed by "Regards," and then "William Specter". Finally, add a logo with a feather graphic in the bottom right corner.
    '''.strip(),
    '''
A visually engaging, collage-style presentation slide combining diverse typographic styles, sizes, and orientations along with layered artistic textures. Dominating the upper half in eye-catching bold lettering is the phrase "IMAGINE, CREATE, IMPACT". Set diagonally below this headline, in elegant cursive script with lighter strokes, is the complementary phrase "Unlocking Creativity Through Collaboration". Spread across the lower left, a smaller clear typeface features "Interactive Sessions & Workshops", neatly enclosed in a minimalist rectangular frame. Decorative artistic splashes, geometric overlaps, subtle textures, and soft color gradients unify the diverse textual elements, creating a contemporary, dynamic, and inspiring visual design.
    '''.strip(),
]
examples = [[prompt, '1152x1152', 1.0, 0.03, 0] for prompt in examples]


css = """
.app {
    max-width: 800px !important;
    margin: 0 auto !important;
}
"""

with gr.Blocks(css=css) as demo:
    gr.HTML('''
<h1 style="text-align:center">🎨X-Omni: Reinforcement Learning Makes Discrete Autoregressive Image Generative Models Great Again</h1>
<h3 style="text-align:center">Model: <a href="https://huggingface.co/X-Omni/X-Omni-En">X-Omni-En</a> (support English text rendering)</h3>
<p align="center">
  <a href="https://x-omni-team.github.io">🏠 Project Page</a> |
  <a href="https://arxiv.org/pdf/2507.22058">📄 Paper</a> |
  <a href="https://github.com/X-Omni-Team/X-Omni">💻​ Code</a> |
  <a href="https://huggingface.co/collections/X-Omni/x-omni-models-6888aadcc54baad7997d7982">🤗 HuggingFace Model</a>
</p>
    '''.strip())
    with gr.Row():
        textbox = gr.Textbox(lines=2, placeholder='text prompt for image generation', show_label=False)
    image = gr.Image(show_label=False, type='pil')
    with gr.Row():
        button = gr.Button("Generate", variant="primary")
        with gr.Accordion("Advanced Settings", open=False):
            image_size = gr.Dropdown(label="Image Size", choices=["1152x1152", "1152x768", "768x1152"], value="1152x1152")
            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=1.0, step=0.01)
            min_p = gr.Slider(label="Min P", minimum=0.0, maximum=1.0, value=0.03, step=0.01)
            seed_input = gr.Number(label="Seed", value=0, precision=0)
    with gr.Row():
        gr.Examples(examples=examples, inputs=(textbox, image_size, top_p, min_p, seed_input), outputs=image, fn=generate_image, cache_examples=False, run_on_click=True)
    button.click(
        generate_image, 
        inputs=(textbox, image_size, top_p, min_p, seed_input), 
        outputs=image,
    )

if __name__ == "__main__":
    demo.launch(ssr_mode=False)