import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig from peft import PeftModel # We'll load the base model and then the adapter # --- Configuration --- # IMPORTANT: Use the base model name for initial loading, # as the uploaded model might be the merged one which we now know has issues # when re-loaded without quantization settings. # Or, if you want to load your *merged* model (eldntr/cendol-large-lora-indo2minang), # you need to load it with the same BitsAndBytesConfig. # Let's assume you push your 'final_merged_model' from the notebook to eldntr/cendol-large-lora-indo2minang MODEL_ID = "eldntr/cendol-large-lora-indo2minang" # Ensure the base model used for fine-tuning is also available or mentioned # It seems from your notebook you used "indonlp/cendol-mt5-large-inst" as the base BASE_MODEL_ID = "indonlp/cendol-mt5-large-inst" # Setup device (menggunakan GPU jika ada, jika tidak CPU) # If running on a GPU, we will attempt 8-bit loading. # If running on CPU (e.g., free Hugging Face Space), 8-bit loading won't apply directly. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # --- Model Loading --- try: tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) if device.type == "cuda": print(f"Attempting to load model {MODEL_ID} with 8-bit quantization on GPU...") quantization_config = BitsAndBytesConfig(load_in_8bit=True) # Load the merged model with the same quantization config # that it was likely saved with if you pushed the `final_merged_model` model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_ID, quantization_config=quantization_config, device_map="auto", # Use "auto" for proper multi-GPU/CPU offloading with 8-bit # No torch_dtype specified here if it's 8-bit. BitsAndBytes handles the dtypes. low_cpu_mem_usage=True # Still good practice ) # You don't need prepare_model_for_kbit_training if it's already merged and you're not training. print(f"Model {MODEL_ID} berhasil dimuat dengan kuantisasi 8-bit ke {next(model.parameters()).device}") else: print(f"CUDA not available. Loading model {MODEL_ID} in full precision (float32) on CPU...") model = AutoModelForSeq2SeqLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # Force float32 on CPU low_cpu_mem_usage=True # Still good practice ).to(device) print(f"Model {MODEL_ID} berhasil dimuat ke {device}") except Exception as e: print(f"Fatal error loading model: {e}") error_msg = "Gagal memuat model. Ini mungkin karena model disimpan dengan konfigurasi khusus (misalnya 8-bit quantization) dan perlu dimuat dengan konfigurasi yang sama. Pastikan `bitsandbytes` terinstal dan coba lagi." if "Only Tensors of floating point and complex dtype can require gradients" in str(e): error_msg += " Kesalahan ini mengindikasikan masalah tipe data bobot yang kemungkinan besar disebabkan oleh ketidaksesuaian saat memuat model terkuantisasi." raise gr.Error(error_msg) # --- Translation Function (rest of the code remains the same) --- PROMPT_PREFIX = "Terjemahkan Indonesia ke Minangkabau: " MAX_SOURCE_LENGTH = 128 MAX_TARGET_LENGTH = 128 def translate(text_input): """ Fungsi utama untuk melakukan translasi. """ if not text_input: return "" prompt = f"Terjemahkan Indonesia ke Minangkabau: {text_input}" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LENGTH).to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=MAX_TARGET_LENGTH, num_beams=5, early_stopping=True ) translation = tokenizer.decode(outputs[0], skip_special_tokens=True) return translation # --- Gradio Interface (rest of the code remains the same) --- demo = gr.Interface( fn=translate, inputs=gr.Textbox( label="Teks Bahasa Indonesia", placeholder="Ketik kalimat di sini...", lines=4 ), outputs=gr.Textbox( label="Terjemahan Minangkabau", interactive=False, # Output box tidak bisa diedit lines=4 ), title="Penerjemah Indonesia ke Minangkabau", description="Aplikasi demo ini menggunakan model Cendol MT5 yang telah di-fine-tune pada dataset terjemahan Indonesia-Minangkabau.", article=f"Model yang digunakan: {MODEL_ID}. Dibuat dengan Gradio dan dihosting di Hugging Face Spaces.", examples=[ ["Saya ingin pergi ke pasar besok pagi."], ["Apa makanan favoritmu?"], ["Rumah makan itu sangat terkenal di kota ini."] ]) # --- Launch Application --- if __name__ == "__main__": demo.launch()