Spaces:

tuanhqv123
/

Qwen-Qwen3-1.7B

Running

App Files Files Community

tuanhqv123 commited on Jun 14

Commit

ad57d9c

verified ·

1 Parent(s): 010fa18

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -40

app.py CHANGED Viewed

@@ -1,11 +1,14 @@
-from fastapi import FastAPI
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json
-app = FastAPI()
-# Load models
 models = {}
 tokenizers = {}
@@ -14,85 +17,319 @@ MODEL_CONFIGS = {
     "qwen3-4b": "Qwen/Qwen3-4B"
 }
 @app.on_event("startup")
 async def load_models():
-    for model_key, model_name in MODEL_CONFIGS.items():
-        print(f"Loading {model_name}...")
-        tokenizers[model_key] = AutoTokenizer.from_pretrained(model_name, resume_download=True, timeout=300)
-        models[model_key] = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype="auto",
-            device_map="auto",
-             resume_download=True, timeout=300
-        )
-    print("All models loaded!")
 @app.post("/v1/chat/completions")
-def chat_completions(request: dict):
     try:
         model_name = request.get("model", "qwen3-1.7b")
         messages = request.get("messages", [])
         temperature = request.get("temperature", 0.7)
         max_tokens = request.get("max_tokens", 1024)
-        # Chọn model
-        if "4b" in model_name.lower() or "4" in model_name:
             model_key = "qwen3-4b"
         else:
             model_key = "qwen3-1.7b"
         if model_key not in models:
-            return {"error": f"Model {model_key} not loaded"}
         tokenizer = tokenizers[model_key]
         model = models[model_key]
-        # Format messages cho Qwen3 - QUAN TRỌNG: dùng apply_chat_template
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True,
-            enable_thinking=False  # Tắt thinking mode để response nhanh hơn
         )
-        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-        # Generate với temperature
-        generated_ids = model.generate(
-            **model_inputs,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            do_sample=True if temperature > 0 else False,
-            pad_token_id=tokenizer.eos_token_id
-        )
         # Extract response - chỉ lấy phần mới generate
-        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
         response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
-        # Format response theo OpenAI API để tương thích với AiService
         return {
             "choices": [{
                 "message": {
                     "content": response,
                     "role": "assistant"
-                }
             }],
-            "model": model_key
         }
     except Exception as e:
-        print(f"Error: {str(e)}")
         return {
             "choices": [{
                 "message": {
-                    "content": f"Error processing request: {str(e)}",
                     "role": "assistant"
-                }
             }],
-            "error": str(e)
         }
-@app.get("/")
-def health_check():
-    return {"status": "API is running", "models": list(MODEL_CONFIGS.keys())}

+from fastapi import FastAPI, HTTPException
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import json
+import time
+from typing import Dict, Any, Optional
+import os
+app = FastAPI(title="Qwen3 API", description="API for Qwen3 models", version="1.0.0")
+# Global variables để lưu models
 models = {}
 tokenizers = {}
     "qwen3-4b": "Qwen/Qwen3-4B"
 }
+def download_model_safely(model_name: str, max_retries: int = 3):
+    """Download model với retry logic và error handling"""
+    for attempt in range(max_retries):
+        try:
+            print(f"Downloading {model_name} (attempt {attempt + 1}/{max_retries})...")
+            # Download tokenizer với các parameters tối ưu
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                resume_download=True,
+                timeout=600,
+                trust_remote_code=True,
+                cache_dir=None  # Sử dụng cache mặc định
+            )
+            # Download model với cấu hình tối ưu cho free tier
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype="auto",
+                device_map="auto",
+                resume_download=True,
+                timeout=600,
+                trust_remote_code=True,
+                cache_dir=None,
+                low_cpu_mem_usage=True  # Tối ưu memory usage
+            )
+            print(f"Successfully loaded {model_name}")
+            return tokenizer, model
+        except Exception as e:
+            print(f"Download failed (attempt {attempt + 1}): {str(e)}")
+            if attempt == max_retries - 1:
+                raise e
+            time.sleep(30)  # Wait before retry
+def load_model_on_demand(model_key: str):
+    """Load model khi cần thiết với memory management"""
+    if model_key not in models:
+        if model_key not in MODEL_CONFIGS:
+            raise ValueError(f"Unknown model key: {model_key}")
+        model_name = MODEL_CONFIGS[model_key]
+        print(f"Loading {model_name} on demand...")
+        # Memory management: chỉ giữ 1 model trong memory do giới hạn free tier
+        if len(models) >= 1:
+            oldest_model = list(models.keys())[0]
+            print(f"Unloading {oldest_model} to free memory...")
+            del models[oldest_model]
+            del tokenizers[oldest_model]
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        tokenizer, model = download_model_safely(model_name)
+        tokenizers[model_key] = tokenizer
+        models[model_key] = model
+        print(f"{model_name} loaded successfully!")
 @app.on_event("startup")
 async def load_models():
+    """Load model mặc định khi startup"""
+    try:
+        print("Loading default model: Qwen3-1.7B...")
+        tokenizer, model = download_model_safely("Qwen/Qwen3-1.7B")
+        tokenizers["qwen3-1.7b"] = tokenizer
+        models["qwen3-1.7b"] = model
+        print("Default model loaded successfully!")
+    except Exception as e:
+        print(f"Failed to load default model: {str(e)}")
+        print("Server will continue running, models will be loaded on demand")
+@app.get("/")
+def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "API is running",
+        "available_models": list(MODEL_CONFIGS.keys()),
+        "loaded_models": list(models.keys()),
+        "version": "1.0.0",
+        "message": "Qwen3 API Service"
+    }
+@app.get("/models")
+def list_models():
+    """List available models"""
+    return {
+        "available_models": MODEL_CONFIGS,
+        "loaded_models": list(models.keys()),
+        "total_available": len(MODEL_CONFIGS),
+        "total_loaded": len(models)
+    }
 @app.post("/v1/chat/completions")
+def chat_completions(request: Dict[str, Any]):
+    """
+    OpenAI-compatible chat completions endpoint
+    Tương thích hoàn toàn với code AiService hiện tại
+    """
     try:
+        # Parse request parameters
         model_name = request.get("model", "qwen3-1.7b")
         messages = request.get("messages", [])
         temperature = request.get("temperature", 0.7)
         max_tokens = request.get("max_tokens", 1024)
+        # Validate input
+        if not messages:
+            raise HTTPException(status_code=400, detail="Messages cannot be empty")
+        # Determine model key từ model name - tương thích với agents.py
+        if "4b" in model_name.lower() or "4" in model_name.lower():
             model_key = "qwen3-4b"
         else:
             model_key = "qwen3-1.7b"
+        print(f"Using model: {model_key} for request")
+        # Load model nếu chưa có
         if model_key not in models:
+            load_model_on_demand(model_key)
+        # Get model và tokenizer
         tokenizer = tokenizers[model_key]
         model = models[model_key]
+        # Format messages cho Qwen3 using apply_chat_template
+        # Đây là phần quan trọng để tương thích với Qwen3
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True,
+            enable_thinking=False  # Tắt thinking mode để response đơn giản và nhanh
         )
+        # Tokenize input
+        model_inputs = tokenizer([text], return_tensors="pt")
+        # Move to device if available
+        if torch.cuda.is_available():
+            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
+        # Generate response với các parameters tối ưu
+        with torch.no_grad():
+            generated_ids = model.generate(
+                **model_inputs,
+                max_new_tokens=min(max_tokens, 2048),  # Limit max tokens để tránh timeout
+                temperature=temperature,
+                do_sample=True if temperature > 0 else False,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.1,  # Tránh lặp lại
+                top_p=0.9 if temperature > 0 else None,
+                use_cache=True  # Tăng tốc generation
+            )
         # Extract response - chỉ lấy phần mới generate
+        input_length = model_inputs.input_ids.shape[1]
+        output_ids = generated_ids[0][input_length:].tolist()
         response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+        # Clean up response
+        if not response:
+            response = "I apologize, but I couldn't generate a proper response. Please try again."
+        # Format response theo OpenAI API để tương thích hoàn toàn với AiService
         return {
             "choices": [{
                 "message": {
                     "content": response,
                     "role": "assistant"
+                },
+                "finish_reason": "stop",
+                "index": 0
             }],
+            "model": model_key,
+            "usage": {
+                "prompt_tokens": input_length,
+                "completion_tokens": len(output_ids),
+                "total_tokens": input_length + len(output_ids)
+            },
+            "object": "chat.completion",
+            "created": int(time.time())
         }
+    except HTTPException:
+        raise
     except Exception as e:
+        print(f"Error in chat_completions: {str(e)}")
+        # Return error trong format tương thích với OpenAI API
         return {
             "choices": [{
                 "message": {
+                    "content": f"I encountered an error while processing your request: {str(e)}",
                     "role": "assistant"
+                },
+                "finish_reason": "error",
+                "index": 0
             }],
+            "error": {
+                "message": str(e),
+                "type": "internal_error",
+                "code": "processing_error"
+            },
+            "model": "qwen3-1.7b"
         }
+@app.post("/generate")
+def simple_generate(request: Dict[str, Any]):
+    """
+    Simple generate endpoint cho testing đơn giản
+    """
+    try:
+        text = request.get("text", "")
+        model_name = request.get("model", "qwen3-1.7b")
+        max_tokens = request.get("max_tokens", 100)
+        temperature = request.get("temperature", 0.7)
+        if not text:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        # Determine model key
+        if "4b" in model_name.lower():
+            model_key = "qwen3-4b"
+        else:
+            model_key = "qwen3-1.7b"
+        # Load model nếu cần
+        if model_key not in models:
+            load_model_on_demand(model_key)
+        tokenizer = tokenizers[model_key]
+        model = models[model_key]
+        # Simple generation
+        inputs = tokenizer(text, return_tensors="pt")
+        if torch.cuda.is_available():
+            inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                do_sample=True if temperature > 0 else False,
+                pad_token_id=tokenizer.eos_token_id
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return {
+            "generated_text": response,
+            "model": model_key,
+            "input_text": text
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+def health():
+    """Simple health check"""
+    return {
+        "status": "healthy",
+        "timestamp": int(time.time()),
+        "models_loaded": len(models)
+    }
+@app.get("/status")
+def status():
+    """Detailed status information"""
+    return {
+        "service": "Qwen3 API",
+        "status": "running",
+        "models": {
+            "available": MODEL_CONFIGS,
+            "loaded": list(models.keys()),
+            "memory_usage": {
+                "total_models": len(models),
+                "cuda_available": torch.cuda.is_available(),
+                "cuda_memory": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else None
+            }
+        },
+        "endpoints": [
+            "/v1/chat/completions",
+            "/generate",
+            "/models",
+            "/health",
+            "/status"
+        ]
+    }
+# Error handlers
+@app.exception_handler(404)
+async def not_found_handler(request, exc):
+    return {
+        "error": {
+            "message": "Endpoint not found",
+            "type": "not_found_error",
+            "code": 404
+        }
+    }
+@app.exception_handler(500)
+async def internal_error_handler(request, exc):
+    return {
+        "error": {
+            "message": "Internal server error",
+            "type": "internal_server_error",
+            "code": 500
+        }
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)