Spaces:

tuanhqv123
/

Qwen-Qwen3-1.7B

Running

App Files Files Community

tuanhqv123 commited on Jun 14

Commit

d5238da

verified ·

1 Parent(s): e8bbc64

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -85

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import time
 import asyncio
 from typing import Dict, Any, Optional
 import logging
 import traceback
@@ -78,6 +80,49 @@ def load_model_on_demand(model_key: str):
         models[model_key] = model
         logger.info(f"{model_name} loaded successfully!")
 @app.on_event("startup")
 async def load_models():
     """Load default model"""
@@ -99,7 +144,7 @@ def health_check():
         "available_models": list(MODEL_CONFIGS.keys()),
         "loaded_models": list(models.keys()),
         "version": "1.0.0",
-        "message": "Qwen3 API Service - OpenAI Compatible"
     }
 @app.get("/models")
@@ -114,19 +159,20 @@ def list_models():
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Dict[str, Any]):
-    """OpenAI-compatible chat completions endpoint - FIXED AttributeError"""
     try:
         logger.info("=== CHAT COMPLETIONS REQUEST START ===")
-        logger.info(f"Request payload: {request}")
         # Parse request parameters
         model_name = request.get("model", "qwen3-1.7b")
         messages = request.get("messages", [])
         temperature = request.get("temperature", 0.7)
         max_tokens = request.get("max_tokens", 200)
         logger.info(f"Model: {model_name}, Temperature: {temperature}, Max tokens: {max_tokens}")
-        logger.info(f"Messages: {messages}")
         # Validate input
         if not messages:
@@ -151,6 +197,12 @@ async def chat_completions(request: Dict[str, Any]):
         model = models[model_key]
         logger.info(f"Got tokenizer and model for {model_key}")
         # Format messages - FORCE DISABLE thinking mode
         logger.info("Formatting messages with apply_chat_template...")
         try:
@@ -161,28 +213,28 @@ async def chat_completions(request: Dict[str, Any]):
                 enable_thinking=False  # CRITICAL: Force disable thinking
             )
-            # REMOVE thinking tags if present
-            if "<think>" in text:
                 logger.warning("Found thinking tags in formatted text, removing...")
-                text = text.replace("<think>\n\n</think>\n\n", "")
-                text = text.replace("<think></think>", "")
-                # Remove any remaining thinking content
-                import re
                 text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
-            logger.info(f"Formatted text (first 200 chars): {text[:200]}...")
         except Exception as e:
             logger.error(f"Error in apply_chat_template: {str(e)}")
             # Fallback to simple format WITHOUT thinking
             text = ""
             for msg in messages:
-                if msg["role"] == "user":
                     text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
                 elif msg["role"] == "assistant":
                     text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
             text += "<|im_start|>assistant\n"  # NO thinking tags
-            logger.info(f"Using fallback formatting: {text}")
         # Tokenize input
         logger.info("Tokenizing input...")
@@ -204,7 +256,7 @@ async def chat_completions(request: Dict[str, Any]):
                 with torch.no_grad():
                     generated_ids = model.generate(
                         **model_inputs,
-                        max_new_tokens=min(max_tokens, 100),
                         temperature=temperature,
                         do_sample=True if temperature > 0 else False,
                         pad_token_id=tokenizer.eos_token_id,
@@ -251,7 +303,7 @@ async def chat_completions(request: Dict[str, Any]):
                 "model": model_key
             }
-        # FIXED: Extract response - handle both tensor and dict cases
         logger.info("Extracting response...")
         try:
             # Get input length correctly
@@ -260,33 +312,34 @@ async def chat_completions(request: Dict[str, Any]):
             elif isinstance(model_inputs, dict) and 'input_ids' in model_inputs:
                 input_length = model_inputs['input_ids'].shape[1]
             else:
-                logger.error("Cannot find input_ids in model_inputs")
                 input_length = 0
             # Extract output tokens
-            if torch.is_tensor(generated_ids):
-                output_ids = generated_ids[0][input_length:].tolist()
-            else:
-                output_ids = generated_ids[0][input_length:].tolist()
             response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
-            logger.info(f"Generated response: {response}")
         except Exception as e:
             logger.error(f"Error extracting response: {str(e)}")
-            # Fallback: decode entire generated sequence
-            try:
-                if torch.is_tensor(generated_ids):
-                    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
-                else:
-                    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
-                # Remove the original prompt from response
-                if text in response:
-                    response = response.replace(text, "").strip()
-                logger.info(f"Fallback response: {response}")
-            except Exception as e2:
-                logger.error(f"Fallback extraction also failed: {str(e2)}")
-                response = "Error extracting response"
         # Clean up response
         if not response:
@@ -333,56 +386,6 @@ async def chat_completions(request: Dict[str, Any]):
             "model": "qwen3-1.7b"
         }
-@app.post("/generate")
-async def simple_generate(request: Dict[str, Any]):
-    """Simple generate endpoint for testing"""
-    try:
-        text = request.get("text", "")
-        model_name = request.get("model", "qwen3-1.7b")
-        max_tokens = request.get("max_tokens", 50)
-        temperature = request.get("temperature", 0.7)
-        if not text:
-            raise HTTPException(status_code=400, detail="Text cannot be empty")
-        # Determine model key
-        if "4b" in model_name.lower():
-            model_key = "qwen3-4b"
-        else:
-            model_key = "qwen3-1.7b"
-        # Load model if needed
-        if model_key not in models:
-            load_model_on_demand(model_key)
-        tokenizer = tokenizers[model_key]
-        model = models[model_key]
-        # Simple generation
-        inputs = tokenizer(text, return_tensors="pt")
-        if hasattr(model, 'device'):
-            inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                do_sample=True if temperature > 0 else False,
-                pad_token_id=tokenizer.eos_token_id
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return {
-            "generated_text": response,
-            "model": model_key,
-            "input_text": text
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 def health():
     """Simple health check"""

 import torch
 import time
 import asyncio
+import json
+import re
 from typing import Dict, Any, Optional
 import logging
 import traceback
         models[model_key] = model
         logger.info(f"{model_name} loaded successfully!")
+def extract_json_from_response(text: str) -> str:
+    """Extract JSON from response text"""
+    # Remove thinking tags completely
+    text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    text = text.strip()
+    # Try to find JSON object
+    json_match = re.search(r'\{[^{}]*\}', text)
+    if json_match:
+        return json_match.group(0)
+    # If no JSON found, return the cleaned text
+    return text
+def format_structured_prompt(messages: list, json_schema: dict) -> str:
+    """Format messages with JSON schema instructions"""
+    # Extract schema properties for clear instructions
+    schema_info = json_schema.get('schema', {})
+    properties = schema_info.get('properties', {})
+    required = schema_info.get('required', [])
+    # Create clear JSON format instructions
+    json_instructions = f"""
+You must respond with a valid JSON object only. No explanations, no markdown, no additional text.
+Required JSON format:
+{json.dumps(schema_info, indent=2)}
+Example response format: {{"type": "examschedule"}}
+"""
+    # Build the conversation
+    formatted_messages = []
+    for msg in messages:
+        if msg["role"] == "system":
+            # Append JSON instructions to system message
+            content = msg["content"] + "\n" + json_instructions
+            formatted_messages.append({"role": "system", "content": content})
+        else:
+            formatted_messages.append(msg)
+    return formatted_messages
 @app.on_event("startup")
 async def load_models():
     """Load default model"""
         "available_models": list(MODEL_CONFIGS.keys()),
         "loaded_models": list(models.keys()),
         "version": "1.0.0",
+        "message": "Qwen3 API Service - OpenAI Compatible with Structured Output"
     }
 @app.get("/models")
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Dict[str, Any]):
+    """OpenAI-compatible chat completions endpoint với Structured Output support"""
     try:
         logger.info("=== CHAT COMPLETIONS REQUEST START ===")
+        logger.info(f"Request payload: {json.dumps(request, ensure_ascii=False, indent=2)}")
         # Parse request parameters
         model_name = request.get("model", "qwen3-1.7b")
         messages = request.get("messages", [])
         temperature = request.get("temperature", 0.7)
         max_tokens = request.get("max_tokens", 200)
+        response_format = request.get("response_format", None)
         logger.info(f"Model: {model_name}, Temperature: {temperature}, Max tokens: {max_tokens}")
+        logger.info(f"Response format: {response_format}")
         # Validate input
         if not messages:
         model = models[model_key]
         logger.info(f"Got tokenizer and model for {model_key}")
+        # Handle structured output
+        if response_format and response_format.get("type") == "json_schema":
+            json_schema = response_format.get("json_schema", {})
+            logger.info("Structured output requested, formatting messages with JSON schema")
+            messages = format_structured_prompt(messages, json_schema)
         # Format messages - FORCE DISABLE thinking mode
         logger.info("Formatting messages with apply_chat_template...")
         try:
                 enable_thinking=False  # CRITICAL: Force disable thinking
             )
+            # AGGRESSIVE thinking mode removal
+            if "<think>" in text or "think>" in text:
                 logger.warning("Found thinking tags in formatted text, removing...")
                 text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+                text = re.sub(r'<think>\s*</think>', '', text)
+                text = text.replace("<think>", "").replace("</think>", "")
+            logger.info(f"Formatted text (first 300 chars): {text[:300]}...")
         except Exception as e:
             logger.error(f"Error in apply_chat_template: {str(e)}")
             # Fallback to simple format WITHOUT thinking
             text = ""
             for msg in messages:
+                if msg["role"] == "system":
+                    text += f"<|im_start|>system\n{msg['content']}<|im_end|>\n"
+                elif msg["role"] == "user":
                     text += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
                 elif msg["role"] == "assistant":
                     text += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
             text += "<|im_start|>assistant\n"  # NO thinking tags
+            logger.info(f"Using fallback formatting")
         # Tokenize input
         logger.info("Tokenizing input...")
                 with torch.no_grad():
                     generated_ids = model.generate(
                         **model_inputs,
+                        max_new_tokens=min(max_tokens, 200),
                         temperature=temperature,
                         do_sample=True if temperature > 0 else False,
                         pad_token_id=tokenizer.eos_token_id,
                 "model": model_key
             }
+        # Extract response
         logger.info("Extracting response...")
         try:
             # Get input length correctly
             elif isinstance(model_inputs, dict) and 'input_ids' in model_inputs:
                 input_length = model_inputs['input_ids'].shape[1]
             else:
                 input_length = 0
             # Extract output tokens
+            output_ids = generated_ids[0][input_length:].tolist()
             response = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+            # Handle structured output
+            if response_format and response_format.get("type") == "json_schema":
+                response = extract_json_from_response(response)
+                logger.info(f"Extracted JSON response: {response}")
+                # Validate JSON
+                try:
+                    json.loads(response)
+                except json.JSONDecodeError:
+                    logger.warning("Generated response is not valid JSON, attempting to fix...")
+                    # Try to extract just the JSON part
+                    json_match = re.search(r'\{.*\}', response)
+                    if json_match:
+                        response = json_match.group(0)
+                    else:
+                        response = '{"type": "other"}'  # Fallback
+            logger.info(f"Final response: {response}")
         except Exception as e:
             logger.error(f"Error extracting response: {str(e)}")
+            response = "Error extracting response"
         # Clean up response
         if not response:
             "model": "qwen3-1.7b"
         }
 @app.get("/health")
 def health():
     """Simple health check"""