llava-chat / configs /model_config.py
Prashant26am's picture
fix: Update Gradio to 4.44.1 and improve interface
8d272fe
"""
Configuration for LLaVA models.
"""
from dataclasses import dataclass
from typing import Optional, List, Dict, Any
@dataclass
class VisionConfig:
"""Configuration for the vision encoder."""
model_name: str = "openai/clip-vit-large-patch14-336"
image_size: int = 336
patch_size: int = 14
hidden_size: int = 1024
num_attention_heads: int = 16
num_hidden_layers: int = 24
intermediate_size: int = 4096
projection_dim: int = 768
dropout: float = 0.0
attention_dropout: float = 0.0
@dataclass
class LanguageConfig:
"""Configuration for the language model."""
model_name: str = "lmsys/vicuna-7b-v1.5"
hidden_size: int = 4096
num_attention_heads: int = 32
num_hidden_layers: int = 32
intermediate_size: int = 11008
max_position_embeddings: int = 2048
vocab_size: int = 32000
rms_norm_eps: float = 1e-6
use_cache: bool = True
rope_theta: float = 10000.0
@dataclass
class ProjectorConfig:
"""Configuration for the projection layer."""
input_dim: int = 1024 # Vision encoder hidden size
hidden_dim: int = 4096 # Projection hidden dimension
output_dim: int = 4096 # Language model hidden size
dropout: float = 0.1
num_layers: int = 2
activation: str = "gelu"
@dataclass
class TrainingConfig:
"""Configuration for training."""
batch_size: int = 32
gradient_accumulation_steps: int = 1
learning_rate: float = 2e-5
weight_decay: float = 0.0
num_train_epochs: int = 1
max_steps: int = -1
warmup_steps: int = 0
lr_scheduler_type: str = "cosine"
logging_steps: int = 100
save_steps: int = 1000
eval_steps: int = 1000
save_total_limit: int = 3
fp16: bool = True
bf16: bool = False
seed: int = 42
gradient_checkpointing: bool = False
optim: str = "adamw_torch"
@dataclass
class LLaVAConfig:
"""Configuration for the LLaVA model."""
vision: VisionConfig = VisionConfig()
language: LanguageConfig = LanguageConfig()
projector: ProjectorConfig = ProjectorConfig()
training: TrainingConfig = TrainingConfig()
# Additional configurations
max_length: int = 2048
temperature: float = 0.7
top_p: float = 0.9
repetition_penalty: float = 1.0
@classmethod
def from_dict(cls, config_dict: Dict[str, Any]) -> "LLaVAConfig":
"""Create a configuration from a dictionary."""
vision_config = VisionConfig(**config_dict.get("vision", {}))
language_config = LanguageConfig(**config_dict.get("language", {}))
projector_config = ProjectorConfig(**config_dict.get("projector", {}))
training_config = TrainingConfig(**config_dict.get("training", {}))
# Get additional configurations
additional_config = {k: v for k, v in config_dict.items()
if k not in ["vision", "language", "projector", "training"]}
# Create and return the configuration
config = cls(
vision=vision_config,
language=language_config,
projector=projector_config,
training=training_config,
**additional_config
)
return config
def to_dict(self) -> Dict[str, Any]:
"""Convert the configuration to a dictionary."""
config_dict = {
"vision": {
"model_name": self.vision.model_name,
"image_size": self.vision.image_size,
"patch_size": self.vision.patch_size,
"hidden_size": self.vision.hidden_size,
"num_attention_heads": self.vision.num_attention_heads,
"num_hidden_layers": self.vision.num_hidden_layers,
"intermediate_size": self.vision.intermediate_size,
"projection_dim": self.vision.projection_dim,
"dropout": self.vision.dropout,
"attention_dropout": self.vision.attention_dropout
},
"language": {
"model_name": self.language.model_name,
"hidden_size": self.language.hidden_size,
"num_attention_heads": self.language.num_attention_heads,
"num_hidden_layers": self.language.num_hidden_layers,
"intermediate_size": self.language.intermediate_size,
"max_position_embeddings": self.language.max_position_embeddings,
"vocab_size": self.language.vocab_size,
"rms_norm_eps": self.language.rms_norm_eps,
"use_cache": self.language.use_cache,
"rope_theta": self.language.rope_theta
},
"projector": {
"input_dim": self.projector.input_dim,
"hidden_dim": self.projector.hidden_dim,
"output_dim": self.projector.output_dim,
"dropout": self.projector.dropout,
"num_layers": self.projector.num_layers,
"activation": self.projector.activation
},
"training": {
"batch_size": self.training.batch_size,
"gradient_accumulation_steps": self.training.gradient_accumulation_steps,
"learning_rate": self.training.learning_rate,
"weight_decay": self.training.weight_decay,
"num_train_epochs": self.training.num_train_epochs,
"max_steps": self.training.max_steps,
"warmup_steps": self.training.warmup_steps,
"lr_scheduler_type": self.training.lr_scheduler_type,
"logging_steps": self.training.logging_steps,
"save_steps": self.training.save_steps,
"eval_steps": self.training.eval_steps,
"save_total_limit": self.training.save_total_limit,
"fp16": self.training.fp16,
"bf16": self.training.bf16,
"seed": self.training.seed,
"gradient_checkpointing": self.training.gradient_checkpointing,
"optim": self.training.optim
},
"max_length": self.max_length,
"temperature": self.temperature,
"top_p": self.top_p,
"repetition_penalty": self.repetition_penalty
}
return config_dict
# Default configurations for different model sizes
LLAVA_7B_CONFIG = LLaVAConfig(
language=LanguageConfig(
model_name="lmsys/vicuna-7b-v1.5",
hidden_size=4096,
num_attention_heads=32,
num_hidden_layers=32,
intermediate_size=11008
),
projector=ProjectorConfig(
input_dim=1024,
hidden_dim=4096,
output_dim=4096
)
)
LLAVA_13B_CONFIG = LLaVAConfig(
language=LanguageConfig(
model_name="lmsys/vicuna-13b-v1.5",
hidden_size=5120,
num_attention_heads=40,
num_hidden_layers=40,
intermediate_size=13824
),
projector=ProjectorConfig(
input_dim=1024,
hidden_dim=5120,
output_dim=5120
)
)