Spaces:
Sleeping
Sleeping
""" | |
Configuration for LLaVA models. | |
""" | |
from dataclasses import dataclass | |
from typing import Optional, List, Dict, Any | |
class VisionConfig: | |
"""Configuration for the vision encoder.""" | |
model_name: str = "openai/clip-vit-large-patch14-336" | |
image_size: int = 336 | |
patch_size: int = 14 | |
hidden_size: int = 1024 | |
num_attention_heads: int = 16 | |
num_hidden_layers: int = 24 | |
intermediate_size: int = 4096 | |
projection_dim: int = 768 | |
dropout: float = 0.0 | |
attention_dropout: float = 0.0 | |
class LanguageConfig: | |
"""Configuration for the language model.""" | |
model_name: str = "lmsys/vicuna-7b-v1.5" | |
hidden_size: int = 4096 | |
num_attention_heads: int = 32 | |
num_hidden_layers: int = 32 | |
intermediate_size: int = 11008 | |
max_position_embeddings: int = 2048 | |
vocab_size: int = 32000 | |
rms_norm_eps: float = 1e-6 | |
use_cache: bool = True | |
rope_theta: float = 10000.0 | |
class ProjectorConfig: | |
"""Configuration for the projection layer.""" | |
input_dim: int = 1024 # Vision encoder hidden size | |
hidden_dim: int = 4096 # Projection hidden dimension | |
output_dim: int = 4096 # Language model hidden size | |
dropout: float = 0.1 | |
num_layers: int = 2 | |
activation: str = "gelu" | |
class TrainingConfig: | |
"""Configuration for training.""" | |
batch_size: int = 32 | |
gradient_accumulation_steps: int = 1 | |
learning_rate: float = 2e-5 | |
weight_decay: float = 0.0 | |
num_train_epochs: int = 1 | |
max_steps: int = -1 | |
warmup_steps: int = 0 | |
lr_scheduler_type: str = "cosine" | |
logging_steps: int = 100 | |
save_steps: int = 1000 | |
eval_steps: int = 1000 | |
save_total_limit: int = 3 | |
fp16: bool = True | |
bf16: bool = False | |
seed: int = 42 | |
gradient_checkpointing: bool = False | |
optim: str = "adamw_torch" | |
class LLaVAConfig: | |
"""Configuration for the LLaVA model.""" | |
vision: VisionConfig = VisionConfig() | |
language: LanguageConfig = LanguageConfig() | |
projector: ProjectorConfig = ProjectorConfig() | |
training: TrainingConfig = TrainingConfig() | |
# Additional configurations | |
max_length: int = 2048 | |
temperature: float = 0.7 | |
top_p: float = 0.9 | |
repetition_penalty: float = 1.0 | |
def from_dict(cls, config_dict: Dict[str, Any]) -> "LLaVAConfig": | |
"""Create a configuration from a dictionary.""" | |
vision_config = VisionConfig(**config_dict.get("vision", {})) | |
language_config = LanguageConfig(**config_dict.get("language", {})) | |
projector_config = ProjectorConfig(**config_dict.get("projector", {})) | |
training_config = TrainingConfig(**config_dict.get("training", {})) | |
# Get additional configurations | |
additional_config = {k: v for k, v in config_dict.items() | |
if k not in ["vision", "language", "projector", "training"]} | |
# Create and return the configuration | |
config = cls( | |
vision=vision_config, | |
language=language_config, | |
projector=projector_config, | |
training=training_config, | |
**additional_config | |
) | |
return config | |
def to_dict(self) -> Dict[str, Any]: | |
"""Convert the configuration to a dictionary.""" | |
config_dict = { | |
"vision": { | |
"model_name": self.vision.model_name, | |
"image_size": self.vision.image_size, | |
"patch_size": self.vision.patch_size, | |
"hidden_size": self.vision.hidden_size, | |
"num_attention_heads": self.vision.num_attention_heads, | |
"num_hidden_layers": self.vision.num_hidden_layers, | |
"intermediate_size": self.vision.intermediate_size, | |
"projection_dim": self.vision.projection_dim, | |
"dropout": self.vision.dropout, | |
"attention_dropout": self.vision.attention_dropout | |
}, | |
"language": { | |
"model_name": self.language.model_name, | |
"hidden_size": self.language.hidden_size, | |
"num_attention_heads": self.language.num_attention_heads, | |
"num_hidden_layers": self.language.num_hidden_layers, | |
"intermediate_size": self.language.intermediate_size, | |
"max_position_embeddings": self.language.max_position_embeddings, | |
"vocab_size": self.language.vocab_size, | |
"rms_norm_eps": self.language.rms_norm_eps, | |
"use_cache": self.language.use_cache, | |
"rope_theta": self.language.rope_theta | |
}, | |
"projector": { | |
"input_dim": self.projector.input_dim, | |
"hidden_dim": self.projector.hidden_dim, | |
"output_dim": self.projector.output_dim, | |
"dropout": self.projector.dropout, | |
"num_layers": self.projector.num_layers, | |
"activation": self.projector.activation | |
}, | |
"training": { | |
"batch_size": self.training.batch_size, | |
"gradient_accumulation_steps": self.training.gradient_accumulation_steps, | |
"learning_rate": self.training.learning_rate, | |
"weight_decay": self.training.weight_decay, | |
"num_train_epochs": self.training.num_train_epochs, | |
"max_steps": self.training.max_steps, | |
"warmup_steps": self.training.warmup_steps, | |
"lr_scheduler_type": self.training.lr_scheduler_type, | |
"logging_steps": self.training.logging_steps, | |
"save_steps": self.training.save_steps, | |
"eval_steps": self.training.eval_steps, | |
"save_total_limit": self.training.save_total_limit, | |
"fp16": self.training.fp16, | |
"bf16": self.training.bf16, | |
"seed": self.training.seed, | |
"gradient_checkpointing": self.training.gradient_checkpointing, | |
"optim": self.training.optim | |
}, | |
"max_length": self.max_length, | |
"temperature": self.temperature, | |
"top_p": self.top_p, | |
"repetition_penalty": self.repetition_penalty | |
} | |
return config_dict | |
# Default configurations for different model sizes | |
LLAVA_7B_CONFIG = LLaVAConfig( | |
language=LanguageConfig( | |
model_name="lmsys/vicuna-7b-v1.5", | |
hidden_size=4096, | |
num_attention_heads=32, | |
num_hidden_layers=32, | |
intermediate_size=11008 | |
), | |
projector=ProjectorConfig( | |
input_dim=1024, | |
hidden_dim=4096, | |
output_dim=4096 | |
) | |
) | |
LLAVA_13B_CONFIG = LLaVAConfig( | |
language=LanguageConfig( | |
model_name="lmsys/vicuna-13b-v1.5", | |
hidden_size=5120, | |
num_attention_heads=40, | |
num_hidden_layers=40, | |
intermediate_size=13824 | |
), | |
projector=ProjectorConfig( | |
input_dim=1024, | |
hidden_dim=5120, | |
output_dim=5120 | |
) | |
) |