blt-testing / backup_blt_wip_backup /blt_args.py

Upload BLT model converted

724be6e verified 26 days ago

5.55 kB

	from enum import Enum
	from typing import Any

	from pydantic import BaseModel, ConfigDict, model_validator
	from typing_extensions import Self


	EOS_ID: int = 2


	class InitStdFactor(str, Enum):
	DISABLED = "disabled" # Init std is divided by 1.0
	GLOBAL_DEPTH = "global_depth" # Init std is divided by sqrt(2*n_layers)
	CURRENT_DEPTH = "current_depth" # Init std is divided by sqrt(2*depth)
	DIM_RATIO = "dim_ratio" # Init std is divided by model_dim/4096


	class PatchingModeEnum(str, Enum):
	entropy = "entropy"
	bpe = "bpe"
	bpe_patcher = "bpe_patcher"
	space = "space"
	static = "static"
	byte = "byte"


	class LMTransformerArgs(BaseModel):
	"""Arguments for the Language Model Transformer (used as entropy model for patching)"""

	model_config = ConfigDict()

	# Basic architecture
	dim: int = 512
	n_layers: int = 8
	head_dim: int \| None = None
	n_heads: int \| None = None
	n_kv_heads: int \| None = None

	# Transformer configuration
	max_seqlen: int = 1024
	norm_eps: float = 1e-5
	dropout: float = 0
	vocab_size: int = -1
	sliding_window: int \| None = None

	# Feedforward
	ffn_dim_multiplier: float \| None = None
	multiple_of: int = 256

	# Positional encoding
	rope_theta: float = 10000.0
	rope_use_fp32_in_outer_product: bool = False

	# Attention
	attn_impl: str = "sdpa"
	attn_bias_type: str = "causal"

	# Initialization
	init_base_std: float \| None = None
	init_std_factor: InitStdFactor = InitStdFactor.DISABLED

	# Embedding dimensions
	dim_token_emb: int \| None = None

	# Model behavior
	weight_tying: bool = False
	seed: int = 42

	# Special token config
	eos_id: int = EOS_ID


	class ByteLatentTransformerArgs(BaseModel):
	"""Arguments for the Byte Latent Transformer (main BLT model)"""

	model_config = ConfigDict()

	# Basic model configuration
	seed: int = 42
	vocab_size: int = -1

	# Main architecture dimensions (these will be used for creating transformer args)
	dim: int = 512
	n_layers: int = 8
	head_dim: int \| None = None
	n_heads: int \| None = None
	n_kv_heads: int \| None = None

	# Component-specific dimensions
	dim_global: int = 512
	dim_local_decoder: int = 512
	dim_local_encoder: int = 512
	n_layers_global: int = 8
	n_layers_local_decoder: int = 8
	n_layers_local_encoder: int = 8
	n_heads_global: int = 8
	n_heads_local_decoder: int = 8
	n_heads_local_encoder: int = 8
	n_kv_heads_global: int \| None = None

	# Transformer configuration (needed by transformer components)
	max_seqlen: int = 1024
	norm_eps: float = 1e-5
	dropout: float = 0

	# Feedforward (needed by transformer components)
	ffn_dim_multiplier: float = 1.0
	multiple_of: int = 256

	# Positional encoding (needed by transformer components)
	rope_theta: float = 10000.0
	rope_use_fp32_in_outer_product: bool = False

	# Attention (needed by transformer components)
	attn_impl: str = "sdpa"
	attn_bias_type: str = "causal"

	# Initialization (needed by transformer components)
	init_base_std: float \| None = None
	init_std_factor: InitStdFactor = InitStdFactor.DISABLED

	# Embedding dimensions (needed by transformer components)
	dim_token_emb: int \| None = None

	# Patching configuration
	patch_in_forward: bool = False
	realtime_patching: bool = True
	patch_size: float \| None = None
	patching_mode: str \| None = None
	patching_threshold: float \| None = None
	patching_threshold_add: float \| None = None
	monotonicity: bool = False
	patching_batch_size: int = 1
	patching_device: str = "cuda"
	max_patch_length: int \| None = None
	entropy_model_checkpoint_dir: str \| None = None

	# Cross attention configurations
	cross_attn_encoder: bool = False
	cross_attn_decoder: bool = False
	cross_attn_window_encoder: int \| None = None
	cross_attn_window_decoder: int \| None = None
	cross_attn_k: int \| None = None
	cross_attn_nheads: int \| None = None
	cross_attn_all_layers_decoder: bool = False
	cross_attn_all_layers_encoder: bool = False
	cross_attn_use_flex_attention: bool = True
	cross_attn_init_by_pooling: bool = False

	# Encoder configurations
	use_local_encoder_transformer: bool = False
	max_encoder_seq_length: int \| None = None
	encoder_hash_byte_group_size: Any \| None = None
	encoder_hash_byte_group_vocab: int = 30000
	encoder_hash_byte_group_nb_functions: int = 3
	encoder_enable_byte_ngrams: bool = False
	encoder_ngram_to_size_str: str \| None = None
	downsampling_by_pooling: str \| None = None

	# Architecture and dimensions
	dim_token: int \| None = None
	share_encoder_decoder_emb: bool = True
	weight_tying: bool = False

	# Attention configuration
	local_attention_window_len: int \| None = None
	use_rope: bool = True

	# Performance optimization
	sequence_parallel: bool = False
	loss_parallel: bool = False
	fuse_sequence_parallel: bool = False
	use_fsdp: bool = True

	# Parameter mixing
	pm_size: int = 0

	# Special token config
	eos_id: int = EOS_ID

	@model_validator(mode="after")
	def check_hash_byte_sizes(self) -> Self:
	if self.encoder_hash_byte_group_size is not None and type(self.encoder_hash_byte_group_size) == str:
	self.encoder_hash_byte_group_size = [
	int(x) for x in self.encoder_hash_byte_group_size.split(",") if len(x) > 0
	]
	return self