Chapter 13
15 min read
Section 63 of 75

Model Configuration and Setup

Training Translation Model

Introduction

This section brings together all components we've built to configure and set up the complete translation model. We'll define configurations for different model sizes and set up the training environment.


1.1 Model Architecture Configuration

Configuration Classes

🐍python
1from dataclasses import dataclass, field, asdict
2from typing import Optional, Dict, Any
3import json
4from pathlib import Path
5
6
7@dataclass
8class ModelConfig:
9    """
10    Configuration for the Transformer translation model.
11
12    Defines all architecture hyperparameters.
13    """
14    # Vocabulary
15    vocab_size: int = 8000
16    pad_id: int = 0
17
18    # Architecture
19    d_model: int = 512
20    num_heads: int = 8
21    num_encoder_layers: int = 6
22    num_decoder_layers: int = 6
23    d_ff: int = 2048
24    max_seq_len: int = 128
25
26    # Regularization
27    dropout: float = 0.1
28    attention_dropout: float = 0.1
29
30    # Initialization
31    init_std: float = 0.02
32
33    def __post_init__(self):
34        """Validate configuration."""
35        assert self.d_model % self.num_heads == 0, \
36            f"d_model ({self.d_model}) must be divisible by num_heads ({self.num_heads})"
37
38    def to_dict(self) -> Dict[str, Any]:
39        """Convert to dictionary."""
40        return asdict(self)
41
42    def save(self, path: str):
43        """Save configuration to JSON file."""
44        with open(path, 'w') as f:
45            json.dump(self.to_dict(), f, indent=2)
46
47    @classmethod
48    def load(cls, path: str) -> 'ModelConfig':
49        """Load configuration from JSON file."""
50        with open(path, 'r') as f:
51            data = json.load(f)
52        return cls(**data)
53
54
55@dataclass
56class TrainingConfig:
57    """
58    Configuration for training the translation model.
59
60    Defines optimization and training hyperparameters.
61    """
62    # Batching
63    batch_size: int = 64
64    max_tokens: int = 4096
65    accumulation_steps: int = 1
66
67    # Training
68    num_epochs: int = 30
69    max_steps: Optional[int] = None
70
71    # Optimization
72    learning_rate: float = 1e-4
73    weight_decay: float = 0.01
74    adam_beta1: float = 0.9
75    adam_beta2: float = 0.98
76    adam_epsilon: float = 1e-9
77
78    # Scheduler
79    warmup_steps: int = 4000
80    scheduler_type: str = "transformer"  # or "cosine"
81
82    # Regularization
83    gradient_clip: float = 1.0
84    label_smoothing: float = 0.1
85
86    # Checkpointing
87    save_every: int = 1000
88    eval_every: int = 500
89    keep_best: int = 5
90    checkpoint_dir: str = "checkpoints"
91
92    # Logging
93    log_every: int = 100
94    log_dir: str = "logs"
95
96    # Device
97    device: str = "cuda"
98    mixed_precision: bool = True
99
100    def to_dict(self) -> Dict[str, Any]:
101        return asdict(self)
102
103
104@dataclass
105class ExperimentConfig:
106    """
107    Complete experiment configuration.
108
109    Combines model, training, and data configurations.
110    """
111    # Experiment name
112    name: str = "multi30k_de_en"
113    seed: int = 42
114
115    # Paths
116    data_dir: str = "data/multi30k"
117    tokenizer_path: str = "data/tokenizer/tokenizer.json"
118    output_dir: str = "outputs"
119
120    # Configs
121    model: ModelConfig = field(default_factory=ModelConfig)
122    training: TrainingConfig = field(default_factory=TrainingConfig)
123
124    def save(self, path: str):
125        """Save complete config."""
126        config_dict = {
127            'name': self.name,
128            'seed': self.seed,
129            'data_dir': self.data_dir,
130            'tokenizer_path': self.tokenizer_path,
131            'output_dir': self.output_dir,
132            'model': self.model.to_dict(),
133            'training': self.training.to_dict(),
134        }
135        with open(path, 'w') as f:
136            json.dump(config_dict, f, indent=2)
137
138
139def show_default_configs():
140    """
141    Show default configurations.
142    """
143    print("Default Configurations")
144    print("=" * 60)
145
146    model_config = ModelConfig()
147    training_config = TrainingConfig()
148
149    print("\nModel Configuration:")
150    print("-" * 40)
151    for key, value in model_config.to_dict().items():
152        print(f"  {key}: {value}")
153
154    print("\nTraining Configuration:")
155    print("-" * 40)
156    for key, value in training_config.to_dict().items():
157        print(f"  {key}: {value}")
158
159
160show_default_configs()

1.2 Predefined Model Sizes

Small, Base, and Large Configurations

🐍python
1def get_model_config(size: str = "base") -> ModelConfig:
2    """
3    Get predefined model configuration.
4
5    Available sizes:
6    - tiny: For testing, ~1M parameters
7    - small: Quick training, ~10M parameters
8    - base: Standard, ~65M parameters
9    - large: Best quality, ~200M parameters
10
11    Args:
12        size: Model size name
13
14    Returns:
15        ModelConfig for specified size
16    """
17    configs = {
18        "tiny": ModelConfig(
19            d_model=128,
20            num_heads=2,
21            num_encoder_layers=2,
22            num_decoder_layers=2,
23            d_ff=256,
24            dropout=0.1,
25        ),
26        "small": ModelConfig(
27            d_model=256,
28            num_heads=4,
29            num_encoder_layers=3,
30            num_decoder_layers=3,
31            d_ff=512,
32            dropout=0.1,
33        ),
34        "base": ModelConfig(
35            d_model=512,
36            num_heads=8,
37            num_encoder_layers=6,
38            num_decoder_layers=6,
39            d_ff=2048,
40            dropout=0.1,
41        ),
42        "large": ModelConfig(
43            d_model=1024,
44            num_heads=16,
45            num_encoder_layers=6,
46            num_decoder_layers=6,
47            d_ff=4096,
48            dropout=0.3,
49        ),
50    }
51
52    if size not in configs:
53        raise ValueError(f"Unknown size: {size}. Choose from {list(configs.keys())}")
54
55    return configs[size]
56
57
58def compare_model_sizes():
59    """
60    Compare different model configurations.
61    """
62    print("Model Size Comparison")
63    print("=" * 60)
64
65    sizes = ["tiny", "small", "base", "large"]
66
67    print(f"\n{'Size':<10} {'d_model':<10} {'Heads':<8} {'Layers':<10} {'d_ff':<8} {'~Params':<12}")
68    print("-" * 65)
69
70    for size in sizes:
71        config = get_model_config(size)
72
73        # Estimate parameters
74        # Embeddings: vocab_size * d_model * 2 (src + tgt)
75        # Encoder: num_layers * (4*d_model*d_model + 2*d_model*d_ff)
76        # Decoder: num_layers * (4*d_model*d_model + 2*d_model*d_model + 2*d_model*d_ff)
77        # Output: d_model * vocab_size
78
79        embed_params = config.vocab_size * config.d_model * 2
80        enc_params = config.num_encoder_layers * (4 * config.d_model ** 2 + 2 * config.d_model * config.d_ff)
81        dec_params = config.num_decoder_layers * (6 * config.d_model ** 2 + 2 * config.d_model * config.d_ff)
82        output_params = config.d_model * config.vocab_size
83
84        total_params = embed_params + enc_params + dec_params + output_params
85
86        print(f"{size:<10} {config.d_model:<10} {config.num_heads:<8} "
87              f"{config.num_encoder_layers}+{config.num_decoder_layers:<6} "
88              f"{config.d_ff:<8} {total_params/1e6:.1f}M")
89
90    print("""
91
92    RECOMMENDATIONS:
93    ────────────────
94
95    tiny:  For debugging and testing code
96           ~1 minute per epoch on CPU
97
98    small: For quick experiments
99           ~5 minutes per epoch on GPU
100           BLEU: ~25-30
101
102    base:  For best results on Multi30k
103           ~15 minutes per epoch on GPU
104           BLEU: ~30-35 (our target)
105
106    large: For maximum quality
107           ~30+ minutes per epoch on GPU
108           BLEU: ~35-40
109    """)
110
111
112compare_model_sizes()

1.3 Building the Model

Model Factory

🐍python
1import torch
2import torch.nn as nn
3
4
5def build_model(config: ModelConfig) -> nn.Module:
6    """
7    Build Transformer translation model from configuration.
8
9    Uses all the components we've built throughout the course.
10
11    Args:
12        config: ModelConfig instance
13
14    Returns:
15        Transformer model
16    """
17    # This would import from our implementations
18    # from src.model import Transformer
19
20    # For demonstration, show the expected interface
21    print(f"Building model with config:")
22    print(f"  d_model: {config.d_model}")
23    print(f"  num_heads: {config.num_heads}")
24    print(f"  encoder_layers: {config.num_encoder_layers}")
25    print(f"  decoder_layers: {config.num_decoder_layers}")
26
27    # Actual implementation would be:
28    '''
29    model = Transformer(
30        vocab_size=config.vocab_size,
31        d_model=config.d_model,
32        num_heads=config.num_heads,
33        num_encoder_layers=config.num_encoder_layers,
34        num_decoder_layers=config.num_decoder_layers,
35        d_ff=config.d_ff,
36        max_seq_len=config.max_seq_len,
37        dropout=config.dropout,
38        attention_dropout=config.attention_dropout,
39        pad_id=config.pad_id,
40    )
41
42    # Initialize weights
43    model.apply(lambda m: init_weights(m, config.init_std))
44
45    return model
46    '''
47
48    return None  # Placeholder
49
50
51def init_weights(module: nn.Module, std: float = 0.02):
52    """
53    Initialize model weights.
54
55    Uses Xavier/Glorot for linear layers and normal for embeddings.
56    """
57    if isinstance(module, nn.Linear):
58        nn.init.normal_(module.weight, mean=0.0, std=std)
59        if module.bias is not None:
60            nn.init.zeros_(module.bias)
61
62    elif isinstance(module, nn.Embedding):
63        nn.init.normal_(module.weight, mean=0.0, std=std)
64
65    elif isinstance(module, nn.LayerNorm):
66        nn.init.ones_(module.weight)
67        nn.init.zeros_(module.bias)
68
69
70def count_parameters(model: nn.Module) -> int:
71    """Count trainable parameters."""
72    return sum(p.numel() for p in model.parameters() if p.requires_grad)
73
74
75def model_summary(model: nn.Module, config: ModelConfig):
76    """
77    Print model summary.
78    """
79    print("Model Summary")
80    print("=" * 60)
81
82    if model is not None:
83        total_params = count_parameters(model)
84
85        print(f"\nTotal parameters: {total_params:,}")
86        print(f"Trainable parameters: {total_params:,}")
87
88        # Memory estimation (fp32)
89        memory_mb = total_params * 4 / 1024 / 1024
90        print(f"Model size (fp32): {memory_mb:.1f} MB")
91
92        # Training memory estimation (model + gradients + optimizer states)
93        training_memory = memory_mb * 4  # Rough estimate
94        print(f"Training memory estimate: {training_memory:.1f} MB")
95
96    print(f"""
97    Architecture:
98      Encoder: {config.num_encoder_layers} layers
99      Decoder: {config.num_decoder_layers} layers
100      Attention heads: {config.num_heads}
101      Model dimension: {config.d_model}
102      FFN dimension: {config.d_ff}
103      Max sequence length: {config.max_seq_len}
104      Vocabulary size: {config.vocab_size}
105    """)

1.4 Optimizer and Scheduler Setup

Training Components

🐍python
1import math
2
3
4def setup_optimizer(
5    model: nn.Module,
6    config: TrainingConfig
7) -> torch.optim.Optimizer:
8    """
9    Set up AdamW optimizer with proper configuration.
10
11    Uses weight decay only on non-bias, non-LayerNorm parameters.
12
13    Args:
14        model: Model to optimize
15        config: Training configuration
16
17    Returns:
18        Configured optimizer
19    """
20    # Separate parameters for weight decay
21    decay_params = []
22    no_decay_params = []
23
24    for name, param in model.named_parameters():
25        if not param.requires_grad:
26            continue
27
28        # No weight decay for biases and LayerNorm
29        if 'bias' in name or 'LayerNorm' in name or 'layer_norm' in name:
30            no_decay_params.append(param)
31        else:
32            decay_params.append(param)
33
34    optimizer_groups = [
35        {'params': decay_params, 'weight_decay': config.weight_decay},
36        {'params': no_decay_params, 'weight_decay': 0.0},
37    ]
38
39    optimizer = torch.optim.AdamW(
40        optimizer_groups,
41        lr=config.learning_rate,
42        betas=(config.adam_beta1, config.adam_beta2),
43        eps=config.adam_epsilon,
44    )
45
46    return optimizer
47
48
49def setup_scheduler(
50    optimizer: torch.optim.Optimizer,
51    config: TrainingConfig,
52    num_training_steps: int
53):
54    """
55    Set up learning rate scheduler.
56
57    Args:
58        optimizer: Optimizer instance
59        config: Training configuration
60        num_training_steps: Total number of training steps
61
62    Returns:
63        LR scheduler
64    """
65    if config.scheduler_type == "transformer":
66        # Original Transformer schedule
67        def lr_lambda(step):
68            step = max(step, 1)
69            return min(
70                step ** (-0.5),
71                step * config.warmup_steps ** (-1.5)
72            )
73        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
74
75    elif config.scheduler_type == "cosine":
76        # Linear warmup + cosine decay
77        def lr_lambda(step):
78            if step < config.warmup_steps:
79                return step / config.warmup_steps
80            else:
81                progress = (step - config.warmup_steps) / (num_training_steps - config.warmup_steps)
82                return 0.5 * (1 + math.cos(math.pi * progress))
83
84        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
85
86    else:
87        raise ValueError(f"Unknown scheduler: {config.scheduler_type}")
88
89    return scheduler
90
91
92def setup_criterion(config: TrainingConfig, vocab_size: int, pad_id: int):
93    """
94    Set up loss function with label smoothing.
95
96    Args:
97        config: Training configuration
98        vocab_size: Vocabulary size
99        pad_id: Padding token ID
100
101    Returns:
102        Loss criterion
103    """
104    # Using cross-entropy with label smoothing
105    criterion = nn.CrossEntropyLoss(
106        ignore_index=pad_id,
107        label_smoothing=config.label_smoothing
108    )
109
110    return criterion
111
112
113def demonstrate_setup():
114    """
115    Demonstrate training setup.
116    """
117    print("Training Setup Demonstration")
118    print("=" * 60)
119
120    code = '''
121# Complete setup code
122
123def setup_training(model_config, training_config, data_module):
124    """
125    Set up all training components.
126    """
127    # Build model
128    model = build_model(model_config)
129    model = model.to(training_config.device)
130
131    # Set up optimizer
132    optimizer = setup_optimizer(model, training_config)
133
134    # Estimate training steps
135    train_loader = data_module.train_dataloader()
136    steps_per_epoch = len(train_loader)
137    total_steps = steps_per_epoch * training_config.num_epochs
138
139    # Set up scheduler
140    scheduler = setup_scheduler(
141        optimizer,
142        training_config,
143        total_steps
144    )
145
146    # Set up criterion
147    criterion = setup_criterion(
148        training_config,
149        model_config.vocab_size,
150        model_config.pad_id
151    )
152
153    # Mixed precision scaler
154    scaler = None
155    if training_config.mixed_precision and training_config.device == "cuda":
156        scaler = torch.cuda.amp.GradScaler()
157
158    print(f"Training setup complete:")
159    print(f"  Steps per epoch: {steps_per_epoch}")
160    print(f"  Total steps: {total_steps}")
161    print(f"  Warmup steps: {training_config.warmup_steps}")
162
163    return {
164        'model': model,
165        'optimizer': optimizer,
166        'scheduler': scheduler,
167        'criterion': criterion,
168        'scaler': scaler,
169    }
170'''
171
172    print(code)
173
174
175demonstrate_setup()

1.5 Complete Setup Script

Ready-to-Run Setup

🐍python
1def complete_setup_script():
2    """
3    Show complete setup script.
4    """
5    print("Complete Setup Script")
6    print("=" * 60)
7
8    script = '''
9# setup.py - Complete training setup
10
11import torch
12import random
13import numpy as np
14from pathlib import Path
15
16def set_seed(seed: int):
17    """Set all random seeds for reproducibility."""
18    random.seed(seed)
19    np.random.seed(seed)
20    torch.manual_seed(seed)
21    if torch.cuda.is_available():
22        torch.cuda.manual_seed_all(seed)
23        torch.backends.cudnn.deterministic = True
24        torch.backends.cudnn.benchmark = False
25
26
27def main():
28    # 1. Configuration
29    experiment_config = ExperimentConfig(
30        name="multi30k_base",
31        seed=42,
32        data_dir="data/multi30k",
33        tokenizer_path="data/tokenizer/tokenizer.json",
34        output_dir="outputs/multi30k_base",
35        model=get_model_config("base"),
36        training=TrainingConfig(
37            num_epochs=30,
38            max_tokens=4096,
39            learning_rate=1e-4,
40            warmup_steps=4000,
41            label_smoothing=0.1,
42            gradient_clip=1.0,
43            device="cuda" if torch.cuda.is_available() else "cpu",
44        ),
45    )
46
47    # Create output directory
48    Path(experiment_config.output_dir).mkdir(parents=True, exist_ok=True)
49
50    # Save configuration
51    experiment_config.save(
52        Path(experiment_config.output_dir) / "config.json"
53    )
54
55    # 2. Set seed
56    set_seed(experiment_config.seed)
57
58    # 3. Setup data
59    data_config = DataConfig(
60        data_dir=experiment_config.data_dir,
61        tokenizer_path=experiment_config.tokenizer_path,
62        max_tokens=experiment_config.training.max_tokens,
63    )
64
65    data_module = Multi30kDataModule(data_config)
66    data_module.setup()
67
68    # Update vocab size from tokenizer
69    experiment_config.model.vocab_size = data_module.vocab_size
70    experiment_config.model.pad_id = data_module.pad_id
71
72    # 4. Build model
73    model = build_model(experiment_config.model)
74    model = model.to(experiment_config.training.device)
75
76    print(f"\nModel parameters: {count_parameters(model):,}")
77
78    # 5. Setup optimizer
79    optimizer = setup_optimizer(model, experiment_config.training)
80
81    # 6. Setup scheduler
82    train_loader = data_module.train_dataloader()
83    total_steps = len(train_loader) * experiment_config.training.num_epochs
84
85    scheduler = setup_scheduler(
86        optimizer,
87        experiment_config.training,
88        total_steps
89    )
90
91    # 7. Setup criterion
92    criterion = setup_criterion(
93        experiment_config.training,
94        experiment_config.model.vocab_size,
95        experiment_config.model.pad_id
96    )
97
98    # 8. Setup mixed precision
99    scaler = None
100    if experiment_config.training.mixed_precision:
101        scaler = torch.cuda.amp.GradScaler()
102
103    print("\nSetup complete! Ready to train.")
104
105    return {
106        'config': experiment_config,
107        'data_module': data_module,
108        'model': model,
109        'optimizer': optimizer,
110        'scheduler': scheduler,
111        'criterion': criterion,
112        'scaler': scaler,
113    }
114
115
116if __name__ == "__main__":
117    components = main()
118'''
119
120    print(script)
121
122
123complete_setup_script()

Summary

Configuration Summary

ComponentPurposeKey Settings
ModelConfigArchitectured_model=512, layers=6, heads=8
TrainingConfigOptimizationlr=1e-4, warmup=4000, epochs=30
ExperimentConfigComplete setupPaths, seeds, combined configs
🐍python
1# For ~30 BLEU target
2model_config = ModelConfig(
3    d_model=512,
4    num_heads=8,
5    num_encoder_layers=6,
6    num_decoder_layers=6,
7    d_ff=2048,
8    dropout=0.1,
9)
10
11training_config = TrainingConfig(
12    max_tokens=4096,
13    num_epochs=30,
14    learning_rate=1e-4,
15    warmup_steps=4000,
16    label_smoothing=0.1,
17    gradient_clip=1.0,
18)

Exercises

Configuration

  • Create a configuration for training on CPU (smaller model, shorter training).
  • Implement configuration validation (check all paths exist, GPU available, etc.).
  • Add command-line argument parsing for configurations.

Analysis

  • Compare training curves with different model sizes.
  • Experiment with different warmup_steps values.

Next Section Preview: In the next section, we'll implement the Complete Training Script that puts everything together.

Loading comments...