Introduction
This section brings together all components we've built to configure and set up the complete translation model. We'll define configurations for different model sizes and set up the training environment.
1.1 Model Architecture Configuration
Configuration Classes
🐍python
1from dataclasses import dataclass, field, asdict
2from typing import Optional, Dict, Any
3import json
4from pathlib import Path
5
6
7@dataclass
8class ModelConfig:
9 """
10 Configuration for the Transformer translation model.
11
12 Defines all architecture hyperparameters.
13 """
14 # Vocabulary
15 vocab_size: int = 8000
16 pad_id: int = 0
17
18 # Architecture
19 d_model: int = 512
20 num_heads: int = 8
21 num_encoder_layers: int = 6
22 num_decoder_layers: int = 6
23 d_ff: int = 2048
24 max_seq_len: int = 128
25
26 # Regularization
27 dropout: float = 0.1
28 attention_dropout: float = 0.1
29
30 # Initialization
31 init_std: float = 0.02
32
33 def __post_init__(self):
34 """Validate configuration."""
35 assert self.d_model % self.num_heads == 0, \
36 f"d_model ({self.d_model}) must be divisible by num_heads ({self.num_heads})"
37
38 def to_dict(self) -> Dict[str, Any]:
39 """Convert to dictionary."""
40 return asdict(self)
41
42 def save(self, path: str):
43 """Save configuration to JSON file."""
44 with open(path, 'w') as f:
45 json.dump(self.to_dict(), f, indent=2)
46
47 @classmethod
48 def load(cls, path: str) -> 'ModelConfig':
49 """Load configuration from JSON file."""
50 with open(path, 'r') as f:
51 data = json.load(f)
52 return cls(**data)
53
54
55@dataclass
56class TrainingConfig:
57 """
58 Configuration for training the translation model.
59
60 Defines optimization and training hyperparameters.
61 """
62 # Batching
63 batch_size: int = 64
64 max_tokens: int = 4096
65 accumulation_steps: int = 1
66
67 # Training
68 num_epochs: int = 30
69 max_steps: Optional[int] = None
70
71 # Optimization
72 learning_rate: float = 1e-4
73 weight_decay: float = 0.01
74 adam_beta1: float = 0.9
75 adam_beta2: float = 0.98
76 adam_epsilon: float = 1e-9
77
78 # Scheduler
79 warmup_steps: int = 4000
80 scheduler_type: str = "transformer" # or "cosine"
81
82 # Regularization
83 gradient_clip: float = 1.0
84 label_smoothing: float = 0.1
85
86 # Checkpointing
87 save_every: int = 1000
88 eval_every: int = 500
89 keep_best: int = 5
90 checkpoint_dir: str = "checkpoints"
91
92 # Logging
93 log_every: int = 100
94 log_dir: str = "logs"
95
96 # Device
97 device: str = "cuda"
98 mixed_precision: bool = True
99
100 def to_dict(self) -> Dict[str, Any]:
101 return asdict(self)
102
103
104@dataclass
105class ExperimentConfig:
106 """
107 Complete experiment configuration.
108
109 Combines model, training, and data configurations.
110 """
111 # Experiment name
112 name: str = "multi30k_de_en"
113 seed: int = 42
114
115 # Paths
116 data_dir: str = "data/multi30k"
117 tokenizer_path: str = "data/tokenizer/tokenizer.json"
118 output_dir: str = "outputs"
119
120 # Configs
121 model: ModelConfig = field(default_factory=ModelConfig)
122 training: TrainingConfig = field(default_factory=TrainingConfig)
123
124 def save(self, path: str):
125 """Save complete config."""
126 config_dict = {
127 'name': self.name,
128 'seed': self.seed,
129 'data_dir': self.data_dir,
130 'tokenizer_path': self.tokenizer_path,
131 'output_dir': self.output_dir,
132 'model': self.model.to_dict(),
133 'training': self.training.to_dict(),
134 }
135 with open(path, 'w') as f:
136 json.dump(config_dict, f, indent=2)
137
138
139def show_default_configs():
140 """
141 Show default configurations.
142 """
143 print("Default Configurations")
144 print("=" * 60)
145
146 model_config = ModelConfig()
147 training_config = TrainingConfig()
148
149 print("\nModel Configuration:")
150 print("-" * 40)
151 for key, value in model_config.to_dict().items():
152 print(f" {key}: {value}")
153
154 print("\nTraining Configuration:")
155 print("-" * 40)
156 for key, value in training_config.to_dict().items():
157 print(f" {key}: {value}")
158
159
160show_default_configs()1.2 Predefined Model Sizes
Small, Base, and Large Configurations
🐍python
1def get_model_config(size: str = "base") -> ModelConfig:
2 """
3 Get predefined model configuration.
4
5 Available sizes:
6 - tiny: For testing, ~1M parameters
7 - small: Quick training, ~10M parameters
8 - base: Standard, ~65M parameters
9 - large: Best quality, ~200M parameters
10
11 Args:
12 size: Model size name
13
14 Returns:
15 ModelConfig for specified size
16 """
17 configs = {
18 "tiny": ModelConfig(
19 d_model=128,
20 num_heads=2,
21 num_encoder_layers=2,
22 num_decoder_layers=2,
23 d_ff=256,
24 dropout=0.1,
25 ),
26 "small": ModelConfig(
27 d_model=256,
28 num_heads=4,
29 num_encoder_layers=3,
30 num_decoder_layers=3,
31 d_ff=512,
32 dropout=0.1,
33 ),
34 "base": ModelConfig(
35 d_model=512,
36 num_heads=8,
37 num_encoder_layers=6,
38 num_decoder_layers=6,
39 d_ff=2048,
40 dropout=0.1,
41 ),
42 "large": ModelConfig(
43 d_model=1024,
44 num_heads=16,
45 num_encoder_layers=6,
46 num_decoder_layers=6,
47 d_ff=4096,
48 dropout=0.3,
49 ),
50 }
51
52 if size not in configs:
53 raise ValueError(f"Unknown size: {size}. Choose from {list(configs.keys())}")
54
55 return configs[size]
56
57
58def compare_model_sizes():
59 """
60 Compare different model configurations.
61 """
62 print("Model Size Comparison")
63 print("=" * 60)
64
65 sizes = ["tiny", "small", "base", "large"]
66
67 print(f"\n{'Size':<10} {'d_model':<10} {'Heads':<8} {'Layers':<10} {'d_ff':<8} {'~Params':<12}")
68 print("-" * 65)
69
70 for size in sizes:
71 config = get_model_config(size)
72
73 # Estimate parameters
74 # Embeddings: vocab_size * d_model * 2 (src + tgt)
75 # Encoder: num_layers * (4*d_model*d_model + 2*d_model*d_ff)
76 # Decoder: num_layers * (4*d_model*d_model + 2*d_model*d_model + 2*d_model*d_ff)
77 # Output: d_model * vocab_size
78
79 embed_params = config.vocab_size * config.d_model * 2
80 enc_params = config.num_encoder_layers * (4 * config.d_model ** 2 + 2 * config.d_model * config.d_ff)
81 dec_params = config.num_decoder_layers * (6 * config.d_model ** 2 + 2 * config.d_model * config.d_ff)
82 output_params = config.d_model * config.vocab_size
83
84 total_params = embed_params + enc_params + dec_params + output_params
85
86 print(f"{size:<10} {config.d_model:<10} {config.num_heads:<8} "
87 f"{config.num_encoder_layers}+{config.num_decoder_layers:<6} "
88 f"{config.d_ff:<8} {total_params/1e6:.1f}M")
89
90 print("""
91
92 RECOMMENDATIONS:
93 ────────────────
94
95 tiny: For debugging and testing code
96 ~1 minute per epoch on CPU
97
98 small: For quick experiments
99 ~5 minutes per epoch on GPU
100 BLEU: ~25-30
101
102 base: For best results on Multi30k
103 ~15 minutes per epoch on GPU
104 BLEU: ~30-35 (our target)
105
106 large: For maximum quality
107 ~30+ minutes per epoch on GPU
108 BLEU: ~35-40
109 """)
110
111
112compare_model_sizes()1.3 Building the Model
Model Factory
🐍python
1import torch
2import torch.nn as nn
3
4
5def build_model(config: ModelConfig) -> nn.Module:
6 """
7 Build Transformer translation model from configuration.
8
9 Uses all the components we've built throughout the course.
10
11 Args:
12 config: ModelConfig instance
13
14 Returns:
15 Transformer model
16 """
17 # This would import from our implementations
18 # from src.model import Transformer
19
20 # For demonstration, show the expected interface
21 print(f"Building model with config:")
22 print(f" d_model: {config.d_model}")
23 print(f" num_heads: {config.num_heads}")
24 print(f" encoder_layers: {config.num_encoder_layers}")
25 print(f" decoder_layers: {config.num_decoder_layers}")
26
27 # Actual implementation would be:
28 '''
29 model = Transformer(
30 vocab_size=config.vocab_size,
31 d_model=config.d_model,
32 num_heads=config.num_heads,
33 num_encoder_layers=config.num_encoder_layers,
34 num_decoder_layers=config.num_decoder_layers,
35 d_ff=config.d_ff,
36 max_seq_len=config.max_seq_len,
37 dropout=config.dropout,
38 attention_dropout=config.attention_dropout,
39 pad_id=config.pad_id,
40 )
41
42 # Initialize weights
43 model.apply(lambda m: init_weights(m, config.init_std))
44
45 return model
46 '''
47
48 return None # Placeholder
49
50
51def init_weights(module: nn.Module, std: float = 0.02):
52 """
53 Initialize model weights.
54
55 Uses Xavier/Glorot for linear layers and normal for embeddings.
56 """
57 if isinstance(module, nn.Linear):
58 nn.init.normal_(module.weight, mean=0.0, std=std)
59 if module.bias is not None:
60 nn.init.zeros_(module.bias)
61
62 elif isinstance(module, nn.Embedding):
63 nn.init.normal_(module.weight, mean=0.0, std=std)
64
65 elif isinstance(module, nn.LayerNorm):
66 nn.init.ones_(module.weight)
67 nn.init.zeros_(module.bias)
68
69
70def count_parameters(model: nn.Module) -> int:
71 """Count trainable parameters."""
72 return sum(p.numel() for p in model.parameters() if p.requires_grad)
73
74
75def model_summary(model: nn.Module, config: ModelConfig):
76 """
77 Print model summary.
78 """
79 print("Model Summary")
80 print("=" * 60)
81
82 if model is not None:
83 total_params = count_parameters(model)
84
85 print(f"\nTotal parameters: {total_params:,}")
86 print(f"Trainable parameters: {total_params:,}")
87
88 # Memory estimation (fp32)
89 memory_mb = total_params * 4 / 1024 / 1024
90 print(f"Model size (fp32): {memory_mb:.1f} MB")
91
92 # Training memory estimation (model + gradients + optimizer states)
93 training_memory = memory_mb * 4 # Rough estimate
94 print(f"Training memory estimate: {training_memory:.1f} MB")
95
96 print(f"""
97 Architecture:
98 Encoder: {config.num_encoder_layers} layers
99 Decoder: {config.num_decoder_layers} layers
100 Attention heads: {config.num_heads}
101 Model dimension: {config.d_model}
102 FFN dimension: {config.d_ff}
103 Max sequence length: {config.max_seq_len}
104 Vocabulary size: {config.vocab_size}
105 """)1.4 Optimizer and Scheduler Setup
Training Components
🐍python
1import math
2
3
4def setup_optimizer(
5 model: nn.Module,
6 config: TrainingConfig
7) -> torch.optim.Optimizer:
8 """
9 Set up AdamW optimizer with proper configuration.
10
11 Uses weight decay only on non-bias, non-LayerNorm parameters.
12
13 Args:
14 model: Model to optimize
15 config: Training configuration
16
17 Returns:
18 Configured optimizer
19 """
20 # Separate parameters for weight decay
21 decay_params = []
22 no_decay_params = []
23
24 for name, param in model.named_parameters():
25 if not param.requires_grad:
26 continue
27
28 # No weight decay for biases and LayerNorm
29 if 'bias' in name or 'LayerNorm' in name or 'layer_norm' in name:
30 no_decay_params.append(param)
31 else:
32 decay_params.append(param)
33
34 optimizer_groups = [
35 {'params': decay_params, 'weight_decay': config.weight_decay},
36 {'params': no_decay_params, 'weight_decay': 0.0},
37 ]
38
39 optimizer = torch.optim.AdamW(
40 optimizer_groups,
41 lr=config.learning_rate,
42 betas=(config.adam_beta1, config.adam_beta2),
43 eps=config.adam_epsilon,
44 )
45
46 return optimizer
47
48
49def setup_scheduler(
50 optimizer: torch.optim.Optimizer,
51 config: TrainingConfig,
52 num_training_steps: int
53):
54 """
55 Set up learning rate scheduler.
56
57 Args:
58 optimizer: Optimizer instance
59 config: Training configuration
60 num_training_steps: Total number of training steps
61
62 Returns:
63 LR scheduler
64 """
65 if config.scheduler_type == "transformer":
66 # Original Transformer schedule
67 def lr_lambda(step):
68 step = max(step, 1)
69 return min(
70 step ** (-0.5),
71 step * config.warmup_steps ** (-1.5)
72 )
73 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
74
75 elif config.scheduler_type == "cosine":
76 # Linear warmup + cosine decay
77 def lr_lambda(step):
78 if step < config.warmup_steps:
79 return step / config.warmup_steps
80 else:
81 progress = (step - config.warmup_steps) / (num_training_steps - config.warmup_steps)
82 return 0.5 * (1 + math.cos(math.pi * progress))
83
84 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
85
86 else:
87 raise ValueError(f"Unknown scheduler: {config.scheduler_type}")
88
89 return scheduler
90
91
92def setup_criterion(config: TrainingConfig, vocab_size: int, pad_id: int):
93 """
94 Set up loss function with label smoothing.
95
96 Args:
97 config: Training configuration
98 vocab_size: Vocabulary size
99 pad_id: Padding token ID
100
101 Returns:
102 Loss criterion
103 """
104 # Using cross-entropy with label smoothing
105 criterion = nn.CrossEntropyLoss(
106 ignore_index=pad_id,
107 label_smoothing=config.label_smoothing
108 )
109
110 return criterion
111
112
113def demonstrate_setup():
114 """
115 Demonstrate training setup.
116 """
117 print("Training Setup Demonstration")
118 print("=" * 60)
119
120 code = '''
121# Complete setup code
122
123def setup_training(model_config, training_config, data_module):
124 """
125 Set up all training components.
126 """
127 # Build model
128 model = build_model(model_config)
129 model = model.to(training_config.device)
130
131 # Set up optimizer
132 optimizer = setup_optimizer(model, training_config)
133
134 # Estimate training steps
135 train_loader = data_module.train_dataloader()
136 steps_per_epoch = len(train_loader)
137 total_steps = steps_per_epoch * training_config.num_epochs
138
139 # Set up scheduler
140 scheduler = setup_scheduler(
141 optimizer,
142 training_config,
143 total_steps
144 )
145
146 # Set up criterion
147 criterion = setup_criterion(
148 training_config,
149 model_config.vocab_size,
150 model_config.pad_id
151 )
152
153 # Mixed precision scaler
154 scaler = None
155 if training_config.mixed_precision and training_config.device == "cuda":
156 scaler = torch.cuda.amp.GradScaler()
157
158 print(f"Training setup complete:")
159 print(f" Steps per epoch: {steps_per_epoch}")
160 print(f" Total steps: {total_steps}")
161 print(f" Warmup steps: {training_config.warmup_steps}")
162
163 return {
164 'model': model,
165 'optimizer': optimizer,
166 'scheduler': scheduler,
167 'criterion': criterion,
168 'scaler': scaler,
169 }
170'''
171
172 print(code)
173
174
175demonstrate_setup()1.5 Complete Setup Script
Ready-to-Run Setup
🐍python
1def complete_setup_script():
2 """
3 Show complete setup script.
4 """
5 print("Complete Setup Script")
6 print("=" * 60)
7
8 script = '''
9# setup.py - Complete training setup
10
11import torch
12import random
13import numpy as np
14from pathlib import Path
15
16def set_seed(seed: int):
17 """Set all random seeds for reproducibility."""
18 random.seed(seed)
19 np.random.seed(seed)
20 torch.manual_seed(seed)
21 if torch.cuda.is_available():
22 torch.cuda.manual_seed_all(seed)
23 torch.backends.cudnn.deterministic = True
24 torch.backends.cudnn.benchmark = False
25
26
27def main():
28 # 1. Configuration
29 experiment_config = ExperimentConfig(
30 name="multi30k_base",
31 seed=42,
32 data_dir="data/multi30k",
33 tokenizer_path="data/tokenizer/tokenizer.json",
34 output_dir="outputs/multi30k_base",
35 model=get_model_config("base"),
36 training=TrainingConfig(
37 num_epochs=30,
38 max_tokens=4096,
39 learning_rate=1e-4,
40 warmup_steps=4000,
41 label_smoothing=0.1,
42 gradient_clip=1.0,
43 device="cuda" if torch.cuda.is_available() else "cpu",
44 ),
45 )
46
47 # Create output directory
48 Path(experiment_config.output_dir).mkdir(parents=True, exist_ok=True)
49
50 # Save configuration
51 experiment_config.save(
52 Path(experiment_config.output_dir) / "config.json"
53 )
54
55 # 2. Set seed
56 set_seed(experiment_config.seed)
57
58 # 3. Setup data
59 data_config = DataConfig(
60 data_dir=experiment_config.data_dir,
61 tokenizer_path=experiment_config.tokenizer_path,
62 max_tokens=experiment_config.training.max_tokens,
63 )
64
65 data_module = Multi30kDataModule(data_config)
66 data_module.setup()
67
68 # Update vocab size from tokenizer
69 experiment_config.model.vocab_size = data_module.vocab_size
70 experiment_config.model.pad_id = data_module.pad_id
71
72 # 4. Build model
73 model = build_model(experiment_config.model)
74 model = model.to(experiment_config.training.device)
75
76 print(f"\nModel parameters: {count_parameters(model):,}")
77
78 # 5. Setup optimizer
79 optimizer = setup_optimizer(model, experiment_config.training)
80
81 # 6. Setup scheduler
82 train_loader = data_module.train_dataloader()
83 total_steps = len(train_loader) * experiment_config.training.num_epochs
84
85 scheduler = setup_scheduler(
86 optimizer,
87 experiment_config.training,
88 total_steps
89 )
90
91 # 7. Setup criterion
92 criterion = setup_criterion(
93 experiment_config.training,
94 experiment_config.model.vocab_size,
95 experiment_config.model.pad_id
96 )
97
98 # 8. Setup mixed precision
99 scaler = None
100 if experiment_config.training.mixed_precision:
101 scaler = torch.cuda.amp.GradScaler()
102
103 print("\nSetup complete! Ready to train.")
104
105 return {
106 'config': experiment_config,
107 'data_module': data_module,
108 'model': model,
109 'optimizer': optimizer,
110 'scheduler': scheduler,
111 'criterion': criterion,
112 'scaler': scaler,
113 }
114
115
116if __name__ == "__main__":
117 components = main()
118'''
119
120 print(script)
121
122
123complete_setup_script()Summary
Configuration Summary
| Component | Purpose | Key Settings |
|---|---|---|
| ModelConfig | Architecture | d_model=512, layers=6, heads=8 |
| TrainingConfig | Optimization | lr=1e-4, warmup=4000, epochs=30 |
| ExperimentConfig | Complete setup | Paths, seeds, combined configs |
Recommended Settings for Multi30k
🐍python
1# For ~30 BLEU target
2model_config = ModelConfig(
3 d_model=512,
4 num_heads=8,
5 num_encoder_layers=6,
6 num_decoder_layers=6,
7 d_ff=2048,
8 dropout=0.1,
9)
10
11training_config = TrainingConfig(
12 max_tokens=4096,
13 num_epochs=30,
14 learning_rate=1e-4,
15 warmup_steps=4000,
16 label_smoothing=0.1,
17 gradient_clip=1.0,
18)Exercises
Configuration
- Create a configuration for training on CPU (smaller model, shorter training).
- Implement configuration validation (check all paths exist, GPU available, etc.).
- Add command-line argument parsing for configurations.
Analysis
- Compare training curves with different model sizes.
- Experiment with different warmup_steps values.
Next Section Preview: In the next section, we'll implement the Complete Training Script that puts everything together.