Raw text data requires preprocessing before it can be used for training. This section covers text normalization, cleaning, and preparing Multi30k data for our translation model.
2.1 Text Normalization
Standard Preprocessing Steps
πpython
1import re
2import unicodedata
3from typing import List, Tuple, Optional
4
5
6class TextNormalizer:
7 """
8 Text normalization for machine translation.
9
10 Applies consistent preprocessing to source and target text:
11 - Unicode normalization
12 - Whitespace normalization
13 - Case handling (optional)
14 - Punctuation spacing
15
16 Args:
17 lowercase: Whether to lowercase text
18 normalize_unicode: Unicode normalization form
19 strip_accents: Whether to remove accents
20
21 Example:
22 >>> normalizer = TextNormalizer(lowercase=True)
23 >>> normalizer.normalize(" Hello, World! ")
24 'hello , world !'
25 """
26
27 def __init__(
28 self,
29 lowercase: bool = False,
30 normalize_unicode: str = 'NFC',
31 strip_accents: bool = False
32 ):
33 self.lowercase = lowercase
34 self.normalize_unicode = normalize_unicode
35 self.strip_accents = strip_accents
36
37 def normalize(self, text: str) -> str:
38 """
39 Apply all normalization steps.
40 """
41 # Unicode normalization
42 if self.normalize_unicode:
43 text = unicodedata.normalize(self.normalize_unicode, text)
44
45 # Strip accents (for some languages)
46 if self.strip_accents:
47 text = self._strip_accents(text)
48
49 # Whitespace normalization
50 text = self._normalize_whitespace(text)
51
52 # Separate punctuation
53 text = self._separate_punctuation(text)
54
55 # Lowercase
56 if self.lowercase:
57 text = text.lower()
58
59 return text.strip()
60
61 def _strip_accents(self, text: str) -> str:
62 """Remove accent marks from text."""
63 normalized = unicodedata.normalize('NFD', text)
64 return ''.join(
65 char for char in normalized
66 if unicodedata.category(char) != 'Mn'
67 )
68
69 def _normalize_whitespace(self, text: str) -> str:
70 """Normalize all whitespace to single spaces."""
71 return ' '.join(text.split())
72
73 def _separate_punctuation(self, text: str) -> str:
74 """Add spaces around punctuation."""
75 # Common punctuation to separate
76 punctuation = r'([.,!?;:\'"\"\(\)\[\]\{\}])'
77 text = re.sub(punctuation, r' \1 ', text)
78
79 # Clean up multiple spaces
80 text = ' '.join(text.split())
81
82 return textExample usage:
πpython
1normalizer = TextNormalizer(lowercase=False)
2
3examples = [
4 " Extra whitespace here ",
5 "Hello,World! No spaces",
6 "Γber MΓΌnchen fΓ€hrt", # German with umlauts
7 "It's a \"test\"",
8 "Numbers: 1,000.50",
9]
10
11for text in examples:
12 normalized = normalizer.normalize(text)
13 print(f"{repr(text):<40} {repr(normalized):<40}")2.2 German-Specific Preprocessing
Handling German Text
πpython
1class GermanPreprocessor(TextNormalizer):
2 """
3 German-specific text preprocessing.
4
5 Special handling for:
6 - Umlauts (Γ€, ΓΆ, ΓΌ, Γ)
7 - Compound word handling
8 - German-specific punctuation
9
10 Args:
11 normalize_umlauts: Convert Γ€βae, ΓΆβoe, etc.
12 handle_eszett: Convert Γβss
13 """
14
15 def __init__(
16 self,
17 lowercase: bool = False,
18 normalize_umlauts: bool = False,
19 handle_eszett: bool = False
20 ):
21 super().__init__(lowercase=lowercase)
22 self.normalize_umlauts = normalize_umlauts
23 self.handle_eszett = handle_eszett
24
25 # Umlaut mapping
26 self.umlaut_map = {
27 'Γ€': 'ae', 'Γ': 'Ae',
28 'ΓΆ': 'oe', 'Γ': 'Oe',
29 'ΓΌ': 'ue', 'Γ': 'Ue',
30 }
31
32 def normalize(self, text: str) -> str:
33 """Apply German-specific normalization."""
34 # Handle eszett
35 if self.handle_eszett:
36 text = text.replace('Γ', 'ss')
37
38 # Handle umlauts
39 if self.normalize_umlauts:
40 for umlaut, replacement in self.umlaut_map.items():
41 text = text.replace(umlaut, replacement)
42
43 # Apply base normalization
44 text = super().normalize(text)
45
46 return textExamples with different settings:
πpython
1examples = [
2 "Das MΓ€dchen lΓ€uft ΓΌber die StraΓe.",
3 "Der BΓ€r frisst Γpfel.",
4 "GrΓΆΓe und SchΓΆnheit.",
5]
6
7# Different settings
8preprocessors = [
9 ("Standard", GermanPreprocessor()),
10 ("Normalized", GermanPreprocessor(normalize_umlauts=True, handle_eszett=True)),
11 ("Lowercase", GermanPreprocessor(lowercase=True)),
12]
13
14for example in examples:
15 print(f"\nOriginal: {example}")
16 for name, prep in preprocessors:
17 result = prep.normalize(example)
18 print(f" {name}: {result}")2.3 English Preprocessing
Handling English Text
πpython
1class EnglishPreprocessor(TextNormalizer):
2 """
3 English-specific text preprocessing.
4
5 Special handling for:
6 - Contractions
7 - Possessives
8 - Numbers and dates
9 """
10
11 def __init__(self, lowercase: bool = False):
12 super().__init__(lowercase=lowercase)
13
14 # Common contractions to handle
15 self.contractions = {
16 "n't": " n't",
17 "'re": " 're",
18 "'ve": " 've",
19 "'ll": " 'll",
20 "'d": " 'd",
21 "'m": " 'm",
22 "'s": " 's",
23 }
24
25 def normalize(self, text: str) -> str:
26 """Apply English-specific normalization."""
27 # Handle contractions before punctuation separation
28 for contraction, replacement in self.contractions.items():
29 text = text.replace(contraction, replacement)
30
31 # Apply base normalization
32 text = super().normalize(text)
33
34 return textExample processing:
πpython
1prep = EnglishPreprocessor()
2
3examples = [
4 "I'm going to the store.",
5 "They've been waiting.",
6 "It's John's car.",
7 "We won't be late.",
8 "She'd have known.",
9]
10
11for text in examples:
12 result = prep.normalize(text)
13 print(f"{text:<35} {result:<40}")2.4 Complete Preprocessing Pipeline
Multi30k Preprocessor
πpython
1from dataclasses import dataclass
2from typing import Dict, Any
3
4
5@dataclass
6class PreprocessingConfig:
7 """Configuration for preprocessing."""
8 lowercase: bool = False
9 normalize_umlauts: bool = False # Keep German characters
10 handle_contractions: bool = True
11 min_length: int = 1
12 max_length: int = 100
13
14
15class Multi30kPreprocessor:
16 """
17 Complete preprocessing pipeline for Multi30k.
18
19 Handles both German source and English target text
20 with appropriate language-specific processing.
21
22 Args:
23 config: PreprocessingConfig instance
24
25 Example:
26 >>> prep = Multi30kPreprocessor(PreprocessingConfig())
27 >>> src, tgt = prep.process_pair(german, english)
28 """
29
30 def __init__(self, config: PreprocessingConfig):
31 self.config = config
32
33 # Language-specific preprocessors
34 self.src_preprocessor = GermanPreprocessor(
35 lowercase=config.lowercase,
36 normalize_umlauts=config.normalize_umlauts
37 )
38
39 self.tgt_preprocessor = EnglishPreprocessor(
40 lowercase=config.lowercase
41 )
42
43 def process_source(self, text: str) -> str:
44 """Process German source text."""
45 return self.src_preprocessor.normalize(text)
46
47 def process_target(self, text: str) -> str:
48 """Process English target text."""
49 return self.tgt_preprocessor.normalize(text)
50
51 def process_pair(
52 self,
53 source: str,
54 target: str
55 ) -> Tuple[str, str]:
56 """
57 Process a source-target pair.
58
59 Returns:
60 Tuple of (processed_source, processed_target)
61 """
62 src = self.process_source(source)
63 tgt = self.process_target(target)
64 return src, tgt
65
66 def is_valid_pair(
67 self,
68 source: str,
69 target: str
70 ) -> bool:
71 """
72 Check if pair passes filtering criteria.
73 """
74 src_len = len(source.split())
75 tgt_len = len(target.split())
76
77 # Length checks
78 if src_len < self.config.min_length or tgt_len < self.config.min_length:
79 return False
80
81 if src_len > self.config.max_length or tgt_len > self.config.max_length:
82 return False
83
84 # Empty check
85 if not source.strip() or not target.strip():
86 return False
87
88 return True
89
90 def process_corpus(
91 self,
92 sources: List[str],
93 targets: List[str],
94 filter_invalid: bool = True
95 ) -> Tuple[List[str], List[str]]:
96 """
97 Process entire parallel corpus.
98
99 Args:
100 sources: Source sentences
101 targets: Target sentences
102 filter_invalid: Whether to filter invalid pairs
103
104 Returns:
105 Tuple of (processed_sources, processed_targets)
106 """
107 processed_src = []
108 processed_tgt = []
109 filtered_count = 0
110
111 for src, tgt in zip(sources, targets):
112 # Process
113 src_proc = self.process_source(src)
114 tgt_proc = self.process_target(tgt)
115
116 # Filter if needed
117 if filter_invalid and not self.is_valid_pair(src_proc, tgt_proc):
118 filtered_count += 1
119 continue
120
121 processed_src.append(src_proc)
122 processed_tgt.append(tgt_proc)
123
124 if filtered_count > 0:
125 print(f"Filtered {filtered_count} invalid pairs")
126
127 return processed_src, processed_tgt2.5 Saving Processed Data
Export Functions
πpython
1from pathlib import Path
2import json
3
4
5def save_processed_data(
6 sources: List[str],
7 targets: List[str],
8 output_dir: str,
9 prefix: str = "processed"
10):
11 """
12 Save processed data to files.
13
14 Args:
15 sources: Processed source sentences
16 targets: Processed target sentences
17 output_dir: Output directory
18 prefix: File prefix
19 """
20 output_path = Path(output_dir)
21 output_path.mkdir(parents=True, exist_ok=True)
22
23 # Save as separate files
24 src_path = output_path / f"{prefix}.de"
25 tgt_path = output_path / f"{prefix}.en"
26
27 with open(src_path, 'w', encoding='utf-8') as f:
28 for line in sources:
29 f.write(line + '\n')
30
31 with open(tgt_path, 'w', encoding='utf-8') as f:
32 for line in targets:
33 f.write(line + '\n')
34
35 print(f"Saved {len(sources)} pairs to {output_dir}/")
36 print(f" Source: {src_path}")
37 print(f" Target: {tgt_path}")Complete Workflow
πpython
1# preprocess_multi30k.py
2
3from pathlib import Path
4
5def main():
6 # Paths
7 data_dir = Path("data/multi30k")
8 output_dir = Path("data/processed")
9
10 # Configuration
11 config = PreprocessingConfig(
12 lowercase=False,
13 min_length=2,
14 max_length=100
15 )
16
17 preprocessor = Multi30kPreprocessor(config)
18
19 # Process each split
20 splits = ['train', 'val', 'test_2016_flickr']
21
22 for split in splits:
23 print(f"Processing {split}...")
24
25 # Load
26 src_file = data_dir / f"{split}.de"
27 tgt_file = data_dir / f"{split}.en"
28
29 sources, targets = load_parallel_data(src_file, tgt_file)
30
31 # Process
32 proc_src, proc_tgt = preprocessor.process_corpus(
33 sources, targets
34 )
35
36 # Save
37 save_processed_data(
38 proc_src, proc_tgt,
39 output_dir,
40 prefix=split
41 )
42
43 # Save config
44 save_processing_config(config, output_dir / "config.json")
45
46 print("Preprocessing complete!")
47
48
49if __name__ == "__main__":
50 main()2.6 Data Statistics After Preprocessing
Analyzing Processed Data
πpython
1def analyze_preprocessing_results(
2 raw_sources: List[str],
3 raw_targets: List[str],
4 proc_sources: List[str],
5 proc_targets: List[str]
6):
7 """
8 Compare raw and processed data statistics.
9 """
10 print("\nPreprocessing Analysis")
11 print("=" * 60)
12
13 # Basic counts
14 print(f"\nPair counts:")
15 print(f" Raw: {len(raw_sources):,}")
16 print(f" Processed: {len(proc_sources):,}")
17 print(f" Filtered: {len(raw_sources) - len(proc_sources):,}")
18
19 # Average lengths
20 def avg_len(sentences):
21 return sum(len(s.split()) for s in sentences) / len(sentences)
22
23 print(f"\nAverage lengths (words):")
24 print(f" Raw source: {avg_len(raw_sources):.1f}")
25 print(f" Processed source: {avg_len(proc_sources):.1f}")
26 print(f" Raw target: {avg_len(raw_targets):.1f}")
27 print(f" Processed target: {avg_len(proc_targets):.1f}")
28
29 # Character changes
30 def total_chars(sentences):
31 return sum(len(s) for s in sentences)
32
33 raw_src_chars = total_chars(raw_sources)
34 proc_src_chars = total_chars(proc_sources)
35
36 print(f"\nCharacter counts (source):")
37 print(f" Raw: {raw_src_chars:,}")
38 print(f" Processed: {proc_src_chars:,}")
39
40 # Sample comparison
41 print(f"\nSample comparisons:")
42 print("-" * 60)
43
44 for i in range(min(3, len(raw_sources))):
45 print(f"\n[{i+1}] Raw DE: {raw_sources[i]}")
46 print(f" Proc DE: {proc_sources[i]}")
47 print(f" Raw EN: {raw_targets[i]}")
48 print(f" Proc EN: {proc_targets[i]}")Summary
| Step | Purpose |
|---|---|
| Unicode normalization | Consistent character encoding |
| Whitespace normalization | Single spaces |
| Punctuation separation | Separate tokens |
| Language-specific | Handle contractions, umlauts |
| Filtering | Remove invalid pairs |
Configuration for Multi30k
πpython
1# Recommended settings for Multi30k
2config = PreprocessingConfig(
3 lowercase=False, # Preserve case (helps with names)
4 normalize_umlauts=False, # Keep German characters
5 min_length=2,
6 max_length=100
7)Key Considerations
- Keep preprocessing consistent across train/val/test
- Save configuration for reproducibility
- Don't over-process - subword tokenization handles most complexity
- Filter carefully - don't remove valid short sentences
Exercises
Implementation
- Add number normalization (e.g., "1,000" β "1000").
- Implement URL and email detection/handling.
- Create a preprocessing report generator.
Analysis
- Compare BLEU scores with different preprocessing settings.
- Find examples where preprocessing improves/hurts translation.
Next Section Preview
In the next section, we'll cover Vocabulary and Tokenizer Setupβbuilding the subword tokenizer for our translation model.