Introduction
While BLEU is the most common metric, other metrics capture different aspects of translation quality. This section covers ChrF, TER, and introduces METEOR concepts. Each metric has strengths for different scenarios.
ChrF - Character F-Score
Why Character-Level?
📝text
1Problem with word-level BLEU for German:
2
3Reference: "Bundesausbildungsförderungsgesetz" (1 word)
4Hypothesis: "Bundes ausbildungs förderungs gesetz" (4 words)
5
6Word-level BLEU: 0% match (different tokenization!)
7Character-level: ~95% overlap
8
9German, Finnish, Turkish, and other morphologically
10rich languages benefit from character-level metrics.ChrF Implementation
🐍python
1from collections import Counter
2from typing import List, Dict, Tuple
3import math
4
5
6def extract_char_ngrams(text: str, n: int) -> Counter:
7 """
8 Extract character n-grams from text.
9
10 Includes word boundary markers (spaces).
11
12 Args:
13 text: Input text
14 n: N-gram order
15
16 Returns:
17 Counter of character n-grams
18 """
19 # Keep spaces as word boundary markers
20 ngrams = []
21
22 for i in range(len(text) - n + 1):
23 ngram = text[i:i + n]
24 ngrams.append(ngram)
25
26 return Counter(ngrams)
27
28
29def chrf_precision_recall(
30 hypothesis: str,
31 reference: str,
32 n: int
33) -> Tuple[float, float]:
34 """
35 Compute character n-gram precision and recall.
36
37 Args:
38 hypothesis: Hypothesis text
39 reference: Reference text
40 n: N-gram order
41
42 Returns:
43 (precision, recall)
44 """
45 hyp_ngrams = extract_char_ngrams(hypothesis, n)
46 ref_ngrams = extract_char_ngrams(reference, n)
47
48 # Count matches (with clipping)
49 matches = 0
50 for ngram, count in hyp_ngrams.items():
51 matches += min(count, ref_ngrams.get(ngram, 0))
52
53 hyp_total = sum(hyp_ngrams.values())
54 ref_total = sum(ref_ngrams.values())
55
56 precision = matches / hyp_total if hyp_total > 0 else 0.0
57 recall = matches / ref_total if ref_total > 0 else 0.0
58
59 return precision, recall
60
61
62def f_score(precision: float, recall: float, beta: float = 1.0) -> float:
63 """
64 Compute F-score from precision and recall.
65
66 F_β = (1 + β²) × (precision × recall) / (β² × precision + recall)
67
68 Args:
69 precision: Precision value
70 recall: Recall value
71 beta: Beta parameter (1.0 = balanced, 2.0 = recall-weighted)
72
73 Returns:
74 F-score
75 """
76 if precision + recall == 0:
77 return 0.0
78
79 beta_sq = beta ** 2
80 return (1 + beta_sq) * precision * recall / (beta_sq * precision + recall)
81
82
83class ChrFScore:
84 """
85 Character n-gram F-score (ChrF) metric.
86
87 ChrF is particularly good for:
88 - Morphologically rich languages (German, Finnish, etc.)
89 - Languages with no clear word boundaries
90 - Capturing partial word matches
91
92 Formula:
93 ChrF_β = F_β of character n-grams (average over orders 1 to max_n)
94
95 Args:
96 max_n: Maximum character n-gram order (default: 6)
97 beta: Beta for F-score (default: 2.0 for ChrF++)
98
99 Example:
100 >>> chrf = ChrFScore()
101 >>> score = chrf.sentence_score("the cat sat", "the cat sits")
102 """
103
104 def __init__(self, max_n: int = 6, beta: float = 2.0):
105 self.max_n = max_n
106 self.beta = beta
107
108 # Accumulators for corpus-level
109 self.reset()
110
111 def reset(self):
112 """Reset accumulators."""
113 self.precisions = [[] for _ in range(self.max_n)]
114 self.recalls = [[] for _ in range(self.max_n)]
115
116 def sentence_score(
117 self,
118 hypothesis: str,
119 reference: str
120 ) -> float:
121 """
122 Compute ChrF for a single sentence.
123
124 Args:
125 hypothesis: Hypothesis text
126 reference: Reference text
127
128 Returns:
129 ChrF score (0 to 1)
130 """
131 f_scores = []
132
133 for n in range(1, self.max_n + 1):
134 precision, recall = chrf_precision_recall(hypothesis, reference, n)
135 f = f_score(precision, recall, self.beta)
136 f_scores.append(f)
137
138 # Average F-scores across n-gram orders
139 return sum(f_scores) / len(f_scores) if f_scores else 0.0
140
141 def add(self, hypothesis: str, reference: str):
142 """Add a sentence pair for corpus-level computation."""
143 for n in range(1, self.max_n + 1):
144 precision, recall = chrf_precision_recall(hypothesis, reference, n)
145 self.precisions[n-1].append(precision)
146 self.recalls[n-1].append(recall)
147
148 def corpus_score(self) -> Dict[str, float]:
149 """
150 Compute corpus-level ChrF.
151
152 Returns:
153 Dictionary with score and components
154 """
155 avg_f_scores = []
156
157 for n in range(self.max_n):
158 avg_p = sum(self.precisions[n]) / len(self.precisions[n]) if self.precisions[n] else 0
159 avg_r = sum(self.recalls[n]) / len(self.recalls[n]) if self.recalls[n] else 0
160 f = f_score(avg_p, avg_r, self.beta)
161 avg_f_scores.append(f)
162
163 return {
164 'chrf': sum(avg_f_scores) / len(avg_f_scores) if avg_f_scores else 0.0,
165 'f_scores': avg_f_scores,
166 'beta': self.beta,
167 'max_n': self.max_n,
168 }
169
170
171def test_chrf():
172 """
173 Test ChrF implementation.
174 """
175 print("ChrF Score Test")
176 print("=" * 60)
177
178 chrf = ChrFScore(max_n=6, beta=2.0)
179
180 # Test cases
181 test_cases = [
182 ("the cat sat on the mat", "the cat sat on the mat", "Exact match"),
183 ("the cat sat on the mat", "the cat sits on the mat", "One word different"),
184 ("Bundesausbildungsförderungsgesetz", "Bundesausbildungsförderung", "German compound"),
185 ("hello world", "goodbye moon", "Completely different"),
186 ]
187
188 print(f"{'Hypothesis':<35} {'Reference':<35} {'ChrF':<10}")
189 print("-" * 85)
190
191 for hyp, ref, note in test_cases:
192 score = chrf.sentence_score(hyp, ref)
193 hyp_display = hyp[:32] + "..." if len(hyp) > 35 else hyp
194 ref_display = ref[:32] + "..." if len(ref) > 35 else ref
195 print(f"{hyp_display:<35} {ref_display:<35} {score:.4f}")
196
197 print("\n ChrF test passed!")
198
199
200test_chrf()ChrF++ (With Word N-grams)
Combining Character and Word Level
ChrF++ adds word-level n-grams to ChrF, providing benefits of both character and word-level matching.
🐍python
1class ChrFPlusPlus:
2 """
3 ChrF++ metric combining character and word n-grams.
4
5 ChrF++ adds word-level n-grams to ChrF, providing
6 benefits of both character and word-level matching.
7
8 Args:
9 char_order: Max character n-gram order (default: 6)
10 word_order: Max word n-gram order (default: 2)
11 beta: Beta for F-score (default: 2.0)
12 """
13
14 def __init__(
15 self,
16 char_order: int = 6,
17 word_order: int = 2,
18 beta: float = 2.0
19 ):
20 self.char_order = char_order
21 self.word_order = word_order
22 self.beta = beta
23
24 def sentence_score(
25 self,
26 hypothesis: str,
27 reference: str
28 ) -> float:
29 """
30 Compute ChrF++ for a single sentence.
31 """
32 f_scores = []
33
34 # Character n-grams
35 for n in range(1, self.char_order + 1):
36 p, r = chrf_precision_recall(hypothesis, reference, n)
37 f_scores.append(f_score(p, r, self.beta))
38
39 # Word n-grams
40 hyp_words = hypothesis.split()
41 ref_words = reference.split()
42
43 for n in range(1, self.word_order + 1):
44 p, r = self._word_precision_recall(hyp_words, ref_words, n)
45 f_scores.append(f_score(p, r, self.beta))
46
47 return sum(f_scores) / len(f_scores) if f_scores else 0.0
48
49 def _word_precision_recall(
50 self,
51 hyp_words: List[str],
52 ref_words: List[str],
53 n: int
54 ) -> Tuple[float, float]:
55 """Compute word n-gram precision and recall."""
56 def get_ngrams(words, n):
57 ngrams = []
58 for i in range(len(words) - n + 1):
59 ngrams.append(tuple(words[i:i+n]))
60 return Counter(ngrams)
61
62 hyp_ngrams = get_ngrams(hyp_words, n)
63 ref_ngrams = get_ngrams(ref_words, n)
64
65 matches = sum(
66 min(c, ref_ngrams.get(ng, 0))
67 for ng, c in hyp_ngrams.items()
68 )
69
70 hyp_total = sum(hyp_ngrams.values())
71 ref_total = sum(ref_ngrams.values())
72
73 precision = matches / hyp_total if hyp_total > 0 else 0.0
74 recall = matches / ref_total if ref_total > 0 else 0.0
75
76 return precision, recall
77
78
79def test_chrfpp():
80 """
81 Test ChrF++ implementation.
82 """
83 print("ChrF++ Test")
84 print("=" * 60)
85
86 chrf = ChrFScore()
87 chrfpp = ChrFPlusPlus()
88
89 examples = [
90 ("The quick brown fox", "The quick brown fox"),
91 ("The quick brown fox", "A quick brown fox"),
92 ("completely different", "totally other words"),
93 ]
94
95 print(f"{'Hypothesis':<25} {'ChrF':<10} {'ChrF++':<10}")
96 print("-" * 50)
97
98 for hyp, ref in examples:
99 chrf_score = chrf.sentence_score(hyp, ref)
100 chrfpp_score = chrfpp.sentence_score(hyp, ref)
101 print(f"{hyp:<25} {chrf_score:.4f} {chrfpp_score:.4f}")
102
103
104test_chrfpp()TER - Translation Edit Rate
Measuring Edit Distance
📝text
1TER = Translation Edit Rate
2
3Measures minimum edits needed to change hypothesis into reference.
4
5Edit operations:
6 - Insert: Add a word
7 - Delete: Remove a word
8 - Substitute: Replace a word
9 - Shift: Move a sequence of words
10
11TER = (# of edits) / (# of reference words)
12
13Lower is better! (Unlike BLEU)🐍python
1def levenshtein_distance(
2 hypothesis: List[str],
3 reference: List[str]
4) -> int:
5 """
6 Compute word-level Levenshtein distance.
7
8 Args:
9 hypothesis: Hypothesis tokens
10 reference: Reference tokens
11
12 Returns:
13 Minimum edit distance
14 """
15 m, n = len(hypothesis), len(reference)
16
17 # DP table
18 dp = [[0] * (n + 1) for _ in range(m + 1)]
19
20 # Base cases
21 for i in range(m + 1):
22 dp[i][0] = i
23 for j in range(n + 1):
24 dp[0][j] = j
25
26 # Fill table
27 for i in range(1, m + 1):
28 for j in range(1, n + 1):
29 if hypothesis[i-1] == reference[j-1]:
30 dp[i][j] = dp[i-1][j-1]
31 else:
32 dp[i][j] = 1 + min(
33 dp[i-1][j], # Delete
34 dp[i][j-1], # Insert
35 dp[i-1][j-1] # Substitute
36 )
37
38 return dp[m][n]
39
40
41class TERScore:
42 """
43 Translation Edit Rate (TER) metric.
44
45 TER measures the minimum number of edits needed to
46 change a hypothesis into the reference.
47
48 Lower TER = Better translation
49
50 Note: This is a simplified version without shift operations.
51 Full TER includes phrase shifts.
52
53 Args:
54 normalize: Whether to normalize by reference length
55
56 Example:
57 >>> ter = TERScore()
58 >>> score = ter.sentence_score("the cat sat", "a cat sits")
59 """
60
61 def __init__(self, normalize: bool = True):
62 self.normalize = normalize
63 self.reset()
64
65 def reset(self):
66 """Reset accumulators."""
67 self.total_edits = 0
68 self.total_ref_length = 0
69
70 def sentence_score(
71 self,
72 hypothesis: str,
73 reference: str
74 ) -> float:
75 """
76 Compute TER for a single sentence.
77
78 Args:
79 hypothesis: Hypothesis text
80 reference: Reference text
81
82 Returns:
83 TER score (lower is better)
84 """
85 hyp_tokens = hypothesis.split()
86 ref_tokens = reference.split()
87
88 edits = levenshtein_distance(hyp_tokens, ref_tokens)
89 ref_len = len(ref_tokens)
90
91 if self.normalize and ref_len > 0:
92 return edits / ref_len
93 return float(edits)
94
95 def add(self, hypothesis: str, reference: str):
96 """Add sentence pair for corpus-level."""
97 hyp_tokens = hypothesis.split()
98 ref_tokens = reference.split()
99
100 self.total_edits += levenshtein_distance(hyp_tokens, ref_tokens)
101 self.total_ref_length += len(ref_tokens)
102
103 def corpus_score(self) -> float:
104 """Compute corpus-level TER."""
105 if self.total_ref_length == 0:
106 return 0.0
107 return self.total_edits / self.total_ref_length
108
109
110def test_ter():
111 """
112 Test TER implementation.
113 """
114 print("TER Score Test")
115 print("=" * 60)
116
117 ter = TERScore()
118
119 test_cases = [
120 ("the cat sat on the mat", "the cat sat on the mat", "Exact"),
121 ("the cat sat on the mat", "the cat sits on the mat", "1 substitution"),
122 ("the cat on the mat", "the cat sat on the mat", "1 deletion"),
123 ("cat sat mat", "the cat sat on the mat", "Missing words"),
124 ("completely different sentence", "the cat sat", "Very different"),
125 ]
126
127 print(f"{'Hypothesis':<30} {'TER':<10} {'Note':<20}")
128 print("-" * 65)
129
130 for hyp, ref, note in test_cases:
131 score = ter.sentence_score(hyp, ref)
132 hyp_display = hyp[:27] + "..." if len(hyp) > 30 else hyp
133 print(f"{hyp_display:<30} {score:.4f} {note:<20}")
134
135 print("\n TER test passed!")
136
137
138test_ter()METEOR Concepts
Beyond Simple N-gram Matching
📝text
1METEOR = Metric for Evaluation of Translation with
2 Explicit ORdering
3
4Key innovations over BLEU:
5────────────────────────
6
71. ALIGNMENT-BASED:
8 - Finds optimal word alignment between hyp and ref
9 - Uses multiple matching strategies
10
112. MULTIPLE MATCHERS:
12 - Exact match: "cat" = "cat"
13 - Stem match: "cats" = "cat" (via stemming)
14 - Synonym match: "cat" = "feline" (via WordNet)
15 - Paraphrase: "how are you" = "how's it going"
16
173. RECALL-ORIENTED:
18 - Weights recall more than precision
19 - F_mean = (10 × P × R) / (R + 9 × P)
20
214. FRAGMENTATION PENALTY:
22 - Penalizes translations with many "chunks"
23 - Encourages contiguous matchingMETEOR Scoring
📝text
11. Compute alignment (maximize matches)
22. Calculate precision and recall
33. Compute harmonic mean (recall-weighted)
44. Apply fragmentation penalty
5
6Final score = F_mean × (1 - penalty)
7
8
9CHUNKS and FRAGMENTATION:
10────────────────────────
11
12Reference: "the cat sat on the mat"
13Hypothesis: "on the mat sat the cat"
14
15Matches: [on][the][mat][sat][the][cat]
16Chunks: 3 (non-contiguous groups)
17
18Penalty = γ × (chunks/matches)^β
19(Typically γ=0.5, β=3)Why METEOR is Better (Sometimes)
📝text
1Reference: "The cat is sitting on the mat"
2Hypothesis: "A feline sits on the rug"
3
4BLEU: Low (few exact n-gram matches)
5METEOR: Higher (synonyms match: cat=feline, mat=rug)
6
7
8LIMITATIONS:
9────────────
10
11- Requires language resources (stemmer, WordNet)
12- Slower than BLEU
13- Primarily designed for English
14- Less standardized than BLEUMetric Selection Guide
Decision Tree
📝text
1Q: What language pair?
2│
3├─ Morphologically rich (DE, FI, TR, etc.)
4│ └─→ Use ChrF or ChrF++ primarily
5│ Also report BLEU for comparability
6│
7├─ Agglutinative (JA, KO, ZH)
8│ └─→ Use ChrF (handles character compounds)
9│ Consider language-specific metrics
10│
11└─ English / Romance languages
12 └─→ Use BLEU (standard)
13 Add METEOR if resources available
14
15
16Q: What's your use case?
17│
18├─ Research paper
19│ └─→ BLEU (for comparability) + ChrF/METEOR
20│ Report tokenization details!
21│
22├─ Development/debugging
23│ └─→ BLEU (fast, good enough)
24│
25├─ Final evaluation
26│ └─→ Multiple metrics + human eval if possible
27│ BLEU + ChrF + COMET (if available)
28│
29└─ Quality estimation (no refs)
30 └─→ COMET-QE, XMover, or similarMetric Strengths
| Metric | Focus | Strengths | Weaknesses |
|---|---|---|---|
| BLEU | Word n-grams | Standard, fast | No synonyms |
| ChrF | Char n-grams | Morphology | May miss words |
| TER | Edit distance | Intuitive | No positive reward |
| METEOR | Alignment | Synonyms, stems | Slow, EN-centric |
Recommended Combinations
- Standard setup: BLEU + ChrF (complementary)
- For German/morphological: ChrF + BLEU (ChrF primary)
- For publication: BLEU + ChrF + COMET (comprehensive)
- Quick development: BLEU only (fast iteration)
Combined Evaluator
Multi-Metric Evaluation
A combined evaluator computes multiple metrics at once, providing comprehensive evaluation:
- BLEU (corpus-level)
- ChrF
- TER
- Sentence-level statistics
🐍python
1class TranslationEvaluator:
2 """
3 Combined evaluator computing multiple metrics.
4
5 Provides comprehensive evaluation with:
6 - BLEU (corpus-level)
7 - ChrF
8 - TER
9 - Sentence-level statistics
10
11 Example:
12 >>> evaluator = TranslationEvaluator()
13 >>> results = evaluator.evaluate(hypotheses, references)
14 """
15
16 def __init__(self):
17 self.bleu = None # Will use our BLEUScore
18 self.chrf = ChrFScore()
19 self.ter = TERScore()
20
21 def evaluate(
22 self,
23 hypotheses: List[str],
24 references: List[List[str]]
25 ) -> Dict:
26 """
27 Evaluate translations with multiple metrics.
28
29 Args:
30 hypotheses: List of hypothesis strings
31 references: List of reference lists
32
33 Returns:
34 Dictionary with all metrics
35 """
36 # Implementation details...
37 pass
38
39 def format_results(self, results: Dict) -> str:
40 """Format results for display."""
41 lines = [
42 "=" * 50,
43 "TRANSLATION EVALUATION RESULTS",
44 "=" * 50,
45 f"Sentences evaluated: {results['num_sentences']}",
46 "",
47 "Corpus-level scores:",
48 f" BLEU: {results['bleu']:.2f}",
49 f" ChrF: {results['chrf']:.2f}",
50 f" TER: {results['ter']:.2f} (lower is better)",
51 "",
52 "BLEU precisions:",
53 f" 1-gram: {results['precisions'][0]:.1f}%",
54 f" 2-gram: {results['precisions'][1]:.1f}%",
55 f" 3-gram: {results['precisions'][2]:.1f}%",
56 f" 4-gram: {results['precisions'][3]:.1f}%",
57 "=" * 50,
58 ]
59 return "\n".join(lines)Summary
Metric Comparison
| Metric | Focus | Strengths | Weaknesses |
|---|---|---|---|
| BLEU | Word n-grams | Standard, fast | No synonyms |
| ChrF | Char n-grams | Morphology | May miss words |
| TER | Edit distance | Intuitive | No positive reward |
| METEOR | Alignment | Synonyms, stems | Slow, EN-centric |
Recommendations
- Always report BLEU for comparability
- Add ChrF for morphologically rich languages
- Use TER to understand error types
- Consider METEOR when synonym matching matters
Next Steps
In the final section, we'll put everything together in a Practical Evaluation Pipeline for evaluating our translation model.