Introduction
Autonomous agents improve their outputs through iterative refinement - cycles of self-critique and revision. This capability enables agents to produce higher-quality results by continuously evaluating and improving their work.
Section Overview: We'll explore self-critique patterns, revision loops, convergence detection, and quality gates for iterative refinement in autonomous agents.
Self-Critique Patterns
Basic Self-Critique
🐍python
1from langchain_openai import ChatOpenAI
2from langchain_core.messages import SystemMessage, HumanMessage
3from pydantic import BaseModel, Field
4from typing import List, Optional
5
6
7class Critique(BaseModel):
8 """Structured critique output."""
9 strengths: List[str] = Field(description="What works well")
10 weaknesses: List[str] = Field(description="What needs improvement")
11 suggestions: List[str] = Field(description="Specific improvements")
12 score: float = Field(ge=0, le=1, description="Quality score")
13 needs_revision: bool = Field(description="Whether revision is needed")
14
15
16class SelfCritic:
17 """Self-critique system for agent outputs."""
18
19 def __init__(self):
20 self.llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
21 self.structured_llm = self.llm.with_structured_output(Critique)
22
23 def critique(
24 self,
25 output: str,
26 criteria: List[str],
27 goal: str
28 ) -> Critique:
29 """Generate a critique of the output."""
30
31 criteria_str = "\n".join(f"- {c}" for c in criteria)
32
33 prompt = f"""
34Critically evaluate this output:
35
36Goal: {goal}
37
38Output to evaluate:
39{output}
40
41Evaluation criteria:
42{criteria_str}
43
44Be thorough but fair. Identify both strengths and weaknesses.
45"""
46 messages = [
47 SystemMessage(content="You are a critical evaluator."),
48 HumanMessage(content=prompt)
49 ]
50
51 return self.structured_llm.invoke(messages)
52
53
54# Usage
55critic = SelfCritic()
56critique = critic.critique(
57 output="AI agents are computer programs that can work autonomously.",
58 criteria=[
59 "Technical accuracy",
60 "Depth of explanation",
61 "Clarity and readability",
62 "Actionable insights"
63 ],
64 goal="Explain AI agents to a technical audience"
65)
66
67print(f"Score: {critique.score}")
68print(f"Needs revision: {critique.needs_revision}")
69print(f"Weaknesses: {critique.weaknesses}")Multi-Perspective Critique
🐍python
1class MultiPerspectiveCritic:
2 """Critique from multiple perspectives."""
3
4 def __init__(self):
5 self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6
7 def critique_from_perspectives(
8 self,
9 output: str,
10 perspectives: List[dict]
11 ) -> List[Critique]:
12 """Get critiques from multiple perspectives."""
13
14 critiques = []
15
16 for perspective in perspectives:
17 role = perspective["role"]
18 focus = perspective["focus"]
19
20 prompt = f"""
21You are a {role} evaluating this output.
22Focus on: {focus}
23
24Output:
25{output}
26
27Provide your critique from your professional perspective.
28"""
29 messages = [
30 SystemMessage(content=f"You are a {role}."),
31 HumanMessage(content=prompt)
32 ]
33
34 response = self.llm.invoke(messages)
35 critiques.append({
36 "perspective": role,
37 "feedback": response.content
38 })
39
40 return critiques
41
42 def synthesize_critiques(self, critiques: List[dict]) -> dict:
43 """Combine critiques into actionable improvements."""
44 all_feedback = "\n\n".join([
45 f"{c['perspective']}: {c['feedback']}"
46 for c in critiques
47 ])
48
49 prompt = f"""
50Synthesize these critiques from different perspectives:
51
52{all_feedback}
53
54Create a prioritized list of improvements.
55"""
56 messages = [
57 SystemMessage(content="Synthesize feedback into improvements."),
58 HumanMessage(content=prompt)
59 ]
60
61 response = self.llm.invoke(messages)
62 return {"synthesis": response.content}
63
64
65# Usage
66critic = MultiPerspectiveCritic()
67critiques = critic.critique_from_perspectives(
68 output="Technical document about AI agents...",
69 perspectives=[
70 {"role": "Technical Accuracy Reviewer", "focus": "correctness"},
71 {"role": "UX Writer", "focus": "clarity and readability"},
72 {"role": "Subject Matter Expert", "focus": "depth and completeness"}
73 ]
74)Revision Loops
Iterative Revision System
🐍python
1from dataclasses import dataclass
2from typing import List, Callable
3
4
5@dataclass
6class RevisionResult:
7 """Result of a revision iteration."""
8 content: str
9 iteration: int
10 score: float
11 improvements_made: List[str]
12 remaining_issues: List[str]
13
14
15class RevisionLoop:
16 """Iterative revision system."""
17
18 def __init__(
19 self,
20 max_iterations: int = 5,
21 target_score: float = 0.85
22 ):
23 self.max_iterations = max_iterations
24 self.target_score = target_score
25 self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
26 self.critic = SelfCritic()
27 self.history: List[RevisionResult] = []
28
29 def run(
30 self,
31 initial_content: str,
32 goal: str,
33 criteria: List[str]
34 ) -> RevisionResult:
35 """Run the revision loop until convergence."""
36
37 content = initial_content
38 iteration = 0
39
40 while iteration < self.max_iterations:
41 # Critique current version
42 critique = self.critic.critique(content, criteria, goal)
43
44 # Check if target achieved
45 if critique.score >= self.target_score:
46 result = RevisionResult(
47 content=content,
48 iteration=iteration,
49 score=critique.score,
50 improvements_made=[],
51 remaining_issues=[]
52 )
53 self.history.append(result)
54 return result
55
56 # Generate revision
57 revised = self._revise(content, critique, goal)
58
59 result = RevisionResult(
60 content=revised,
61 iteration=iteration,
62 score=critique.score,
63 improvements_made=critique.suggestions,
64 remaining_issues=critique.weaknesses
65 )
66 self.history.append(result)
67
68 content = revised
69 iteration += 1
70
71 # Return best version
72 best = max(self.history, key=lambda r: r.score)
73 return best
74
75 def _revise(
76 self,
77 content: str,
78 critique: Critique,
79 goal: str
80 ) -> str:
81 """Generate a revised version based on critique."""
82
83 prompt = f"""
84Revise this content based on the critique:
85
86Original goal: {goal}
87
88Current content:
89{content}
90
91Critique:
92Weaknesses: {critique.weaknesses}
93Suggestions: {critique.suggestions}
94
95Create an improved version addressing the issues.
96"""
97 messages = [
98 SystemMessage(content="Revise content based on feedback."),
99 HumanMessage(content=prompt)
100 ]
101
102 response = self.llm.invoke(messages)
103 return response.content
104
105
106# Usage
107loop = RevisionLoop(max_iterations=3, target_score=0.8)
108result = loop.run(
109 initial_content="AI agents are autonomous programs.",
110 goal="Comprehensive explanation of AI agents",
111 criteria=["Accuracy", "Depth", "Clarity"]
112)
113
114print(f"Final score: {result.score}")
115print(f"Iterations: {result.iteration}")Progressive Refinement
🐍python
1class ProgressiveRefiner:
2 """Refine content through progressive stages."""
3
4 def __init__(self):
5 self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6 self.stages = [
7 self._stage_structure,
8 self._stage_content,
9 self._stage_clarity,
10 self._stage_polish
11 ]
12
13 def refine(self, draft: str, goal: str) -> str:
14 """Progressively refine through all stages."""
15 content = draft
16
17 for i, stage in enumerate(self.stages):
18 print(f"Stage {i+1}: {stage.__name__}")
19 content = stage(content, goal)
20
21 return content
22
23 def _stage_structure(self, content: str, goal: str) -> str:
24 """Improve overall structure and organization."""
25 prompt = f"""
26Focus on STRUCTURE only:
27- Improve organization
28- Add clear sections if needed
29- Ensure logical flow
30
31Goal: {goal}
32
33Content:
34{content}
35
36Output the restructured content.
37"""
38 return self._invoke(prompt)
39
40 def _stage_content(self, content: str, goal: str) -> str:
41 """Enhance content depth and accuracy."""
42 prompt = f"""
43Focus on CONTENT only:
44- Add missing information
45- Verify accuracy
46- Include examples
47
48Goal: {goal}
49
50Content:
51{content}
52
53Output the enhanced content.
54"""
55 return self._invoke(prompt)
56
57 def _stage_clarity(self, content: str, goal: str) -> str:
58 """Improve clarity and readability."""
59 prompt = f"""
60Focus on CLARITY only:
61- Simplify complex sentences
62- Define technical terms
63- Use active voice
64
65Goal: {goal}
66
67Content:
68{content}
69
70Output the clarified content.
71"""
72 return self._invoke(prompt)
73
74 def _stage_polish(self, content: str, goal: str) -> str:
75 """Final polish and consistency."""
76 prompt = f"""
77Focus on POLISH only:
78- Fix grammar and spelling
79- Ensure consistency
80- Perfect formatting
81
82Goal: {goal}
83
84Content:
85{content}
86
87Output the polished content.
88"""
89 return self._invoke(prompt)
90
91 def _invoke(self, prompt: str) -> str:
92 messages = [
93 SystemMessage(content="Refine the content as directed."),
94 HumanMessage(content=prompt)
95 ]
96 return self.llm.invoke(messages).contentConvergence Detection
Detecting When to Stop
🐍python
1from dataclasses import dataclass
2from typing import List
3import numpy as np
4
5
6@dataclass
7class ConvergenceMetrics:
8 """Metrics for convergence detection."""
9 score_history: List[float]
10 content_similarity: float
11 improvement_rate: float
12 is_converged: bool
13 reason: str
14
15
16class ConvergenceDetector:
17 """Detect when iterative refinement should stop."""
18
19 def __init__(
20 self,
21 min_improvement: float = 0.02,
22 plateau_window: int = 3,
23 similarity_threshold: float = 0.95
24 ):
25 self.min_improvement = min_improvement
26 self.plateau_window = plateau_window
27 self.similarity_threshold = similarity_threshold
28 self.score_history: List[float] = []
29 self.content_history: List[str] = []
30
31 def check(self, score: float, content: str) -> ConvergenceMetrics:
32 """Check if refinement has converged."""
33 self.score_history.append(score)
34 self.content_history.append(content)
35
36 # Not enough history
37 if len(self.score_history) < 2:
38 return ConvergenceMetrics(
39 score_history=self.score_history,
40 content_similarity=0.0,
41 improvement_rate=1.0,
42 is_converged=False,
43 reason="Not enough iterations"
44 )
45
46 # Calculate metrics
47 improvement = self._calculate_improvement()
48 similarity = self._calculate_similarity()
49
50 # Check convergence conditions
51 is_converged, reason = self._check_conditions(improvement, similarity)
52
53 return ConvergenceMetrics(
54 score_history=self.score_history,
55 content_similarity=similarity,
56 improvement_rate=improvement,
57 is_converged=is_converged,
58 reason=reason
59 )
60
61 def _calculate_improvement(self) -> float:
62 """Calculate recent improvement rate."""
63 if len(self.score_history) < 2:
64 return 1.0
65
66 recent = self.score_history[-self.plateau_window:]
67 if len(recent) < 2:
68 return abs(self.score_history[-1] - self.score_history[-2])
69
70 # Average improvement over window
71 improvements = [
72 recent[i] - recent[i-1]
73 for i in range(1, len(recent))
74 ]
75 return np.mean(improvements)
76
77 def _calculate_similarity(self) -> float:
78 """Calculate similarity between last two versions."""
79 if len(self.content_history) < 2:
80 return 0.0
81
82 # Simple Jaccard similarity
83 prev_words = set(self.content_history[-2].lower().split())
84 curr_words = set(self.content_history[-1].lower().split())
85
86 intersection = len(prev_words & curr_words)
87 union = len(prev_words | curr_words)
88
89 return intersection / union if union > 0 else 0.0
90
91 def _check_conditions(
92 self,
93 improvement: float,
94 similarity: float
95 ) -> tuple[bool, str]:
96 """Check all convergence conditions."""
97
98 # Condition 1: Score plateau
99 if improvement < self.min_improvement:
100 return True, f"Score plateau (improvement: {improvement:.4f})"
101
102 # Condition 2: High content similarity
103 if similarity > self.similarity_threshold:
104 return True, f"Content converged (similarity: {similarity:.2f})"
105
106 # Condition 3: Perfect score
107 if self.score_history[-1] >= 0.99:
108 return True, "Perfect score achieved"
109
110 # Condition 4: Score regression
111 if len(self.score_history) >= 3:
112 if self.score_history[-1] < self.score_history[-2] < self.score_history[-3]:
113 return True, "Score regression detected"
114
115 return False, "Continuing refinement"Quality Gates
Multi-Criteria Quality Gates
🐍python
1from dataclasses import dataclass
2from typing import Callable, Dict
3
4
5@dataclass
6class QualityGate:
7 """A single quality gate."""
8 name: str
9 check: Callable[[str], float] # Returns score 0-1
10 threshold: float
11 required: bool = True
12
13
14@dataclass
15class GateResult:
16 """Result of a quality gate check."""
17 gate_name: str
18 score: float
19 passed: bool
20 threshold: float
21
22
23class QualityGateSystem:
24 """System of quality gates for content validation."""
25
26 def __init__(self):
27 self.gates: List[QualityGate] = []
28 self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
29
30 def add_gate(self, gate: QualityGate):
31 """Add a quality gate."""
32 self.gates.append(gate)
33
34 def check_all(self, content: str) -> Dict:
35 """Check all quality gates."""
36 results = []
37 all_passed = True
38
39 for gate in self.gates:
40 score = gate.check(content)
41 passed = score >= gate.threshold
42
43 if gate.required and not passed:
44 all_passed = False
45
46 results.append(GateResult(
47 gate_name=gate.name,
48 score=score,
49 passed=passed,
50 threshold=gate.threshold
51 ))
52
53 return {
54 "passed": all_passed,
55 "results": results,
56 "summary": self._summarize(results)
57 }
58
59 def _summarize(self, results: List[GateResult]) -> str:
60 """Summarize gate results."""
61 passed = sum(1 for r in results if r.passed)
62 total = len(results)
63 return f"{passed}/{total} gates passed"
64
65
66# Example quality gates
67def accuracy_check(content: str) -> float:
68 """Check factual accuracy."""
69 llm = ChatOpenAI(model="gpt-4o", temperature=0)
70 messages = [
71 SystemMessage(content="Rate the factual accuracy from 0-1."),
72 HumanMessage(content=f"Content: {content}")
73 ]
74 response = llm.invoke(messages)
75 try:
76 return float(response.content)
77 except ValueError:
78 return 0.5
79
80
81def readability_check(content: str) -> float:
82 """Check readability."""
83 # Simple readability score based on sentence length
84 sentences = content.split('.')
85 avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
86 # Ideal: 15-20 words per sentence
87 if 15 <= avg_length <= 20:
88 return 1.0
89 elif avg_length < 10 or avg_length > 30:
90 return 0.5
91 return 0.75
92
93
94def completeness_check(content: str) -> float:
95 """Check content completeness."""
96 llm = ChatOpenAI(model="gpt-4o", temperature=0)
97 messages = [
98 SystemMessage(content="Rate completeness from 0-1 (covers all key points)."),
99 HumanMessage(content=f"Content: {content}")
100 ]
101 response = llm.invoke(messages)
102 try:
103 return float(response.content)
104 except ValueError:
105 return 0.5
106
107
108# Usage
109system = QualityGateSystem()
110system.add_gate(QualityGate("Accuracy", accuracy_check, 0.8, required=True))
111system.add_gate(QualityGate("Readability", readability_check, 0.7))
112system.add_gate(QualityGate("Completeness", completeness_check, 0.8, required=True))
113
114result = system.check_all("Content to evaluate...")
115print(f"Passed: {result['passed']}")
116print(f"Summary: {result['summary']}")Key Takeaways
- Self-critique enables agents to identify weaknesses and generate specific improvement suggestions.
- Multi-perspective critique provides more comprehensive feedback by evaluating from different viewpoints.
- Revision loops systematically improve content until quality targets are met.
- Convergence detection prevents unnecessary iterations when improvement plateaus.
- Quality gates ensure outputs meet required standards before proceeding.
Next Section Preview: We'll examine the limitations and challenges of autonomous agents.