Chapter 17
15 min read
Section 107 of 175

Iterative Refinement

AutoGPT and Autonomous Agents

Introduction

Autonomous agents improve their outputs through iterative refinement - cycles of self-critique and revision. This capability enables agents to produce higher-quality results by continuously evaluating and improving their work.

Section Overview: We'll explore self-critique patterns, revision loops, convergence detection, and quality gates for iterative refinement in autonomous agents.

Self-Critique Patterns

Basic Self-Critique

🐍python
1from langchain_openai import ChatOpenAI
2from langchain_core.messages import SystemMessage, HumanMessage
3from pydantic import BaseModel, Field
4from typing import List, Optional
5
6
7class Critique(BaseModel):
8    """Structured critique output."""
9    strengths: List[str] = Field(description="What works well")
10    weaknesses: List[str] = Field(description="What needs improvement")
11    suggestions: List[str] = Field(description="Specific improvements")
12    score: float = Field(ge=0, le=1, description="Quality score")
13    needs_revision: bool = Field(description="Whether revision is needed")
14
15
16class SelfCritic:
17    """Self-critique system for agent outputs."""
18
19    def __init__(self):
20        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
21        self.structured_llm = self.llm.with_structured_output(Critique)
22
23    def critique(
24        self,
25        output: str,
26        criteria: List[str],
27        goal: str
28    ) -> Critique:
29        """Generate a critique of the output."""
30
31        criteria_str = "\n".join(f"- {c}" for c in criteria)
32
33        prompt = f"""
34Critically evaluate this output:
35
36Goal: {goal}
37
38Output to evaluate:
39{output}
40
41Evaluation criteria:
42{criteria_str}
43
44Be thorough but fair. Identify both strengths and weaknesses.
45"""
46        messages = [
47            SystemMessage(content="You are a critical evaluator."),
48            HumanMessage(content=prompt)
49        ]
50
51        return self.structured_llm.invoke(messages)
52
53
54# Usage
55critic = SelfCritic()
56critique = critic.critique(
57    output="AI agents are computer programs that can work autonomously.",
58    criteria=[
59        "Technical accuracy",
60        "Depth of explanation",
61        "Clarity and readability",
62        "Actionable insights"
63    ],
64    goal="Explain AI agents to a technical audience"
65)
66
67print(f"Score: {critique.score}")
68print(f"Needs revision: {critique.needs_revision}")
69print(f"Weaknesses: {critique.weaknesses}")

Multi-Perspective Critique

🐍python
1class MultiPerspectiveCritic:
2    """Critique from multiple perspectives."""
3
4    def __init__(self):
5        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6
7    def critique_from_perspectives(
8        self,
9        output: str,
10        perspectives: List[dict]
11    ) -> List[Critique]:
12        """Get critiques from multiple perspectives."""
13
14        critiques = []
15
16        for perspective in perspectives:
17            role = perspective["role"]
18            focus = perspective["focus"]
19
20            prompt = f"""
21You are a {role} evaluating this output.
22Focus on: {focus}
23
24Output:
25{output}
26
27Provide your critique from your professional perspective.
28"""
29            messages = [
30                SystemMessage(content=f"You are a {role}."),
31                HumanMessage(content=prompt)
32            ]
33
34            response = self.llm.invoke(messages)
35            critiques.append({
36                "perspective": role,
37                "feedback": response.content
38            })
39
40        return critiques
41
42    def synthesize_critiques(self, critiques: List[dict]) -> dict:
43        """Combine critiques into actionable improvements."""
44        all_feedback = "\n\n".join([
45            f"{c['perspective']}: {c['feedback']}"
46            for c in critiques
47        ])
48
49        prompt = f"""
50Synthesize these critiques from different perspectives:
51
52{all_feedback}
53
54Create a prioritized list of improvements.
55"""
56        messages = [
57            SystemMessage(content="Synthesize feedback into improvements."),
58            HumanMessage(content=prompt)
59        ]
60
61        response = self.llm.invoke(messages)
62        return {"synthesis": response.content}
63
64
65# Usage
66critic = MultiPerspectiveCritic()
67critiques = critic.critique_from_perspectives(
68    output="Technical document about AI agents...",
69    perspectives=[
70        {"role": "Technical Accuracy Reviewer", "focus": "correctness"},
71        {"role": "UX Writer", "focus": "clarity and readability"},
72        {"role": "Subject Matter Expert", "focus": "depth and completeness"}
73    ]
74)

Revision Loops

Iterative Revision System

🐍python
1from dataclasses import dataclass
2from typing import List, Callable
3
4
5@dataclass
6class RevisionResult:
7    """Result of a revision iteration."""
8    content: str
9    iteration: int
10    score: float
11    improvements_made: List[str]
12    remaining_issues: List[str]
13
14
15class RevisionLoop:
16    """Iterative revision system."""
17
18    def __init__(
19        self,
20        max_iterations: int = 5,
21        target_score: float = 0.85
22    ):
23        self.max_iterations = max_iterations
24        self.target_score = target_score
25        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
26        self.critic = SelfCritic()
27        self.history: List[RevisionResult] = []
28
29    def run(
30        self,
31        initial_content: str,
32        goal: str,
33        criteria: List[str]
34    ) -> RevisionResult:
35        """Run the revision loop until convergence."""
36
37        content = initial_content
38        iteration = 0
39
40        while iteration < self.max_iterations:
41            # Critique current version
42            critique = self.critic.critique(content, criteria, goal)
43
44            # Check if target achieved
45            if critique.score >= self.target_score:
46                result = RevisionResult(
47                    content=content,
48                    iteration=iteration,
49                    score=critique.score,
50                    improvements_made=[],
51                    remaining_issues=[]
52                )
53                self.history.append(result)
54                return result
55
56            # Generate revision
57            revised = self._revise(content, critique, goal)
58
59            result = RevisionResult(
60                content=revised,
61                iteration=iteration,
62                score=critique.score,
63                improvements_made=critique.suggestions,
64                remaining_issues=critique.weaknesses
65            )
66            self.history.append(result)
67
68            content = revised
69            iteration += 1
70
71        # Return best version
72        best = max(self.history, key=lambda r: r.score)
73        return best
74
75    def _revise(
76        self,
77        content: str,
78        critique: Critique,
79        goal: str
80    ) -> str:
81        """Generate a revised version based on critique."""
82
83        prompt = f"""
84Revise this content based on the critique:
85
86Original goal: {goal}
87
88Current content:
89{content}
90
91Critique:
92Weaknesses: {critique.weaknesses}
93Suggestions: {critique.suggestions}
94
95Create an improved version addressing the issues.
96"""
97        messages = [
98            SystemMessage(content="Revise content based on feedback."),
99            HumanMessage(content=prompt)
100        ]
101
102        response = self.llm.invoke(messages)
103        return response.content
104
105
106# Usage
107loop = RevisionLoop(max_iterations=3, target_score=0.8)
108result = loop.run(
109    initial_content="AI agents are autonomous programs.",
110    goal="Comprehensive explanation of AI agents",
111    criteria=["Accuracy", "Depth", "Clarity"]
112)
113
114print(f"Final score: {result.score}")
115print(f"Iterations: {result.iteration}")

Progressive Refinement

🐍python
1class ProgressiveRefiner:
2    """Refine content through progressive stages."""
3
4    def __init__(self):
5        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6        self.stages = [
7            self._stage_structure,
8            self._stage_content,
9            self._stage_clarity,
10            self._stage_polish
11        ]
12
13    def refine(self, draft: str, goal: str) -> str:
14        """Progressively refine through all stages."""
15        content = draft
16
17        for i, stage in enumerate(self.stages):
18            print(f"Stage {i+1}: {stage.__name__}")
19            content = stage(content, goal)
20
21        return content
22
23    def _stage_structure(self, content: str, goal: str) -> str:
24        """Improve overall structure and organization."""
25        prompt = f"""
26Focus on STRUCTURE only:
27- Improve organization
28- Add clear sections if needed
29- Ensure logical flow
30
31Goal: {goal}
32
33Content:
34{content}
35
36Output the restructured content.
37"""
38        return self._invoke(prompt)
39
40    def _stage_content(self, content: str, goal: str) -> str:
41        """Enhance content depth and accuracy."""
42        prompt = f"""
43Focus on CONTENT only:
44- Add missing information
45- Verify accuracy
46- Include examples
47
48Goal: {goal}
49
50Content:
51{content}
52
53Output the enhanced content.
54"""
55        return self._invoke(prompt)
56
57    def _stage_clarity(self, content: str, goal: str) -> str:
58        """Improve clarity and readability."""
59        prompt = f"""
60Focus on CLARITY only:
61- Simplify complex sentences
62- Define technical terms
63- Use active voice
64
65Goal: {goal}
66
67Content:
68{content}
69
70Output the clarified content.
71"""
72        return self._invoke(prompt)
73
74    def _stage_polish(self, content: str, goal: str) -> str:
75        """Final polish and consistency."""
76        prompt = f"""
77Focus on POLISH only:
78- Fix grammar and spelling
79- Ensure consistency
80- Perfect formatting
81
82Goal: {goal}
83
84Content:
85{content}
86
87Output the polished content.
88"""
89        return self._invoke(prompt)
90
91    def _invoke(self, prompt: str) -> str:
92        messages = [
93            SystemMessage(content="Refine the content as directed."),
94            HumanMessage(content=prompt)
95        ]
96        return self.llm.invoke(messages).content

Convergence Detection

Detecting When to Stop

🐍python
1from dataclasses import dataclass
2from typing import List
3import numpy as np
4
5
6@dataclass
7class ConvergenceMetrics:
8    """Metrics for convergence detection."""
9    score_history: List[float]
10    content_similarity: float
11    improvement_rate: float
12    is_converged: bool
13    reason: str
14
15
16class ConvergenceDetector:
17    """Detect when iterative refinement should stop."""
18
19    def __init__(
20        self,
21        min_improvement: float = 0.02,
22        plateau_window: int = 3,
23        similarity_threshold: float = 0.95
24    ):
25        self.min_improvement = min_improvement
26        self.plateau_window = plateau_window
27        self.similarity_threshold = similarity_threshold
28        self.score_history: List[float] = []
29        self.content_history: List[str] = []
30
31    def check(self, score: float, content: str) -> ConvergenceMetrics:
32        """Check if refinement has converged."""
33        self.score_history.append(score)
34        self.content_history.append(content)
35
36        # Not enough history
37        if len(self.score_history) < 2:
38            return ConvergenceMetrics(
39                score_history=self.score_history,
40                content_similarity=0.0,
41                improvement_rate=1.0,
42                is_converged=False,
43                reason="Not enough iterations"
44            )
45
46        # Calculate metrics
47        improvement = self._calculate_improvement()
48        similarity = self._calculate_similarity()
49
50        # Check convergence conditions
51        is_converged, reason = self._check_conditions(improvement, similarity)
52
53        return ConvergenceMetrics(
54            score_history=self.score_history,
55            content_similarity=similarity,
56            improvement_rate=improvement,
57            is_converged=is_converged,
58            reason=reason
59        )
60
61    def _calculate_improvement(self) -> float:
62        """Calculate recent improvement rate."""
63        if len(self.score_history) < 2:
64            return 1.0
65
66        recent = self.score_history[-self.plateau_window:]
67        if len(recent) < 2:
68            return abs(self.score_history[-1] - self.score_history[-2])
69
70        # Average improvement over window
71        improvements = [
72            recent[i] - recent[i-1]
73            for i in range(1, len(recent))
74        ]
75        return np.mean(improvements)
76
77    def _calculate_similarity(self) -> float:
78        """Calculate similarity between last two versions."""
79        if len(self.content_history) < 2:
80            return 0.0
81
82        # Simple Jaccard similarity
83        prev_words = set(self.content_history[-2].lower().split())
84        curr_words = set(self.content_history[-1].lower().split())
85
86        intersection = len(prev_words & curr_words)
87        union = len(prev_words | curr_words)
88
89        return intersection / union if union > 0 else 0.0
90
91    def _check_conditions(
92        self,
93        improvement: float,
94        similarity: float
95    ) -> tuple[bool, str]:
96        """Check all convergence conditions."""
97
98        # Condition 1: Score plateau
99        if improvement < self.min_improvement:
100            return True, f"Score plateau (improvement: {improvement:.4f})"
101
102        # Condition 2: High content similarity
103        if similarity > self.similarity_threshold:
104            return True, f"Content converged (similarity: {similarity:.2f})"
105
106        # Condition 3: Perfect score
107        if self.score_history[-1] >= 0.99:
108            return True, "Perfect score achieved"
109
110        # Condition 4: Score regression
111        if len(self.score_history) >= 3:
112            if self.score_history[-1] < self.score_history[-2] < self.score_history[-3]:
113                return True, "Score regression detected"
114
115        return False, "Continuing refinement"

Quality Gates

Multi-Criteria Quality Gates

🐍python
1from dataclasses import dataclass
2from typing import Callable, Dict
3
4
5@dataclass
6class QualityGate:
7    """A single quality gate."""
8    name: str
9    check: Callable[[str], float]  # Returns score 0-1
10    threshold: float
11    required: bool = True
12
13
14@dataclass
15class GateResult:
16    """Result of a quality gate check."""
17    gate_name: str
18    score: float
19    passed: bool
20    threshold: float
21
22
23class QualityGateSystem:
24    """System of quality gates for content validation."""
25
26    def __init__(self):
27        self.gates: List[QualityGate] = []
28        self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
29
30    def add_gate(self, gate: QualityGate):
31        """Add a quality gate."""
32        self.gates.append(gate)
33
34    def check_all(self, content: str) -> Dict:
35        """Check all quality gates."""
36        results = []
37        all_passed = True
38
39        for gate in self.gates:
40            score = gate.check(content)
41            passed = score >= gate.threshold
42
43            if gate.required and not passed:
44                all_passed = False
45
46            results.append(GateResult(
47                gate_name=gate.name,
48                score=score,
49                passed=passed,
50                threshold=gate.threshold
51            ))
52
53        return {
54            "passed": all_passed,
55            "results": results,
56            "summary": self._summarize(results)
57        }
58
59    def _summarize(self, results: List[GateResult]) -> str:
60        """Summarize gate results."""
61        passed = sum(1 for r in results if r.passed)
62        total = len(results)
63        return f"{passed}/{total} gates passed"
64
65
66# Example quality gates
67def accuracy_check(content: str) -> float:
68    """Check factual accuracy."""
69    llm = ChatOpenAI(model="gpt-4o", temperature=0)
70    messages = [
71        SystemMessage(content="Rate the factual accuracy from 0-1."),
72        HumanMessage(content=f"Content: {content}")
73    ]
74    response = llm.invoke(messages)
75    try:
76        return float(response.content)
77    except ValueError:
78        return 0.5
79
80
81def readability_check(content: str) -> float:
82    """Check readability."""
83    # Simple readability score based on sentence length
84    sentences = content.split('.')
85    avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
86    # Ideal: 15-20 words per sentence
87    if 15 <= avg_length <= 20:
88        return 1.0
89    elif avg_length < 10 or avg_length > 30:
90        return 0.5
91    return 0.75
92
93
94def completeness_check(content: str) -> float:
95    """Check content completeness."""
96    llm = ChatOpenAI(model="gpt-4o", temperature=0)
97    messages = [
98        SystemMessage(content="Rate completeness from 0-1 (covers all key points)."),
99        HumanMessage(content=f"Content: {content}")
100    ]
101    response = llm.invoke(messages)
102    try:
103        return float(response.content)
104    except ValueError:
105        return 0.5
106
107
108# Usage
109system = QualityGateSystem()
110system.add_gate(QualityGate("Accuracy", accuracy_check, 0.8, required=True))
111system.add_gate(QualityGate("Readability", readability_check, 0.7))
112system.add_gate(QualityGate("Completeness", completeness_check, 0.8, required=True))
113
114result = system.check_all("Content to evaluate...")
115print(f"Passed: {result['passed']}")
116print(f"Summary: {result['summary']}")

Key Takeaways

  • Self-critique enables agents to identify weaknesses and generate specific improvement suggestions.
  • Multi-perspective critique provides more comprehensive feedback by evaluating from different viewpoints.
  • Revision loops systematically improve content until quality targets are met.
  • Convergence detection prevents unnecessary iterations when improvement plateaus.
  • Quality gates ensure outputs meet required standards before proceeding.
Next Section Preview: We'll examine the limitations and challenges of autonomous agents.