Boo-AI — Master Artificial Intelligence by Building from Scratch

Introduction

Autonomous agents improve their outputs through iterative refinement - cycles of self-critique and revision. This capability enables agents to produce higher-quality results by continuously evaluating and improving their work.

Section Overview: We'll explore self-critique patterns, revision loops, convergence detection, and quality gates for iterative refinement in autonomous agents.

Self-Critique Patterns

Basic Self-Critique

🐍python

1from langchain_openai import ChatOpenAI
2from langchain_core.messages import SystemMessage, HumanMessage
3from pydantic import BaseModel, Field
4from typing import List, Optional
5
6
7class Critique(BaseModel):
8    """Structured critique output."""
9    strengths: List[str] = Field(description="What works well")
10    weaknesses: List[str] = Field(description="What needs improvement")
11    suggestions: List[str] = Field(description="Specific improvements")
12    score: float = Field(ge=0, le=1, description="Quality score")
13    needs_revision: bool = Field(description="Whether revision is needed")
14
15
16class SelfCritic:
17    """Self-critique system for agent outputs."""
18
19    def __init__(self):
20        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.2)
21        self.structured_llm = self.llm.with_structured_output(Critique)
22
23    def critique(
24        self,
25        output: str,
26        criteria: List[str],
27        goal: str
28    ) -> Critique:
29        """Generate a critique of the output."""
30
31        criteria_str = "\n".join(f"- {c}" for c in criteria)
32
33        prompt = f"""
34Critically evaluate this output:
35
36Goal: {goal}
37
38Output to evaluate:
39{output}
40
41Evaluation criteria:
42{criteria_str}
43
44Be thorough but fair. Identify both strengths and weaknesses.
45"""
46        messages = [
47            SystemMessage(content="You are a critical evaluator."),
48            HumanMessage(content=prompt)
49        ]
50
51        return self.structured_llm.invoke(messages)
52
53
54# Usage
55critic = SelfCritic()
56critique = critic.critique(
57    output="AI agents are computer programs that can work autonomously.",
58    criteria=[
59        "Technical accuracy",
60        "Depth of explanation",
61        "Clarity and readability",
62        "Actionable insights"
63    ],
64    goal="Explain AI agents to a technical audience"
65)
66
67print(f"Score: {critique.score}")
68print(f"Needs revision: {critique.needs_revision}")
69print(f"Weaknesses: {critique.weaknesses}")

Multi-Perspective Critique

🐍python

1class MultiPerspectiveCritic:
2    """Critique from multiple perspectives."""
3
4    def __init__(self):
5        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6
7    def critique_from_perspectives(
8        self,
9        output: str,
10        perspectives: List[dict]
11    ) -> List[Critique]:
12        """Get critiques from multiple perspectives."""
13
14        critiques = []
15
16        for perspective in perspectives:
17            role = perspective["role"]
18            focus = perspective["focus"]
19
20            prompt = f"""
21You are a {role} evaluating this output.
22Focus on: {focus}
23
24Output:
25{output}
26
27Provide your critique from your professional perspective.
28"""
29            messages = [
30                SystemMessage(content=f"You are a {role}."),
31                HumanMessage(content=prompt)
32            ]
33
34            response = self.llm.invoke(messages)
35            critiques.append({
36                "perspective": role,
37                "feedback": response.content
38            })
39
40        return critiques
41
42    def synthesize_critiques(self, critiques: List[dict]) -> dict:
43        """Combine critiques into actionable improvements."""
44        all_feedback = "\n\n".join([
45            f"{c['perspective']}: {c['feedback']}"
46            for c in critiques
47        ])
48
49        prompt = f"""
50Synthesize these critiques from different perspectives:
51
52{all_feedback}
53
54Create a prioritized list of improvements.
55"""
56        messages = [
57            SystemMessage(content="Synthesize feedback into improvements."),
58            HumanMessage(content=prompt)
59        ]
60
61        response = self.llm.invoke(messages)
62        return {"synthesis": response.content}
63
64
65# Usage
66critic = MultiPerspectiveCritic()
67critiques = critic.critique_from_perspectives(
68    output="Technical document about AI agents...",
69    perspectives=[
70        {"role": "Technical Accuracy Reviewer", "focus": "correctness"},
71        {"role": "UX Writer", "focus": "clarity and readability"},
72        {"role": "Subject Matter Expert", "focus": "depth and completeness"}
73    ]
74)

Revision Loops

Iterative Revision System

🐍python

1from dataclasses import dataclass
2from typing import List, Callable
3
4
5@dataclass
6class RevisionResult:
7    """Result of a revision iteration."""
8    content: str
9    iteration: int
10    score: float
11    improvements_made: List[str]
12    remaining_issues: List[str]
13
14
15class RevisionLoop:
16    """Iterative revision system."""
17
18    def __init__(
19        self,
20        max_iterations: int = 5,
21        target_score: float = 0.85
22    ):
23        self.max_iterations = max_iterations
24        self.target_score = target_score
25        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
26        self.critic = SelfCritic()
27        self.history: List[RevisionResult] = []
28
29    def run(
30        self,
31        initial_content: str,
32        goal: str,
33        criteria: List[str]
34    ) -> RevisionResult:
35        """Run the revision loop until convergence."""
36
37        content = initial_content
38        iteration = 0
39
40        while iteration < self.max_iterations:
41            # Critique current version
42            critique = self.critic.critique(content, criteria, goal)
43
44            # Check if target achieved
45            if critique.score >= self.target_score:
46                result = RevisionResult(
47                    content=content,
48                    iteration=iteration,
49                    score=critique.score,
50                    improvements_made=[],
51                    remaining_issues=[]
52                )
53                self.history.append(result)
54                return result
55
56            # Generate revision
57            revised = self._revise(content, critique, goal)
58
59            result = RevisionResult(
60                content=revised,
61                iteration=iteration,
62                score=critique.score,
63                improvements_made=critique.suggestions,
64                remaining_issues=critique.weaknesses
65            )
66            self.history.append(result)
67
68            content = revised
69            iteration += 1
70
71        # Return best version
72        best = max(self.history, key=lambda r: r.score)
73        return best
74
75    def _revise(
76        self,
77        content: str,
78        critique: Critique,
79        goal: str
80    ) -> str:
81        """Generate a revised version based on critique."""
82
83        prompt = f"""
84Revise this content based on the critique:
85
86Original goal: {goal}
87
88Current content:
89{content}
90
91Critique:
92Weaknesses: {critique.weaknesses}
93Suggestions: {critique.suggestions}
94
95Create an improved version addressing the issues.
96"""
97        messages = [
98            SystemMessage(content="Revise content based on feedback."),
99            HumanMessage(content=prompt)
100        ]
101
102        response = self.llm.invoke(messages)
103        return response.content
104
105
106# Usage
107loop = RevisionLoop(max_iterations=3, target_score=0.8)
108result = loop.run(
109    initial_content="AI agents are autonomous programs.",
110    goal="Comprehensive explanation of AI agents",
111    criteria=["Accuracy", "Depth", "Clarity"]
112)
113
114print(f"Final score: {result.score}")
115print(f"Iterations: {result.iteration}")

Progressive Refinement

🐍python

1class ProgressiveRefiner:
2    """Refine content through progressive stages."""
3
4    def __init__(self):
5        self.llm = ChatOpenAI(model="gpt-4o", temperature=0.3)
6        self.stages = [
7            self._stage_structure,
8            self._stage_content,
9            self._stage_clarity,
10            self._stage_polish
11        ]
12
13    def refine(self, draft: str, goal: str) -> str:
14        """Progressively refine through all stages."""
15        content = draft
16
17        for i, stage in enumerate(self.stages):
18            print(f"Stage {i+1}: {stage.__name__}")
19            content = stage(content, goal)
20
21        return content
22
23    def _stage_structure(self, content: str, goal: str) -> str:
24        """Improve overall structure and organization."""
25        prompt = f"""
26Focus on STRUCTURE only:
27- Improve organization
28- Add clear sections if needed
29- Ensure logical flow
30
31Goal: {goal}
32
33Content:
34{content}
35
36Output the restructured content.
37"""
38        return self._invoke(prompt)
39
40    def _stage_content(self, content: str, goal: str) -> str:
41        """Enhance content depth and accuracy."""
42        prompt = f"""
43Focus on CONTENT only:
44- Add missing information
45- Verify accuracy
46- Include examples
47
48Goal: {goal}
49
50Content:
51{content}
52
53Output the enhanced content.
54"""
55        return self._invoke(prompt)
56
57    def _stage_clarity(self, content: str, goal: str) -> str:
58        """Improve clarity and readability."""
59        prompt = f"""
60Focus on CLARITY only:
61- Simplify complex sentences
62- Define technical terms
63- Use active voice
64
65Goal: {goal}
66
67Content:
68{content}
69
70Output the clarified content.
71"""
72        return self._invoke(prompt)
73
74    def _stage_polish(self, content: str, goal: str) -> str:
75        """Final polish and consistency."""
76        prompt = f"""
77Focus on POLISH only:
78- Fix grammar and spelling
79- Ensure consistency
80- Perfect formatting
81
82Goal: {goal}
83
84Content:
85{content}
86
87Output the polished content.
88"""
89        return self._invoke(prompt)
90
91    def _invoke(self, prompt: str) -> str:
92        messages = [
93            SystemMessage(content="Refine the content as directed."),
94            HumanMessage(content=prompt)
95        ]
96        return self.llm.invoke(messages).content

Convergence Detection

Detecting When to Stop

🐍python

1from dataclasses import dataclass
2from typing import List
3import numpy as np
4
5
6@dataclass
7class ConvergenceMetrics:
8    """Metrics for convergence detection."""
9    score_history: List[float]
10    content_similarity: float
11    improvement_rate: float
12    is_converged: bool
13    reason: str
14
15
16class ConvergenceDetector:
17    """Detect when iterative refinement should stop."""
18
19    def __init__(
20        self,
21        min_improvement: float = 0.02,
22        plateau_window: int = 3,
23        similarity_threshold: float = 0.95
24    ):
25        self.min_improvement = min_improvement
26        self.plateau_window = plateau_window
27        self.similarity_threshold = similarity_threshold
28        self.score_history: List[float] = []
29        self.content_history: List[str] = []
30
31    def check(self, score: float, content: str) -> ConvergenceMetrics:
32        """Check if refinement has converged."""
33        self.score_history.append(score)
34        self.content_history.append(content)
35
36        # Not enough history
37        if len(self.score_history) < 2:
38            return ConvergenceMetrics(
39                score_history=self.score_history,
40                content_similarity=0.0,
41                improvement_rate=1.0,
42                is_converged=False,
43                reason="Not enough iterations"
44            )
45
46        # Calculate metrics
47        improvement = self._calculate_improvement()
48        similarity = self._calculate_similarity()
49
50        # Check convergence conditions
51        is_converged, reason = self._check_conditions(improvement, similarity)
52
53        return ConvergenceMetrics(
54            score_history=self.score_history,
55            content_similarity=similarity,
56            improvement_rate=improvement,
57            is_converged=is_converged,
58            reason=reason
59        )
60
61    def _calculate_improvement(self) -> float:
62        """Calculate recent improvement rate."""
63        if len(self.score_history) < 2:
64            return 1.0
65
66        recent = self.score_history[-self.plateau_window:]
67        if len(recent) < 2:
68            return abs(self.score_history[-1] - self.score_history[-2])
69
70        # Average improvement over window
71        improvements = [
72            recent[i] - recent[i-1]
73            for i in range(1, len(recent))
74        ]
75        return np.mean(improvements)
76
77    def _calculate_similarity(self) -> float:
78        """Calculate similarity between last two versions."""
79        if len(self.content_history) < 2:
80            return 0.0
81
82        # Simple Jaccard similarity
83        prev_words = set(self.content_history[-2].lower().split())
84        curr_words = set(self.content_history[-1].lower().split())
85
86        intersection = len(prev_words & curr_words)
87        union = len(prev_words | curr_words)
88
89        return intersection / union if union > 0 else 0.0
90
91    def _check_conditions(
92        self,
93        improvement: float,
94        similarity: float
95    ) -> tuple[bool, str]:
96        """Check all convergence conditions."""
97
98        # Condition 1: Score plateau
99        if improvement < self.min_improvement:
100            return True, f"Score plateau (improvement: {improvement:.4f})"
101
102        # Condition 2: High content similarity
103        if similarity > self.similarity_threshold:
104            return True, f"Content converged (similarity: {similarity:.2f})"
105
106        # Condition 3: Perfect score
107        if self.score_history[-1] >= 0.99:
108            return True, "Perfect score achieved"
109
110        # Condition 4: Score regression
111        if len(self.score_history) >= 3:
112            if self.score_history[-1] < self.score_history[-2] < self.score_history[-3]:
113                return True, "Score regression detected"
114
115        return False, "Continuing refinement"

Quality Gates

Multi-Criteria Quality Gates

🐍python

1from dataclasses import dataclass
2from typing import Callable, Dict
3
4
5@dataclass
6class QualityGate:
7    """A single quality gate."""
8    name: str
9    check: Callable[[str], float]  # Returns score 0-1
10    threshold: float
11    required: bool = True
12
13
14@dataclass
15class GateResult:
16    """Result of a quality gate check."""
17    gate_name: str
18    score: float
19    passed: bool
20    threshold: float
21
22
23class QualityGateSystem:
24    """System of quality gates for content validation."""
25
26    def __init__(self):
27        self.gates: List[QualityGate] = []
28        self.llm = ChatOpenAI(model="gpt-4o", temperature=0)
29
30    def add_gate(self, gate: QualityGate):
31        """Add a quality gate."""
32        self.gates.append(gate)
33
34    def check_all(self, content: str) -> Dict:
35        """Check all quality gates."""
36        results = []
37        all_passed = True
38
39        for gate in self.gates:
40            score = gate.check(content)
41            passed = score >= gate.threshold
42
43            if gate.required and not passed:
44                all_passed = False
45
46            results.append(GateResult(
47                gate_name=gate.name,
48                score=score,
49                passed=passed,
50                threshold=gate.threshold
51            ))
52
53        return {
54            "passed": all_passed,
55            "results": results,
56            "summary": self._summarize(results)
57        }
58
59    def _summarize(self, results: List[GateResult]) -> str:
60        """Summarize gate results."""
61        passed = sum(1 for r in results if r.passed)
62        total = len(results)
63        return f"{passed}/{total} gates passed"
64
65
66# Example quality gates
67def accuracy_check(content: str) -> float:
68    """Check factual accuracy."""
69    llm = ChatOpenAI(model="gpt-4o", temperature=0)
70    messages = [
71        SystemMessage(content="Rate the factual accuracy from 0-1."),
72        HumanMessage(content=f"Content: {content}")
73    ]
74    response = llm.invoke(messages)
75    try:
76        return float(response.content)
77    except ValueError:
78        return 0.5
79
80
81def readability_check(content: str) -> float:
82    """Check readability."""
83    # Simple readability score based on sentence length
84    sentences = content.split('.')
85    avg_length = sum(len(s.split()) for s in sentences) / len(sentences)
86    # Ideal: 15-20 words per sentence
87    if 15 <= avg_length <= 20:
88        return 1.0
89    elif avg_length < 10 or avg_length > 30:
90        return 0.5
91    return 0.75
92
93
94def completeness_check(content: str) -> float:
95    """Check content completeness."""
96    llm = ChatOpenAI(model="gpt-4o", temperature=0)
97    messages = [
98        SystemMessage(content="Rate completeness from 0-1 (covers all key points)."),
99        HumanMessage(content=f"Content: {content}")
100    ]
101    response = llm.invoke(messages)
102    try:
103        return float(response.content)
104    except ValueError:
105        return 0.5
106
107
108# Usage
109system = QualityGateSystem()
110system.add_gate(QualityGate("Accuracy", accuracy_check, 0.8, required=True))
111system.add_gate(QualityGate("Readability", readability_check, 0.7))
112system.add_gate(QualityGate("Completeness", completeness_check, 0.8, required=True))
113
114result = system.check_all("Content to evaluate...")
115print(f"Passed: {result['passed']}")
116print(f"Summary: {result['summary']}")

Key Takeaways

Self-critique enables agents to identify weaknesses and generate specific improvement suggestions.
Multi-perspective critique provides more comprehensive feedback by evaluating from different viewpoints.
Revision loops systematically improve content until quality targets are met.
Convergence detection prevents unnecessary iterations when improvement plateaus.
Quality gates ensure outputs meet required standards before proceeding.

Next Section Preview: We'll examine the limitations and challenges of autonomous agents.