Introduction
Web search is the cornerstone of any research agent. It enables the agent to discover relevant sources, find up-to-date information, and explore topics beyond its training data. This section covers how to integrate various search APIs into your research agent.
Key Insight: A well-designed search integration should handle multiple search providers, optimize queries for better results, and gracefully handle rate limits and errors.
Search API Options
Several APIs are available for programmatic web search. Each has different strengths, pricing, and rate limits:
| API | Strengths | Considerations |
|---|---|---|
| Google Custom Search | High quality results, extensive index | 100 queries/day free, $5 per 1000 after |
| Bing Search API | Good quality, generous free tier | 1000 queries/month free |
| SerpAPI | Multiple search engines, easy integration | 100 searches/month free |
| Brave Search API | Privacy-focused, independent index | 2000 queries/month free |
| Tavily AI | Optimized for AI agents, semantic search | 1000 queries/month free |
Choosing a Search Provider
- For prototyping: Brave Search or Tavily offer generous free tiers
- For production: Consider SerpAPI for reliability or Google for quality
- For specialized research: Combine multiple providers for comprehensive coverage
Implementing Search
Let's implement a flexible search system that can work with multiple providers:
1from abc import ABC, abstractmethod
2from dataclasses import dataclass
3from typing import List, Dict, Any, Optional
4import aiohttp
5import asyncio
6
7
8@dataclass
9class SearchResult:
10 """A single search result."""
11 title: str
12 url: str
13 snippet: str
14 position: int
15 source: str # Which search engine returned this
16
17
18class SearchProvider(ABC):
19 """Base class for search providers."""
20
21 @property
22 @abstractmethod
23 def name(self) -> str:
24 pass
25
26 @abstractmethod
27 async def search(
28 self,
29 query: str,
30 num_results: int = 10
31 ) -> List[SearchResult]:
32 pass
33
34
35class BraveSearchProvider(SearchProvider):
36 """Brave Search API integration."""
37
38 def __init__(self, api_key: str):
39 self.api_key = api_key
40 self.base_url = "https://api.search.brave.com/res/v1/web/search"
41
42 @property
43 def name(self) -> str:
44 return "brave"
45
46 async def search(
47 self,
48 query: str,
49 num_results: int = 10
50 ) -> List[SearchResult]:
51 headers = {
52 "Accept": "application/json",
53 "X-Subscription-Token": self.api_key
54 }
55 params = {
56 "q": query,
57 "count": min(num_results, 20)
58 }
59
60 async with aiohttp.ClientSession() as session:
61 async with session.get(
62 self.base_url,
63 headers=headers,
64 params=params
65 ) as response:
66 if response.status != 200:
67 raise Exception(f"Search failed: [response.status]")
68
69 data = await response.json()
70
71 results = []
72 web_results = data.get("web", {}).get("results", [])
73
74 for i, result in enumerate(web_results):
75 results.append(SearchResult(
76 title=result.get("title", ""),
77 url=result.get("url", ""),
78 snippet=result.get("description", ""),
79 position=i + 1,
80 source=self.name
81 ))
82
83 return results
84
85
86class TavilySearchProvider(SearchProvider):
87 """Tavily AI Search API - optimized for AI agents."""
88
89 def __init__(self, api_key: str):
90 self.api_key = api_key
91 self.base_url = "https://api.tavily.com/search"
92
93 @property
94 def name(self) -> str:
95 return "tavily"
96
97 async def search(
98 self,
99 query: str,
100 num_results: int = 10,
101 search_depth: str = "basic"
102 ) -> List[SearchResult]:
103 payload = {
104 "api_key": self.api_key,
105 "query": query,
106 "max_results": num_results,
107 "search_depth": search_depth,
108 "include_answer": True,
109 "include_raw_content": False
110 }
111
112 async with aiohttp.ClientSession() as session:
113 async with session.post(
114 self.base_url,
115 json=payload
116 ) as response:
117 if response.status != 200:
118 raise Exception(f"Search failed: [response.status]")
119
120 data = await response.json()
121
122 results = []
123 for i, result in enumerate(data.get("results", [])):
124 results.append(SearchResult(
125 title=result.get("title", ""),
126 url=result.get("url", ""),
127 snippet=result.get("content", ""),
128 position=i + 1,
129 source=self.name
130 ))
131
132 return results
133
134
135class SerpAPIProvider(SearchProvider):
136 """SerpAPI - supports multiple search engines."""
137
138 def __init__(self, api_key: str, engine: str = "google"):
139 self.api_key = api_key
140 self.engine = engine
141 self.base_url = "https://serpapi.com/search"
142
143 @property
144 def name(self) -> str:
145 return f"serpapi_[self.engine]"
146
147 async def search(
148 self,
149 query: str,
150 num_results: int = 10
151 ) -> List[SearchResult]:
152 params = {
153 "api_key": self.api_key,
154 "engine": self.engine,
155 "q": query,
156 "num": num_results
157 }
158
159 async with aiohttp.ClientSession() as session:
160 async with session.get(
161 self.base_url,
162 params=params
163 ) as response:
164 if response.status != 200:
165 raise Exception(f"Search failed: [response.status]")
166
167 data = await response.json()
168
169 results = []
170 organic_results = data.get("organic_results", [])
171
172 for i, result in enumerate(organic_results):
173 results.append(SearchResult(
174 title=result.get("title", ""),
175 url=result.get("link", ""),
176 snippet=result.get("snippet", ""),
177 position=i + 1,
178 source=self.name
179 ))
180
181 return resultsMulti-Provider Search Manager
For robust research, combine multiple search providers:
1class SearchManager:
2 """
3 Manages multiple search providers with fallback and deduplication.
4 """
5
6 def __init__(self):
7 self.providers: List[SearchProvider] = []
8 self.cache: Dict[str, List[SearchResult]] = {}
9 self.rate_limits: Dict[str, int] = {}
10
11 def add_provider(self, provider: SearchProvider) -> None:
12 """Add a search provider."""
13 self.providers.append(provider)
14
15 async def search(
16 self,
17 query: str,
18 num_results: int = 10,
19 use_all_providers: bool = False
20 ) -> List[SearchResult]:
21 """
22 Search across providers.
23
24 Args:
25 query: Search query
26 num_results: Number of results wanted
27 use_all_providers: If True, search all providers and merge results
28
29 Returns:
30 List of search results
31 """
32 # Check cache
33 cache_key = f"[query]:[num_results]:[use_all_providers]"
34 if cache_key in self.cache:
35 return self.cache[cache_key]
36
37 if use_all_providers:
38 results = await self._search_all_providers(query, num_results)
39 else:
40 results = await self._search_with_fallback(query, num_results)
41
42 # Cache results
43 self.cache[cache_key] = results
44 return results
45
46 async def _search_all_providers(
47 self,
48 query: str,
49 num_results: int
50 ) -> List[SearchResult]:
51 """Search all providers and merge results."""
52 tasks = [
53 provider.search(query, num_results)
54 for provider in self.providers
55 ]
56
57 all_results = []
58 provider_results = await asyncio.gather(*tasks, return_exceptions=True)
59
60 for results in provider_results:
61 if isinstance(results, Exception):
62 continue
63 all_results.extend(results)
64
65 # Deduplicate by URL
66 seen_urls = set()
67 unique_results = []
68 for result in all_results:
69 if result.url not in seen_urls:
70 seen_urls.add(result.url)
71 unique_results.append(result)
72
73 return unique_results[:num_results * 2] # Return more for variety
74
75 async def _search_with_fallback(
76 self,
77 query: str,
78 num_results: int
79 ) -> List[SearchResult]:
80 """Try providers in order until one succeeds."""
81 for provider in self.providers:
82 try:
83 results = await provider.search(query, num_results)
84 if results:
85 return results
86 except Exception as e:
87 print(f"Provider [provider.name] failed: [e]")
88 continue
89
90 return []
91
92 def clear_cache(self) -> None:
93 """Clear the search cache."""
94 self.cache.clear()Query Optimization
The quality of search results depends heavily on the query. An LLM can help generate optimized search queries:
1class QueryOptimizer:
2 """
3 Optimize search queries for better results.
4 """
5
6 def __init__(self, llm_client):
7 self.llm = llm_client
8
9 async def expand_query(self, question: str) -> List[str]:
10 """
11 Expand a research question into multiple search queries.
12 """
13 prompt = f"""Given the research question below, generate 5 different
14search queries that would help find comprehensive information.
15
16Research question: [question]
17
18Generate queries that:
191. Use different phrasings of the core question
202. Include specific terminology
213. Target different aspects of the topic
224. Mix broad and specific queries
23
24Return ONLY the queries, one per line, no numbering or explanation."""
25
26 response = await self.llm.generate(prompt)
27 queries = [q.strip() for q in response.strip().split("\n") if q.strip()]
28 return queries[:5]
29
30 def add_search_operators(self, query: str, site: str = None) -> str:
31 """Add search operators to improve specificity."""
32 operators = []
33
34 if site:
35 operators.append(f"site:[site]")
36
37 # Add quotes for exact phrases
38 if len(query.split()) > 3:
39 # Find likely phrase portions
40 words = query.split()
41 if len(words) >= 4:
42 phrase = " ".join(words[:4])
43 operators.append(f'"[phrase]"')
44
45 if operators:
46 return f"[query] [' '.join(operators)]"
47 return query
48
49 async def generate_follow_up_queries(
50 self,
51 original_query: str,
52 results: List[SearchResult],
53 gaps: List[str]
54 ) -> List[str]:
55 """
56 Generate follow-up queries based on initial results and identified gaps.
57 """
58 results_summary = "\n".join([
59 f"- [r.title]: [r.snippet[:100]]..."
60 for r in results[:5]
61 ])
62
63 gaps_text = "\n".join([f"- [gap]" for gap in gaps])
64
65 prompt = f"""Based on the original research query and initial results,
66generate follow-up search queries to fill in gaps.
67
68Original query: [original_query]
69
70Initial results found:
71[results_summary]
72
73Information gaps identified:
74[gaps_text]
75
76Generate 3 targeted search queries to fill these gaps.
77Return ONLY the queries, one per line."""
78
79 response = await self.llm.generate(prompt)
80 return [q.strip() for q in response.strip().split("\n") if q.strip()][:3]
81
82
83class QueryBuilder:
84 """
85 Build structured search queries.
86 """
87
88 def __init__(self):
89 self.terms: List[str] = []
90 self.required: List[str] = []
91 self.excluded: List[str] = []
92 self.site: Optional[str] = None
93 self.file_type: Optional[str] = None
94 self.date_range: Optional[str] = None
95
96 def add_term(self, term: str) -> "QueryBuilder":
97 """Add a search term."""
98 self.terms.append(term)
99 return self
100
101 def require(self, term: str) -> "QueryBuilder":
102 """Require a term to be present."""
103 self.required.append(term)
104 return self
105
106 def exclude(self, term: str) -> "QueryBuilder":
107 """Exclude a term."""
108 self.excluded.append(term)
109 return self
110
111 def from_site(self, site: str) -> "QueryBuilder":
112 """Restrict to a specific site."""
113 self.site = site
114 return self
115
116 def of_type(self, file_type: str) -> "QueryBuilder":
117 """Restrict to a file type."""
118 self.file_type = file_type
119 return self
120
121 def build(self) -> str:
122 """Build the final query string."""
123 parts = []
124
125 # Base terms
126 parts.extend(self.terms)
127
128 # Required terms
129 for term in self.required:
130 parts.append(f"+[term]")
131
132 # Excluded terms
133 for term in self.excluded:
134 parts.append(f"-[term]")
135
136 # Site restriction
137 if self.site:
138 parts.append(f"site:[self.site]")
139
140 # File type
141 if self.file_type:
142 parts.append(f"filetype:[self.file_type]")
143
144 return " ".join(parts)
145
146
147# Usage example
148def build_academic_query(topic: str) -> str:
149 """Build a query optimized for academic sources."""
150 return (
151 QueryBuilder()
152 .add_term(topic)
153 .add_term("research")
154 .exclude("buy")
155 .exclude("course")
156 .from_site("scholar.google.com")
157 .build()
158 )Result Processing
Raw search results need processing before they're useful for research:
1from urllib.parse import urlparse
2
3
4class ResultProcessor:
5 """
6 Process and enrich search results.
7 """
8
9 def __init__(self, llm_client = None):
10 self.llm = llm_client
11 self.domain_quality = {
12 "wikipedia.org": 0.8,
13 "arxiv.org": 0.9,
14 "github.com": 0.7,
15 "stackoverflow.com": 0.7,
16 "medium.com": 0.5,
17 "reddit.com": 0.4,
18 }
19
20 def process_results(
21 self,
22 results: List[SearchResult],
23 query: str
24 ) -> List[SearchResult]:
25 """Process and score results."""
26 processed = []
27
28 for result in results:
29 # Calculate relevance score
30 relevance = self._calculate_relevance(result, query)
31
32 # Calculate quality score
33 quality = self._calculate_quality(result)
34
35 # Store scores in a modified result
36 result.relevance_score = relevance
37 result.quality_score = quality
38 result.combined_score = (relevance * 0.6) + (quality * 0.4)
39
40 processed.append(result)
41
42 # Sort by combined score
43 processed.sort(key=lambda r: r.combined_score, reverse=True)
44 return processed
45
46 def _calculate_relevance(
47 self,
48 result: SearchResult,
49 query: str
50 ) -> float:
51 """Calculate relevance of result to query."""
52 query_terms = set(query.lower().split())
53 title_terms = set(result.title.lower().split())
54 snippet_terms = set(result.snippet.lower().split())
55
56 # Term overlap
57 title_overlap = len(query_terms & title_terms) / len(query_terms)
58 snippet_overlap = len(query_terms & snippet_terms) / len(query_terms)
59
60 # Position bonus (earlier results usually more relevant)
61 position_score = 1.0 / (1 + result.position * 0.1)
62
63 return (title_overlap * 0.4) + (snippet_overlap * 0.4) + (position_score * 0.2)
64
65 def _calculate_quality(self, result: SearchResult) -> float:
66 """Calculate quality score based on source."""
67 domain = urlparse(result.url).netloc.replace("www.", "")
68
69 # Check known domains
70 for known_domain, score in self.domain_quality.items():
71 if known_domain in domain:
72 return score
73
74 # Default scoring based on TLD
75 if ".edu" in domain or ".gov" in domain:
76 return 0.85
77 elif ".org" in domain:
78 return 0.6
79
80 return 0.5
81
82 def filter_duplicates(
83 self,
84 results: List[SearchResult]
85 ) -> List[SearchResult]:
86 """Remove duplicate or near-duplicate results."""
87 seen_urls = set()
88 seen_titles = set()
89 filtered = []
90
91 for result in results:
92 # Normalize URL
93 normalized_url = result.url.lower().rstrip("/")
94
95 # Skip if URL seen
96 if normalized_url in seen_urls:
97 continue
98
99 # Check for similar titles
100 title_key = result.title.lower()[:50]
101 if title_key in seen_titles:
102 continue
103
104 seen_urls.add(normalized_url)
105 seen_titles.add(title_key)
106 filtered.append(result)
107
108 return filtered
109
110 async def rank_with_llm(
111 self,
112 results: List[SearchResult],
113 query: str
114 ) -> List[SearchResult]:
115 """Use LLM to re-rank results by relevance."""
116 if not self.llm or len(results) < 3:
117 return results
118
119 results_text = "\n".join([
120 f"[i+1]. [r.title]\n [r.snippet[:150]]..."
121 for i, r in enumerate(results[:10])
122 ])
123
124 prompt = f"""Given the research query and search results below,
125rank the results from most to least relevant.
126
127Query: [query]
128
129Results:
130[results_text]
131
132Return ONLY the numbers in order of relevance (most relevant first).
133Example: 3, 1, 5, 2, 4, 6, 7, 8, 9, 10"""
134
135 try:
136 response = await self.llm.generate(prompt)
137 # Parse ranking
138 ranking = [int(n.strip()) for n in response.split(",")]
139
140 # Reorder results
141 reordered = []
142 for rank in ranking:
143 if 1 <= rank <= len(results):
144 reordered.append(results[rank - 1])
145
146 # Add any missed results
147 for result in results:
148 if result not in reordered:
149 reordered.append(result)
150
151 return reordered
152 except:
153 return resultsSummary
In this section, we built a comprehensive web search system for our research agent:
- Multiple Providers: Integration with Brave, Tavily, and SerpAPI for robust search coverage
- Search Manager: Fallback handling, caching, and result deduplication
- Query Optimization: LLM-powered query expansion and search operators
- Result Processing: Relevance scoring, quality assessment, and intelligent ranking
In the next section, we'll implement web scraping and content extraction to gather detailed information from the sources we discover.