| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841 |
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import quote_plus, urljoin
- import time
- import re
- import html
- class BrowserTool:
- name = "browser_search"
- description = "执行网页搜索(支持多种搜索引擎和内容提取)"
-
- def get_parameters(self):
- return {
- "input": {"type": "str", "description": "搜索关键词", "required": True}
- }
- def _is_valid_result(self, title, url):
- """验证搜索结果的有效性"""
- if not title or len(title.strip()) < 3:
- return False
-
- # 过滤导航链接和无意义内容
- skip_keywords = [
- "next", "previous", "more", "about", "help", "settings",
- "privacy", "terms", "feedback", "donate", "install",
- "download", "login", "register", "sign in", "sign up"
- ]
-
- title_lower = title.lower()
- if any(keyword in title_lower for keyword in skip_keywords):
- return False
-
- # 过滤广告和推广链接
- ad_indicators = ["ad", "sponsored", "promotion", "广告", "推广"]
- if any(indicator in title_lower for indicator in ad_indicators):
- return False
-
- return True
- def _clean_text(self, text):
- """清理文本内容"""
- if not text:
- return ""
-
- # 移除多余空白字符
- text = re.sub(r'\s+', ' ', text.strip())
-
- # 移除特殊字符
- text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'-]', '', text)
-
- return text[:200] # 限制长度
- def _search_searx(self, query, limit=5):
- """使用多个搜索引擎实例 - 稳定版,优先支持中文搜索"""
- # 精选多个稳定的搜索引擎,优先支持中文
- search_instances = [
- {
- "name": "Searx.xyz",
- "url": "https://searx.xyz/search",
- "timeout": 10,
- "type": "searx"
- },
- {
- "name": "Searx.be",
- "url": "https://searx.be/search",
- "timeout": 10,
- "type": "searx"
- },
- {
- "name": "Brave搜索",
- "url": "https://search.brave.com/search",
- "timeout": 8,
- "type": "brave"
- },
- {
- "name": "Ecosia",
- "url": "https://www.ecosia.org/search",
- "timeout": 8,
- "type": "ecosia"
- },
- {
- "name": "Qwant",
- "url": "https://www.qwant.com",
- "timeout": 8,
- "type": "qwant"
- }
- ]
-
- for instance in search_instances:
- try:
- print(f"🔍 尝试 {instance['name']}...")
- result = self._try_search_instance(instance, query, limit)
- if result and len(result) > 0:
- print(f"✅ {instance['name']} 搜索成功,找到 {len(result)} 个结果")
- return result, True
-
- except Exception as e:
- print(f"⚠️ {instance['name']} 失败: {str(e)[:50]}")
- continue # 静默失败,快速切换
-
- # 快速降级到搜索建议
- print("🔗 所有搜索引擎失败,提供搜索建议")
- return self._get_search_suggestions(query), True
- def _try_search_instance(self, instance, query, limit):
- """尝试单个搜索引擎实例"""
- if instance['type'] == 'searx':
- return self._try_searx_instance(instance, query, limit)
- elif instance['type'] == 'duckduckgo':
- return self._try_duckduckgo_instance(instance, query, limit)
- elif instance['type'] == 'startpage':
- return self._try_startpage_instance(instance, query, limit)
- elif instance['type'] == 'qwant':
- return self._try_qwant_instance(instance, query, limit)
- elif instance['type'] == 'brave':
- return self._try_brave_instance(instance, query, limit)
- elif instance['type'] == 'ecosia':
- return self._try_ecosia_instance(instance, query, limit)
- else:
- return None
- def _try_searx_instance(self, instance, query, limit):
- """尝试Searx实例 - 优化中文搜索支持"""
- # 检测是否为中文查询
- is_chinese = any('\u4e00' <= char <= '\u9fff' for char in query)
-
- params = {
- 'q': query,
- 'format': 'json',
- 'engines': 'google,bing,duckduckgo,yandex' if not is_chinese else 'google,bing,yandex,baidu',
- 'language': 'zh-CN' if is_chinese else 'auto'
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" if is_chinese else "en-US,en;q=0.9"
- }
-
- try:
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- try:
- data = response.json()
- results = []
-
- for item in data.get('results', [])[:limit]:
- title = self._clean_text(item.get('title', ''))
- url = item.get('url', '')
- content = item.get('content', '')
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': self._clean_text(content)[:200],
- 'source': f"{instance['name']}/{item.get('engine', 'unknown')}"
- })
-
- return results if results else None
- except Exception as e:
- print(f"⚠️ 解析Searx响应失败: {str(e)[:50]}")
- return None
- else:
- print(f"⚠️ Searx返回状态码: {response.status_code}")
- return None
- except requests.Timeout:
- print(f"⚠️ {instance['name']} 请求超时")
- return None
- except Exception as e:
- print(f"⚠️ {instance['name']} 请求异常: {str(e)[:50]}")
- return None
- def _try_duckduckgo_instance(self, instance, query, limit):
- """尝试DuckDuckGo实例"""
- params = {
- 'q': query,
- 'kl': 'cn-zh'
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
- }
-
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- return self._extract_duckduckgo_results_from_soup(soup, limit)
-
- return None
- def _try_startpage_instance(self, instance, query, limit):
- """尝试Startpage实例"""
- params = {
- 'query': query,
- 'cat': 'web',
- 'pl': 'ext-ff',
- 'extVersion': '1.3.0'
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
- }
-
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- return self._extract_startpage_results(soup, limit)
-
- return None
- def _try_qwant_instance(self, instance, query, limit):
- """尝试Qwant实例"""
- params = {
- 'q': query,
- 't': 'web',
- 'locale': 'zh_CN'
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
- }
-
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- return self._extract_qwant_results(soup, limit)
-
- return None
- def _try_brave_instance(self, instance, query, limit):
- """尝试Brave搜索实例"""
- params = {
- 'q': query,
- 'source': 'web'
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
- }
-
- try:
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- # Brave搜索结果提取(需要根据实际HTML结构调整)
- results = []
- result_divs = soup.find_all('div', class_=['result', 'web-result'])
-
- for div in result_divs[:limit]:
- title_elem = div.find('a') or div.find('h2')
- snippet_elem = div.find('p') or div.find('span', class_='snippet')
-
- if title_elem:
- title = self._clean_text(title_elem.get_text())
- url = title_elem.get('href', '')
- snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': snippet[:200],
- 'source': 'Brave'
- })
-
- return results if results else None
- except Exception:
- return None
-
- return None
- def _try_ecosia_instance(self, instance, query, limit):
- """尝试Ecosia搜索实例"""
- params = {
- 'q': query
- }
-
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
- }
-
- try:
- response = requests.get(
- instance['url'],
- params=params,
- headers=headers,
- timeout=instance['timeout']
- )
-
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- # Ecosia搜索结果提取(需要根据实际HTML结构调整)
- results = []
- result_divs = soup.find_all('div', class_=['result', 'web-result', 'result__body'])
-
- for div in result_divs[:limit]:
- title_elem = div.find('a') or div.find('h2')
- snippet_elem = div.find('p') or div.find('span', class_='result__snippet')
-
- if title_elem:
- title = self._clean_text(title_elem.get_text())
- url = title_elem.get('href', '')
- snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': snippet[:200],
- 'source': 'Ecosia'
- })
-
- return results if results else None
- except Exception:
- return None
-
- return None
- def _extract_duckduckgo_results_from_soup(self, soup, limit):
- """从DuckDuckGo HTML中提取结果"""
- results = []
-
- # 查找搜索结果
- result_divs = soup.find_all('div', class_='result')
-
- for div in result_divs[:limit]:
- title_elem = div.find('a', class_='result__a')
- snippet_elem = div.find('a', class_='result__snippet')
-
- if title_elem:
- title = self._clean_text(title_elem.get_text())
- url = title_elem.get('href', '')
- snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': snippet[:200],
- 'source': 'DuckDuckGo'
- })
-
- return results
- def _extract_startpage_results(self, soup, limit):
- """从Startpage HTML中提取结果"""
- results = []
-
- # 查找搜索结果
- result_divs = soup.find_all('div', class_='w-gl__result')
-
- for div in result_divs[:limit]:
- title_elem = div.find('h3')
- link_elem = title_elem.find('a') if title_elem else None
- snippet_elem = div.find('p', class_='w-gl__description')
-
- if link_elem:
- title = self._clean_text(link_elem.get_text())
- url = link_elem.get('href', '')
- snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': snippet[:200],
- 'source': 'Startpage'
- })
-
- return results
- def _extract_qwant_results(self, soup, limit):
- """从Qwant HTML中提取结果"""
- results = []
-
- # 查找搜索结果
- result_divs = soup.find_all('div', class_='result')
-
- for div in result_divs[:limit]:
- title_elem = div.find('a', class_='result--web')
- snippet_elem = div.find('p', class_='result__desc')
-
- if title_elem:
- title = self._clean_text(title_elem.get_text())
- url = title_elem.get('href', '')
- snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
-
- if self._is_valid_result(title, url):
- results.append({
- 'title': title,
- 'url': url,
- 'snippet': snippet[:200],
- 'source': 'Qwant'
- })
-
- return results
- def _extract_duckduckgo_results(self, soup, limit=5):
- """提取DuckDuckGo搜索结果"""
- results = []
-
- # DuckDuckGo现在返回202状态码,需要JavaScript渲染
- # 我们尝试从HTML中提取任何有用的信息
-
- # 方法1:查找所有外部链接
- all_links = soup.find_all('a', href=True)
- external_links = []
-
- for link in all_links:
- href = link.get('href', '')
- title = self._clean_text(link.get_text(strip=True))
-
- # 过滤外部链接(非DuckDuckGo内部链接)
- if (href and
- not href.startswith('javascript:') and
- not href.startswith('#') and
- 'duckduckgo.com' not in href and
- len(title) > 3 and
- self._is_valid_result(title, href)):
-
- external_links.append({
- 'title': title,
- 'url': href,
- 'snippet': '',
- 'link_element': link
- })
-
- # 方法2:如果外部链接不够,尝试从页面文本中提取信息
- if len(external_links) < 2:
- print("⚠️ 外部链接较少,尝试文本提取")
-
- # 查找页面中的主要文本内容
- text_content = soup.get_text()
-
- # 尝试提取URL模式
- import re
- url_pattern = r'https?://[^\s<>"\'()]+'
- urls = re.findall(url_pattern, text_content)
-
- for url in urls[:limit]:
- # 从URL中提取可能的标题
- domain = url.split('/')[2] if '/' in url else url
- title = domain.replace('www.', '').title()
-
- if self._is_valid_result(title, url):
- external_links.append({
- 'title': title,
- 'url': url,
- 'snippet': f'来自 {domain}',
- 'link_element': None
- })
-
- # 方法3:如果还是没有足够结果,提供搜索建议
- if len(external_links) < 2:
- print("⚠️ 搜索结果有限,提供搜索建议")
-
- suggestions = [
- {
- 'title': f'在Google搜索 "{self.last_query}"',
- 'url': f'https://www.google.com/search?q={self.last_query}',
- 'snippet': '使用Google搜索引擎',
- 'link_element': None
- },
- {
- 'title': f'在Bing搜索 "{self.last_query}"',
- 'url': f'https://www.bing.com/search?q={self.last_query}',
- 'snippet': '使用Bing搜索引擎',
- 'link_element': None
- }
- ]
- external_links.extend(suggestions)
-
- # 去重并限制结果数量
- seen_urls = set()
- unique_results = []
-
- for result in external_links:
- if result['url'] and result['url'] not in seen_urls:
- seen_urls.add(result['url'])
- unique_results.append(result)
- if len(unique_results) >= limit:
- break
-
- return unique_results
- def _extract_content_from_url(self, url, max_length=300):
- """从URL提取主要内容"""
- try:
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
- }
-
- response = requests.get(url, headers=headers, timeout=10)
- if response.status_code != 200:
- return "内容获取失败"
-
- soup = BeautifulSoup(response.text, 'html.parser')
-
- # 移除脚本和样式标签
- for script in soup(["script", "style", "nav", "footer", "header", "aside", "advertisement"]):
- script.decompose()
-
- # 智能内容提取策略
- content = self._extract_main_content(soup)
-
- if not content:
- content = soup.get_text(strip=True)
-
- # 清理和优化内容
- content = self._clean_and_format_content(content)
-
- return content[:max_length] + "..." if len(content) > max_length else content
-
- except Exception as e:
- return f"内容提取失败: {str(e)[:50]}"
- def _extract_main_content(self, soup):
- """智能提取页面主要内容"""
- # 优先级策略:从最具体到最通用
- extraction_strategies = [
- # 1. 文章相关标签
- ['article', 'main article', '.article-content', '.post-content'],
- # 2. 主要内容区域
- ['main', '.main', '.content', '.main-content'],
- # 3. 常见内容类名
- ['.entry-content', '.post-body', '.article-body', '.content-area'],
- # 4. 通用容器
- ['.container', '.wrapper', '.page-content'],
- # 5. 最后尝试body
- ['body']
- ]
-
- for strategy in extraction_strategies:
- for selector in strategy:
- element = soup.select_one(selector)
- if element:
- content = element.get_text(strip=True)
- # 验证内容质量
- if self._is_quality_content(content):
- return content
-
- return ""
- def _is_quality_content(self, content):
- """验证内容质量"""
- if not content or len(content) < 50:
- return False
-
- # 过滤导航和菜单内容
- nav_keywords = ['导航', '菜单', '首页', '登录', '注册', '搜索', '联系', '关于', 'privacy', 'terms', 'home', 'login', 'register', 'contact', 'about']
- content_lower = content.lower()
-
- for keyword in nav_keywords:
- if keyword in content_lower:
- return False
-
- # 检查是否包含有意义的句子
- sentences = content.split('。')
- meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
-
- return len(meaningful_sentences) >= 2
- def _clean_and_format_content(self, content):
- """清理和格式化内容"""
- if not content:
- return ""
-
- # 移除多余空白
- content = re.sub(r'\s+', ' ', content.strip())
-
- # 移除特殊字符,保留中文标点
- content = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'。,!?:;()【】""''-]', '', content)
-
- # 移除重复的换行和空格
- content = re.sub(r'\n\s*\n', '\n', content)
- content = re.sub(r' {2,}', ' ', content)
-
- # 提取前几个有意义的句子
- sentences = re.split(r'[。!?.!?]', content)
- meaningful_sentences = []
-
- for sentence in sentences:
- sentence = sentence.strip()
- if len(sentence) > 10 and len(sentence) < 100: # 合理的句子长度
- meaningful_sentences.append(sentence)
- if len(meaningful_sentences) >= 3: # 最多3个句子
- break
-
- return '。'.join(meaningful_sentences)
- def _enhance_search_results(self, results, limit=3):
- """增强搜索结果,提取内容预览"""
- enhanced_results = []
-
- for i, result in enumerate(results):
- if i >= limit: # 只增强前几个结果
- break
-
- if result['url'] and result['url'].startswith('http'):
- print(f"📄 提取内容: {result['title'][:30]}...")
- content = self._extract_content_from_url(result['url'])
- result['snippet'] = content
- result['enhanced'] = True
- else:
- result['enhanced'] = False
-
- enhanced_results.append(result)
-
- # 添加未增强的结果
- enhanced_results.extend(results[limit:])
-
- return enhanced_results
- def _fallback_extraction(self, soup, limit=5):
- """备用结果提取方法"""
- results = []
-
- # 方法1:提取标题元素
- for tag in ["h1", "h2", "h3", "h4"]:
- elements = soup.find_all(tag)
- for elem in elements:
- if len(results) >= limit:
- break
-
- title = self._clean_text(elem.get_text(strip=True))
- if self._is_valid_result(title, ""):
- results.append({
- "title": title,
- "url": "",
- "snippet": ""
- })
-
- # 方法2:提取文本块
- if not results:
- text_blocks = soup.get_text().split('\n')
- for block in text_blocks:
- if len(results) >= limit:
- break
-
- block = self._clean_text(block)
- if len(block) > 20 and len(block) < 150:
- results.append({
- "title": block,
- "url": "",
- "snippet": ""
- })
-
- return results
- def run(self, parameters):
- # 确保参数处理的安全性
- if isinstance(parameters, dict):
- query = parameters.get("input", "")
- else:
- query = str(parameters) if parameters else ""
- # 参数验证
- if not query or not query.strip():
- return "错误:搜索关键词不能为空"
-
- query = query.strip()
- self.last_query = query # 保存查询用于建议
- limit = 5 # 增加结果数量
-
- # URL 编码查询参数
- encoded_query = quote_plus(query)
- url = f"https://duckduckgo.com/html/?q={encoded_query}"
-
- # 使用更真实的User-Agent
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
- "Accept-Encoding": "gzip, deflate, br",
- "DNT": "1",
- "Connection": "keep-alive",
- "Upgrade-Insecure-Requests": "1"
- }
-
- # 检测是否为中文查询
- is_chinese = any('\u4e00' <= char <= '\u9fff' for char in query)
-
- # 对于中文搜索,直接使用Searx搜索引擎,跳过DuckDuckGo(避免202问题)
- if is_chinese:
- print(f"🌐 检测到中文查询,使用多引擎搜索策略...")
- searx_results, searx_success = self._search_searx(query, limit)
-
- if searx_success and searx_results:
- results = searx_results
- search_engine = "Searx多引擎"
- print(f"✅ 中文搜索成功,找到 {len(results)} 个结果")
- else:
- # 如果Searx失败,提供搜索建议
- print("⚠️ 所有搜索引擎失败,提供搜索建议")
- results = self._get_search_suggestions(query)
- search_engine = "搜索建议"
- else:
- # 英文搜索:先尝试DuckDuckGo,失败后使用Searx
- max_retries = 2 # 减少重试次数,快速切换到Searx
- duckduckgo_success = False
-
- for attempt in range(max_retries):
- try:
- print(f"🔍 尝试DuckDuckGo搜索: {query} (尝试 {attempt + 1}/{max_retries})")
-
- response = requests.get(url, headers=headers, timeout=10)
-
- # DuckDuckGo经常返回202,直接跳过
- if response.status_code == 202:
- print("⚠️ DuckDuckGo返回202(需要JavaScript),切换到Searx...")
- break
-
- if response.status_code != 200:
- if attempt < max_retries - 1:
- time.sleep(1)
- continue
- break
-
- # 检查响应内容
- if len(response.text) < 1000:
- if attempt < max_retries - 1:
- time.sleep(1)
- continue
- break
-
- soup = BeautifulSoup(response.text, "html.parser")
- results = self._extract_duckduckgo_results(soup, limit)
-
- if results and len(results) > 0:
- duckduckgo_success = True
- search_engine = "DuckDuckGo"
- print(f"✅ DuckDuckGo搜索成功,找到 {len(results)} 个结果")
- break
-
- except Exception as e:
- print(f"⚠️ DuckDuckGo尝试失败: {str(e)[:50]}")
- if attempt < max_retries - 1:
- time.sleep(1)
- continue
- break
-
- # 如果DuckDuckGo失败,使用Searx
- if not duckduckgo_success:
- print("🌐 DuckDuckGo失败,切换到Searx搜索引擎...")
- searx_results, searx_success = self._search_searx(query, limit)
-
- if searx_success and searx_results:
- results = searx_results
- search_engine = "Searx多引擎"
- print(f"✅ Searx搜索成功,找到 {len(results)} 个结果")
- else:
- print("⚠️ 所有搜索引擎失败,提供搜索建议")
- results = self._get_search_suggestions(query)
- search_engine = "搜索建议"
-
- # 增强搜索结果(提取内容预览)
- if results:
- print("🚀 增强搜索结果,提取内容预览...")
- enhanced_results = self._enhance_search_results(results, limit=3)
- results = enhanced_results
-
- # 格式化输出结果
- if results:
- formatted_results = []
- for i, result in enumerate(results, 1):
- result_text = f"{i}. {result['title']}"
-
- if result['url']:
- result_text += f"\n 🔗 {result['url']}"
-
- if result['snippet']:
- # 如果是增强的结果,显示内容预览
- if result.get('enhanced'):
- result_text += f"\n 📄 内容预览: {result['snippet']}"
- else:
- result_text += f"\n 📝 {result['snippet']}"
-
- formatted_results.append(result_text)
-
- return "\n\n".join(formatted_results)
- else:
- return f"未找到关于 '{query}' 的搜索结果。请尝试使用不同的关键词。"
-
- def _get_search_suggestions(self, query):
- """快速提供搜索建议"""
- return [
- {
- 'title': f'Google搜索: {query}',
- 'url': f'https://www.google.com/search?q={query}',
- 'snippet': '使用Google搜索引擎',
- 'source': 'Google'
- },
- {
- 'title': f'Bing搜索: {query}',
- 'url': f'https://www.bing.com/search?q={query}',
- 'snippet': '使用Bing搜索引擎',
- 'source': 'Bing'
- }
- ]
- return "搜索失败,已多次重试。请稍后再试。"
|