browser_tool.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import quote_plus, urljoin
  4. import time
  5. import re
  6. import html
  7. class BrowserTool:
  8. name = "browser_search"
  9. description = "执行网页搜索(支持多种搜索引擎和内容提取)"
  10. def get_parameters(self):
  11. return {
  12. "input": {"type": "str", "description": "搜索关键词", "required": True}
  13. }
  14. def _is_valid_result(self, title, url):
  15. """验证搜索结果的有效性"""
  16. if not title or len(title.strip()) < 3:
  17. return False
  18. # 过滤导航链接和无意义内容
  19. skip_keywords = [
  20. "next", "previous", "more", "about", "help", "settings",
  21. "privacy", "terms", "feedback", "donate", "install",
  22. "download", "login", "register", "sign in", "sign up"
  23. ]
  24. title_lower = title.lower()
  25. if any(keyword in title_lower for keyword in skip_keywords):
  26. return False
  27. # 过滤广告和推广链接
  28. ad_indicators = ["ad", "sponsored", "promotion", "广告", "推广"]
  29. if any(indicator in title_lower for indicator in ad_indicators):
  30. return False
  31. return True
  32. def _clean_text(self, text):
  33. """清理文本内容"""
  34. if not text:
  35. return ""
  36. # 移除多余空白字符
  37. text = re.sub(r'\s+', ' ', text.strip())
  38. # 移除特殊字符
  39. text = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'-]', '', text)
  40. return text[:200] # 限制长度
  41. def _search_searx(self, query, limit=5):
  42. """使用多个搜索引擎实例 - 稳定版,优先支持中文搜索"""
  43. # 精选多个稳定的搜索引擎,优先支持中文
  44. search_instances = [
  45. {
  46. "name": "Searx.xyz",
  47. "url": "https://searx.xyz/search",
  48. "timeout": 10,
  49. "type": "searx"
  50. },
  51. {
  52. "name": "Searx.be",
  53. "url": "https://searx.be/search",
  54. "timeout": 10,
  55. "type": "searx"
  56. },
  57. {
  58. "name": "Brave搜索",
  59. "url": "https://search.brave.com/search",
  60. "timeout": 8,
  61. "type": "brave"
  62. },
  63. {
  64. "name": "Ecosia",
  65. "url": "https://www.ecosia.org/search",
  66. "timeout": 8,
  67. "type": "ecosia"
  68. },
  69. {
  70. "name": "Qwant",
  71. "url": "https://www.qwant.com",
  72. "timeout": 8,
  73. "type": "qwant"
  74. }
  75. ]
  76. for instance in search_instances:
  77. try:
  78. print(f"🔍 尝试 {instance['name']}...")
  79. result = self._try_search_instance(instance, query, limit)
  80. if result and len(result) > 0:
  81. print(f"✅ {instance['name']} 搜索成功,找到 {len(result)} 个结果")
  82. return result, True
  83. except Exception as e:
  84. print(f"⚠️ {instance['name']} 失败: {str(e)[:50]}")
  85. continue # 静默失败,快速切换
  86. # 快速降级到搜索建议
  87. print("🔗 所有搜索引擎失败,提供搜索建议")
  88. return self._get_search_suggestions(query), True
  89. def _try_search_instance(self, instance, query, limit):
  90. """尝试单个搜索引擎实例"""
  91. if instance['type'] == 'searx':
  92. return self._try_searx_instance(instance, query, limit)
  93. elif instance['type'] == 'duckduckgo':
  94. return self._try_duckduckgo_instance(instance, query, limit)
  95. elif instance['type'] == 'startpage':
  96. return self._try_startpage_instance(instance, query, limit)
  97. elif instance['type'] == 'qwant':
  98. return self._try_qwant_instance(instance, query, limit)
  99. elif instance['type'] == 'brave':
  100. return self._try_brave_instance(instance, query, limit)
  101. elif instance['type'] == 'ecosia':
  102. return self._try_ecosia_instance(instance, query, limit)
  103. else:
  104. return None
  105. def _try_searx_instance(self, instance, query, limit):
  106. """尝试Searx实例 - 优化中文搜索支持"""
  107. # 检测是否为中文查询
  108. is_chinese = any('\u4e00' <= char <= '\u9fff' for char in query)
  109. params = {
  110. 'q': query,
  111. 'format': 'json',
  112. 'engines': 'google,bing,duckduckgo,yandex' if not is_chinese else 'google,bing,yandex,baidu',
  113. 'language': 'zh-CN' if is_chinese else 'auto'
  114. }
  115. headers = {
  116. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  117. "Accept": "application/json, text/plain, */*",
  118. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8" if is_chinese else "en-US,en;q=0.9"
  119. }
  120. try:
  121. response = requests.get(
  122. instance['url'],
  123. params=params,
  124. headers=headers,
  125. timeout=instance['timeout']
  126. )
  127. if response.status_code == 200:
  128. try:
  129. data = response.json()
  130. results = []
  131. for item in data.get('results', [])[:limit]:
  132. title = self._clean_text(item.get('title', ''))
  133. url = item.get('url', '')
  134. content = item.get('content', '')
  135. if self._is_valid_result(title, url):
  136. results.append({
  137. 'title': title,
  138. 'url': url,
  139. 'snippet': self._clean_text(content)[:200],
  140. 'source': f"{instance['name']}/{item.get('engine', 'unknown')}"
  141. })
  142. return results if results else None
  143. except Exception as e:
  144. print(f"⚠️ 解析Searx响应失败: {str(e)[:50]}")
  145. return None
  146. else:
  147. print(f"⚠️ Searx返回状态码: {response.status_code}")
  148. return None
  149. except requests.Timeout:
  150. print(f"⚠️ {instance['name']} 请求超时")
  151. return None
  152. except Exception as e:
  153. print(f"⚠️ {instance['name']} 请求异常: {str(e)[:50]}")
  154. return None
  155. def _try_duckduckgo_instance(self, instance, query, limit):
  156. """尝试DuckDuckGo实例"""
  157. params = {
  158. 'q': query,
  159. 'kl': 'cn-zh'
  160. }
  161. headers = {
  162. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
  163. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  164. }
  165. response = requests.get(
  166. instance['url'],
  167. params=params,
  168. headers=headers,
  169. timeout=instance['timeout']
  170. )
  171. if response.status_code == 200:
  172. soup = BeautifulSoup(response.text, 'html.parser')
  173. return self._extract_duckduckgo_results_from_soup(soup, limit)
  174. return None
  175. def _try_startpage_instance(self, instance, query, limit):
  176. """尝试Startpage实例"""
  177. params = {
  178. 'query': query,
  179. 'cat': 'web',
  180. 'pl': 'ext-ff',
  181. 'extVersion': '1.3.0'
  182. }
  183. headers = {
  184. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
  185. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  186. }
  187. response = requests.get(
  188. instance['url'],
  189. params=params,
  190. headers=headers,
  191. timeout=instance['timeout']
  192. )
  193. if response.status_code == 200:
  194. soup = BeautifulSoup(response.text, 'html.parser')
  195. return self._extract_startpage_results(soup, limit)
  196. return None
  197. def _try_qwant_instance(self, instance, query, limit):
  198. """尝试Qwant实例"""
  199. params = {
  200. 'q': query,
  201. 't': 'web',
  202. 'locale': 'zh_CN'
  203. }
  204. headers = {
  205. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
  206. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  207. }
  208. response = requests.get(
  209. instance['url'],
  210. params=params,
  211. headers=headers,
  212. timeout=instance['timeout']
  213. )
  214. if response.status_code == 200:
  215. soup = BeautifulSoup(response.text, 'html.parser')
  216. return self._extract_qwant_results(soup, limit)
  217. return None
  218. def _try_brave_instance(self, instance, query, limit):
  219. """尝试Brave搜索实例"""
  220. params = {
  221. 'q': query,
  222. 'source': 'web'
  223. }
  224. headers = {
  225. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  226. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  227. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
  228. }
  229. try:
  230. response = requests.get(
  231. instance['url'],
  232. params=params,
  233. headers=headers,
  234. timeout=instance['timeout']
  235. )
  236. if response.status_code == 200:
  237. soup = BeautifulSoup(response.text, 'html.parser')
  238. # Brave搜索结果提取(需要根据实际HTML结构调整)
  239. results = []
  240. result_divs = soup.find_all('div', class_=['result', 'web-result'])
  241. for div in result_divs[:limit]:
  242. title_elem = div.find('a') or div.find('h2')
  243. snippet_elem = div.find('p') or div.find('span', class_='snippet')
  244. if title_elem:
  245. title = self._clean_text(title_elem.get_text())
  246. url = title_elem.get('href', '')
  247. snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
  248. if self._is_valid_result(title, url):
  249. results.append({
  250. 'title': title,
  251. 'url': url,
  252. 'snippet': snippet[:200],
  253. 'source': 'Brave'
  254. })
  255. return results if results else None
  256. except Exception:
  257. return None
  258. return None
  259. def _try_ecosia_instance(self, instance, query, limit):
  260. """尝试Ecosia搜索实例"""
  261. params = {
  262. 'q': query
  263. }
  264. headers = {
  265. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  266. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  267. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
  268. }
  269. try:
  270. response = requests.get(
  271. instance['url'],
  272. params=params,
  273. headers=headers,
  274. timeout=instance['timeout']
  275. )
  276. if response.status_code == 200:
  277. soup = BeautifulSoup(response.text, 'html.parser')
  278. # Ecosia搜索结果提取(需要根据实际HTML结构调整)
  279. results = []
  280. result_divs = soup.find_all('div', class_=['result', 'web-result', 'result__body'])
  281. for div in result_divs[:limit]:
  282. title_elem = div.find('a') or div.find('h2')
  283. snippet_elem = div.find('p') or div.find('span', class_='result__snippet')
  284. if title_elem:
  285. title = self._clean_text(title_elem.get_text())
  286. url = title_elem.get('href', '')
  287. snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
  288. if self._is_valid_result(title, url):
  289. results.append({
  290. 'title': title,
  291. 'url': url,
  292. 'snippet': snippet[:200],
  293. 'source': 'Ecosia'
  294. })
  295. return results if results else None
  296. except Exception:
  297. return None
  298. return None
  299. def _extract_duckduckgo_results_from_soup(self, soup, limit):
  300. """从DuckDuckGo HTML中提取结果"""
  301. results = []
  302. # 查找搜索结果
  303. result_divs = soup.find_all('div', class_='result')
  304. for div in result_divs[:limit]:
  305. title_elem = div.find('a', class_='result__a')
  306. snippet_elem = div.find('a', class_='result__snippet')
  307. if title_elem:
  308. title = self._clean_text(title_elem.get_text())
  309. url = title_elem.get('href', '')
  310. snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
  311. if self._is_valid_result(title, url):
  312. results.append({
  313. 'title': title,
  314. 'url': url,
  315. 'snippet': snippet[:200],
  316. 'source': 'DuckDuckGo'
  317. })
  318. return results
  319. def _extract_startpage_results(self, soup, limit):
  320. """从Startpage HTML中提取结果"""
  321. results = []
  322. # 查找搜索结果
  323. result_divs = soup.find_all('div', class_='w-gl__result')
  324. for div in result_divs[:limit]:
  325. title_elem = div.find('h3')
  326. link_elem = title_elem.find('a') if title_elem else None
  327. snippet_elem = div.find('p', class_='w-gl__description')
  328. if link_elem:
  329. title = self._clean_text(link_elem.get_text())
  330. url = link_elem.get('href', '')
  331. snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
  332. if self._is_valid_result(title, url):
  333. results.append({
  334. 'title': title,
  335. 'url': url,
  336. 'snippet': snippet[:200],
  337. 'source': 'Startpage'
  338. })
  339. return results
  340. def _extract_qwant_results(self, soup, limit):
  341. """从Qwant HTML中提取结果"""
  342. results = []
  343. # 查找搜索结果
  344. result_divs = soup.find_all('div', class_='result')
  345. for div in result_divs[:limit]:
  346. title_elem = div.find('a', class_='result--web')
  347. snippet_elem = div.find('p', class_='result__desc')
  348. if title_elem:
  349. title = self._clean_text(title_elem.get_text())
  350. url = title_elem.get('href', '')
  351. snippet = self._clean_text(snippet_elem.get_text()) if snippet_elem else ''
  352. if self._is_valid_result(title, url):
  353. results.append({
  354. 'title': title,
  355. 'url': url,
  356. 'snippet': snippet[:200],
  357. 'source': 'Qwant'
  358. })
  359. return results
  360. def _extract_duckduckgo_results(self, soup, limit=5):
  361. """提取DuckDuckGo搜索结果"""
  362. results = []
  363. # DuckDuckGo现在返回202状态码,需要JavaScript渲染
  364. # 我们尝试从HTML中提取任何有用的信息
  365. # 方法1:查找所有外部链接
  366. all_links = soup.find_all('a', href=True)
  367. external_links = []
  368. for link in all_links:
  369. href = link.get('href', '')
  370. title = self._clean_text(link.get_text(strip=True))
  371. # 过滤外部链接(非DuckDuckGo内部链接)
  372. if (href and
  373. not href.startswith('javascript:') and
  374. not href.startswith('#') and
  375. 'duckduckgo.com' not in href and
  376. len(title) > 3 and
  377. self._is_valid_result(title, href)):
  378. external_links.append({
  379. 'title': title,
  380. 'url': href,
  381. 'snippet': '',
  382. 'link_element': link
  383. })
  384. # 方法2:如果外部链接不够,尝试从页面文本中提取信息
  385. if len(external_links) < 2:
  386. print("⚠️ 外部链接较少,尝试文本提取")
  387. # 查找页面中的主要文本内容
  388. text_content = soup.get_text()
  389. # 尝试提取URL模式
  390. import re
  391. url_pattern = r'https?://[^\s<>"\'()]+'
  392. urls = re.findall(url_pattern, text_content)
  393. for url in urls[:limit]:
  394. # 从URL中提取可能的标题
  395. domain = url.split('/')[2] if '/' in url else url
  396. title = domain.replace('www.', '').title()
  397. if self._is_valid_result(title, url):
  398. external_links.append({
  399. 'title': title,
  400. 'url': url,
  401. 'snippet': f'来自 {domain}',
  402. 'link_element': None
  403. })
  404. # 方法3:如果还是没有足够结果,提供搜索建议
  405. if len(external_links) < 2:
  406. print("⚠️ 搜索结果有限,提供搜索建议")
  407. suggestions = [
  408. {
  409. 'title': f'在Google搜索 "{self.last_query}"',
  410. 'url': f'https://www.google.com/search?q={self.last_query}',
  411. 'snippet': '使用Google搜索引擎',
  412. 'link_element': None
  413. },
  414. {
  415. 'title': f'在Bing搜索 "{self.last_query}"',
  416. 'url': f'https://www.bing.com/search?q={self.last_query}',
  417. 'snippet': '使用Bing搜索引擎',
  418. 'link_element': None
  419. }
  420. ]
  421. external_links.extend(suggestions)
  422. # 去重并限制结果数量
  423. seen_urls = set()
  424. unique_results = []
  425. for result in external_links:
  426. if result['url'] and result['url'] not in seen_urls:
  427. seen_urls.add(result['url'])
  428. unique_results.append(result)
  429. if len(unique_results) >= limit:
  430. break
  431. return unique_results
  432. def _extract_content_from_url(self, url, max_length=300):
  433. """从URL提取主要内容"""
  434. try:
  435. headers = {
  436. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  437. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  438. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
  439. }
  440. response = requests.get(url, headers=headers, timeout=10)
  441. if response.status_code != 200:
  442. return "内容获取失败"
  443. soup = BeautifulSoup(response.text, 'html.parser')
  444. # 移除脚本和样式标签
  445. for script in soup(["script", "style", "nav", "footer", "header", "aside", "advertisement"]):
  446. script.decompose()
  447. # 智能内容提取策略
  448. content = self._extract_main_content(soup)
  449. if not content:
  450. content = soup.get_text(strip=True)
  451. # 清理和优化内容
  452. content = self._clean_and_format_content(content)
  453. return content[:max_length] + "..." if len(content) > max_length else content
  454. except Exception as e:
  455. return f"内容提取失败: {str(e)[:50]}"
  456. def _extract_main_content(self, soup):
  457. """智能提取页面主要内容"""
  458. # 优先级策略:从最具体到最通用
  459. extraction_strategies = [
  460. # 1. 文章相关标签
  461. ['article', 'main article', '.article-content', '.post-content'],
  462. # 2. 主要内容区域
  463. ['main', '.main', '.content', '.main-content'],
  464. # 3. 常见内容类名
  465. ['.entry-content', '.post-body', '.article-body', '.content-area'],
  466. # 4. 通用容器
  467. ['.container', '.wrapper', '.page-content'],
  468. # 5. 最后尝试body
  469. ['body']
  470. ]
  471. for strategy in extraction_strategies:
  472. for selector in strategy:
  473. element = soup.select_one(selector)
  474. if element:
  475. content = element.get_text(strip=True)
  476. # 验证内容质量
  477. if self._is_quality_content(content):
  478. return content
  479. return ""
  480. def _is_quality_content(self, content):
  481. """验证内容质量"""
  482. if not content or len(content) < 50:
  483. return False
  484. # 过滤导航和菜单内容
  485. nav_keywords = ['导航', '菜单', '首页', '登录', '注册', '搜索', '联系', '关于', 'privacy', 'terms', 'home', 'login', 'register', 'contact', 'about']
  486. content_lower = content.lower()
  487. for keyword in nav_keywords:
  488. if keyword in content_lower:
  489. return False
  490. # 检查是否包含有意义的句子
  491. sentences = content.split('。')
  492. meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
  493. return len(meaningful_sentences) >= 2
  494. def _clean_and_format_content(self, content):
  495. """清理和格式化内容"""
  496. if not content:
  497. return ""
  498. # 移除多余空白
  499. content = re.sub(r'\s+', ' ', content.strip())
  500. # 移除特殊字符,保留中文标点
  501. content = re.sub(r'[^\w\s\u4e00-\u9fff.,!?;:()[\]{}"\'。,!?:;()【】""''-]', '', content)
  502. # 移除重复的换行和空格
  503. content = re.sub(r'\n\s*\n', '\n', content)
  504. content = re.sub(r' {2,}', ' ', content)
  505. # 提取前几个有意义的句子
  506. sentences = re.split(r'[。!?.!?]', content)
  507. meaningful_sentences = []
  508. for sentence in sentences:
  509. sentence = sentence.strip()
  510. if len(sentence) > 10 and len(sentence) < 100: # 合理的句子长度
  511. meaningful_sentences.append(sentence)
  512. if len(meaningful_sentences) >= 3: # 最多3个句子
  513. break
  514. return '。'.join(meaningful_sentences)
  515. def _enhance_search_results(self, results, limit=3):
  516. """增强搜索结果,提取内容预览"""
  517. enhanced_results = []
  518. for i, result in enumerate(results):
  519. if i >= limit: # 只增强前几个结果
  520. break
  521. if result['url'] and result['url'].startswith('http'):
  522. print(f"📄 提取内容: {result['title'][:30]}...")
  523. content = self._extract_content_from_url(result['url'])
  524. result['snippet'] = content
  525. result['enhanced'] = True
  526. else:
  527. result['enhanced'] = False
  528. enhanced_results.append(result)
  529. # 添加未增强的结果
  530. enhanced_results.extend(results[limit:])
  531. return enhanced_results
  532. def _fallback_extraction(self, soup, limit=5):
  533. """备用结果提取方法"""
  534. results = []
  535. # 方法1:提取标题元素
  536. for tag in ["h1", "h2", "h3", "h4"]:
  537. elements = soup.find_all(tag)
  538. for elem in elements:
  539. if len(results) >= limit:
  540. break
  541. title = self._clean_text(elem.get_text(strip=True))
  542. if self._is_valid_result(title, ""):
  543. results.append({
  544. "title": title,
  545. "url": "",
  546. "snippet": ""
  547. })
  548. # 方法2:提取文本块
  549. if not results:
  550. text_blocks = soup.get_text().split('\n')
  551. for block in text_blocks:
  552. if len(results) >= limit:
  553. break
  554. block = self._clean_text(block)
  555. if len(block) > 20 and len(block) < 150:
  556. results.append({
  557. "title": block,
  558. "url": "",
  559. "snippet": ""
  560. })
  561. return results
  562. def run(self, parameters):
  563. # 确保参数处理的安全性
  564. if isinstance(parameters, dict):
  565. query = parameters.get("input", "")
  566. else:
  567. query = str(parameters) if parameters else ""
  568. # 参数验证
  569. if not query or not query.strip():
  570. return "错误:搜索关键词不能为空"
  571. query = query.strip()
  572. self.last_query = query # 保存查询用于建议
  573. limit = 5 # 增加结果数量
  574. # URL 编码查询参数
  575. encoded_query = quote_plus(query)
  576. url = f"https://duckduckgo.com/html/?q={encoded_query}"
  577. # 使用更真实的User-Agent
  578. headers = {
  579. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  580. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
  581. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  582. "Accept-Encoding": "gzip, deflate, br",
  583. "DNT": "1",
  584. "Connection": "keep-alive",
  585. "Upgrade-Insecure-Requests": "1"
  586. }
  587. # 检测是否为中文查询
  588. is_chinese = any('\u4e00' <= char <= '\u9fff' for char in query)
  589. # 对于中文搜索,直接使用Searx搜索引擎,跳过DuckDuckGo(避免202问题)
  590. if is_chinese:
  591. print(f"🌐 检测到中文查询,使用多引擎搜索策略...")
  592. searx_results, searx_success = self._search_searx(query, limit)
  593. if searx_success and searx_results:
  594. results = searx_results
  595. search_engine = "Searx多引擎"
  596. print(f"✅ 中文搜索成功,找到 {len(results)} 个结果")
  597. else:
  598. # 如果Searx失败,提供搜索建议
  599. print("⚠️ 所有搜索引擎失败,提供搜索建议")
  600. results = self._get_search_suggestions(query)
  601. search_engine = "搜索建议"
  602. else:
  603. # 英文搜索:先尝试DuckDuckGo,失败后使用Searx
  604. max_retries = 2 # 减少重试次数,快速切换到Searx
  605. duckduckgo_success = False
  606. for attempt in range(max_retries):
  607. try:
  608. print(f"🔍 尝试DuckDuckGo搜索: {query} (尝试 {attempt + 1}/{max_retries})")
  609. response = requests.get(url, headers=headers, timeout=10)
  610. # DuckDuckGo经常返回202,直接跳过
  611. if response.status_code == 202:
  612. print("⚠️ DuckDuckGo返回202(需要JavaScript),切换到Searx...")
  613. break
  614. if response.status_code != 200:
  615. if attempt < max_retries - 1:
  616. time.sleep(1)
  617. continue
  618. break
  619. # 检查响应内容
  620. if len(response.text) < 1000:
  621. if attempt < max_retries - 1:
  622. time.sleep(1)
  623. continue
  624. break
  625. soup = BeautifulSoup(response.text, "html.parser")
  626. results = self._extract_duckduckgo_results(soup, limit)
  627. if results and len(results) > 0:
  628. duckduckgo_success = True
  629. search_engine = "DuckDuckGo"
  630. print(f"✅ DuckDuckGo搜索成功,找到 {len(results)} 个结果")
  631. break
  632. except Exception as e:
  633. print(f"⚠️ DuckDuckGo尝试失败: {str(e)[:50]}")
  634. if attempt < max_retries - 1:
  635. time.sleep(1)
  636. continue
  637. break
  638. # 如果DuckDuckGo失败,使用Searx
  639. if not duckduckgo_success:
  640. print("🌐 DuckDuckGo失败,切换到Searx搜索引擎...")
  641. searx_results, searx_success = self._search_searx(query, limit)
  642. if searx_success and searx_results:
  643. results = searx_results
  644. search_engine = "Searx多引擎"
  645. print(f"✅ Searx搜索成功,找到 {len(results)} 个结果")
  646. else:
  647. print("⚠️ 所有搜索引擎失败,提供搜索建议")
  648. results = self._get_search_suggestions(query)
  649. search_engine = "搜索建议"
  650. # 增强搜索结果(提取内容预览)
  651. if results:
  652. print("🚀 增强搜索结果,提取内容预览...")
  653. enhanced_results = self._enhance_search_results(results, limit=3)
  654. results = enhanced_results
  655. # 格式化输出结果
  656. if results:
  657. formatted_results = []
  658. for i, result in enumerate(results, 1):
  659. result_text = f"{i}. {result['title']}"
  660. if result['url']:
  661. result_text += f"\n 🔗 {result['url']}"
  662. if result['snippet']:
  663. # 如果是增强的结果,显示内容预览
  664. if result.get('enhanced'):
  665. result_text += f"\n 📄 内容预览: {result['snippet']}"
  666. else:
  667. result_text += f"\n 📝 {result['snippet']}"
  668. formatted_results.append(result_text)
  669. return "\n\n".join(formatted_results)
  670. else:
  671. return f"未找到关于 '{query}' 的搜索结果。请尝试使用不同的关键词。"
  672. def _get_search_suggestions(self, query):
  673. """快速提供搜索建议"""
  674. return [
  675. {
  676. 'title': f'Google搜索: {query}',
  677. 'url': f'https://www.google.com/search?q={query}',
  678. 'snippet': '使用Google搜索引擎',
  679. 'source': 'Google'
  680. },
  681. {
  682. 'title': f'Bing搜索: {query}',
  683. 'url': f'https://www.bing.com/search?q={query}',
  684. 'snippet': '使用Bing搜索引擎',
  685. 'source': 'Bing'
  686. }
  687. ]
  688. return "搜索失败,已多次重试。请稍后再试。"