repo_analyzer.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # specialist/repo_analyzer.py
  2. """GitHub 仓库分析专家"""
  3. import re
  4. from typing import Dict, List, Optional
  5. import requests
  6. from hello_agents import HelloAgentsLLM
  7. class RepoAnalyzerAgent:
  8. """
  9. GitHub 仓库分析专家
  10. 功能:
  11. - 从 GitHub URL 提取仓库信息
  12. - 获取项目基本信息(描述、语言、stars等)
  13. - 获取并分析 README 内容
  14. - 识别技术栈
  15. - 推断前置知识要求
  16. """
  17. GITHUB_API_BASE = "https://api.github.com"
  18. def __init__(self, llm: HelloAgentsLLM, github_token: Optional[str] = None):
  19. """
  20. 初始化 RepoAnalyzerAgent
  21. Args:
  22. llm: HelloAgentsLLM 实例
  23. github_token: GitHub API Token(可选,用于提高速率限制)
  24. """
  25. self.llm = llm
  26. self.github_token = github_token
  27. self.headers = {}
  28. if github_token:
  29. self.headers["Authorization"] = f"token {github_token}"
  30. def _extract_repo_info(self, url: str) -> tuple[str, str]:
  31. """
  32. 从 GitHub URL 提取 owner 和 repo 名称
  33. Args:
  34. url: GitHub URL(如 https://github.com/vuejs/core)
  35. Returns:
  36. (owner, repo) 元组
  37. """
  38. # 去掉 .git 后缀
  39. url = url.rstrip(".git")
  40. # 提取 owner 和 repo
  41. parts = url.rstrip("/").split("/")
  42. if len(parts) >= 2:
  43. owner = parts[-2]
  44. repo = parts[-1]
  45. return owner, repo
  46. raise ValueError(f"无法解析 GitHub URL: {url}")
  47. def _fetch_repo_info(self, owner: str, repo: str) -> Dict:
  48. """
  49. 获取仓库基本信息
  50. Args:
  51. owner: 仓库所有者
  52. repo: 仓库名称
  53. Returns:
  54. 仓库信息字典
  55. """
  56. url = f"{self.GITHUB_API_BASE}/repos/{owner}/{repo}"
  57. response = requests.get(url, headers=self.headers, timeout=10)
  58. response.raise_for_status()
  59. return response.json()
  60. def _fetch_readme(self, owner: str, repo: str) -> Optional[str]:
  61. """
  62. 获取 README 内容
  63. Args:
  64. owner: 仓库所有者
  65. repo: 仓库名称
  66. Returns:
  67. README 文本内容,如果不存在则返回 None
  68. """
  69. try:
  70. url = f"{self.GITHUB_API_BASE}/repos/{owner}/{repo}/readme"
  71. response = requests.get(url, headers=self.headers, timeout=10)
  72. if response.status_code == 200:
  73. data = response.json()
  74. # README 内容是 base64 编码的
  75. import base64
  76. content = base64.b64decode(data["content"]).decode("utf-8")
  77. return content
  78. except Exception:
  79. pass
  80. return None
  81. def _extract_tech_stack_from_text(self, text: str) -> List[str]:
  82. """
  83. 从文本中提取技术栈关键词
  84. Args:
  85. text: 文本内容
  86. Returns:
  87. 技术栈列表
  88. """
  89. # 常见技术关键词
  90. tech_keywords = [
  91. "React",
  92. "Vue",
  93. "Angular",
  94. "Svelte",
  95. "TypeScript",
  96. "JavaScript",
  97. "Python",
  98. "Java",
  99. "Go",
  100. "Rust",
  101. "Node.js",
  102. "Django",
  103. "Flask",
  104. "FastAPI",
  105. "Express",
  106. "TensorFlow",
  107. "PyTorch",
  108. "Keras",
  109. "Docker",
  110. "Kubernetes",
  111. "MongoDB",
  112. "PostgreSQL",
  113. "MySQL",
  114. "Redis",
  115. "TailwindCSS",
  116. "Bootstrap",
  117. "CSS",
  118. "HTML",
  119. ]
  120. found_techs = []
  121. text_lower = text.lower()
  122. for tech in tech_keywords:
  123. if tech.lower() in text_lower:
  124. found_techs.append(tech)
  125. return found_techs
  126. def _analyze_with_llm(
  127. self, repo_info: Dict, readme: Optional[str]
  128. ) -> Dict[str, any]:
  129. """
  130. 使用 LLM 深度分析仓库
  131. Args:
  132. repo_info: 仓库基本信息
  133. readme: README 内容(可选)
  134. Returns:
  135. 分析结果字典
  136. """
  137. # 构建分析提示
  138. repo_name = repo_info.get("name", "unknown")
  139. description = repo_info.get("description", "")
  140. language = repo_info.get("language", "")
  141. topics = repo_info.get("topics", [])
  142. user_prompt = f"""请分析以下 GitHub 仓库并提取学习相关信息:
  143. 【仓库名称】
  144. {repo_name}
  145. 【描述】
  146. {description}
  147. 【主要语言】
  148. {language}
  149. 【主题标签】
  150. {', '.join(topics) if topics else '无'}
  151. """
  152. if readme:
  153. user_prompt += f"""
  154. 【README 内容】
  155. {readme[:2000]} # 限制长度
  156. """
  157. user_prompt += """
  158. 请提供以下信息(JSON格式):
  159. {
  160. "domain": "学习领域(如 web-development, data-science 等)",
  161. "tech_stack": ["技术1", "技术2", "..."],
  162. "prerequisites": ["前置知识1", "前置知识2", "..."],
  163. "learning_difficulty": "初级/中级/高级",
  164. "estimated_weeks": 学习所需周数(整数)
  165. }
  166. """
  167. messages = [
  168. {
  169. "role": "system",
  170. "content": "你是一个技术教育专家,擅长分析开源项目并提取学习相关信息。",
  171. },
  172. {"role": "user", "content": user_prompt},
  173. ]
  174. try:
  175. response = self.llm.invoke(messages)
  176. # 简化实现:返回基本信息(实际应该解析 LLM 返回的 JSON)
  177. return {
  178. "domain": repo_name.lower().replace("-", " "),
  179. "tech_stack": self._extract_tech_stack_from_text(
  180. description + " " + language
  181. ),
  182. "prerequisites": [],
  183. "learning_difficulty": "中级",
  184. "estimated_weeks": 4,
  185. }
  186. except Exception:
  187. # 降级:使用基于规则的分析
  188. return {
  189. "domain": repo_name.lower().replace("-", " "),
  190. "tech_stack": [language] if language else [],
  191. "prerequisites": [],
  192. "learning_difficulty": "中级",
  193. "estimated_weeks": 4,
  194. }
  195. def analyze(self, github_url: str) -> Dict[str, any]:
  196. """
  197. 分析 GitHub 仓库
  198. Args:
  199. github_url: GitHub 仓库 URL
  200. Returns:
  201. 分析结果字典,包含:
  202. - domain: 学习领域
  203. - tech_stack: 技术栈列表
  204. - prerequisites: 前置知识列表
  205. - description: 项目描述
  206. - language: 主要语言
  207. - stars: Star 数量
  208. """
  209. # 提取仓库信息
  210. owner, repo = self._extract_repo_info(github_url)
  211. # 获取基本信息
  212. repo_info = self._fetch_repo_info(owner, repo)
  213. # 获取 README
  214. readme = self._fetch_readme(owner, repo)
  215. # 提取技术栈(基于规则)
  216. tech_stack = []
  217. if repo_info.get("language"):
  218. tech_stack.append(repo_info["language"])
  219. if readme:
  220. tech_stack.extend(self._extract_tech_stack_from_text(readme))
  221. # 去重
  222. tech_stack = list(set(tech_stack))
  223. # 使用 LLM 深度分析(如果可用)
  224. llm_analysis = self._analyze_with_llm(repo_info, readme)
  225. # 合并结果
  226. result = {
  227. "domain": llm_analysis.get("domain", repo.lower().replace("-", " ")),
  228. "tech_stack": tech_stack,
  229. "prerequisites": llm_analysis.get("prerequisites", []),
  230. "description": repo_info.get("description", ""),
  231. "language": repo_info.get("language", ""),
  232. "stars": repo_info.get("stargazers_count", 0),
  233. "learning_difficulty": llm_analysis.get("learning_difficulty", "中级"),
  234. "estimated_weeks": llm_analysis.get("estimated_weeks", 4),
  235. }
  236. return result