moki
/
hello-agents
огледало от https://github.com/datawhalechina/hello-agents.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
							# specialist/repo_analyzer.py
"""GitHub 仓库分析专家"""

import re
from typing import Dict, List, Optional
import requests
from hello_agents import HelloAgentsLLM


class RepoAnalyzerAgent:
    """
    GitHub 仓库分析专家

    功能：
    - 从 GitHub URL 提取仓库信息
    - 获取项目基本信息（描述、语言、stars等）
    - 获取并分析 README 内容
    - 识别技术栈
    - 推断前置知识要求
    """

    GITHUB_API_BASE = "https://api.github.com"

    def __init__(self, llm: HelloAgentsLLM, github_token: Optional[str] = None):
        """
        初始化 RepoAnalyzerAgent

        Args:
            llm: HelloAgentsLLM 实例
            github_token: GitHub API Token（可选，用于提高速率限制）
        """
        self.llm = llm
        self.github_token = github_token
        self.headers = {}
        if github_token:
            self.headers["Authorization"] = f"token {github_token}"

    def _extract_repo_info(self, url: str) -> tuple[str, str]:
        """
        从 GitHub URL 提取 owner 和 repo 名称

        Args:
            url: GitHub URL（如 https://github.com/vuejs/core）

        Returns:
            (owner, repo) 元组
        """
        # 去掉 .git 后缀
        url = url.rstrip(".git")

        # 提取 owner 和 repo
        parts = url.rstrip("/").split("/")
        if len(parts) >= 2:
            owner = parts[-2]
            repo = parts[-1]
            return owner, repo

        raise ValueError(f"无法解析 GitHub URL: {url}")

    def _fetch_repo_info(self, owner: str, repo: str) -> Dict:
        """
        获取仓库基本信息

        Args:
            owner: 仓库所有者
            repo: 仓库名称

        Returns:
            仓库信息字典
        """
        url = f"{self.GITHUB_API_BASE}/repos/{owner}/{repo}"
        response = requests.get(url, headers=self.headers, timeout=10)
        response.raise_for_status()
        return response.json()

    def _fetch_readme(self, owner: str, repo: str) -> Optional[str]:
        """
        获取 README 内容

        Args:
            owner: 仓库所有者
            repo: 仓库名称

        Returns:
            README 文本内容，如果不存在则返回 None
        """
        try:
            url = f"{self.GITHUB_API_BASE}/repos/{owner}/{repo}/readme"
            response = requests.get(url, headers=self.headers, timeout=10)
            if response.status_code == 200:
                data = response.json()
                # README 内容是 base64 编码的
                import base64

                content = base64.b64decode(data["content"]).decode("utf-8")
                return content
        except Exception:
            pass
        return None

    def _extract_tech_stack_from_text(self, text: str) -> List[str]:
        """
        从文本中提取技术栈关键词

        Args:
            text: 文本内容

        Returns:
            技术栈列表
        """
        # 常见技术关键词
        tech_keywords = [
            "React",
            "Vue",
            "Angular",
            "Svelte",
            "TypeScript",
            "JavaScript",
            "Python",
            "Java",
            "Go",
            "Rust",
            "Node.js",
            "Django",
            "Flask",
            "FastAPI",
            "Express",
            "TensorFlow",
            "PyTorch",
            "Keras",
            "Docker",
            "Kubernetes",
            "MongoDB",
            "PostgreSQL",
            "MySQL",
            "Redis",
            "TailwindCSS",
            "Bootstrap",
            "CSS",
            "HTML",
        ]

        found_techs = []
        text_lower = text.lower()

        for tech in tech_keywords:
            if tech.lower() in text_lower:
                found_techs.append(tech)

        return found_techs

    def _analyze_with_llm(
        self, repo_info: Dict, readme: Optional[str]
    ) -> Dict[str, any]:
        """
        使用 LLM 深度分析仓库

        Args:
            repo_info: 仓库基本信息
            readme: README 内容（可选）

        Returns:
            分析结果字典
        """
        # 构建分析提示
        repo_name = repo_info.get("name", "unknown")
        description = repo_info.get("description", "")
        language = repo_info.get("language", "")
        topics = repo_info.get("topics", [])

        user_prompt = f"""请分析以下 GitHub 仓库并提取学习相关信息：

【仓库名称】
{repo_name}

【描述】
{description}

【主要语言】
{language}

【主题标签】
{', '.join(topics) if topics else '无'}

"""

        if readme:
            user_prompt += f"""
【README 内容】
{readme[:2000]}  # 限制长度
"""

        user_prompt += """
请提供以下信息（JSON格式）：
{
  "domain": "学习领域（如 web-development, data-science 等）",
  "tech_stack": ["技术1", "技术2", "..."],
  "prerequisites": ["前置知识1", "前置知识2", "..."],
  "learning_difficulty": "初级/中级/高级",
  "estimated_weeks": 学习所需周数（整数）
}
"""

        messages = [
            {
                "role": "system",
                "content": "你是一个技术教育专家，擅长分析开源项目并提取学习相关信息。",
            },
            {"role": "user", "content": user_prompt},
        ]

        try:
            response = self.llm.invoke(messages)
            # 简化实现：返回基本信息（实际应该解析 LLM 返回的 JSON）
            return {
                "domain": repo_name.lower().replace("-", " "),
                "tech_stack": self._extract_tech_stack_from_text(
                    description + " " + language
                ),
                "prerequisites": [],
                "learning_difficulty": "中级",
                "estimated_weeks": 4,
            }
        except Exception:
            # 降级：使用基于规则的分析
            return {
                "domain": repo_name.lower().replace("-", " "),
                "tech_stack": [language] if language else [],
                "prerequisites": [],
                "learning_difficulty": "中级",
                "estimated_weeks": 4,
            }

    def analyze(self, github_url: str) -> Dict[str, any]:
        """
        分析 GitHub 仓库

        Args:
            github_url: GitHub 仓库 URL

        Returns:
            分析结果字典，包含：
            - domain: 学习领域
            - tech_stack: 技术栈列表
            - prerequisites: 前置知识列表
            - description: 项目描述
            - language: 主要语言
            - stars: Star 数量
        """
        # 提取仓库信息
        owner, repo = self._extract_repo_info(github_url)

        # 获取基本信息
        repo_info = self._fetch_repo_info(owner, repo)

        # 获取 README
        readme = self._fetch_readme(owner, repo)

        # 提取技术栈（基于规则）
        tech_stack = []
        if repo_info.get("language"):
            tech_stack.append(repo_info["language"])

        if readme:
            tech_stack.extend(self._extract_tech_stack_from_text(readme))

        # 去重
        tech_stack = list(set(tech_stack))

        # 使用 LLM 深度分析（如果可用）
        llm_analysis = self._analyze_with_llm(repo_info, readme)

        # 合并结果
        result = {
            "domain": llm_analysis.get("domain", repo.lower().replace("-", " ")),
            "tech_stack": tech_stack,
            "prerequisites": llm_analysis.get("prerequisites", []),
            "description": repo_info.get("description", ""),
            "language": repo_info.get("language", ""),
            "stars": repo_info.get("stargazers_count", 0),
            "learning_difficulty": llm_analysis.get("learning_difficulty", "中级"),
            "estimated_weeks": llm_analysis.get("estimated_weeks", 4),
        }

        return result