human_verification_ui.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. """
  2. 人工验证界面
  3. 使用Gradio创建Web界面,用于人工验证生成的AIME题目
  4. """
  5. import json
  6. import os
  7. from typing import List, Dict, Any, Tuple
  8. from datetime import datetime
  9. import gradio as gr
  10. class HumanVerificationUI:
  11. """人工验证界面"""
  12. def __init__(self, data_path: str):
  13. """
  14. 初始化验证界面
  15. Args:
  16. data_path: 生成数据的JSON文件路径
  17. """
  18. self.data_path = data_path
  19. self.problems = self._load_problems()
  20. self.current_index = 0
  21. self.verifications = self._load_verifications()
  22. def _load_problems(self) -> List[Dict[str, Any]]:
  23. """加载题目数据"""
  24. if not os.path.exists(self.data_path):
  25. raise FileNotFoundError(f"数据文件不存在: {self.data_path}")
  26. with open(self.data_path, 'r', encoding='utf-8') as f:
  27. return json.load(f)
  28. def _load_verifications(self) -> Dict[str, Any]:
  29. """加载已有的验证结果"""
  30. verification_path = self.data_path.replace(".json", "_verifications.json")
  31. if os.path.exists(verification_path):
  32. with open(verification_path, 'r', encoding='utf-8') as f:
  33. return json.load(f)
  34. return {}
  35. def _save_verifications(self):
  36. """保存验证结果"""
  37. verification_path = self.data_path.replace(".json", "_verifications.json")
  38. with open(verification_path, 'w', encoding='utf-8') as f:
  39. json.dump(self.verifications, f, ensure_ascii=False, indent=2)
  40. def get_current_problem(self) -> Tuple[str, str, str, str, str, str]:
  41. """获取当前题目信息"""
  42. if not self.problems:
  43. return "无题目", "", "", "", "", "0/0"
  44. problem = self.problems[self.current_index]
  45. problem_id = problem.get("id", "unknown")
  46. # 获取已有的验证信息
  47. verification = self.verifications.get(problem_id, {})
  48. return (
  49. f"题目 {self.current_index + 1}/{len(self.problems)}",
  50. problem.get("problem", ""),
  51. f"答案: {problem.get('answer', 'N/A')}",
  52. problem.get("solution", ""),
  53. f"主题: {problem.get('topic', 'N/A')}",
  54. verification.get("comments", "")
  55. )
  56. def verify_problem(
  57. self,
  58. correctness: int,
  59. clarity: int,
  60. difficulty_match: int,
  61. completeness: int,
  62. status: str,
  63. comments: str
  64. ) -> str:
  65. """
  66. 验证当前题目
  67. Args:
  68. correctness: 正确性评分 (1-5)
  69. clarity: 清晰度评分 (1-5)
  70. difficulty_match: 难度匹配评分 (1-5)
  71. completeness: 完整性评分 (1-5)
  72. status: 验证状态 (approved/rejected/needs_revision)
  73. comments: 评论
  74. Returns:
  75. 验证结果消息
  76. """
  77. if not self.problems:
  78. return "❌ 无题目可验证"
  79. problem = self.problems[self.current_index]
  80. problem_id = problem.get("id", "unknown")
  81. # 保存验证结果
  82. self.verifications[problem_id] = {
  83. "problem_id": problem_id,
  84. "scores": {
  85. "correctness": correctness,
  86. "clarity": clarity,
  87. "difficulty_match": difficulty_match,
  88. "completeness": completeness
  89. },
  90. "total_score": (correctness + clarity + difficulty_match + completeness) / 4,
  91. "status": status,
  92. "comments": comments,
  93. "verified_at": datetime.now().isoformat()
  94. }
  95. self._save_verifications()
  96. return f"✅ 题目 {problem_id} 验证完成!\n总分: {self.verifications[problem_id]['total_score']:.2f}/5.0"
  97. def next_problem(self) -> Tuple[str, str, str, str, str, str]:
  98. """下一个题目"""
  99. if self.current_index < len(self.problems) - 1:
  100. self.current_index += 1
  101. return self.get_current_problem()
  102. def prev_problem(self) -> Tuple[str, str, str, str, str, str]:
  103. """上一个题目"""
  104. if self.current_index > 0:
  105. self.current_index -= 1
  106. return self.get_current_problem()
  107. def get_statistics(self) -> str:
  108. """获取验证统计信息"""
  109. if not self.verifications:
  110. return "暂无验证数据"
  111. total = len(self.problems)
  112. verified = len(self.verifications)
  113. approved = sum(1 for v in self.verifications.values() if v["status"] == "approved")
  114. rejected = sum(1 for v in self.verifications.values() if v["status"] == "rejected")
  115. needs_revision = sum(1 for v in self.verifications.values() if v["status"] == "needs_revision")
  116. avg_score = sum(v["total_score"] for v in self.verifications.values()) / verified if verified > 0 else 0
  117. return f"""
  118. 📊 验证统计
  119. 总题目数: {total}
  120. 已验证: {verified} ({verified/total*100:.1f}%)
  121. 未验证: {total - verified}
  122. 验证结果:
  123. - ✅ 通过: {approved}
  124. - ❌ 拒绝: {rejected}
  125. - 🔄 需修改: {needs_revision}
  126. 平均评分: {avg_score:.2f}/5.0
  127. """
  128. def launch(self, share: bool = False):
  129. """启动Gradio界面"""
  130. with gr.Blocks(title="AIME题目人工验证") as demo:
  131. gr.Markdown("# 🎯 AIME题目人工验证系统")
  132. gr.Markdown(f"数据文件: `{self.data_path}`")
  133. with gr.Row():
  134. with gr.Column(scale=2):
  135. # 题目显示区域
  136. title = gr.Textbox(label="当前题目", interactive=False)
  137. problem_text = gr.Textbox(label="问题描述", lines=5, interactive=False)
  138. answer_text = gr.Textbox(label="答案", interactive=False)
  139. solution_text = gr.Textbox(label="解答过程", lines=10, interactive=False)
  140. metadata_text = gr.Textbox(label="元数据", interactive=False)
  141. with gr.Column(scale=1):
  142. # 评分区域
  143. gr.Markdown("### 📝 评分 (1-5分)")
  144. correctness_slider = gr.Slider(1, 5, value=3, step=1, label="正确性")
  145. clarity_slider = gr.Slider(1, 5, value=3, step=1, label="清晰度")
  146. difficulty_slider = gr.Slider(1, 5, value=3, step=1, label="难度匹配")
  147. completeness_slider = gr.Slider(1, 5, value=3, step=1, label="完整性")
  148. # 状态选择
  149. gr.Markdown("### ✅ 验证状态")
  150. status_radio = gr.Radio(
  151. choices=["approved", "rejected", "needs_revision"],
  152. value="approved",
  153. label="状态"
  154. )
  155. # 评论
  156. comments_text = gr.Textbox(label="评论", lines=3, placeholder="请输入评论...")
  157. # 验证按钮
  158. verify_btn = gr.Button("✅ 提交验证", variant="primary")
  159. verify_result = gr.Textbox(label="验证结果", interactive=False)
  160. # 导航按钮
  161. with gr.Row():
  162. prev_btn = gr.Button("⬅️ 上一题")
  163. next_btn = gr.Button("下一题 ➡️")
  164. # 统计信息
  165. with gr.Row():
  166. stats_text = gr.Textbox(label="验证统计", lines=10, interactive=False)
  167. refresh_stats_btn = gr.Button("🔄 刷新统计")
  168. # 加载初始题目
  169. demo.load(
  170. fn=self.get_current_problem,
  171. outputs=[title, problem_text, answer_text, solution_text, metadata_text, comments_text]
  172. )
  173. # 绑定事件
  174. verify_btn.click(
  175. fn=self.verify_problem,
  176. inputs=[correctness_slider, clarity_slider, difficulty_slider, completeness_slider, status_radio, comments_text],
  177. outputs=verify_result
  178. )
  179. next_btn.click(
  180. fn=self.next_problem,
  181. outputs=[title, problem_text, answer_text, solution_text, metadata_text, comments_text]
  182. )
  183. prev_btn.click(
  184. fn=self.prev_problem,
  185. outputs=[title, problem_text, answer_text, solution_text, metadata_text, comments_text]
  186. )
  187. refresh_stats_btn.click(
  188. fn=self.get_statistics,
  189. outputs=stats_text
  190. )
  191. demo.launch(share=share, server_name="127.0.0.1", server_port=7860)
  192. if __name__ == "__main__":
  193. import sys
  194. if len(sys.argv) < 2:
  195. print("用法: python human_verification_ui.py <data_path>")
  196. print("示例: python human_verification_ui.py generated_data/aime_generated_20250110_120000.json")
  197. sys.exit(1)
  198. data_path = sys.argv[1]
  199. ui = HumanVerificationUI(data_path)
  200. ui.launch(share=False)