1
0

data_context.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. """Dataset metadata extraction."""
  2. from __future__ import annotations
  3. import json
  4. from dataclasses import dataclass
  5. from pathlib import Path
  6. from typing import Any
  7. import pandas as pd
  8. @dataclass(frozen=True)
  9. class DataContextSummary:
  10. data_path: Path
  11. absolute_path: Path
  12. columns: list[str]
  13. dtypes: str
  14. shape: tuple[int, int]
  15. head_markdown: str
  16. sample_size_warning: str
  17. small_sample_warning: bool
  18. context_text: str
  19. input_kind: str = "tabular"
  20. background_literature_context: str = ""
  21. parsed_document_path: Path | None = None
  22. pdf_small_table_mode: bool = False
  23. candidate_table_count: int = 0
  24. selected_table_id: str = ""
  25. pdf_multi_table_mode: bool = False
  26. candidate_table_summaries_text: str = ""
  27. def _read_dataframe(data_path: Path) -> pd.DataFrame:
  28. suffix = data_path.suffix.lower()
  29. if suffix == ".csv":
  30. return pd.read_csv(data_path)
  31. if suffix in {".xls", ".xlsx"}:
  32. return pd.read_excel(data_path)
  33. raise ValueError(f"Unsupported data file format: {data_path.suffix}")
  34. def _normalize_background_text(text: str, *, limit: int = 2000) -> str:
  35. normalized = " ".join(str(text or "").split()).strip()
  36. return normalized[:limit]
  37. def _load_parsed_document_context(parsed_document_path: Path | None) -> tuple[str, Path | None, dict[str, object]]:
  38. if parsed_document_path is None or not parsed_document_path.exists():
  39. return "", None, {}
  40. try:
  41. payload = json.loads(parsed_document_path.read_text(encoding="utf-8"))
  42. except Exception:
  43. return "", parsed_document_path, {}
  44. if not isinstance(payload, dict):
  45. return "", parsed_document_path, {}
  46. background = payload.get("background_literature_context", "")
  47. if not background:
  48. background = payload.get("abstract", "")
  49. if not background:
  50. background = payload.get("text_excerpt", "")
  51. return _normalize_background_text(str(background or "")), parsed_document_path, payload
  52. def _extract_selected_table_metadata(
  53. parsed_payload: dict[str, object],
  54. ) -> tuple[int, str, tuple[int, int] | None, tuple[str, ...], tuple[str, ...], bool]:
  55. candidate_tables = parsed_payload.get("candidate_tables", [])
  56. selected_table_id = str(parsed_payload.get("selected_table_id", "") or "")
  57. pdf_multi_table_mode = bool(parsed_payload.get("pdf_multi_table_mode", False))
  58. if not isinstance(candidate_tables, list):
  59. return 0, selected_table_id, None, (), (), pdf_multi_table_mode
  60. selected_shape: tuple[int, int] | None = None
  61. selected_headers: tuple[str, ...] = ()
  62. selected_numeric_columns: tuple[str, ...] = ()
  63. for candidate in candidate_tables:
  64. if not isinstance(candidate, dict) or str(candidate.get("table_id", "") or "") != selected_table_id:
  65. continue
  66. shape = candidate.get("shape", [])
  67. if isinstance(shape, list) and len(shape) == 2:
  68. try:
  69. selected_shape = (int(shape[0]), int(shape[1]))
  70. except (TypeError, ValueError):
  71. selected_shape = None
  72. headers = candidate.get("headers", [])
  73. numeric_columns = candidate.get("numeric_columns", [])
  74. if isinstance(headers, list):
  75. selected_headers = tuple(str(item) for item in headers)
  76. if isinstance(numeric_columns, list):
  77. selected_numeric_columns = tuple(str(item) for item in numeric_columns)
  78. break
  79. return (
  80. len(candidate_tables),
  81. selected_table_id,
  82. selected_shape,
  83. selected_headers,
  84. selected_numeric_columns,
  85. pdf_multi_table_mode,
  86. )
  87. def _format_candidate_table_summaries(parsed_payload: dict[str, object], *, limit: int = 5) -> str:
  88. candidate_tables = parsed_payload.get("candidate_table_summaries", parsed_payload.get("candidate_tables", []))
  89. if not isinstance(candidate_tables, list) or not candidate_tables:
  90. return ""
  91. lines: list[str] = []
  92. for candidate in candidate_tables[:limit]:
  93. if not isinstance(candidate, dict):
  94. continue
  95. table_id = str(candidate.get("table_id", "") or "unknown")
  96. page_number = candidate.get("page_number", "?")
  97. shape = candidate.get("shape", [])
  98. headers = candidate.get("headers", [])
  99. numeric_columns = candidate.get("numeric_columns", [])
  100. content_hint = str(candidate.get("content_hint", "") or "").strip()
  101. selected = bool(candidate.get("selected_as_primary", False))
  102. shape_text = (
  103. f"{shape[0]} x {shape[1]}"
  104. if isinstance(shape, list) and len(shape) == 2
  105. else "unknown"
  106. )
  107. header_text = ", ".join(str(item) for item in headers[:6]) if isinstance(headers, list) else ""
  108. numeric_text = ", ".join(str(item) for item in numeric_columns[:6]) if isinstance(numeric_columns, list) else ""
  109. line = (
  110. f"- {table_id} | page={page_number} | shape={shape_text} | "
  111. f"headers={header_text or 'none'} | numeric_columns={numeric_text or 'none'} | "
  112. f"selected_as_primary={selected}"
  113. )
  114. if content_hint:
  115. line += f" | content_hint={content_hint}"
  116. lines.append(line)
  117. return "\n".join(lines)
  118. def _is_pdf_small_table(
  119. *,
  120. input_kind: str,
  121. selected_shape: tuple[int, int] | None,
  122. columns: list[str],
  123. selected_numeric_columns: tuple[str, ...],
  124. ) -> bool:
  125. if input_kind != "pdf" or selected_shape is None:
  126. return False
  127. rows, cols = selected_shape
  128. has_numeric = bool(selected_numeric_columns)
  129. has_text_label = len(columns) > len(selected_numeric_columns)
  130. return rows <= 30 and cols <= 10 and has_numeric and has_text_label
  131. def build_data_context(
  132. data_path: str | Path,
  133. *,
  134. input_kind: str = "tabular",
  135. parsed_document_path: str | Path | None = None,
  136. ) -> DataContextSummary:
  137. """Build a compact metadata-first prompt context for a local dataset."""
  138. path = Path(data_path)
  139. try:
  140. normalized_path = path.resolve().relative_to(Path.cwd().resolve())
  141. except ValueError:
  142. normalized_path = path
  143. df = _read_dataframe(path)
  144. absolute_path = path.resolve()
  145. columns = df.columns.tolist()
  146. dtypes = df.dtypes.to_string()
  147. shape = df.shape
  148. head_markdown = df.head().to_markdown(index=False)
  149. sample_size_warning = ""
  150. small_sample_warning = shape[0] < 30
  151. if small_sample_warning:
  152. sample_size_warning = (
  153. "WARNING / 红色警告:当前样本量极小 (N<30),强烈建议优先考虑非参数检验"
  154. "(如 Mann-Whitney U 检验),并对正态分布假设保持高度谨慎。"
  155. )
  156. literature_context, resolved_parsed_document, parsed_payload = _load_parsed_document_context(
  157. Path(parsed_document_path) if parsed_document_path is not None else None
  158. )
  159. (
  160. candidate_table_count,
  161. selected_table_id,
  162. selected_table_shape,
  163. _selected_table_headers,
  164. selected_table_numeric_columns,
  165. pdf_multi_table_mode,
  166. ) = _extract_selected_table_metadata(parsed_payload)
  167. candidate_table_summaries_text = _format_candidate_table_summaries(parsed_payload)
  168. pdf_small_table_mode = _is_pdf_small_table(
  169. input_kind=input_kind,
  170. selected_shape=selected_table_shape,
  171. columns=columns,
  172. selected_numeric_columns=selected_table_numeric_columns,
  173. )
  174. context_lines = [
  175. f"数据文件相对路径: {normalized_path.as_posix()}",
  176. f"数据文件绝对路径: {absolute_path.as_posix()}",
  177. f"输入类型: {input_kind}",
  178. f"数据列名: {columns}",
  179. f"数据类型:\n{dtypes}",
  180. f"数据规模: {shape}",
  181. ]
  182. if sample_size_warning:
  183. context_lines.append(sample_size_warning)
  184. if literature_context:
  185. context_lines.append(
  186. "<Background_Literature_Context>\n"
  187. f"{literature_context}\n"
  188. "</Background_Literature_Context>"
  189. )
  190. if candidate_table_summaries_text:
  191. context_lines.append(
  192. "<PDF_Candidate_Tables_Context>\n"
  193. f"candidate_table_count={candidate_table_count}\n"
  194. f"selected_table_id={selected_table_id or 'unknown'}\n"
  195. f"pdf_multi_table_mode={pdf_multi_table_mode}\n"
  196. f"{candidate_table_summaries_text}\n"
  197. "</PDF_Candidate_Tables_Context>"
  198. )
  199. if pdf_small_table_mode:
  200. context_lines.append(
  201. "<PDF_Small_Table_Mode>\n"
  202. "This is a PDF-derived small results table, often representing model comparison or compact experimental outcomes.\n"
  203. "Use a lightweight template: descriptive statistics, ranking, bootstrap confidence intervals, cautious correlation analysis, optional top-vs-bottom descriptive comparisons, and 2-4 light figures.\n"
  204. "The selected primary table is the only table for formal quantitative analysis. Other candidate tables are contextual evidence only and must not trigger extra significance testing by default.\n"
  205. "Do not run one-sample tests, do not treat distinct models as repeated observations from one population, and do not run group significance tests without repeated measurements or explicit experimental groups.\n"
  206. "</PDF_Small_Table_Mode>"
  207. )
  208. context_lines.append(f"前 5 行样本:\n{head_markdown}")
  209. context_text = "\n".join(context_lines).strip() + "\n"
  210. return DataContextSummary(
  211. data_path=normalized_path,
  212. absolute_path=absolute_path,
  213. columns=columns,
  214. dtypes=dtypes,
  215. shape=shape,
  216. head_markdown=head_markdown,
  217. sample_size_warning=sample_size_warning,
  218. small_sample_warning=small_sample_warning,
  219. context_text=context_text,
  220. input_kind=input_kind,
  221. background_literature_context=literature_context,
  222. parsed_document_path=resolved_parsed_document,
  223. pdf_small_table_mode=pdf_small_table_mode,
  224. candidate_table_count=candidate_table_count,
  225. selected_table_id=selected_table_id,
  226. pdf_multi_table_mode=pdf_multi_table_mode,
  227. candidate_table_summaries_text=candidate_table_summaries_text,
  228. )