text_truncation.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. """文本截断工具:在长度上限内优先在段落、换行、句号等处断开,减少半句截断。"""
  2. from __future__ import annotations
  3. def truncate_at_natural_boundary(text: str, max_chars: int, suffix: str = "…") -> str:
  4. """
  5. 将 text 截断至不超过 max_chars(含 suffix)。
  6. 优先级:双换行段落 > 单换行 > 句末标点(。!?;.!?,跳过疑似小数点)> 空格 > 硬截断。
  7. """
  8. if max_chars <= 0:
  9. return ""
  10. raw = text or ""
  11. if len(raw) <= max_chars:
  12. return raw
  13. suf = suffix or ""
  14. limit = max_chars - len(suf)
  15. if limit <= 0:
  16. return suf[:max_chars]
  17. window = raw[:limit]
  18. # 句末/空格截断时避免只剩极短前缀;段落边界(\n\n)不受此限,以免首段很短时无法断段
  19. min_pos = max(1, int(limit * 0.35))
  20. def _fits(head_end: int) -> bool:
  21. head = raw[:head_end].rstrip()
  22. return len(head) + len(suf) <= max_chars
  23. para = window.rfind("\n\n")
  24. if para >= 1 and _fits(para) and raw[:para].strip():
  25. return raw[:para].rstrip() + suf
  26. nl = window.rfind("\n")
  27. if nl >= min_pos and _fits(nl) and raw[:nl].strip():
  28. return raw[:nl].rstrip() + suf
  29. sentence_ends = "。!?;.!?"
  30. cut = -1
  31. for i in range(len(window) - 1, -1, -1):
  32. ch = window[i]
  33. if ch not in sentence_ends:
  34. continue
  35. if ch == ".":
  36. if i > 0 and window[i - 1].isdigit():
  37. continue
  38. if i + 1 < len(window) and window[i + 1].isdigit():
  39. continue
  40. cut = i + 1
  41. break
  42. if cut >= min_pos and _fits(cut):
  43. return raw[:cut].rstrip() + suf
  44. sp = window.rfind(" ")
  45. if sp >= min_pos and _fits(sp):
  46. return raw[:sp].rstrip() + suf
  47. return raw[:limit].rstrip() + suf