N_gram.py 1.4 KB

123456789101112131415161718192021222324252627282930
  1. import collections
  2. # 示例语料库,与上方案例讲解中的语料库保持一致
  3. corpus = "datawhale agent learns datawhale agent works"
  4. tokens = corpus.split()
  5. total_tokens = len(tokens)
  6. # --- 第一步:计算 P(datawhale) ---
  7. count_datawhale = tokens.count('datawhale')
  8. p_datawhale = count_datawhale / total_tokens
  9. print(f"第一步: P(datawhale) = {count_datawhale}/{total_tokens} = {p_datawhale:.3f}")
  10. # --- 第二步:计算 P(agent|datawhale) ---
  11. # 先计算 bigrams 用于后续步骤
  12. bigrams = zip(tokens, tokens[1:])
  13. bigram_counts = collections.Counter(bigrams)
  14. count_datawhale_agent = bigram_counts[('datawhale', 'agent')]
  15. # count_datawhale 已在第一步计算
  16. p_agent_given_datawhale = count_datawhale_agent / count_datawhale
  17. print(f"第二步: P(agent|datawhale) = {count_datawhale_agent}/{count_datawhale} = {p_agent_given_datawhale:.3f}")
  18. # --- 第三步:计算 P(learns|agent) ---
  19. count_agent_learns = bigram_counts[('agent', 'learns')]
  20. count_agent = tokens.count('agent')
  21. p_learns_given_agent = count_agent_learns / count_agent
  22. print(f"第三步: P(learns|agent) = {count_agent_learns}/{count_agent} = {p_learns_given_agent:.3f}")
  23. # --- 最后:将概率连乘 ---
  24. p_sentence = p_datawhale * p_agent_given_datawhale * p_learns_given_agent
  25. print(f"最后: P('datawhale agent learns') ≈ {p_datawhale:.3f} * {p_agent_given_datawhale:.3f} * {p_learns_given_agent:.3f} = {p_sentence:.3f}")