| 123456789101112131415161718192021222324252627282930 |
- import collections
- # 示例语料库,与上方案例讲解中的语料库保持一致
- corpus = "datawhale agent learns datawhale agent works"
- tokens = corpus.split()
- total_tokens = len(tokens)
- # --- 第一步:计算 P(datawhale) ---
- count_datawhale = tokens.count('datawhale')
- p_datawhale = count_datawhale / total_tokens
- print(f"第一步: P(datawhale) = {count_datawhale}/{total_tokens} = {p_datawhale:.3f}")
- # --- 第二步:计算 P(agent|datawhale) ---
- # 先计算 bigrams 用于后续步骤
- bigrams = zip(tokens, tokens[1:])
- bigram_counts = collections.Counter(bigrams)
- count_datawhale_agent = bigram_counts[('datawhale', 'agent')]
- # count_datawhale 已在第一步计算
- p_agent_given_datawhale = count_datawhale_agent / count_datawhale
- print(f"第二步: P(agent|datawhale) = {count_datawhale_agent}/{count_datawhale} = {p_agent_given_datawhale:.3f}")
- # --- 第三步:计算 P(learns|agent) ---
- count_agent_learns = bigram_counts[('agent', 'learns')]
- count_agent = tokens.count('agent')
- p_learns_given_agent = count_agent_learns / count_agent
- print(f"第三步: P(learns|agent) = {count_agent_learns}/{count_agent} = {p_learns_given_agent:.3f}")
- # --- 最后:将概率连乘 ---
- p_sentence = p_datawhale * p_agent_given_datawhale * p_learns_given_agent
- print(f"最后: P('datawhale agent learns') ≈ {p_datawhale:.3f} * {p_agent_given_datawhale:.3f} * {p_learns_given_agent:.3f} = {p_sentence:.3f}")
|