data_analysis.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294
  1. # data_analysis.py
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. import matplotlib.pyplot as plt
  6. from scipy.stats import pearsonr, f_oneway
  7. from hello_agents import ToolRegistry
  8. # 读取数据集
  9. work_path = os.path.dirname(os.path.abspath(__file__))
  10. df = pd.read_csv(f"{work_path}/../data/shopping_behavior_updated.csv")
  11. # 创建年龄段分组
  12. def age_group(age):
  13. if age < 20:
  14. return 'Teen (<20)'
  15. elif age < 30:
  16. return '20s'
  17. elif age < 40:
  18. return '30s'
  19. elif age < 50:
  20. return '40s'
  21. elif age < 60:
  22. return '50s'
  23. else:
  24. return 'Senior (60+)'
  25. df['Age Group'] = df['Age'].apply(age_group)
  26. def analyze_gender_preferences(input: str) -> dict:
  27. """分析不同性别的购物偏好,返回可序列化的Python数据类型"""
  28. # 性别分布
  29. gender_counts_dict = df['Gender'].value_counts().to_dict()
  30. # 按性别统计平均消费金额
  31. gender_spending_series = df.groupby('Gender')['Purchase Amount (USD)'].mean()
  32. gender_spending_dict = gender_spending_series.to_dict()
  33. # 按性别统计最受欢迎的商品类别
  34. gender_category = df.groupby(['Gender', 'Category']).size().unstack(fill_value=0)
  35. gender_category_percent = gender_category.div(gender_category.sum(axis=1), axis=0)
  36. # 转换为嵌套字典
  37. gender_category_dict = gender_category_percent.to_dict('index')
  38. # 准备返回值 - 全部使用Python原生数据类型
  39. result = {
  40. 'gender_distribution': gender_counts_dict,
  41. 'average_spending_by_gender': gender_spending_dict,
  42. 'category_preference_by_gender': gender_category_dict
  43. }
  44. # 可视化图表
  45. visualization_urls = []
  46. # 性别分布图
  47. plt.figure(figsize=(8, 5))
  48. plt.bar(gender_counts_dict.keys(), gender_counts_dict.values(), color=['blue', 'pink'])
  49. plt.title('Gender Distribution')
  50. plt.xlabel('Gender')
  51. plt.ylabel('Count')
  52. gender_distribution_path = 'figures/gender_distribution.png'
  53. plt.savefig(os.path.join(work_path, '../out', gender_distribution_path))
  54. plt.close()
  55. visualization_urls.append(gender_distribution_path)
  56. # 平均消费金额图
  57. plt.figure(figsize=(8, 5))
  58. plt.bar(gender_spending_dict.keys(), gender_spending_dict.values(), color=['blue', 'pink'])
  59. plt.title('Average Spending by Gender')
  60. plt.xlabel('Gender')
  61. plt.ylabel('Average Spending (USD)')
  62. average_spending_path = 'figures/average_spending_by_gender.png'
  63. plt.savefig(os.path.join(work_path, '../out', average_spending_path))
  64. plt.close()
  65. visualization_urls.append(average_spending_path)
  66. # 商品类别偏好图
  67. gender_category.plot(kind='bar', stacked=True, figsize=(10, 6))
  68. plt.title('Category Preference by Gender')
  69. plt.xlabel('Gender')
  70. plt.ylabel('Count')
  71. category_preference_path = 'figures/category_preference_by_gender.png'
  72. plt.savefig(os.path.join(work_path, '../out', category_preference_path))
  73. plt.close()
  74. visualization_urls.append(category_preference_path)
  75. result['visualization_url'] = visualization_urls
  76. return result
  77. def analyze_age_preferences(input: str) -> dict:
  78. age_group_counts = df['Age Group'].value_counts().sort_index()
  79. age_group_counts_dict = age_group_counts.to_dict()
  80. # 按年龄段统计平均消费金额
  81. age_spending = df.groupby('Age Group')['Purchase Amount (USD)'].mean().sort_index()
  82. age_spending_dict = age_spending.to_dict()
  83. # 按年龄段统计最受欢迎的商品类别
  84. age_category = df.groupby(['Age Group', 'Category']).size().unstack(fill_value=0)
  85. age_category_percent = age_category.div(age_category.sum(axis=1), axis=0)
  86. age_category_percent = age_category_percent.to_dict('index')
  87. result = {
  88. 'age_group_distribution': age_group_counts_dict,
  89. 'average_spending_by_age_group': age_spending_dict,
  90. 'category_preference_by_age_group': age_category_percent
  91. }
  92. # 可视化图表
  93. visualization_urls = []
  94. # 年龄段分布图
  95. plt.figure(figsize=(8, 5))
  96. plt.bar(age_group_counts_dict.keys(), age_group_counts_dict.values(), color='skyblue')
  97. plt.title('Age Group Distribution')
  98. plt.xlabel('Age Group')
  99. plt.ylabel('Count')
  100. age_distribution_path = 'figures/age_group_distribution.png'
  101. plt.savefig(os.path.join(work_path, '../out', age_distribution_path))
  102. plt.close()
  103. visualization_urls.append(age_distribution_path)
  104. # 平均消费金额图
  105. plt.figure(figsize=(8, 5))
  106. plt.bar(age_spending_dict.keys(), age_spending_dict.values(), color='lightgreen')
  107. plt.title('Average Spending by Age Group')
  108. plt.xlabel('Age Group')
  109. plt.ylabel('Average Spending (USD)')
  110. average_spending_path = 'figures/average_spending_by_age_group.png'
  111. plt.savefig(os.path.join(work_path, '../out', average_spending_path))
  112. plt.close()
  113. visualization_urls.append(average_spending_path)
  114. # 商品类别偏好图
  115. age_category.plot(kind='bar', stacked=True, figsize=(10, 6))
  116. plt.title('Category Preference by Age Group')
  117. plt.xlabel('Age Group')
  118. plt.ylabel('Count')
  119. category_preference_path = 'figures/category_preference_by_age_group.png'
  120. plt.savefig(os.path.join(work_path, '../out', category_preference_path))
  121. plt.close()
  122. visualization_urls.append(category_preference_path)
  123. result['visualization_url'] = visualization_urls # 添加可视化图表路径到结果
  124. return result
  125. def analyze_spending_differences(input: str) -> dict:
  126. # 按性别和年龄段分组统计
  127. gender_age_spending = df.groupby(['Gender', 'Age Group'])['Purchase Amount (USD)'].mean().unstack()
  128. gender_age_spending_dict = gender_age_spending.to_dict()
  129. # 按商品类别和年龄段分组统计
  130. category_age_spending = df.groupby(['Category', 'Age Group'])['Purchase Amount (USD)'].mean().unstack()
  131. category_age_spending_dict = category_age_spending.to_dict()
  132. result = {
  133. 'spending_by_gender_and_age': gender_age_spending_dict,
  134. 'spending_by_category_and_age': category_age_spending_dict
  135. }
  136. # 可视化图表
  137. visualization_urls = []
  138. # 性别和年龄段消费差异图
  139. plt.figure(figsize=(10, 6))
  140. gender_age_spending.plot(kind='bar', figsize=(10, 6))
  141. plt.title('Average Spending by Gender and Age Group')
  142. plt.xlabel('Age Group')
  143. plt.ylabel('Average Spending (USD)')
  144. plt.xticks(rotation=0)
  145. gender_age_spending_path = 'figures/average_spending_by_gender_and_age.png'
  146. plt.savefig(os.path.join(work_path, '../out', gender_age_spending_path))
  147. plt.close()
  148. visualization_urls.append(gender_age_spending_path)
  149. # 商品类别和年龄段消费差异图
  150. plt.figure(figsize=(10, 6))
  151. category_age_spending.plot(kind='bar', figsize=(10, 6))
  152. plt.title('Average Spending by Category and Age Group')
  153. plt.xlabel('Age Group')
  154. plt.ylabel('Average Spending (USD)')
  155. plt.xticks(rotation=0)
  156. category_age_spending_path = 'figures/average_spending_by_category_and_age.png'
  157. plt.savefig(os.path.join(work_path, '../out', category_age_spending_path))
  158. plt.close()
  159. visualization_urls.append(category_age_spending_path)
  160. result['visualization_url'] = visualization_urls # 添加可视化图表路径到结果
  161. return result
  162. def analyze_subscription_impact(input: str) -> dict:
  163. """
  164. 分析订阅状态对消费的影响
  165. 返回包含所有分析结果的字典
  166. """
  167. # 确保有Subscription Status列
  168. if 'Subscription Status' not in df.columns:
  169. return {"error": "数据中缺少Subscription Status列"}
  170. # 标准化订阅状态(处理大小写不一致)
  171. df['Subscription Status'] = df['Subscription Status'].str.strip().str.title()
  172. # 1. 基础统计:订阅用户与非订阅用户数量
  173. subscription_counts = df['Subscription Status'].value_counts().to_dict()
  174. # 2. 平均购买金额对比
  175. avg_purchase_by_subscription = df.groupby('Subscription Status')['Purchase Amount (USD)'].agg(['mean', 'std', 'count']).round(2)
  176. avg_purchase_dict = avg_purchase_by_subscription.to_dict('index')
  177. # 3. 之前购买次数对比
  178. prev_purchases_by_subscription = df.groupby('Subscription Status')['Previous Purchases'].agg(['mean', 'std', 'count']).round(2)
  179. prev_purchases_dict = prev_purchases_by_subscription.to_dict('index')
  180. # 4. 复购频率差异(如果Frequency of Purchases是数值类型)
  181. frequency_analysis = {}
  182. if 'Frequency of Purchases' in df.columns:
  183. # 创建频率映射(如果是分类数据)
  184. frequency_mapping = {
  185. 'Weekly': 52,
  186. 'Fortnightly': 26,
  187. 'Bi-Weekly': 26,
  188. 'Monthly': 12,
  189. 'Quarterly': 4,
  190. 'Every 3 Months': 4,
  191. 'Annually': 1
  192. }
  193. # 转换为数值频率
  194. df['Purchase_Frequency_Numeric'] = df['Frequency of Purchases'].map(frequency_mapping)
  195. frequency_by_subscription = df.groupby('Subscription Status')['Purchase_Frequency_Numeric'].agg(['mean', 'std', 'count']).round(2)
  196. frequency_analysis = frequency_by_subscription.to_dict('index')
  197. # 5. 统计显著性检验
  198. significance_tests = {}
  199. # 分离订阅和非订阅用户数据
  200. subscribed = df[df['Subscription Status'] == 'Yes']
  201. not_subscribed = df[df['Subscription Status'] == 'No']
  202. # 6. 效应大小计算(Cohen's d)
  203. effect_sizes = {}
  204. if len(subscribed) > 0 and len(not_subscribed) > 0:
  205. # 购买金额的效应大小
  206. mean_diff_amount = subscribed['Purchase Amount (USD)'].mean() - not_subscribed['Purchase Amount (USD)'].mean()
  207. pooled_std_amount = np.sqrt(
  208. (subscribed['Purchase Amount (USD)'].std()**2 + not_subscribed['Purchase Amount (USD)'].std()**2) / 2
  209. )
  210. cohens_d_amount = mean_diff_amount / pooled_std_amount if pooled_std_amount > 0 else 0
  211. # 之前购买次数的效应大小
  212. mean_diff_prev = subscribed['Previous Purchases'].mean() - not_subscribed['Previous Purchases'].mean()
  213. pooled_std_prev = np.sqrt(
  214. (subscribed['Previous Purchases'].std()**2 + not_subscribed['Previous Purchases'].std()**2) / 2
  215. )
  216. cohens_d_prev = mean_diff_prev / pooled_std_prev if pooled_std_prev > 0 else 0
  217. effect_sizes = {
  218. 'purchase_amount_cohens_d': round(cohens_d_amount, 3),
  219. 'previous_purchases_cohens_d': round(cohens_d_prev, 3),
  220. 'interpretation': {
  221. 'small': 0.2,
  222. 'medium': 0.5,
  223. 'large': 0.8
  224. }
  225. }
  226. # 7. 按订阅状态分组的其他指标
  227. additional_metrics = {}
  228. # 购买金额的百分位数对比
  229. percentiles = [25, 50, 75, 90]
  230. for status in ['Yes', 'No']:
  231. status_data = df[df['Subscription Status'] == status]['Purchase Amount (USD)']
  232. percentile_dict = {}
  233. for p in percentiles:
  234. percentile_dict[f'p{p}'] = round(status_data.quantile(p/100), 2)
  235. additional_metrics[f'purchase_percentiles_{status.lower()}'] = percentile_dict
  236. # 8. 订阅用户的价值分析
  237. value_analysis = {}
  238. if 'Yes' in subscription_counts and 'No' in subscription_counts:
  239. total_revenue_subscribed = subscribed['Purchase Amount (USD)'].sum()
  240. total_revenue_not_subscribed = not_subscribed['Purchase Amount (USD)'].sum()
  241. avg_revenue_per_customer_subscribed = total_revenue_subscribed / len(subscribed)
  242. avg_revenue_per_customer_not_subscribed = total_revenue_not_subscribed / len(not_subscribed)
  243. value_analysis = {
  244. 'total_revenue': {
  245. 'subscribed': round(total_revenue_subscribed, 2),
  246. 'not_subscribed': round(total_revenue_not_subscribed, 2),
  247. 'ratio': round(total_revenue_subscribed / total_revenue_not_subscribed, 2) if total_revenue_not_subscribed > 0 else 'N/A'
  248. },
  249. 'avg_revenue_per_customer': {
  250. 'subscribed': round(avg_revenue_per_customer_subscribed, 2),
  251. 'not_subscribed': round(avg_revenue_per_customer_not_subscribed, 2),
  252. 'difference': round(avg_revenue_per_customer_subscribed - avg_revenue_per_customer_not_subscribed, 2)
  253. }
  254. }
  255. # 9. 类别购买差异分析(按订阅状态)
  256. category_analysis = {}
  257. category_by_subscription = df.groupby(['Subscription Status', 'Category']).size().unstack(fill_value=0)
  258. # 计算每个类别中订阅用户的占比
  259. for category in category_by_subscription.columns:
  260. total_category = category_by_subscription[category].sum()
  261. if total_category > 0:
  262. subscribed_pct = (category_by_subscription.loc['Yes', category] / total_category * 100) if 'Yes' in category_by_subscription.index else 0
  263. not_subscribed_pct = (category_by_subscription.loc['No', category] / total_category * 100) if 'No' in category_by_subscription.index else 0
  264. category_analysis[category] = {
  265. 'subscribed_pct': round(subscribed_pct, 1),
  266. 'not_subscribed_pct': round(not_subscribed_pct, 1),
  267. 'subscribed_count': int(category_by_subscription.loc['Yes', category]) if 'Yes' in category_by_subscription.index else 0,
  268. 'not_subscribed_count': int(category_by_subscription.loc['No', category]) if 'No' in category_by_subscription.index else 0
  269. }
  270. # 整合所有结果到一个字典
  271. results = {
  272. 'basic_stats': {
  273. 'subscription_counts': subscription_counts,
  274. 'subscribed_percentage': round(subscription_counts.get('Yes', 0) / len(df) * 100, 1) if len(df) > 0 else 0
  275. },
  276. 'purchase_amount_comparison': avg_purchase_dict,
  277. 'previous_purchases_comparison': prev_purchases_dict,
  278. 'purchase_frequency_analysis': frequency_analysis,
  279. 'statistical_significance': significance_tests,
  280. 'effect_sizes': effect_sizes,
  281. 'percentile_analysis': additional_metrics,
  282. 'customer_value_analysis': value_analysis,
  283. 'category_preference_by_subscription': category_analysis,
  284. 'summary': {
  285. 'total_customers': len(df),
  286. 'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
  287. 'data_columns_used': ['Subscription Status', 'Purchase Amount (USD)', 'Previous Purchases', 'Frequency of Purchases', 'Category']
  288. }
  289. }
  290. # 可视化图表
  291. visualization_urls = []
  292. # 订阅用户与非订阅用户的平均购买金额对比图
  293. plt.figure(figsize=(8, 5))
  294. avg_purchase_by_subscription['mean'].plot(kind='bar', color=['blue', 'orange'])
  295. plt.title('Average Purchase Amount by Subscription Status')
  296. plt.xlabel('Subscription Status')
  297. plt.ylabel('Average Purchase Amount (USD)')
  298. purchase_amount_path = 'figures/average_purchase_by_subscription.png'
  299. plt.savefig(os.path.join(work_path, '../out', purchase_amount_path))
  300. plt.close()
  301. visualization_urls.append(purchase_amount_path)
  302. # 订阅用户与非订阅用户的之前购买次数对比图
  303. plt.figure(figsize=(8, 5))
  304. prev_purchases_by_subscription['mean'].plot(kind='bar', color=['blue', 'orange'])
  305. plt.title('Average Previous Purchases by Subscription Status')
  306. plt.xlabel('Subscription Status')
  307. plt.ylabel('Average Previous Purchases')
  308. previous_purchases_path = 'figures/average_previous_purchases_by_subscription.png'
  309. plt.savefig(os.path.join(work_path, '../out', previous_purchases_path))
  310. plt.close()
  311. visualization_urls.append(previous_purchases_path)
  312. results['visualization_url'] = visualization_urls # 添加可视化图表路径到结果
  313. return results
  314. def analyze_seasonal_preferences(input: str) -> dict:
  315. """
  316. 季节性商品偏好分析
  317. 按季节统计各商品类别的购买量及平均金额,找出各季节热销品类
  318. 参数:
  319. 返回:
  320. dict: 包含所有分析结果的字典
  321. """
  322. # 1. 数据预处理和验证
  323. required_columns = ['Season', 'Category', 'Purchase Amount (USD)']
  324. for col in required_columns:
  325. if col not in df.columns:
  326. return {"error": f"数据中缺少必要的列: {col}"}
  327. # 标准化季节名称
  328. season_mapping = {
  329. 'spring': 'Spring',
  330. 'summer': 'Summer',
  331. 'fall': 'Fall',
  332. 'winter': 'Winter',
  333. 'Spring': 'Spring',
  334. 'Summer': 'Summer',
  335. 'Fall': 'Fall',
  336. 'Winter': 'Winter'
  337. }
  338. df['Season'] = df['Season'].astype(str).str.strip().str.lower().map(lambda x: season_mapping.get(x, x))
  339. # 只保留有效的季节
  340. valid_seasons = ['Spring', 'Summer', 'Fall', 'Winter']
  341. # 2. 基础统计:各季节购买量分布
  342. seasonal_counts = df['Season'].value_counts().to_dict()
  343. total_purchases = len(df)
  344. # 3. 按季节和类别统计购买量和平均金额
  345. seasonal_analysis = {}
  346. for season in valid_seasons:
  347. season_data = df[df['Season'] == season]
  348. # 该季节的总购买量
  349. season_total = len(season_data)
  350. # 按类别统计
  351. category_stats = season_data.groupby('Category').agg({
  352. 'Purchase Amount (USD)': ['count', 'mean', 'sum', 'std']
  353. }).round(2)
  354. # 重命名列
  355. category_stats.columns = ['count', 'avg_amount', 'total_amount', 'std_amount']
  356. category_stats = category_stats.reset_index()
  357. # 转换为字典格式
  358. category_dict = {}
  359. for _, row in category_stats.iterrows():
  360. category = row['Category']
  361. category_dict[category] = {
  362. 'count': int(row['count']),
  363. 'percentage': round(row['count'] / season_total * 100, 1),
  364. 'avg_amount': float(row['avg_amount']),
  365. 'total_amount': float(row['total_amount']),
  366. 'std_amount': float(row['std_amount'])
  367. }
  368. # 找出该季节的热销品类(按购买量)
  369. top_categories_by_count = category_stats.nlargest(3, 'count')[['Category', 'count']].to_dict('records')
  370. top_categories_by_revenue = category_stats.nlargest(3, 'total_amount')[['Category', 'total_amount']].to_dict('records')
  371. # 季节特征分析
  372. season_summary = {
  373. 'total_purchases': int(season_total),
  374. 'percentage_of_total': round(season_total / total_purchases * 100, 1),
  375. 'total_revenue': float(season_data['Purchase Amount (USD)'].sum()),
  376. 'avg_transaction_value': float(season_data['Purchase Amount (USD)'].mean()),
  377. 'top_categories_by_count': top_categories_by_count,
  378. 'top_categories_by_revenue': top_categories_by_revenue,
  379. 'category_details': category_dict
  380. }
  381. seasonal_analysis[season] = season_summary
  382. # 4. 季节性趋势分析(跨季节对比)
  383. seasonal_trends = {}
  384. # 计算每个类别在不同季节的表现
  385. all_categories = df['Category'].unique()
  386. for category in all_categories:
  387. category_data = df[df['Category'] == category]
  388. category_season_stats = []
  389. for season in valid_seasons:
  390. season_cat_data = category_data[category_data['Season'] == season]
  391. if len(season_cat_data) > 0:
  392. stats = {
  393. 'season': season,
  394. 'count': len(season_cat_data),
  395. 'avg_amount': float(season_cat_data['Purchase Amount (USD)'].mean()),
  396. 'total_amount': float(season_cat_data['Purchase Amount (USD)'].sum()),
  397. 'percentage': round(len(season_cat_data) / len(category_data) * 100, 1)
  398. }
  399. category_season_stats.append(stats)
  400. # 找出该类别的最佳销售季节
  401. if category_season_stats:
  402. best_by_count = max(category_season_stats, key=lambda x: x['count'])
  403. best_by_revenue = max(category_season_stats, key=lambda x: x['total_amount'])
  404. seasonal_trends[category] = {
  405. 'total_purchases': len(category_data),
  406. 'seasonal_distribution': category_season_stats,
  407. 'best_season_by_count': {
  408. 'season': best_by_count['season'],
  409. 'count': best_by_count['count'],
  410. 'percentage': best_by_count['percentage']
  411. },
  412. 'best_season_by_revenue': {
  413. 'season': best_by_revenue['season'],
  414. 'total_amount': best_by_revenue['total_amount']
  415. },
  416. 'seasonality_index': calculate_seasonality_index(category_season_stats)
  417. }
  418. # 5. 季节性热点分析(找出具有明显季节性的品类)
  419. highly_seasonal_categories = []
  420. for category, trend in seasonal_trends.items():
  421. distribution = trend['seasonal_distribution']
  422. if len(distribution) >= 2: # 至少有两个季节的数据
  423. counts = [d['count'] for d in distribution]
  424. max_count = max(counts)
  425. min_count = min(counts)
  426. if min_count > 0: # 避免除以零
  427. seasonality_ratio = max_count / min_count
  428. if seasonality_ratio >= 2.0: # 季节性差异显著(最高季节是最低季节的2倍以上)
  429. highly_seasonal_categories.append({
  430. 'category': category,
  431. 'seasonality_ratio': round(seasonality_ratio, 2),
  432. 'peak_season': trend['best_season_by_count']['season'],
  433. 'peak_count': trend['best_season_by_count']['count']
  434. })
  435. # 按季节性比例排序
  436. highly_seasonal_categories.sort(key=lambda x: x['seasonality_ratio'], reverse=True)
  437. # 6. 跨季节对比:整体数据
  438. cross_season_comparison = {}
  439. # 按季节的整体表现
  440. seasonal_performance = []
  441. for season in valid_seasons:
  442. if season in seasonal_analysis:
  443. season_data = seasonal_analysis[season]
  444. seasonal_performance.append({
  445. 'season': season,
  446. 'total_purchases': season_data['total_purchases'],
  447. 'total_revenue': season_data['total_revenue'],
  448. 'avg_transaction_value': season_data['avg_transaction_value'],
  449. 'purchase_density': round(season_data['total_purchases'] / len(df[df['Season'] == season].index.unique()) if len(df[df['Season'] == season]) > 0 else 0, 2)
  450. })
  451. # 找出最高和最低销售季节
  452. if seasonal_performance:
  453. peak_season = max(seasonal_performance, key=lambda x: x['total_revenue'])
  454. low_season = min(seasonal_performance, key=lambda x: x['total_revenue'])
  455. cross_season_comparison = {
  456. 'seasonal_performance': seasonal_performance,
  457. 'peak_season': {
  458. 'season': peak_season['season'],
  459. 'total_revenue': peak_season['total_revenue'],
  460. 'reason': analyze_peak_season_reason(seasonal_analysis[peak_season['season']])
  461. },
  462. 'low_season': {
  463. 'season': low_season['season'],
  464. 'total_revenue': low_season['total_revenue']
  465. },
  466. 'revenue_variation': round((peak_season['total_revenue'] - low_season['total_revenue']) / low_season['total_revenue'] * 100, 1) if low_season['total_revenue'] > 0 else 0
  467. }
  468. # 7. 季节性营销建议
  469. marketing_recommendations = generate_seasonal_recommendations(seasonal_analysis, seasonal_trends, highly_seasonal_categories)
  470. # 8. 汇总结果
  471. results = {
  472. 'basic_stats': {
  473. 'total_purchases': int(total_purchases),
  474. 'seasons_covered': valid_seasons,
  475. 'purchases_by_season': seasonal_counts,
  476. 'categories_analyzed': list(all_categories)
  477. },
  478. 'seasonal_analysis': seasonal_analysis,
  479. 'category_seasonal_trends': seasonal_trends,
  480. 'highly_seasonal_categories': highly_seasonal_categories[:10], # 只返回前10个
  481. 'cross_season_comparison': cross_season_comparison,
  482. 'marketing_recommendations': marketing_recommendations,
  483. 'summary': {
  484. 'peak_season': cross_season_comparison.get('peak_season', {}).get('season', 'Unknown'),
  485. 'most_consistent_category': find_most_consistent_category(seasonal_trends),
  486. 'most_seasonal_category': highly_seasonal_categories[0]['category'] if highly_seasonal_categories else 'None',
  487. 'highest_avg_transaction_season': max(seasonal_performance, key=lambda x: x['avg_transaction_value'])['season'] if seasonal_performance else 'Unknown',
  488. 'analysis_timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
  489. }
  490. }
  491. # 可视化图表并保存路径到 results
  492. visualization_urls = []
  493. figures_dir = os.path.join(work_path, '../out', 'figures')
  494. os.makedirs(figures_dir, exist_ok=True)
  495. try:
  496. # 1) 各季节购买量柱状图
  497. plt.figure(figsize=(8,5))
  498. seasons = valid_seasons
  499. counts = [seasonal_counts.get(s, 0) for s in seasons]
  500. plt.bar(seasons, counts, color=['#66c2a5','#fc8d62','#8da0cb','#e78ac3'])
  501. plt.title('Purchases by Season')
  502. plt.ylabel('Purchases')
  503. path1 = 'figures/purchases_by_season.png'
  504. plt.savefig(os.path.join(work_path, '../out', path1), bbox_inches='tight')
  505. plt.close()
  506. visualization_urls.append(path1)
  507. except Exception:
  508. pass
  509. try:
  510. # 2) 季节性总体表现(总收入)柱状图
  511. if seasonal_performance:
  512. plt.figure(figsize=(8,5))
  513. seasons_perf = [s['season'] for s in seasonal_performance]
  514. revenues = [s['total_revenue'] for s in seasonal_performance]
  515. plt.bar(seasons_perf, revenues, color='steelblue')
  516. plt.title('Total Revenue by Season')
  517. plt.ylabel('Total Revenue (USD)')
  518. path2 = 'figures/total_revenue_by_season.png'
  519. plt.savefig(os.path.join(work_path, '../out', path2), bbox_inches='tight')
  520. plt.close()
  521. visualization_urls.append(path2)
  522. except Exception:
  523. pass
  524. try:
  525. # 3) 高季节性品类条形图(前10)
  526. if highly_seasonal_categories:
  527. top_seasonal = highly_seasonal_categories[:10]
  528. cats = [c['category'] for c in top_seasonal]
  529. ratios = [c['seasonality_ratio'] for c in top_seasonal]
  530. plt.figure(figsize=(10,5))
  531. plt.barh(cats[::-1], ratios[::-1], color='darkorange')
  532. plt.title('Top Highly Seasonal Categories (seasonality ratio)')
  533. plt.xlabel('Seasonality Ratio')
  534. path3 = 'figures/highly_seasonal_categories.png'
  535. plt.savefig(os.path.join(work_path, '../out', path3), bbox_inches='tight')
  536. plt.close()
  537. visualization_urls.append(path3)
  538. except Exception:
  539. pass
  540. try:
  541. # 4) 部分类目跨季节堆叠柱状图(取出现频率较高的前8类)
  542. sample_cats = list(seasonal_trends.keys())[:8]
  543. if sample_cats:
  544. matrix = {s: {season:0 for season in valid_seasons} for s in sample_cats}
  545. for cat in sample_cats:
  546. dist = seasonal_trends.get(cat, {}).get('seasonal_distribution', [])
  547. for d in dist:
  548. season = d.get('season')
  549. count = d.get('count', 0)
  550. if season in valid_seasons:
  551. matrix[cat][season] = count
  552. df_matrix = pd.DataFrame.from_dict(matrix, orient='index')[valid_seasons]
  553. plt.figure(figsize=(10,6))
  554. df_matrix.plot(kind='bar', stacked=True, figsize=(10,6), colormap='tab20')
  555. plt.title('Seasonal Distribution for Sample Categories')
  556. plt.xlabel('Category')
  557. plt.ylabel('Purchase Count')
  558. plt.xticks(rotation=45, ha='right')
  559. path4 = 'figures/sample_categories_seasonal_distribution.png'
  560. plt.savefig(os.path.join(work_path, '../out', path4), bbox_inches='tight')
  561. plt.close()
  562. visualization_urls.append(path4)
  563. except Exception:
  564. pass
  565. # 将图表路径加入结果字典(相对 out/ 下的路径列表)
  566. results['visualization_url'] = visualization_urls
  567. return results
  568. def calculate_seasonality_index(seasonal_stats):
  569. """计算季节性指数"""
  570. if not seasonal_stats:
  571. return 0
  572. counts = [s['count'] for s in seasonal_stats]
  573. avg_count = sum(counts) / len(counts)
  574. if avg_count == 0:
  575. return 0
  576. # 计算变异系数作为季节性指数
  577. variance = sum((c - avg_count) ** 2 for c in counts) / len(counts)
  578. std_dev = variance ** 0.5
  579. seasonality_index = std_dev / avg_count if avg_count > 0 else 0
  580. return round(seasonality_index, 3)
  581. def analyze_peak_season_reason(season_data):
  582. """分析高峰季节的原因"""
  583. top_categories = season_data['top_categories_by_count'][:2]
  584. reasons = []
  585. for cat in top_categories:
  586. category_name = cat['Category']
  587. category_details = season_data['category_details'].get(category_name, {})
  588. reasons.append(f"{category_name} ({cat['count']}次购买,占总数的{category_details.get('percentage', 0)}%)")
  589. return f"主要贡献品类: {', '.join(reasons)}"
  590. def analyze_monthly_trends():
  591. """分析月度趋势(如果有月份数据)"""
  592. monthly_insights = {}
  593. # 尝试从现有列中提取月份信息
  594. month_col = None
  595. for col in df.columns:
  596. if col.lower() in ['month', 'purchase_month', 'order_month']:
  597. month_col = col
  598. break
  599. if month_col:
  600. monthly_stats = df.groupby(month_col).agg({
  601. 'Purchase Amount (USD)': ['count', 'mean', 'sum']
  602. }).round(2)
  603. monthly_stats.columns = ['count', 'avg_amount', 'total_amount']
  604. monthly_stats = monthly_stats.reset_index()
  605. monthly_insights = monthly_stats.to_dict('records')
  606. return monthly_insights
  607. def generate_seasonal_recommendations(seasonal_analysis, seasonal_trends, highly_seasonal_categories):
  608. """生成季节性营销建议"""
  609. recommendations = []
  610. # 1. 库存管理建议
  611. for season, data in seasonal_analysis.items():
  612. top_categories = data['top_categories_by_count'][:3]
  613. if top_categories:
  614. categories_str = ', '.join([cat['Category'] for cat in top_categories])
  615. recommendations.append({
  616. 'season': season,
  617. 'type': '库存管理',
  618. 'recommendation': f"增加 {categories_str} 品类的库存",
  619. 'reason': f"该季节最受欢迎的品类,占总购买的{sum(data['category_details'][cat['Category']]['percentage'] for cat in top_categories if cat['Category'] in data['category_details']):.1f}%"
  620. })
  621. # 2. 促销活动建议
  622. for item in highly_seasonal_categories[:3]:
  623. recommendations.append({
  624. 'category': item['category'],
  625. 'type': '促销活动',
  626. 'recommendation': f"在{item['peak_season']}季节进行重点促销",
  627. 'reason': f"该品类在{item['peak_season']}季节的销量是其他季节的{item['seasonality_ratio']:.1f}倍"
  628. })
  629. # 3. 定价策略建议
  630. for season, data in seasonal_analysis.items():
  631. if data['avg_transaction_value'] > 0:
  632. # 找出该季节高价值品类
  633. high_value_categories = []
  634. for category, details in data['category_details'].items():
  635. if details['avg_amount'] > data['avg_transaction_value'] * 1.2: # 高于平均20%
  636. high_value_categories.append(category)
  637. if high_value_categories:
  638. recommendations.append({
  639. 'season': season,
  640. 'type': '定价策略',
  641. 'recommendation': f"对{', '.join(high_value_categories[:3])}品类进行溢价定价",
  642. 'reason': f"这些品类在该季节的平均交易价值较高 (${data['avg_transaction_value']:.2f}+)"
  643. })
  644. return recommendations
  645. def find_most_consistent_category(seasonal_trends):
  646. """找出最稳定的品类(季节性差异最小)"""
  647. if not seasonal_trends:
  648. return "None"
  649. most_consistent = None
  650. min_seasonality = float('inf')
  651. for category, trend in seasonal_trends.items():
  652. seasonality = trend.get('seasonality_index', 1.0)
  653. if seasonality < min_seasonality:
  654. min_seasonality = seasonality
  655. most_consistent = category
  656. return most_consistent
  657. def analyze_review_rating_impact(input: str) -> dict:
  658. """
  659. 评论评分与消费关联分析
  660. 参数:
  661. 返回:
  662. dict: 包含最重要分析结果的字典
  663. """
  664. # 1. 数据预处理和验证
  665. required_columns = ['Review Rating', 'Purchase Amount (USD)', 'Previous Purchases']
  666. for col in required_columns:
  667. if col not in df.columns:
  668. return {"error": f"数据中缺少必要的列: {col}"}
  669. # 数据清洗
  670. df_clean = df.copy()
  671. df_clean['Review Rating'] = pd.to_numeric(df_clean['Review Rating'], errors='coerce')
  672. df_clean = df_clean.dropna(subset=['Review Rating'])
  673. df_clean = df_clean[(df_clean['Review Rating'] >= 1) & (df_clean['Review Rating'] <= 5)]
  674. if len(df_clean) == 0:
  675. return {"error": "清洗后无有效数据"}
  676. # 2. 核心结果:评分组对比分析
  677. # 创建简化的评分区间
  678. def create_simple_rating_groups(rating):
  679. if rating >= 4.0:
  680. return 'High (4.0-5.0)'
  681. elif rating >= 3.0:
  682. return 'Medium (3.0-3.99)'
  683. else:
  684. return 'Low (1.0-2.99)'
  685. df_clean['Rating Group'] = df_clean['Review Rating'].apply(create_simple_rating_groups)
  686. # 评分组分析
  687. rating_group_analysis = {}
  688. for group in ['High (4.0-5.0)', 'Medium (3.0-3.99)', 'Low (1.0-2.99)']:
  689. if group in df_clean['Rating Group'].unique():
  690. group_data = df_clean[df_clean['Rating Group'] == group]
  691. rating_group_analysis[group] = {
  692. 'customer_count': int(len(group_data)),
  693. 'percentage': round(len(group_data) / len(df_clean) * 100, 1),
  694. 'avg_purchase_amount': round(float(group_data['Purchase Amount (USD)'].mean()), 2),
  695. 'avg_previous_purchases': round(float(group_data['Previous Purchases'].mean()), 1),
  696. 'total_revenue': round(float(group_data['Purchase Amount (USD)'].sum()), 2)
  697. }
  698. # 3. 核心结果:相关性分析
  699. correlation_results = {}
  700. if len(df_clean) >= 10:
  701. try:
  702. # 评分与购买金额的相关性
  703. corr_amount, p_value_amount = pearsonr(
  704. df_clean['Review Rating'],
  705. df_clean['Purchase Amount (USD)']
  706. )
  707. # 评分与之前购买次数的相关性
  708. corr_prev, p_value_prev = pearsonr(
  709. df_clean['Review Rating'],
  710. df_clean['Previous Purchases']
  711. )
  712. correlation_results = {
  713. 'rating_vs_purchase_amount': {
  714. 'correlation': round(corr_amount, 3),
  715. 'p_value': round(p_value_amount, 4),
  716. 'significant': p_value_amount < 0.05,
  717. 'strength': '强' if abs(corr_amount) >= 0.5 else '中' if abs(corr_amount) >= 0.3 else '弱'
  718. },
  719. 'rating_vs_previous_purchases': {
  720. 'correlation': round(corr_prev, 3),
  721. 'p_value': round(p_value_prev, 4),
  722. 'significant': p_value_prev < 0.05
  723. }
  724. }
  725. except:
  726. correlation_results = {'error': '相关性计算失败'}
  727. # 4. 核心结果:关键指标对比
  728. # 找出最高和最低评分组的差异
  729. key_comparisons = {}
  730. if len(rating_group_analysis) >= 2:
  731. high_group = rating_group_analysis.get('High (4.0-5.0)', {})
  732. low_group = rating_group_analysis.get('Low (1.0-2.99)', {})
  733. if high_group and low_group:
  734. amount_diff = high_group['avg_purchase_amount'] - low_group['avg_purchase_amount']
  735. prev_diff = high_group['avg_previous_purchases'] - low_group['avg_previous_purchases']
  736. key_comparisons = {
  737. 'high_vs_low_rating': {
  738. 'purchase_amount_difference': round(amount_diff, 2),
  739. 'purchase_amount_percentage_diff': round(amount_diff / low_group['avg_purchase_amount'] * 100, 1) if low_group['avg_purchase_amount'] > 0 else 0,
  740. 'previous_purchases_difference': round(prev_diff, 1),
  741. 'revenue_contribution_ratio': round(high_group['total_revenue'] / low_group['total_revenue'], 1) if low_group['total_revenue'] > 0 else 'N/A'
  742. }
  743. }
  744. # 5. 核心结果:业务洞察摘要
  745. insights = []
  746. # 评分分布洞察
  747. high_rating_percentage = rating_group_analysis.get('High (4.0-5.0)', {}).get('percentage', 0)
  748. if high_rating_percentage > 50:
  749. insights.append("超过一半的客户给出高评分(4.0+),表明总体满意度较高")
  750. elif high_rating_percentage < 30:
  751. insights.append("高评分客户比例较低,需要关注服务质量提升")
  752. # 消费差异洞察
  753. if key_comparisons and 'high_vs_low_rating' in key_comparisons:
  754. diff_info = key_comparisons['high_vs_low_rating']
  755. insights.append(f"高评分客户比低评分客户平均多消费${diff_info['purchase_amount_difference']:.2f} ({diff_info['purchase_amount_percentage_diff']:.1f}%)")
  756. # 相关性洞察
  757. if correlation_results and 'rating_vs_purchase_amount' in correlation_results:
  758. corr_info = correlation_results['rating_vs_purchase_amount']
  759. if corr_info['significant']:
  760. direction = "正" if corr_info['correlation'] > 0 else "负"
  761. insights.append(f"评分与消费金额存在{direction}相关关系({corr_info['strength']}相关,r={corr_info['correlation']:.2f})")
  762. # 6. 整合最重要的结果
  763. results = {
  764. 'overall_summary': {
  765. 'total_customers': int(len(df_clean)),
  766. 'avg_rating': round(float(df_clean['Review Rating'].mean()), 2),
  767. 'avg_purchase_amount': round(float(df_clean['Purchase Amount (USD)'].mean()), 2),
  768. 'avg_previous_purchases': round(float(df_clean['Previous Purchases'].mean()), 1)
  769. },
  770. 'rating_distribution': {
  771. 'high_rating_percentage': high_rating_percentage,
  772. 'rating_groups_summary': {
  773. group: {
  774. 'customer_count': data['customer_count'],
  775. 'percentage': data['percentage']
  776. }
  777. for group, data in rating_group_analysis.items()
  778. }
  779. },
  780. 'key_metrics_by_rating': {
  781. group: {
  782. 'avg_purchase_amount': data['avg_purchase_amount'],
  783. 'avg_previous_purchases': data['avg_previous_purchases'],
  784. 'total_revenue': data['total_revenue']
  785. }
  786. for group, data in rating_group_analysis.items()
  787. },
  788. 'correlation_analysis': correlation_results,
  789. 'key_comparisons': key_comparisons,
  790. 'top_insights': insights[:3] if insights else ["数据不足或无明显模式"],
  791. 'analysis_timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
  792. }
  793. # 可视化图表并保存路径到 results
  794. visualization_urls = []
  795. figures_dir = os.path.join(work_path, '../out', 'figures')
  796. os.makedirs(figures_dir, exist_ok=True)
  797. try:
  798. # 1) 评分分布柱状图(High/Med/Low)
  799. groups = ['High (4.0-5.0)', 'Medium (3.0-3.99)', 'Low (1.0-2.99)']
  800. counts = [rating_group_analysis.get(g, {}).get('customer_count', 0) for g in groups]
  801. plt.figure(figsize=(7,4))
  802. plt.bar(groups, counts, color=['#4CAF50','#FFD54F','#EF5350'])
  803. plt.title('Rating Group Distribution')
  804. plt.ylabel('Customer Count')
  805. path1 = 'figures/rating_group_distribution.png'
  806. plt.savefig(os.path.join(work_path, '../out', path1), bbox_inches='tight')
  807. plt.close()
  808. visualization_urls.append(path1)
  809. except Exception:
  810. pass
  811. try:
  812. # 2) 各评分组平均消费金额柱状图
  813. avg_amounts = [rating_group_analysis.get(g, {}).get('avg_purchase_amount', 0) for g in groups]
  814. plt.figure(figsize=(7,4))
  815. plt.bar(groups, avg_amounts, color=['#2E7D32','#F9A825','#C62828'])
  816. plt.title('Average Purchase Amount by Rating Group')
  817. plt.ylabel('Average Purchase Amount (USD)')
  818. path2 = 'figures/avg_purchase_by_rating_group.png'
  819. plt.savefig(os.path.join(work_path, '../out', path2), bbox_inches='tight')
  820. plt.close()
  821. visualization_urls.append(path2)
  822. except Exception:
  823. pass
  824. try:
  825. # 3) 评分 vs 购买金额 散点图(样本点)
  826. plt.figure(figsize=(7,5))
  827. plt.scatter(df_clean['Review Rating'], df_clean['Purchase Amount (USD)'], alpha=0.6, s=20)
  828. plt.xlabel('Review Rating')
  829. plt.ylabel('Purchase Amount (USD)')
  830. plt.title('Rating vs Purchase Amount')
  831. path3 = 'figures/rating_vs_purchase_scatter.png'
  832. plt.savefig(os.path.join(work_path, '../out', path3), bbox_inches='tight')
  833. plt.close()
  834. visualization_urls.append(path3)
  835. except Exception:
  836. pass
  837. try:
  838. # 4) 各评分组总收入柱状图
  839. totals = [rating_group_analysis.get(g, {}).get('total_revenue', 0) for g in groups]
  840. plt.figure(figsize=(7,4))
  841. plt.bar(groups, totals, color=['#66BB6A','#FFCA28','#EF5350'])
  842. plt.title('Total Revenue by Rating Group')
  843. plt.ylabel('Total Revenue (USD)')
  844. path4 = 'figures/total_revenue_by_rating_group.png'
  845. plt.savefig(os.path.join(work_path, '../out', path4), bbox_inches='tight')
  846. plt.close()
  847. visualization_urls.append(path4)
  848. except Exception:
  849. pass
  850. # 将图表路径加入结果字典(相对 out/ 路径列表)
  851. results['visualization_url'] = visualization_urls
  852. return results
  853. def analyze_payment_method_impact(input: str) -> dict:
  854. """
  855. 支付方式对购买金额的影响分析
  856. 参数:
  857. 返回:
  858. dict: 包含分析结果的字典
  859. """
  860. # 1. 数据验证
  861. required_columns = ['Payment Method', 'Purchase Amount (USD)']
  862. for col in required_columns:
  863. if col not in df.columns:
  864. return {"error": f"数据中缺少必要的列: {col}"}
  865. # 2. 数据清洗
  866. df_clean = df.copy()
  867. df_clean['Payment Method'] = df_clean['Payment Method'].astype(str).str.strip()
  868. # 过滤无效数据
  869. df_clean = df_clean[df_clean['Purchase Amount (USD)'] > 0]
  870. if len(df_clean) == 0:
  871. return {"error": "清洗后无有效数据"}
  872. # 3. 基础统计分析
  873. # 支付方式分布
  874. payment_counts = df_clean['Payment Method'].value_counts().to_dict()
  875. total_transactions = len(df_clean)
  876. # 按支付方式的统计
  877. payment_stats = {}
  878. for method, group in df_clean.groupby('Payment Method'):
  879. payment_stats[method] = {
  880. 'transaction_count': int(len(group)),
  881. 'percentage': round(len(group) / total_transactions * 100, 1),
  882. 'total_amount': round(float(group['Purchase Amount (USD)'].sum()), 2),
  883. 'avg_amount': round(float(group['Purchase Amount (USD)'].mean()), 2),
  884. 'median_amount': round(float(group['Purchase Amount (USD)'].median()), 2),
  885. 'std_amount': round(float(group['Purchase Amount (USD)'].std()), 2),
  886. 'min_amount': round(float(group['Purchase Amount (USD)'].min()), 2),
  887. 'max_amount': round(float(group['Purchase Amount (USD)'].max()), 2)
  888. }
  889. # 4. 关键对比:最高和最低平均金额
  890. avg_amounts = {method: stats['avg_amount'] for method, stats in payment_stats.items()}
  891. if avg_amounts:
  892. max_avg_method = max(avg_amounts, key=avg_amounts.get)
  893. min_avg_method = min(avg_amounts, key=avg_amounts.get)
  894. key_comparisons = {
  895. 'highest_avg_payment': {
  896. 'method': max_avg_method,
  897. 'amount': avg_amounts[max_avg_method],
  898. 'details': payment_stats[max_avg_method]
  899. },
  900. 'lowest_avg_payment': {
  901. 'method': min_avg_method,
  902. 'amount': avg_amounts[min_avg_method],
  903. 'details': payment_stats[min_avg_method]
  904. },
  905. 'difference': {
  906. 'amount_diff': round(avg_amounts[max_avg_method] - avg_amounts[min_avg_method], 2),
  907. 'percentage_diff': round((avg_amounts[max_avg_method] - avg_amounts[min_avg_method]) / avg_amounts[min_avg_method] * 100, 1) if avg_amounts[min_avg_method] > 0 else 0
  908. }
  909. }
  910. else:
  911. key_comparisons = {}
  912. # 5. 统计分析:ANOVA检验
  913. anova_results = {}
  914. if len(payment_stats) >= 2:
  915. try:
  916. # 准备各组数据
  917. groups = []
  918. for method in payment_stats.keys():
  919. group_data = df_clean[df_clean['Payment Method'] == method]['Purchase Amount (USD)'].values
  920. if len(group_data) >= 2: # 至少2个样本
  921. groups.append(group_data)
  922. if len(groups) >= 2:
  923. # 进行ANOVA检验
  924. f_stat, p_value = f_oneway(*groups)
  925. anova_results = {
  926. 'f_statistic': round(f_stat, 4),
  927. 'p_value': round(p_value, 6),
  928. 'significant': p_value < 0.05,
  929. 'interpretation': '不同支付方式间的购买金额存在显著差异' if p_value < 0.05 else '不同支付方式间的购买金额无显著差异'
  930. }
  931. except Exception as e:
  932. anova_results = {'error': f'ANOVA检验失败: {str(e)}'}
  933. # 6. 市场份额与金额贡献对比
  934. contribution_analysis = {}
  935. for method, stats in payment_stats.items():
  936. contribution_analysis[method] = {
  937. 'transaction_share': stats['percentage'],
  938. 'revenue_share': round(stats['total_amount'] / df_clean['Purchase Amount (USD)'].sum() * 100, 1),
  939. 'avg_transaction_value': stats['avg_amount']
  940. }
  941. # 7. 业务洞察
  942. insights = []
  943. # 支付方式偏好洞察
  944. max_transactions = max(payment_counts.values())
  945. most_popular = [m for m, c in payment_counts.items() if c == max_transactions][0]
  946. insights.append(f"最常用的支付方式: {most_popular} ({payment_counts[most_popular]}笔交易)")
  947. # 金额差异洞察
  948. if key_comparisons:
  949. diff = key_comparisons['difference']
  950. insights.append(f"{key_comparisons['highest_avg_payment']['method']}的平均交易额比{key_comparisons['lowest_avg_payment']['method']}高{diff['percentage_diff']:.1f}%")
  951. # 统计显著性洞察
  952. if anova_results and 'significant' in anova_results:
  953. if anova_results['significant']:
  954. insights.append("不同支付方式间的消费金额存在统计显著差异")
  955. else:
  956. insights.append("不同支付方式间的消费金额无显著差异")
  957. # 高价值支付方式识别
  958. for method, contrib in contribution_analysis.items():
  959. if contrib['revenue_share'] > contrib['transaction_share'] + 10: # 收入占比明显高于交易占比
  960. insights.append(f"{method}是高价值支付方式:贡献{contrib['revenue_share']}%的收入,但只占{contrib['transaction_share']}%的交易")
  961. # 8. 整合结果
  962. results = {
  963. 'overall_summary': {
  964. 'total_transactions': total_transactions,
  965. 'total_revenue': round(float(df_clean['Purchase Amount (USD)'].sum()), 2),
  966. 'avg_transaction_value': round(float(df_clean['Purchase Amount (USD)'].mean()), 2),
  967. 'unique_payment_methods': len(payment_stats)
  968. },
  969. 'payment_method_distribution': {
  970. 'transaction_counts': payment_counts,
  971. 'percentage_breakdown': {method: stats['percentage'] for method, stats in payment_stats.items()}
  972. },
  973. 'performance_by_payment_method': payment_stats,
  974. 'contribution_analysis': contribution_analysis,
  975. 'key_comparisons': key_comparisons,
  976. 'statistical_analysis': anova_results,
  977. 'business_insights': insights[:5]
  978. }
  979. # 可视化图表并保存路径到 results
  980. visualization_urls = []
  981. figures_dir = os.path.join(work_path, '../out', 'figures')
  982. os.makedirs(figures_dir, exist_ok=True)
  983. try:
  984. # 1) 支付方式交易次数柱状图
  985. plt.figure(figsize=(8,5))
  986. methods = list(payment_counts.keys())
  987. counts = [payment_counts[m] for m in methods]
  988. plt.bar(methods, counts, color='skyblue')
  989. plt.title('Transaction Counts by Payment Method')
  990. plt.xlabel('Payment Method')
  991. plt.ylabel('Transaction Count')
  992. plt.xticks(rotation=45, ha='right')
  993. path_a = 'figures/payment_method_transaction_counts.png'
  994. plt.savefig(os.path.join(work_path, '../out', path_a), bbox_inches='tight')
  995. plt.close()
  996. visualization_urls.append(path_a)
  997. except Exception:
  998. pass
  999. try:
  1000. # 2) 支付方式占比饼图
  1001. plt.figure(figsize=(6,6))
  1002. series_counts = pd.Series(payment_counts)
  1003. series_counts = series_counts.sort_values(ascending=False)
  1004. series_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, pctdistance=0.75)
  1005. plt.ylabel('')
  1006. plt.title('Payment Method Share')
  1007. path_b = 'figures/payment_method_share_pie.png'
  1008. plt.savefig(os.path.join(work_path, '../out', path_b), bbox_inches='tight')
  1009. plt.close()
  1010. visualization_urls.append(path_b)
  1011. except Exception:
  1012. pass
  1013. try:
  1014. # 3) 各支付方式平均交易额柱状图
  1015. plt.figure(figsize=(8,5))
  1016. methods_avg = list(avg_amounts.keys()) if 'avg_amounts' in locals() else list(payment_stats.keys())
  1017. avg_vals = [payment_stats[m]['avg_amount'] if m in payment_stats else 0 for m in methods_avg]
  1018. plt.bar(methods_avg, avg_vals, color='seagreen')
  1019. plt.title('Average Transaction Value by Payment Method')
  1020. plt.xlabel('Payment Method')
  1021. plt.ylabel('Average Amount (USD)')
  1022. plt.xticks(rotation=45, ha='right')
  1023. path_c = 'figures/avg_transaction_value_by_payment_method.png'
  1024. plt.savefig(os.path.join(work_path, '../out', path_c), bbox_inches='tight')
  1025. plt.close()
  1026. visualization_urls.append(path_c)
  1027. except Exception:
  1028. pass
  1029. try:
  1030. # 4) 支付方式金额分布箱线图(若样本量允许)
  1031. grouped = []
  1032. labels = []
  1033. for method in payment_stats.keys():
  1034. vals = df_clean[df_clean['Payment Method'] == method]['Purchase Amount (USD)'].dropna().values
  1035. if len(vals) >= 3:
  1036. grouped.append(vals)
  1037. labels.append(method)
  1038. if grouped:
  1039. plt.figure(figsize=(10,6))
  1040. plt.boxplot(grouped, tick_labels=labels, vert=True, patch_artist=True)
  1041. plt.title('Purchase Amount Distribution by Payment Method')
  1042. plt.ylabel('Purchase Amount (USD)')
  1043. plt.xticks(rotation=45, ha='right')
  1044. path_d = 'figures/purchase_amount_boxplot_by_payment_method.png'
  1045. plt.savefig(os.path.join(work_path, '../out', path_d), bbox_inches='tight')
  1046. plt.close()
  1047. visualization_urls.append(path_d)
  1048. except Exception:
  1049. pass
  1050. # 将图表路径加入结果字典(相对 out/ 下的路径列表)
  1051. results['visualization_url'] = visualization_urls
  1052. return results
  1053. def create_data_analysis_registry():
  1054. """创建数据分析工具注册表"""
  1055. tool_registry = ToolRegistry()
  1056. # 注册数据分析工具
  1057. tool_registry.register_function(
  1058. name="Gender Preference Analysis",
  1059. func=analyze_gender_preferences,
  1060. description="分析不同性别的购物偏好,包括消费金额和商品类别偏好。"
  1061. )
  1062. tool_registry.register_function(
  1063. name="Age Preference Analysis",
  1064. func=analyze_age_preferences,
  1065. description="分析不同年龄段的购物偏好,包括消费金额和商品类别偏好。"
  1066. )
  1067. tool_registry.register_function(
  1068. name="Spending Differences Analysis",
  1069. func=analyze_spending_differences,
  1070. description="分析不同性别和年龄段在各商品类别上的消费差异。"
  1071. )
  1072. tool_registry.register_function(
  1073. name="Subscription Impact Analysis",
  1074. func=analyze_subscription_impact,
  1075. description="分析订阅状态对用户购买行为和消费金额的影响。"
  1076. )
  1077. tool_registry.register_function(
  1078. name="Seasonal Preference Analysis",
  1079. func=analyze_seasonal_preferences,
  1080. description="分析不同季节的商品购买偏好,找出各季节热销品类。"
  1081. )
  1082. tool_registry.register_function(
  1083. name="Review Rating Impact Analysis",
  1084. func=analyze_review_rating_impact,
  1085. description="分析评论评分对用户购买金额和购买频率的影响。"
  1086. )
  1087. tool_registry.register_function(
  1088. name="Payment Method Impact Analysis",
  1089. func=analyze_payment_method_impact,
  1090. description="分析不同支付方式对用户购买金额的影响。"
  1091. )
  1092. return tool_registry
  1093. if __name__ == "__main__":
  1094. registry = create_data_analysis_registry()
  1095. result = registry.execute_tool("Payment Method Impact Analysis", input_text=None)
  1096. print(f"\n分析结果:{result}")