import pandas as pd
import numpy as np
import plotly.express as px
import mytools
#| echo: false
import plotly.io as pio
# 设置默认渲染器为 SVG
= "svg" pio.renderers.default
网站首屏设计AB测试
研究方法
= pd.read_excel(R"data\ABtest_actions_full.xlsx") df1
解释性研究是关于现象或事物之间因果关系的研究。解释性研究是在描述性研究的基础上,进一步探寻“为什么”。解释性研究要在描述性研究的基础上对变量之间的关系进行分析,以确定它们之间是否存在相关,并进而判断它们之间是否存在因果关系。
在定量研究方法中,解释性研究通常是首先提出研究假设,然后从理论假设出发,设计出调查方案(收集资料的方案)并采用各种调查方法去收集经验材料,最后通过对资料的分析来验证假设,达到对社会现象进行理论解释的目的。
明确数据分析目标
本案例属于解释性研究,也可以认为是评价性研究。
某在线教育机构,需要评价网站首页改版是否有效,进行了在线AB测试,在一段时间内,将访问用户随机分成2部分,分别访问原首页和改版后首页,搜集用户的浏览、点击和报名行为。
研究假设:改版后报名比例高于改版前的报名比例。 原假设:改版后报名比例小于等于改版前的报名比例。
数据获取
本研究采用在线AB实验法,在XXX与XX期间,对访问网站首页的用户进行随机分组分流,50%的用户访问原首页,50%的用户访问新首页,共获得样本数据XXX个。
数据清理
查看所有空白值
= df1[df1.isnull().T.any()]
temp temp
timestamp | id | group | action | duration | |
---|---|---|---|---|---|
12 | 2016-09-24 22:43:35.120 | 701620 | control | view | NaN |
227 | 2016-09-30 11:57:44.059 | 802405 | control | view | NaN |
236 | 2016-09-30 17:17:07.709 | 524941 | control | view | NaN |
261 | 2016-10-01 08:40:31.380 | 381758 | control | view | NaN |
367 | 2016-10-03 20:08:57.011 | 757165 | control | enroll | NaN |
605 | 2016-10-10 11:26:20.715 | 678491 | control | view | NaN |
731 | 2016-10-14 04:19:46.021 | 878418 | control | view | NaN |
1145 | 2016-10-27 04:48:21.083 | 767153 | control | view | NaN |
1212 | 2016-10-29 09:37:41.649 | 196115 | control | enroll | NaN |
1238 | 2016-10-29 23:41:44.643 | 542999 | control | view | NaN |
1311 | 2016-11-01 05:42:50.439 | 599303 | control | view | NaN |
1322 | 2016-11-01 10:07:53.009 | 281907 | control | view | NaN |
1663 | 2016-11-11 13:56:17.708 | 742859 | control | view | NaN |
1768 | 2016-11-14 12:04:15.515 | 860051 | control | view | NaN |
1966 | 2016-11-20 09:06:10.553 | 864971 | control | view | NaN |
2457 | 2016-12-04 22:55:30.055 | 443391 | control | enroll | NaN |
2576 | 2016-12-08 11:34:10.808 | 928935 | control | view | NaN |
2700 | 2016-12-11 18:23:52.131 | 417569 | control | view | NaN |
2766 | 2016-12-13 10:17:36.711 | 496865 | control | view | NaN |
2834 | 2016-12-14 17:24:26.749 | 429709 | control | enroll | NaN |
3660 | 2017-01-07 08:14:30.071 | 631083 | control | enroll | NaN |
3754 | 2017-01-10 01:45:10.342 | 403964 | control | view | NaN |
3758 | 2017-01-10 03:04:52.819 | 883792 | control | view | NaN |
3798 | 2017-01-11 05:16:49.001 | 910942 | control | view | NaN |
3965 | 2017-01-15 20:17:33.700 | 508697 | control | view | NaN |
## 删除空值
= df1.dropna() df2
## 查看重复值
=['id','group','action'],keep='first')][['id','group','action']] df2[df2.duplicated(subset
id | group | action | |
---|---|---|---|
5 | 261869 | experiment | view |
6 | 226546 | experiment | view |
7 | 286353 | experiment | view |
8 | 842279 | experiment | view |
142 | 711838 | experiment | view |
971 | 724590 | experiment | view |
1013 | 314669 | experiment | view |
1351 | 381744 | experiment | view |
1765 | 831767 | experiment | view |
1902 | 655009 | experiment | view |
2293 | 645047 | experiment | view |
2704 | 510055 | experiment | view |
2919 | 661526 | experiment | view |
2952 | 885859 | experiment | view |
3559 | 661528 | experiment | view |
3653 | 191559 | experiment | view |
# 删除重复值
= df2.drop_duplicates(subset=['id'],keep='last') df3
查看变量类型
df3.dtypes.to_frame()
0 | |
---|---|
timestamp | datetime64[ns] |
id | int64 |
group | object |
action | object |
duration | float64 |
# 指定变量的类型
= df3.astype({
df4 'group': 'category',
'action': 'category',
}) df4.dtypes.to_frame()
0 | |
---|---|
timestamp | datetime64[ns] |
id | int64 |
group | category |
action | category |
duration | float64 |
## 异常值查找
'duration'].describe() df4[
count 4028.000000
mean 123.402057
std 72.587800
min 0.013856
25% 67.080495
50% 118.487843
75% 172.580542
max 421.567520
Name: duration, dtype: float64
= px.box(df4, y="duration")
fig fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
= px.histogram(df4, x="duration")
fig fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
# 数据清理完毕
= df4.copy() df
数据分析
描述统计
先描述样本背景,对样本质量进行评价。再描述样本特征信息、样本基本现状,最后描述样本基本态度及其他维度。
描述统计分析也应该有理论依据或概念合理的分类。
推论统计
可进行一些相关性、差异性分析以及回归分析。合理分析变量之间的相关性。
## 样本规模
= df.shape[0]
N print(N)
4028
'group') mytools.gen_percent_table(df,
group | 个数 | 百分比 | |
---|---|---|---|
0 | experiment | 2079 | 51.61 |
1 | control | 1949 | 48.39 |
2 | 总和 | 4028 | 100.00 |
# 构建绘图用数据表
= df.groupby(["group",'action']).size().reset_index(name='频数')
sun_df sun_df
group | action | 频数 | |
---|---|---|---|
0 | control | enroll | 370 |
1 | control | view | 1579 |
2 | experiment | enroll | 438 |
3 | experiment | view | 1641 |
= sun_df.set_index(['group','action'])
temp '%'] = 100 * (temp / temp.groupby('group').sum())
temp[round(2) temp.
频数 | % | ||
---|---|---|---|
group | action | ||
control | enroll | 370 | 18.98 |
view | 1579 | 81.02 | |
experiment | enroll | 438 | 21.07 |
view | 1641 | 78.93 |
= temp.reset_index()
sun_df sun_df
group | action | 频数 | % | |
---|---|---|---|---|
0 | control | enroll | 370 | 18.984094 |
1 | control | view | 1579 | 81.015906 |
2 | experiment | enroll | 438 | 21.067821 |
3 | experiment | view | 1641 | 78.932179 |
= px.bar(
fig # 带绘图数据
sun_df, ="group", # x轴
x="%", # y轴
y="group",
color="action", # 列
facet_col
) fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
= px.pie(df,names="action")
fig fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
= px.sunburst(sun_df,
fig =['group','action'],
path='%'
values
) fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
### 双变量统计分析
= pd.crosstab(
result 'action'],
df['group'],
df[='columns',
normalize=True,
margins='合计',
margins_name*100
)round(2) result.
group | control | experiment | 合计 |
---|---|---|---|
action | |||
enroll | 18.98 | 21.07 | 20.06 |
view | 81.02 | 78.93 | 79.94 |
可通过Z检验,对两个比例值是否存在统计显著性差异进行检验。
= df.query('group =="control"').shape[0]
n1 = df.query('group =="experiment"').shape[0]
n2 = df.query('group =="control" and action =="enroll"').shape[0] / n1
p1 = df.query('group =="experiment" and action =="enroll"').shape[0] / n2
p2 = mytools.two_prop_equal_test(n1, n2, p1, p2)
z, p_value print(z, p_value)
-1.6503840862221497 0.049432201929688536
通过对AB测试的结果进行分析,发现实验组的报名比例较控制组有所增高,p=0.049,具有统计显著性,拒绝原假设,接受研究假设,即:改版后的首页较原版有利于提高学员报名率。