This analysis seeks two objectives:
First, this analysis recommends that the light button be selected for the complete website rollout.
Second, This analysis recommends segmenting users into the following three categories:
These segmentations were supported by K-Means clustering.
This analysis makes these segmentations with a business situation and use-case in mind:
Credit to God, my Mother, family and friends.
All errors are my own.
Best,
George John Jordan Thomas Aquinas Hayward, Optimist
The Dark Button Got Slightly More Clicks
The Light Button Got More Absolute Message Sends
The Light Button Dominated The Dark Button In Terms of Message Send Conversion Rate
Watch The K-Means Clustering Classification Progress As We Select For More and More Active Users.
It seems that users seem to really have an overall 'either/or' preference for calling or online.
Could New Yorkers use the service more than anyone else?
Do people on exchanges use the service more than people who are not on exchanges?
Do seniors use the service less frequently than other age groups?
#for part 1
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as mysql
import numpy as np
import pandas as pd
import missingno as msno
import statsmodels
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import proportions_ztest
from scipy import stats
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
#added in for part 2
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans
Background: Oscar has run a two-armed experiment to test a new design for the messaging button that appears on the homepage of the member website. This homepage button is one of several possible entry points to the messaging feature. Our messaging feature allows members to message a dedicated care team with any benefits or plan related questions.
The goals of this experiment were to improve discoverability and increase usage of the messaging feature. The control design (dark) was shown to 50% of user traffic for the duration of the experiment period; the other 50% of traffic was shown the experimental design (light).
experiment_subjects = pd.read_csv('experiment_subjects.csv')
#let's check it out
experiment_subjects.describe()
experiment_subjects = experiment_subjects.drop_duplicates(['user_id'])
experiment_subjects.describe()
experiment_subjects = experiment_subjects.dropna(subset=['user_id'])
experiment_subjects.describe()
experiment_subjects.enrolled_at.dtype
#that means it's an object; we better turn this into a datetime format:
experiment_subjects['enrolled_at'] = pd.to_datetime(experiment_subjects['enrolled_at'],\
format='%Y-%m-%d %H:%M:%S')
experiment_subjects.enrolled_at.dtype
#let's confirm that's a datetime
np.dtype('datetime64[ns]') == np.dtype('<M8[ns]')
experiment_subjects.head()
experiment_actions = pd.read_csv('experiment_actions.csv')
#let's check it out
experiment_actions.describe()
#similiar to the above situation, let's get timestamp into datetime format
experiment_actions['timestamp'] = pd.to_datetime(experiment_actions['timestamp'],\
format='%Y-%m-%d %H:%M:%S')
experiment_actions = experiment_actions.sort_values('timestamp').drop_duplicates(subset=['user_id', 'action'],\
keep='first')
experiment_actions.describe()
ab_test = experiment_subjects.merge(experiment_actions, left_on='user_id', right_on='user_id',\
how='left')
ab_test = ab_test[(ab_test.enrolled_at <= ab_test.timestamp) | (pd.isnull(ab_test.timestamp) == True)]
ab_test.describe()
engine = create_engine('mysql+mysqlconnector://newuser:data@localhost:3306/sys', echo=False)
ab_test.to_sql(name='ab_test_os', con=engine, if_exists = 'replace', index=False)
#connect to the MySQL database
db = mysql.connect(
host = "localhost",
user = "newuser",
passwd = "data",
auth_plugin='mysql_native_password',
database = 'sys')
discoverability = pd.read_sql("""
with experiment_subjects_distinct as(
select distinct
user_id,
audience_name
from sys.ab_test_os
),
clickers as (
select
user_id,
audience_name,
case when action = 'Clicked Messaging Button' then 1 end as clicked_yn
from sys.ab_test_os
where action = 'Clicked Messaging Button'
)
select
esd.user_id,
esd.audience_name as button,
coalesce(c.clicked_yn, 0) as clicked_yn
from experiment_subjects_distinct esd
left join clickers c on c.user_id = esd.user_id;
""", con=db)
discoverability.reset_index
discoverability.sample(5)
discoverability.describe()
message_usage = pd.read_sql("""
with experiment_subjects_distinct as(
select distinct
user_id,
audience_name
from sys.ab_test_os),
ab_test_with_action_order as (
select
*,
dense_rank() over(partition by user_id order by timestamp asc) as user_event_order
from sys.ab_test_os
),
first_clickers as (
select distinct
user_id
from ab_test_with_action_order
where action = 'Clicked Messaging Button' and user_event_order = 1
),
sent_message_after_clicking_button_first as (
select
ab.user_id,
case when action = 'Sent Message' then 1 end as sent_message_yn
from ab_test_with_action_order ab
join first_clickers fc on fc.user_id = ab.user_id
where action = 'Sent Message' and user_event_order > 1
)
select
esd.user_id,
esd.audience_name as button,
coalesce(sm.sent_message_yn, 0) as sent_message_yn
from experiment_subjects_distinct esd
left join sent_message_after_clicking_button_first sm on sm.user_id = esd.user_id;
""", con=db)
message_usage.sample(5)
message_usage.describe()
first_clickers = pd.read_sql("""
with experiment_subjects_distinct as (
select distinct
user_id,
audience_name
from sys.ab_test_os
),
ab_test_with_action_order as (
select
*,
dense_rank() over(partition by user_id order by timestamp asc) as user_event_order
from sys.ab_test_os
),
first_clickers as (
select
user_id,
case when action = 'Clicked Messaging Button' then 1 end as first_action_was_click_yn
from ab_test_with_action_order
where action = 'Clicked Messaging Button' and user_event_order = 1
)
select
esd.user_id,
esd.audience_name as button,
coalesce(fc.first_action_was_click_yn, 0) as first_action_was_click_yn
from experiment_subjects_distinct esd
left join first_clickers fc on fc.user_id = esd.user_id;
""", con=db)
first_clickers.sample(5)
first_clickers.describe()
#helpful to peek at the data again.
discoverability.sample(3)
count_clicks = discoverability.clicked_yn.sum()
users_clicks = discoverability.clicked_yn.count()
#this is a z-test w/ normal distribution for the confidence interval, alpha set it to 95%
print('95-Percent Confidence Interval: \n Lower Bound = %.4f, Upper Bound = %.4f' % \
(statsmodels.stats.proportion.proportion_confint(\
count_clicks, users_clicks, alpha=0.05, method='normal')))
#we'll store these as variables for easier use later
clicked_light_button = discoverability[discoverability["button"]=='light']['clicked_yn']
clicked_dark_button = discoverability[discoverability["button"]=='dark']['clicked_yn']
#now for use in the proportions test, we need to make a mini dataframe
#counts is the number of successes
#in a Bernoulli trial setup where it's just 1s and 0s, we can just use sum() to get this
#nobs (now users) is the total number of trials, len() can work, though I chose to use counts()
click_button_ab_test = pd.DataFrame({
"count": [clicked_light_button.sum(), clicked_dark_button.sum()],
"users": [clicked_light_button.count(), clicked_dark_button.count()]
}, index=['light_button_clicks', 'dark_button_clicks'])
#now we use this to feed into the stats test
#for some odd reason if you say gender_polls.count it will blow up...
#so you have to say gender_polls['count']
print('Z-Score = %.3f, p-Value = %.10f' % statsmodels.stats.proportion.proportions_ztest(\
click_button_ab_test['count'], click_button_ab_test['users']))
#z score, p-value
print('T-Score = %.3f, p-Value = %.10f' % stats.ttest_ind(clicked_light_button, clicked_dark_button))
click_button_ab_test['proportion'] = round(click_button_ab_test['count'] / click_button_ab_test['users'],2)
click_button_ab_test
#helpful to peek at the data again.
message_usage.sample(3)
count_sends = message_usage.sent_message_yn.sum()
users_sends = message_usage.sent_message_yn.count()
#this is a z-test w/ normal distribution for the confidence interval, alpha set it to 95%
print('95-Percent Confidence Interval: \n Lower Bound = %.4f, Upper Bound = %.4f' % \
(statsmodels.stats.proportion.proportion_confint(\
count_sends, users_sends, alpha=0.05, method='normal')))
#we'll store these as variables for easier use later
sent_light_button = message_usage[message_usage["button"]=='light']['sent_message_yn']
sent_dark_button = message_usage[message_usage["button"]=='dark']['sent_message_yn']
#now for use in the proportions test, we need to make a mini dataframe
#counts is the number of successes
#in a Bernoulli trial setup where it's just 1s and 0s, we can just use sum() to get this
#nobs (now users) is the total number of trials, len() can work, though I chose to use counts()
send_button_ab_test = pd.DataFrame({
"count": [sent_light_button.sum(), sent_dark_button.sum()],
"users": [sent_light_button.count(), sent_dark_button.count()]
}, index=['light_button_sends', 'dark_button_sends'])
#now we use this to feed into the stats test
#for some odd reason if you say gender_polls.count it will blow up...
#so you have to say gender_polls['count']
print('Z-Score = %.3f, p-Value = %.10f' % statsmodels.stats.proportion.proportions_ztest(\
send_button_ab_test['count'], send_button_ab_test['users']))
#z score, p-value
print('T-Score = %.3f, p-Value = %.10f' % stats.ttest_ind(sent_light_button, sent_dark_button))
send_button_ab_test['proportion'] = round(send_button_ab_test['count'] / send_button_ab_test['users'],2)
send_button_ab_test
#take a peek
first_clickers.sample(3)
#we'll store these as variables for easier use later
first_clicked_light_button = first_clickers[first_clickers["button"]=='light']['first_action_was_click_yn']
first_clicked_dark_button = first_clickers[first_clickers["button"]=='dark']['first_action_was_click_yn']
#now for use in the proportions test, we need to make a mini dataframe
#counts is the number of successes
#in a Bernoulli trial setup where it's just 1s and 0s, we can just use sum() to get this
#nobs (now users) is the total number of trials, len() can work, though I chose to use counts()
first_clicks_button_ab_test = pd.DataFrame({
"count": [first_clicked_light_button.sum(), first_clicked_dark_button.sum()],
"users": [first_clicked_light_button.count(), first_clicked_dark_button.count()]
}, index=['light_button_first_clicks', 'dark_button_first_clicks'])
first_clicks_button_ab_test
send_button_ab_test
click_to_send_conversions_funnel = pd.DataFrame({
"users": [send_button_ab_test.users['light_button_sends'], send_button_ab_test.users['dark_button_sends']],
"first-clicking users": [first_clicks_button_ab_test['count']['light_button_first_clicks'],\
first_clicks_button_ab_test['count']['dark_button_first_clicks']],
"sending users": [send_button_ab_test['count']['light_button_sends'],\
send_button_ab_test['count']['dark_button_sends']]
}, index=['light_button', 'dark_button'])
click_to_send_conversions_funnel['conversion'] \
= round(click_to_send_conversions_funnel['sending users'] / \
click_to_send_conversions_funnel['first-clicking users'],2)
click_to_send_conversions_funnel
#now we use this to feed into the stats test
#for some odd reason if you say gender_polls.count it will blow up...
#so you have to say gender_polls['count']
print('Z-Score = %.3f, p-Value = %.10f' % statsmodels.stats.proportion.proportions_ztest(\
click_to_send_conversions_funnel['sending users'], click_to_send_conversions_funnel['first-clicking users']))
#z score, p-value
click_button_ab_test
labels = ['Clicking Users', '']
light_button_clicks = [click_button_ab_test['count'][0], click_button_ab_test['users'][0]-\
click_button_ab_test['count'][0]]
dark_button_clicks = [click_button_ab_test['count'][1], click_button_ab_test['users'][1]-\
click_button_ab_test['count'][1]]
adg_font = {'fontname':'Adobe Garamond Pro'}
fig, axs = plt.subplots(1, 2,figsize=(11, 6))
colors_lb = ['darkgray','gainsboro']
colors_db = ['cornflowerblue','lightsteelblue']
axs[0].pie(light_button_clicks, labels=labels, autopct='%1.2f%%', shadow=False, colors=colors_lb, explode = (0, 0.08)\
, startangle = 6)
axs[0].set_title('Light Button', fontsize = 19, **adg_font)
axs[1].pie(dark_button_clicks, labels=labels, autopct='%1.2f%%', shadow=False, colors=colors_db, explode = (0, 0.08))
axs[1].set_title('Dark Button', fontsize = 19, **adg_font)
plt.subplots_adjust(wspace=0.3, hspace=1)
plt.suptitle(" Anytime Clicks, By Button Type", fontsize = 24, fontweight = 'bold', **adg_font)
plt.savefig("ex_A_clicks.png",dpi=300, bbox_inches='tight')
plt.show()
send_button_ab_test
labels = ['Sending Users', '']
light_button_sends = [send_button_ab_test['count'][0], send_button_ab_test['users'][0]-\
send_button_ab_test['count'][0]]
dark_button_sends = [send_button_ab_test['count'][1], send_button_ab_test['users'][1]-\
send_button_ab_test['count'][1]]
adg_font = {'fontname':'Adobe Garamond Pro'}
fig, axs = plt.subplots(1, 2,figsize=(11, 6))
colors_lb = ['darkgray','gainsboro']
colors_db = ['cornflowerblue','lightsteelblue']
axs[0].pie(light_button_sends, labels=labels, autopct='%1.2f%%', shadow=False, colors=colors_lb, explode = (0, 0.08))
axs[0].set_title('Light Button', fontsize = 19, **adg_font)
axs[1].pie(dark_button_sends, labels=labels, autopct='%1.2f%%', shadow=False, colors=colors_db, explode = (0, 0.08))
axs[1].set_title('Dark Button', fontsize = 19, **adg_font)
plt.subplots_adjust(wspace=0.3, hspace=1)
plt.suptitle(" Sends After Clicks, By Button Type", fontsize = 24, fontweight = 'bold', **adg_font)
plt.savefig("ex_B_send_it.png",dpi=300, bbox_inches='tight')
plt.show()
click_to_send_conversions_funnel
labels = ['Converted Users', '']
light_button_converted_sends = \
[click_to_send_conversions_funnel['sending users'][0], click_to_send_conversions_funnel['first-clicking users'][0]-\
click_to_send_conversions_funnel['sending users'][0]]
dark_button_converted_sends = \
[click_to_send_conversions_funnel['sending users'][1], click_to_send_conversions_funnel['first-clicking users'][1]-\
click_to_send_conversions_funnel['sending users'][1]]
adg_font = {'fontname':'Adobe Garamond Pro'}
fig, axs = plt.subplots(1, 2,figsize=(11, 6))
colors_lb = ['darkgray','gainsboro']
colors_db = ['cornflowerblue','lightsteelblue']
axs[0].pie(light_button_converted_sends, labels=labels, autopct='%1.2f%%', shadow=False, \
colors=colors_lb, explode = (0, 0.08), startangle = -43)
axs[0].set_title('Light Button', fontsize = 19, **adg_font)
axs[1].pie(dark_button_converted_sends, labels=labels, autopct='%1.2f%%', shadow=False, \
colors=colors_db, explode = (0, 0.08))
axs[1].set_title('Dark Button', fontsize = 19, **adg_font)
plt.subplots_adjust(wspace=0.3, hspace=1)
plt.suptitle(" Conversion Rate, By Button Type", fontsize = 24, fontweight = 'bold', **adg_font)
plt.savefig("ex_C_converted_sends.png",dpi=300, bbox_inches='tight')
plt.show()
My ultimate conclusion is that segmentation should be done as follows:
(3) - active users who are high-call, low-online.
From my analysis, it appear very few, if any users, are both high-call and high-online or both low-call and low-online. It seems people really have a preference for how they communicate. If it turned out that there were large categories like this, we could segment in to them.
member_engagement = pd.read_csv('member_engagement.csv')
#this is just to take a look at the data
member_engagement.describe()
member_engagement.dtypes
member_engagement.enrollment_type.unique()
member_engagement.policy_relation.unique()
member_engagement.region.unique()
member_engagement.month.unique()
member_engagement.sample(4)
def k_means_think_binary(sigma):
super_peeps = member_engagement[
(np.abs(member_engagement.login_count-\
member_engagement.login_count.mean())\
>= (sigma*member_engagement.login_count.std()))
|\
(np.abs(member_engagement.ib_call_count-\
member_engagement.ib_call_count.mean())\
>= (sigma*member_engagement.ib_call_count.std()))
]
super_peeps = super_peeps[['login_count','ib_call_count']]
kmeans = KMeans(n_clusters=2)
kmeans.fit(super_peeps)
y_kmeans = kmeans.predict(super_peeps)
#y_kmeans = ['Loves To Call, But Not Online' if x == 0 else 'Always Online, Never Calls' for x in y_kmeans]
if sigma < 3: #this is a hacky bit to make sure the legend colors are correctly assigned
y_kmeans = ['Loves To Call, But Not Online' if x == 0 else 'Always Online, Never Calls' for x in y_kmeans]
#^^^thanks to @arboc, https://stackoverflow.com/a/4406777/11736959
sns.set_palette("husl", 8)
#elif sigma in (3,10,15):
# y_kmeans = ['Loves To Call, But Not Online' if x == 1 else 'Always Online, Never Calls' for x in y_kmeans]
#^^^thanks to @arboc, https://stackoverflow.com/a/4406777/11736959
# sns.set_palette("Set2")
else:
y_kmeans = ['Loves To Call, But Not Online' if x == 1 else 'Always Online, Never Calls' for x in y_kmeans]
#^^^thanks to @arboc, https://stackoverflow.com/a/4406777/11736959
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(flatui)
super_peeps['Classification Concept'] = y_kmeans
super_peeps
#sns.set_palette("Paired")
#sns.set_palette("husl", 8)
sns.lmplot( x="login_count", y="ib_call_count", data=super_peeps, fit_reg=False, hue="Classification Concept", \
legend=True, legend_out=True)
#plt.legend(loc='upper right')
user_count = super_peeps.login_count.count()
plt.ylabel('Number of Inbound Calls Per Month')
plt.xlabel('Number of Customer Logins Per Month')
plt.suptitle('For Users With Inbound Calls or Logins Above {}σ'.format(sigma),\
fontweight = 'bold', y=1.05)
plt.title(' {:,} customers'.format(user_count))
plt.savefig('analyzer_{}-std-kmeans.png'.format(sigma),dpi=400, bbox_inches='tight')
plt.ylim(top=30, bottom=0)
plt.xlim(right=80, left=0)
plt.show()
for standard_deviation in [0,0.5,0.75,1,2,3,15,20]:
k_means_think_binary(standard_deviation)
def the_analyzer(num, groupby):
super_peeps = member_engagement[
(np.abs(member_engagement.login_count-\
member_engagement.login_count.mean())\
>= (num*member_engagement.login_count.std()))
|\
(np.abs(member_engagement.ib_call_count-\
member_engagement.ib_call_count.mean())\
>= (num*member_engagement.ib_call_count.std()))
]
sns.set_palette("Dark2")
sns.lmplot( x="login_count", y="ib_call_count", data=super_peeps, fit_reg=False, hue=groupby, \
legend=True)
user_count = super_peeps.member_id.count()
plt.ylabel('Number of Inbound Calls Per Month')
plt.xlabel('Number of Customer Logins Per Month')
plt.suptitle(' For Users With Inbound Calls or Logins Above {}σ'.format(num),\
fontweight = 'bold', y=1.05)
plt.title(' {:,} customers'.format(user_count))
plt.savefig('analyzer_{}-std_{}-groupby.png'.format(num, groupby),dpi=400, bbox_inches='tight')
plt.ylim(top=30, bottom=0)
plt.xlim(right=80, left=0)
plt.show()
for i in [0,0.5,0.75,1,2,3,10,15, 20]:
for q in ['enrollment_type', 'age_group', 'region']:
the_analyzer(i, q)