Files
2026-02-01_churn/churn-analysis.ipynb
2026-02-02 16:26:10 -05:00

184 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"import numpy as np\n",
"\n",
"# Load CSV file\n",
"df = pd.read_csv('churn.csv')\n",
"\n",
"category_labels = {\n",
" 'quick-exit': 'Free trial only',\n",
" 'fair-trial': '74 day churn',\n",
" 'short-termer': '6 month churn',\n",
" 'active-user': 'No churn'\n",
"}\n",
"timeframe_labels = {\n",
" 'short term': 'During free trial',\n",
" 'medium term': 'After trial, before 90 days',\n",
" 'long term': 'After 90 days, first 6 months'\n",
"}\n",
"\n",
"def fix_dataset_label(k):\n",
" parts = k.split('_')\n",
" if \"term\" in parts:\n",
" timeframe = timeframe_labels[parts[-2:].join(\" \")]\n",
" name = parts[0:-2].join(\" \").title()\n",
" else:\n",
" timeframe = 'Lifetime'\n",
" name = parts.join(\" \").title()\n",
" return f\"{name}: {timeframe}\"\n",
"\n",
"def x_labels(categories):\n",
" return [category_labels[cat] for cat in categories]\n",
"\n",
"def dataset_labels(columns):\n",
" return [fix_dataset_label(col) for col in columns]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>booking_forms_short_term</th>\n",
" <th>booking_forms_medium_term</th>\n",
" <th>booking_forms_long_term</th>\n",
" </tr>\n",
" <tr>\n",
" <th>category</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>active-user</th>\n",
" <td>0.046701</td>\n",
" <td>0.041571</td>\n",
" <td>0.040686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fair-trial</th>\n",
" <td>0.268002</td>\n",
" <td>0.020303</td>\n",
" <td>0.000271</td>\n",
" </tr>\n",
" <tr>\n",
" <th>quick-exit</th>\n",
" <td>0.232673</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>short-termer</th>\n",
" <td>0.290954</td>\n",
" <td>0.082213</td>\n",
" <td>0.029034</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" booking_forms_short_term booking_forms_medium_term \\\n",
"category \n",
"active-user 0.046701 0.041571 \n",
"fair-trial 0.268002 0.020303 \n",
"quick-exit 0.232673 0.000000 \n",
"short-termer 0.290954 0.082213 \n",
"\n",
" booking_forms_long_term \n",
"category \n",
"active-user 0.040686 \n",
"fair-trial 0.000271 \n",
"quick-exit 0.000000 \n",
"short-termer 0.029034 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['booking_forms_short_term', 'booking_forms_medium_term', 'booking_forms_long_term']\n",
"categories = category_labels.keys() # All\n",
"\n",
"# Group by category and make sure they're in the correct order\n",
"df.groupby('category')[columns].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Order categories as defined in category_labels\ncategory_order = [category_labels[k] for k in category_labels.keys()]"
},
{
"cell_type": "code",
"source": "# Generate charts for all metrics\nmetrics = [\n 'booking_forms',\n 'bookings',\n 'employees', \n 'contacts',\n 'emails',\n 'sms',\n 'appointments',\n 'checklists',\n 'checklists_filled'\n]\n\n# Get total users per category for percentage calculations\ncategory_totals = df.groupby('category').size()\n\ndef create_charts_for_metric(metric_name):\n cols = [f'{metric_name}_short_term', f'{metric_name}_medium_term', f'{metric_name}_long_term']\n title_name = metric_name.replace('_', ' ').title()\n \n # Chart 1: Average usage\n grouped = df.groupby('category')[cols].mean().reset_index()\n melted = grouped.melt(id_vars='category', var_name='timeframe', value_name='value')\n melted['category_label'] = melted['category'].map(category_labels)\n \n fig1 = px.bar(\n melted,\n x='category_label',\n y='value',\n color='timeframe',\n barmode='group',\n title=f'{title_name} Usage by Churn Category and Timeframe',\n category_orders={'category_label': category_order}\n )\n fig1.update_layout(\n xaxis_title='Category',\n yaxis_title='Average Usage',\n legend_title='Timeframe'\n )\n fig1.show()\n \n # Chart 2: Percentage of users with at least one\n counts = df.groupby('category')[cols].apply(lambda x: (x > 0).sum()).reset_index()\n counts_melted = counts.melt(id_vars='category', var_name='timeframe', value_name='count')\n counts_melted['total'] = counts_melted['category'].map(category_totals)\n counts_melted['percentage'] = (counts_melted['count'] / counts_melted['total']) * 100\n counts_melted['category_label'] = counts_melted['category'].map(category_labels)\n \n fig2 = px.bar(\n counts_melted,\n x='category_label',\n y='percentage',\n color='timeframe',\n barmode='group',\n title=f'% of Users with At Least One {title_name} by Churn Category and Timeframe',\n category_orders={'category_label': category_order}\n )\n fig2.update_layout(\n xaxis_title='Category',\n yaxis_title='% of Users',\n legend_title='Timeframe'\n )\n fig2.show()\n\n# Generate all charts\nfor metric in metrics:\n create_charts_for_metric(metric)",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "# Generate cumulative charts for all metrics\ndef create_cumulative_charts_for_metric(metric_name):\n cols = [f'{metric_name}_short_term', f'{metric_name}_medium_term', f'{metric_name}_long_term']\n title_name = metric_name.replace('_', ' ').title()\n \n # Chart 1: Cumulative average usage\n grouped = df.groupby('category')[cols].mean().reset_index()\n # Make cumulative\n grouped['cumulative_short'] = grouped[cols[0]]\n grouped['cumulative_medium'] = grouped[cols[0]] + grouped[cols[1]]\n grouped['cumulative_long'] = grouped[cols[0]] + grouped[cols[1]] + grouped[cols[2]]\n \n cumulative_cols = ['cumulative_short', 'cumulative_medium', 'cumulative_long']\n melted = grouped.melt(id_vars='category', value_vars=cumulative_cols, var_name='timeframe', value_name='value')\n melted['category_label'] = melted['category'].map(category_labels)\n \n timeframe_labels_cumulative = {\n 'cumulative_short': 'Through trial',\n 'cumulative_medium': 'Through 90 days',\n 'cumulative_long': 'Through 6 months'\n }\n melted['timeframe_label'] = melted['timeframe'].map(timeframe_labels_cumulative)\n \n fig1 = px.bar(\n melted,\n x='category_label',\n y='value',\n color='timeframe_label',\n barmode='group',\n title=f'{title_name} Cumulative Usage by Churn Category',\n category_orders={\n 'category_label': category_order,\n 'timeframe_label': ['Through trial', 'Through 90 days', 'Through 6 months']\n }\n )\n fig1.update_layout(\n xaxis_title='Category',\n yaxis_title='Cumulative Average Usage',\n legend_title='Timeframe'\n )\n fig1.show()\n \n # Chart 2: Cumulative percentage of users with at least one\n # Check if user used feature in ANY period up to that point (not sum of counts)\n df['_cumul_short'] = df[cols[0]] > 0\n df['_cumul_medium'] = (df[cols[0]] > 0) | (df[cols[1]] > 0)\n df['_cumul_long'] = (df[cols[0]] > 0) | (df[cols[1]] > 0) | (df[cols[2]] > 0)\n \n counts = df.groupby('category')[['_cumul_short', '_cumul_medium', '_cumul_long']].sum().reset_index()\n counts.columns = ['category', 'cumulative_short', 'cumulative_medium', 'cumulative_long']\n \n counts_melted = counts.melt(id_vars='category', value_vars=cumulative_cols, var_name='timeframe', value_name='count')\n counts_melted['total'] = counts_melted['category'].map(category_totals)\n counts_melted['percentage'] = (counts_melted['count'] / counts_melted['total']) * 100\n counts_melted['category_label'] = counts_melted['category'].map(category_labels)\n counts_melted['timeframe_label'] = counts_melted['timeframe'].map(timeframe_labels_cumulative)\n \n fig2 = px.bar(\n counts_melted,\n x='category_label',\n y='percentage',\n color='timeframe_label',\n barmode='group',\n title=f'Cumulative % of Users with At Least One {title_name} by Churn Category',\n category_orders={\n 'category_label': category_order,\n 'timeframe_label': ['Through trial', 'Through 90 days', 'Through 6 months']\n }\n )\n fig2.update_layout(\n xaxis_title='Category',\n yaxis_title='Cumulative % of Users',\n legend_title='Timeframe'\n )\n fig2.show()\n\n# Generate all cumulative charts\nfor metric in metrics:\n create_cumulative_charts_for_metric(metric)",
"metadata": {},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}