First round of fiddling

This commit is contained in:
2026-02-02 16:26:10 -05:00
commit a253955528
9 changed files with 16042 additions and 0 deletions

46
.gitignore vendored Normal file
View File

@ -0,0 +1,46 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual environments
.venv/
venv/
ENV/
# Jupyter
.ipynb_checkpoints/
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Environment
.env
.python-version
# Data (uncomment if you want to ignore data files)
# *.csv

View File

@ -0,0 +1,146 @@
-- First establish some base parameters for our users
WITH config AS (
SELECT
14 AS short_term,
90 AS medium_term,
360 AS long_term,
CAST('2023-01-01' AS timestamp) AS date_cutoff
),
-- Then establish a standard core set of user data
users_with_churn_stats AS (
SELECT id, active, billing_state,
DATE_TRUNC('month', created_at) AS month_created,
created_at, free_trial_ends_at, updated_at,
updated_at::date - created_at::date AS lifespan
FROM users, config
-- We'll only look at newish users.
WHERE created_at >= config.date_cutoff
),
-- Next use that core data to extract the records we want to analyze
-- and categorize them by date and status
users_to_examine AS (
-- First the users who exited just after the trial
SELECT 'quick-exit' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan <= config.short_term
UNION
-- Next, users who gave the tools some time but still left
SELECT 'fair-trial' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan < config.medium_term
AND lifespan > config.short_term
UNION
-- Then some longer-time users who still churned.
-- We'll cut this off after 6 months since afterwards
-- company closures, etc. could factor in.
SELECT 'short-termer' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan >= config.medium_term
AND lifespan < config.long_term
UNION
-- Finally, people who are still kicking.
-- This cohort is going to be a little wrong for older
-- dates since it only includes those who are STILL around.
SELECT 'active-user' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE BILLING_STATE != 'CHURNED'
AND lifespan >= config.long_term
),
-- Well add booking forms created by era
booking_form_counts AS (
SELECT bf.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT booking_forms.user_id, booking_forms.created_at::date - users.created_at::date AS created_age
FROM config, booking_forms JOIN users ON booking_forms.user_id = users.id
WHERE booking_forms.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) bf
GROUP BY bf.user_id
),
-- Bookings made by era
booking_counts AS (
SELECT bk.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT booking_forms.user_id, bookings.created_at::date - users.created_at::date AS created_age
FROM config, booking_forms
JOIN users ON booking_forms.user_id = users.id
JOIN bookings ON bookings.booking_form_id = booking_forms.id
WHERE booking_forms.created_at >= config.date_cutoff
AND bookings.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) bk
GROUP BY bk.user_id
),
-- Employees added by era
employee_counts AS (
SELECT empl.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT employees.user_id, employees.created_at::date - users.created_at::date AS created_age
FROM config, employees JOIN users ON employees.user_id = users.id
WHERE employees.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) empl
GROUP BY empl.user_id
),
-- Contacts created by era
contact_counts AS (
SELECT cust.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT customers.user_id, customers.created_at::date - users.created_at::date AS created_age
FROM config, customers JOIN users ON customers.user_id = users.id
WHERE customers.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) cust
GROUP BY cust.user_id
),
-- Finally, we'll flatten it all out into a repurposable flat table
SELECT users_to_examine.id,
users_to_examine.category,
COALESCE(booking_form_counts.short_term, 0) AS booking_forms_short_term,
COALESCE(booking_form_counts.medium_term, 0) AS booking_forms_medium_term,
COALESCE(booking_form_counts.long_term, 0) AS booking_forms_long_term,
COALESCE(booking_form_counts.total, 0) AS booking_forms,
COALESCE(booking_counts.short_term, 0) AS bookings_short_term,
COALESCE(booking_counts.medium_term, 0) AS bookings_medium_term,
COALESCE(booking_counts.long_term, 0) AS bookings_long_term,
COALESCE(booking_counts.total, 0) AS bookings,
COALESCE(employee_counts.short_term, 0) AS employees_short_term,
COALESCE(employee_counts.medium_term, 0) AS employees_medium_term,
COALESCE(employee_counts.long_term, 0) AS employees_long_term,
COALESCE(employee_counts.total, 0) AS employees,
COALESCE(contact_counts.short_term, 0) AS contacts_short_term,
COALESCE(contact_counts.medium_term, 0) AS contacts_medium_term,
COALESCE(contact_counts.long_term, 0) AS contacts_long_term,
COALESCE(contact_counts.total, 0) AS contacts
FROM users_to_examine
LEFT OUTER JOIN booking_form_counts ON booking_form_counts.user_id = users_to_examine.id
LEFT OUTER JOIN booking_counts ON booking_counts.user_id = users_to_examine.id
LEFT OUTER JOIN employee_counts ON employee_counts.user_id = users_to_examine.id
LEFT OUTER JOIN contact_counts ON contact_counts.user_id = users_to_examine.id
;

View File

@ -0,0 +1,83 @@
-- First establish some base parameters for our users
WITH config AS (
SELECT
14 AS short_term,
90 AS medium_term,
360 AS long_term,
CAST('2023-01-01' AS timestamp) AS date_cutoff
),
-- Then establish a standard core set of user data
users_with_churn_stats AS (
SELECT id, active, billing_state,
DATE_TRUNC('month', created_at) AS month_created,
created_at, free_trial_ends_at, updated_at,
updated_at::date - created_at::date AS lifespan
FROM users, config
-- We'll only look at newish users.
WHERE created_at >= config.date_cutoff
),
-- Next use that core data to extract the records we want to analyze
-- and categorize them by date and status
users_to_examine AS (
-- First the users who exited just after the trial
SELECT 'quick-exit' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan <= config.short_term
UNION
-- Next, users who gave the tools some time but still left
SELECT 'fair-trial' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan < config.medium_term
AND lifespan > config.short_term
UNION
-- Then some longer-time users who still churned.
-- We'll cut this off after 6 months since afterwards
-- company closures, etc. could factor in.
SELECT 'short-termer' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan >= config.medium_term
AND lifespan < config.long_term
UNION
-- Finally, people who are still kicking.
-- This cohort is going to be a little wrong for older
-- dates since it only includes those who are STILL around.
SELECT 'active-user' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE BILLING_STATE != 'CHURNED'
AND lifespan >= config.long_term
),
-- Emails sent by era
email_counts AS (
SELECT email.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT sent_emails.user_id, sent_emails.created_at::date - users.created_at::date AS created_age
FROM config, sent_emails JOIN users ON sent_emails.user_id = users.id
WHERE sent_emails.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) email
GROUP BY email.user_id
)
-- Finally, we'll flatten it all out into a repurposable flat table
SELECT users_to_examine.id,
users_to_examine.category,
COALESCE(email_counts.short_term, 0) AS emails_short_term,
COALESCE(email_counts.medium_term, 0) AS emails_medium_term,
COALESCE(email_counts.long_term, 0) AS emails_long_term,
COALESCE(email_counts.total, 0) AS emails
FROM users_to_examine
LEFT OUTER JOIN email_counts ON email_counts.user_id = users_to_examine.id
;

View File

@ -0,0 +1,151 @@
-- First establish some base parameters for our users
WITH config AS (
SELECT
14 AS short_term,
90 AS medium_term,
360 AS long_term,
CAST('2023-01-01' AS timestamp) AS date_cutoff
),
-- Then establish a standard core set of user data
users_with_churn_stats AS (
SELECT id, active, billing_state,
DATE_TRUNC('month', created_at) AS month_created,
created_at, free_trial_ends_at, updated_at,
updated_at::date - created_at::date AS lifespan
FROM users, config
-- We'll only look at newish users.
WHERE created_at >= config.date_cutoff
),
-- Next use that core data to extract the records we want to analyze
-- and categorize them by date and status
users_to_examine AS (
-- First the users who exited just after the trial
SELECT 'quick-exit' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan <= config.short_term
UNION
-- Next, users who gave the tools some time but still left
SELECT 'fair-trial' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan < config.medium_term
AND lifespan > config.short_term
UNION
-- Then some longer-time users who still churned.
-- We'll cut this off after 6 months since afterwards
-- company closures, etc. could factor in.
SELECT 'short-termer' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE billing_state = 'CHURNED'
AND lifespan >= config.medium_term
AND lifespan < config.long_term
UNION
-- Finally, people who are still kicking.
-- This cohort is going to be a little wrong for older
-- dates since it only includes those who are STILL around.
SELECT 'active-user' AS category, uwcs.*
FROM users_with_churn_stats uwcs, config
WHERE BILLING_STATE != 'CHURNED'
AND lifespan >= config.long_term
),
-- Messages sent by era
sms_counts AS (
SELECT texts.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT sent_texts.user_id, sent_texts.created_at::date - users.created_at::date AS created_age
FROM config, sent_texts JOIN users ON sent_texts.user_id = users.id
WHERE sent_texts.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) texts
GROUP BY texts.user_id
),
-- Appointments by era
appointment_counts AS (
SELECT appts.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT appointments.user_id, appointments.created_at::date - users.created_at::date AS created_age
FROM config, appointments JOIN users ON appointments.user_id = users.id
WHERE appointments.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) appts
GROUP BY appts.user_id
),
-- Checklists created by era
checklist_counts AS (
SELECT checks.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT checklist_templates.user_id, checklists.created_at::date - users.created_at::date AS created_age
FROM config, checklist_templates
JOIN users ON checklist_templates.user_id = users.id
JOIN checklists ON checklists.checklist_template_id = checklist_templates.id
WHERE checklist_templates.created_at >= config.date_cutoff
AND checklists.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) checks
GROUP BY checks.user_id
),
-- Checklist events created by era
checklist_filled_counts AS (
SELECT events.user_id,
SUM(CASE WHEN created_age < config.short_term THEN 1 ELSE 0 END) AS short_term,
SUM(CASE WHEN created_age >= config.short_term AND created_age < config.medium_term THEN 1 ELSE 0 END) AS medium_term,
SUM(CASE WHEN created_age >= config.medium_term AND created_age < config.long_term THEN 1 ELSE 0 END) AS long_term,
COUNT(*) AS total
FROM config, (
SELECT employees.user_id, checklist_item_events.created_at::date - users.created_at::date AS created_age
FROM config, employees
JOIN users ON employees.user_id = users.id
JOIN checklist_item_events ON checklist_item_events.employee_id = employees.id
WHERE checklist_item_events.created_at >= config.date_cutoff
AND checklist_item_events.created_at >= config.date_cutoff
AND users.created_at >= config.date_cutoff
) events
GROUP BY events.user_id
)
-- Finally, we'll flatten it all out into a repurposable flat table
SELECT users_to_examine.id,
users_to_examine.category,
users_to_examine.month_created,
users_to_examine.lifespan,
COALESCE(sms_counts.short_term, 0) AS sms_short_term,
COALESCE(sms_counts.medium_term, 0) AS sms_medium_term,
COALESCE(sms_counts.long_term, 0) AS sms_long_term,
COALESCE(sms_counts.total, 0) AS sms,
COALESCE(appointment_counts.short_term, 0) AS appointments_short_term,
COALESCE(appointment_counts.medium_term, 0) AS appointments_medium_term,
COALESCE(appointment_counts.long_term, 0) AS appointments_long_term,
COALESCE(appointment_counts.total, 0) AS appointments,
COALESCE(checklist_counts.short_term, 0) AS checklists_short_term,
COALESCE(checklist_counts.medium_term, 0) AS checklists_medium_term,
COALESCE(checklist_counts.long_term, 0) AS checklists_long_term,
COALESCE(checklist_counts.total, 0) AS checklists,
COALESCE(checklist_filled_counts.short_term, 0) AS checklists_filled_short_term,
COALESCE(checklist_filled_counts.medium_term, 0) AS checklists_filled_medium_term,
COALESCE(checklist_filled_counts.long_term, 0) AS checklists_filled_long_term,
COALESCE(checklist_filled_counts.total, 0) AS checklists_filled
FROM users_to_examine
LEFT OUTER JOIN sms_counts ON sms_counts.user_id = users_to_examine.id
LEFT OUTER JOIN appointment_counts ON appointment_counts.user_id = users_to_examine.id
LEFT OUTER JOIN checklist_counts ON checklist_counts.user_id = users_to_examine.id
LEFT OUTER JOIN checklist_filled_counts ON checklist_filled_counts.user_id = users_to_examine.id
;

15
README.md Normal file
View File

@ -0,0 +1,15 @@
# Jupyter Analysis
## Setup
```bash
poetry install
```
## Run
```bash
poetry run jupyter notebook
# or
poetry run jupyter lab
```

184
churn-analysis.ipynb Normal file
View File

@ -0,0 +1,184 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"import numpy as np\n",
"\n",
"# Load CSV file\n",
"df = pd.read_csv('churn.csv')\n",
"\n",
"category_labels = {\n",
" 'quick-exit': 'Free trial only',\n",
" 'fair-trial': '74 day churn',\n",
" 'short-termer': '6 month churn',\n",
" 'active-user': 'No churn'\n",
"}\n",
"timeframe_labels = {\n",
" 'short term': 'During free trial',\n",
" 'medium term': 'After trial, before 90 days',\n",
" 'long term': 'After 90 days, first 6 months'\n",
"}\n",
"\n",
"def fix_dataset_label(k):\n",
" parts = k.split('_')\n",
" if \"term\" in parts:\n",
" timeframe = timeframe_labels[parts[-2:].join(\" \")]\n",
" name = parts[0:-2].join(\" \").title()\n",
" else:\n",
" timeframe = 'Lifetime'\n",
" name = parts.join(\" \").title()\n",
" return f\"{name}: {timeframe}\"\n",
"\n",
"def x_labels(categories):\n",
" return [category_labels[cat] for cat in categories]\n",
"\n",
"def dataset_labels(columns):\n",
" return [fix_dataset_label(col) for col in columns]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>booking_forms_short_term</th>\n",
" <th>booking_forms_medium_term</th>\n",
" <th>booking_forms_long_term</th>\n",
" </tr>\n",
" <tr>\n",
" <th>category</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>active-user</th>\n",
" <td>0.046701</td>\n",
" <td>0.041571</td>\n",
" <td>0.040686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>fair-trial</th>\n",
" <td>0.268002</td>\n",
" <td>0.020303</td>\n",
" <td>0.000271</td>\n",
" </tr>\n",
" <tr>\n",
" <th>quick-exit</th>\n",
" <td>0.232673</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>short-termer</th>\n",
" <td>0.290954</td>\n",
" <td>0.082213</td>\n",
" <td>0.029034</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" booking_forms_short_term booking_forms_medium_term \\\n",
"category \n",
"active-user 0.046701 0.041571 \n",
"fair-trial 0.268002 0.020303 \n",
"quick-exit 0.232673 0.000000 \n",
"short-termer 0.290954 0.082213 \n",
"\n",
" booking_forms_long_term \n",
"category \n",
"active-user 0.040686 \n",
"fair-trial 0.000271 \n",
"quick-exit 0.000000 \n",
"short-termer 0.029034 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['booking_forms_short_term', 'booking_forms_medium_term', 'booking_forms_long_term']\n",
"categories = category_labels.keys() # All\n",
"\n",
"# Group by category and make sure they're in the correct order\n",
"df.groupby('category')[columns].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "# Order categories as defined in category_labels\ncategory_order = [category_labels[k] for k in category_labels.keys()]"
},
{
"cell_type": "code",
"source": "# Generate charts for all metrics\nmetrics = [\n 'booking_forms',\n 'bookings',\n 'employees', \n 'contacts',\n 'emails',\n 'sms',\n 'appointments',\n 'checklists',\n 'checklists_filled'\n]\n\n# Get total users per category for percentage calculations\ncategory_totals = df.groupby('category').size()\n\ndef create_charts_for_metric(metric_name):\n cols = [f'{metric_name}_short_term', f'{metric_name}_medium_term', f'{metric_name}_long_term']\n title_name = metric_name.replace('_', ' ').title()\n \n # Chart 1: Average usage\n grouped = df.groupby('category')[cols].mean().reset_index()\n melted = grouped.melt(id_vars='category', var_name='timeframe', value_name='value')\n melted['category_label'] = melted['category'].map(category_labels)\n \n fig1 = px.bar(\n melted,\n x='category_label',\n y='value',\n color='timeframe',\n barmode='group',\n title=f'{title_name} Usage by Churn Category and Timeframe',\n category_orders={'category_label': category_order}\n )\n fig1.update_layout(\n xaxis_title='Category',\n yaxis_title='Average Usage',\n legend_title='Timeframe'\n )\n fig1.show()\n \n # Chart 2: Percentage of users with at least one\n counts = df.groupby('category')[cols].apply(lambda x: (x > 0).sum()).reset_index()\n counts_melted = counts.melt(id_vars='category', var_name='timeframe', value_name='count')\n counts_melted['total'] = counts_melted['category'].map(category_totals)\n counts_melted['percentage'] = (counts_melted['count'] / counts_melted['total']) * 100\n counts_melted['category_label'] = counts_melted['category'].map(category_labels)\n \n fig2 = px.bar(\n counts_melted,\n x='category_label',\n y='percentage',\n color='timeframe',\n barmode='group',\n title=f'% of Users with At Least One {title_name} by Churn Category and Timeframe',\n category_orders={'category_label': category_order}\n )\n fig2.update_layout(\n xaxis_title='Category',\n yaxis_title='% of Users',\n legend_title='Timeframe'\n )\n fig2.show()\n\n# Generate all charts\nfor metric in metrics:\n create_charts_for_metric(metric)",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "# Generate cumulative charts for all metrics\ndef create_cumulative_charts_for_metric(metric_name):\n cols = [f'{metric_name}_short_term', f'{metric_name}_medium_term', f'{metric_name}_long_term']\n title_name = metric_name.replace('_', ' ').title()\n \n # Chart 1: Cumulative average usage\n grouped = df.groupby('category')[cols].mean().reset_index()\n # Make cumulative\n grouped['cumulative_short'] = grouped[cols[0]]\n grouped['cumulative_medium'] = grouped[cols[0]] + grouped[cols[1]]\n grouped['cumulative_long'] = grouped[cols[0]] + grouped[cols[1]] + grouped[cols[2]]\n \n cumulative_cols = ['cumulative_short', 'cumulative_medium', 'cumulative_long']\n melted = grouped.melt(id_vars='category', value_vars=cumulative_cols, var_name='timeframe', value_name='value')\n melted['category_label'] = melted['category'].map(category_labels)\n \n timeframe_labels_cumulative = {\n 'cumulative_short': 'Through trial',\n 'cumulative_medium': 'Through 90 days',\n 'cumulative_long': 'Through 6 months'\n }\n melted['timeframe_label'] = melted['timeframe'].map(timeframe_labels_cumulative)\n \n fig1 = px.bar(\n melted,\n x='category_label',\n y='value',\n color='timeframe_label',\n barmode='group',\n title=f'{title_name} Cumulative Usage by Churn Category',\n category_orders={\n 'category_label': category_order,\n 'timeframe_label': ['Through trial', 'Through 90 days', 'Through 6 months']\n }\n )\n fig1.update_layout(\n xaxis_title='Category',\n yaxis_title='Cumulative Average Usage',\n legend_title='Timeframe'\n )\n fig1.show()\n \n # Chart 2: Cumulative percentage of users with at least one\n # Check if user used feature in ANY period up to that point (not sum of counts)\n df['_cumul_short'] = df[cols[0]] > 0\n df['_cumul_medium'] = (df[cols[0]] > 0) | (df[cols[1]] > 0)\n df['_cumul_long'] = (df[cols[0]] > 0) | (df[cols[1]] > 0) | (df[cols[2]] > 0)\n \n counts = df.groupby('category')[['_cumul_short', '_cumul_medium', '_cumul_long']].sum().reset_index()\n counts.columns = ['category', 'cumulative_short', 'cumulative_medium', 'cumulative_long']\n \n counts_melted = counts.melt(id_vars='category', value_vars=cumulative_cols, var_name='timeframe', value_name='count')\n counts_melted['total'] = counts_melted['category'].map(category_totals)\n counts_melted['percentage'] = (counts_melted['count'] / counts_melted['total']) * 100\n counts_melted['category_label'] = counts_melted['category'].map(category_labels)\n counts_melted['timeframe_label'] = counts_melted['timeframe'].map(timeframe_labels_cumulative)\n \n fig2 = px.bar(\n counts_melted,\n x='category_label',\n y='percentage',\n color='timeframe_label',\n barmode='group',\n title=f'Cumulative % of Users with At Least One {title_name} by Churn Category',\n category_orders={\n 'category_label': category_order,\n 'timeframe_label': ['Through trial', 'Through 90 days', 'Through 6 months']\n }\n )\n fig2.update_layout(\n xaxis_title='Category',\n yaxis_title='Cumulative % of Users',\n legend_title='Timeframe'\n )\n fig2.show()\n\n# Generate all cumulative charts\nfor metric in metrics:\n create_cumulative_charts_for_metric(metric)",
"metadata": {},
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

12822
churn.csv Normal file

File diff suppressed because it is too large Load Diff

2579
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

16
pyproject.toml Normal file
View File

@ -0,0 +1,16 @@
[tool.poetry]
name = "jupyter-analysis"
version = "0.1.0"
description = "Jupyter notebook project for data analysis and charting"
authors = ["Justin"]
package-mode = false
[tool.poetry.dependencies]
python = "^3.13"
jupyter = "^1.1.1"
pandas = "^3.0.0"
plotly = "^6.0.0"
[build-system]
requires = ["poetry-core>=2.0.0,<3.0.0"]
build-backend = "poetry.core.masonry.api"