Context ────────
• Metric: registration_success (binary 0/1, first-time users only)
• Baseline conversion rate: 15 %
• Target lift (MDE): +20 % relative (18 % vs 15 %)
• Power / α: 80 %, two-sided 5 %
• Fixed-sample size: 2 276 users per variant
→ ≈ 77 days at ~59 new users / variant / day, and traffic may drop during the test.
Constraint
──────────
No pre-period data (all users are new), so CUPED/CUPAC is off the table.
What I’ve explored so far
─────────────────────────
I’m looking into post-stratification (device, U.S. state, hour-of-day).
For continuous metrics I have code that works; adapting it to a binary endpoint hasn’t produced an obvious power gain yet (I’ll include the code snippet in the post).
Question
────────
• Which practical methods—variance-reduction, sequential, Bayesian, or other—can realistically shave double-digit days off this test without pre-period data and with only ≈ 60 users / variant / day?
A worked example (or simulation) comparing achieved power vs. a fixed-sample z-test would be hugely appreciated.
I first wrote this simulator for continuous metrics, where adding covariates / post-stratification noticeably boosted power. After switching the outcome to binary (registration = 0/1), the same variance-reduction ideas didn’t move the needle. I’d really appreciate it if someone could post a simulation (or a worked example) showing a variant that does cut run-time for a low-traffic binary metric like this.
# Re-run after reset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
# ---------- functions ----------
def generate_data(
sample_size=1000,
effect=0,
min_base=1000,
max_base=3000,
delta_strat=300,
noise_std=300,
):
"""
Generate continuous metric data with three strata.
(NB: this is still continuous; adjust for binary if needed.)
"""
strat = np.random.randint(0, 3, sample_size * 2)
base = np.random.uniform(min_base, max_base, sample_size * 2)
base_with_strat = base + strat**2 * delta_strat
noises = np.random.normal(0, noise_std, (2, sample_size * 2))
group = np.hstack([np.zeros(sample_size), np.ones(sample_size)])
df = pd.DataFrame(
{
"group": group,
"strat": strat,
"value": base_with_strat + noises[0] + effect * group,
"value_before": base_with_strat + noises[1],
}
)
return df
def check_ttest(a, b):
"""Plain two-sample t-test."""
return stats.ttest_ind(a["value"], b["value"]).pvalue
def calc_strat_mean(df, weights):
"""Weighted mean across strata."""
strat_mean = df.groupby("strat")["value"].mean()
return (strat_mean * weights).sum()
def calc_strat_var(df, weights):
"""Weighted variance across strata."""
strat_var = df.groupby("strat")["value"].var(ddof=1)
return (strat_var * weights).sum()
def check_strat(a, b, weights):
"""Post-stratified t-statistic (Wald approximation)."""
mean_a = calc_strat_mean(a, weights)
mean_b = calc_strat_mean(b, weights)
var_a = calc_strat_var(a, weights)
var_b = calc_strat_var(b, weights)
delta = mean_b - mean_a
std = np.sqrt(var_a / len(a) + var_b / len(b))
t_stat = delta / std
df_num = (var_a / len(a) + var_b / len(b)) ** 2
df_den = (var_a**2) / (len(a) ** 2 * (len(a) - 1)) + (var_b**2) / (
len(b) ** 2 * (len(b) - 1)
)
df_eff = df_num / df_den
pvalue = 2 * (1 - stats.t.cdf(abs(t_stat), df=df_eff))
return pvalue
def plot_pvalue_distribution(pvals_dict):
"""Cumulative p-value plots to visualise power."""
xs = np.linspace(0, 1, 1000)
for lbl, pvals in pvals_dict.items():
ys = [np.mean(pvals < x) for x in xs]
plt.plot(xs, ys, label=lbl)
plt.plot([0, 1], [0, 1], "k--", alpha=0.8)
plt.title("Empirical p-value CDF")
plt.xlabel("p-value")
plt.legend()
plt.grid()
plt.show()
def show_power_ci(pvals_dict, alpha=0.05):
"""Print empirical power and its 95 % CI."""
for lbl, pvals in pvals_dict.items():
hits = (pvals < alpha).astype(float)
pe = hits.mean()
se = hits.std(ddof=1) / np.sqrt(len(hits))
ci_low, ci_high = pe - 1.96 * se, pe + 1.96 * se
print(f"{lbl:<12} power={pe:0.3f}, 95% CI [{ci_low:0.3f}, {ci_high:0.3f}]")
def describe_pvalues(pvals_dict):
show_power_ci(pvals_dict)
plot_pvalue_distribution(pvals_dict)
# ---------- simulation ----------
n_simulations = 10_000
effect = 100 # treatment lift for the *continuous* example
sample_size = 1_000
p_t = []
p_strat = []
for _ in range(n_simulations):
df = generate_data(sample_size=sample_size, effect=effect)
a = df[df.group == 0]
b = df[df.group == 1]
# population weights for each stratum
weights = df["strat"].value_counts(normalize=True).sort_index()
p_t.append(check_ttest(a, b))
p_strat.append(check_strat(a, b, weights))
results = {
"T-test": np.array(p_t),
"Stratified": np.array(p_strat),
}
describe_pvalues(results)