I am creating a Multiple Correspondence Analysis (MCA) plot in R using FactoMineR, factoextra, and ggplot2. The goal is to add confidence ellipses around the archetype categories in the MCA space.
The ellipses produced by stat_ellipse() do not match the distribution of the points:
- For some groups, the ellipse is much larger than the point cloud.
- For others, the ellipse fails to cover most of the actual points.
How can I generate ellipses in an MCA plot that accurately reflect the distribution of the points?
Code:
pacman::p_load(FactoMineR, factoextra, dplyr, gridExtra, tidyr)
# MCA with template as supplementary
mca_input <- all_df |> select(sector, type, template)
mca_res <- MCA(mca_input, quali.sup = 3, graph = FALSE)
# Extract coordinates
mca_coords <- as.data.frame(mca_res$ind$coord)
mca_coords$archetype <- all_df$template
# Test 1: Original variable associations (Fisher)
fish_type <- fisher.test(table(all_df$template, all_df$type), simulate.p.value = TRUE)
fish_sector <- fisher.test(table(all_df$template, all_df$sector), simulate.p.value = TRUE)
# Test 2: MCA dimensional separation (Kruskal-Wallis)
kw_dim1 <- kruskal.test(`Dim 1` ~ archetype, data = mca_coords)
kw_dim2 <- kruskal.test(`Dim 2` ~ archetype, data = mca_coords)
# Plot 1: MCA biplot
p1 <- ggplot() +
geom_hline(yintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
geom_vline(xintercept = 0, color = "grey50", linewidth = 0.5, linetype = "dashed") +
geom_jitter(data = mca_coords,
aes(x = `Dim 1`, y = `Dim 2`, color = archetype),
size = 3, alpha = 0.6, width = 0.03, height = 0.03) +
stat_ellipse(data = mca_coords,
aes(x = `Dim 1`, y = `Dim 2`, color = archetype),
level = 0.68, linewidth = 0.7) +
labs(title = "(A) Archetype Clustering in Feature Space",
x = paste0("Dim 1: Essential ↔ Non-essential (", round(mca_res$eig[1,2], 1), "%)"),
y = paste0("Dim 2: Retail/Commercial ↔ Industrial (", round(mca_res$eig[2,2], 1), "%)"),
color = "Archetype") +
theme_minimal() +
theme(panel.grid = element_blank(),
legend.position = "bottom")
p1
Dataset:
> dput(all_df)
structure(list(city = c("amsterdam", "ba", "berlin", "brisbane",
"cairo", "caracas", "dallas", "delhi", "dubai", "frankfurt",
"guangzhou", "istanbul", "johannesburg", "la", "lima", "london",
"madrid", "manchester", "melbourne", "milan", "mumbai", "munich",
"nairobi", "paris", "pune", "rio", "rome", "santiago", "shanghai",
"shenzhen", "sydney", "vienna", "almaty", "amsterdam", "ba",
"baku", "caracas", "chicago", "dallas", "johannesburg", "la",
"lima", "madrid", "manchester", "melbourne", "mexico", "milan",
"ny", "paris", "abu", "almaty", "amsterdam", "athens", "ba",
"baku", "beijing", "berlin", "brisbane", "cairo", "cape", "caracas",
"chicago", "dallas", "delhi", "dubai", "frankfurt", "guangzhou",
"hk", "istanbul", "jeddah", "johannesburg", "la", "lahore", "lima",
"london", "madrid", "manchester", "melbourne", "mexico", "milan",
"mumbai", "munich", "nairobi", "ny", "paris", "pune", "rio",
"riyadh", "rome", "santiago", "shanghai", "shenzhen", "sp", "sydney",
"vienna", "wash", "wuhan"), template = c("Chronic decline", "Resilient",
"Chronic decline", "Resilient", "Full recovery", "Resilient",
"Resilient", "Full recovery", "Full recovery", "Chronic decline",
"Partial recovery", "Chronic decline", "Chronic decline", "Full recovery",
"Resilient", "Chronic decline", "Full recovery", "Chronic decline",
"Partial recovery", "Chronic decline", "Full recovery", "Chronic decline",
"Full recovery", "Chronic decline", "Resilient", "Full recovery",
"Chronic decline", "Resilient", "Chronic decline", "Resilient",
"Partial recovery", "Chronic decline", "Resilient", "Chronic decline",
"Resilient", "Resilient", "Resilient", "Full recovery", "Resilient",
"Chronic decline", "Resilient", "Resilient", "Full recovery",
"Chronic decline", "Partial recovery", "Full recovery", "Chronic decline",
"Resilient", "Chronic decline", "Chronic decline", "Partial recovery",
"Chronic decline", "Full recovery", "Resilient", "Resilient",
"Resilient", "Chronic decline", "Resilient", "Partial recovery",
"Chronic decline", "Resilient", "Partial recovery", "Resilient",
"Full recovery", "Full recovery", "Chronic decline", "Partial recovery",
"Full recovery", "Chronic decline", "Chronic decline", "Chronic decline",
"Partial recovery", "Partial recovery", "Resilient", "Chronic decline",
"Full recovery", "Chronic decline", "Full recovery", "Full recovery",
"Chronic decline", "Resilient", "Chronic decline", "Partial recovery",
"Resilient", "Chronic decline", "Resilient", "Full recovery",
"Full recovery", "Full recovery", "Resilient", "Chronic decline",
"Resilient", "Resilient", "Partial recovery", "Chronic decline",
"Partial recovery", "Resilient"), type = c("non-essential", "mix",
"non-essential", "mix", "mix", "mix", "mix", "mix", "non-essential",
"non-essential", "non-essential", "non-essential", "mix", "mix",
"non-essential", "non-essential", "mix", "non-essential", "mix",
"non-essential", "non-essential", "non-essential", "mix", "non-essential",
"non-essential", "mix", "non-essential", "mix", "non-essential",
"non-essential", "mix", "non-essential", "essential", "non-essential",
"mix", "essential", "mix", "mix", "mix", "non-essential", "mix",
"essential", "mix", "non-essential", "mix", "non-essential",
"non-essential", "mix", "non-essential", "mix", "mix", "non-essential",
"mix", "mix", "mix", "essential", "non-essential", "mix", "non-essential",
"non-essential", "essential", "mix", "mix", "mix", "non-essential",
"non-essential", "non-essential", "mix", "non-essential", "non-essential",
"non-essential", "mix", "mix", "mix", "non-essential", "mix",
"non-essential", "mix", "mix", "non-essential", "mix", "non-essential",
"non-essential", "non-essential", "mix", "mix", "mix", "non-essential",
"mix", "essential", "non-essential", "non-essential", "mix",
"non-essential", "non-essential", "non-essential", "mix"), sector = c("Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Commercial", "Commercial", "Commercial", "Commercial",
"Commercial", "Retail", "Retail", "Retail", "Retail", "Retail",
"Retail", "Retail", "Retail", "Retail", "Retail", "Retail", "Retail",
"Retail", "Retail", "Retail", "Retail", "Retail", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial", "Industrial", "Industrial", "Industrial",
"Industrial", "Industrial")), class = "data.frame", row.names = c(NA,
-97L))
Session Info:
R version 4.5.2 (2025-10-31 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)
Matrix products: default
LAPACK version 3.12.1
locale:
[1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 LC_MONETARY=English_United States.utf8
[4] LC_NUMERIC=C LC_TIME=English_United States.utf8
time zone: Europe/Bucharest
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tidyr_1.3.1 gridExtra_2.3 dplyr_1.1.4 factoextra_1.0.7 ggplot2_4.0.1 FactoMineR_2.12
loaded via a namespace (and not attached):
[1] utf8_1.2.6 sandwich_3.1-1 generics_0.1.4 lattice_0.22-7 digest_0.6.38 magrittr_2.0.4
[7] grid_4.5.2 estimability_1.5.1 RColorBrewer_1.1-3 mvtnorm_1.3-3 fastmap_1.2.0 Matrix_1.7-4
[13] ggrepel_0.9.6 Formula_1.2-5 survival_3.8-3 multcomp_1.4-29 purrr_1.2.0 scales_1.4.0
[19] TH.data_1.1-5 isoband_0.2.7 codetools_0.2-20 abind_1.4-8 cli_3.6.5 rlang_1.1.6
[25] scatterplot3d_0.3-44 splines_4.5.2 leaps_3.2 withr_3.0.2 tools_4.5.2 multcompView_0.1-10
[31] coda_0.19-4.1 DT_0.34.0 flashClust_1.01-2 vctrs_0.6.5 R6_2.6.1 zoo_1.8-14
[37] lifecycle_1.0.4 emmeans_2.0.0 car_3.1-3 htmlwidgets_1.6.4 MASS_7.3-65 cluster_2.1.8.1
[43] pkgconfig_2.0.3 pillar_1.11.1 gtable_0.3.6 glue_1.8.0 Rcpp_1.1.0 tibble_3.3.0
[49] tidyselect_1.2.1 rstudioapi_0.17.1 dichromat_2.0-0.1 farver_2.1.2 xtable_1.8-4 htmltools_0.5.8.1
[55] carData_3.0-5 labeling_0.4.3 compiler_4.5.2 S7_0.2.1


fviz_mca_ind(mca_res, addEllipses = TRUE, habillage = "template")then add the argumenttype = "norm"tostat_ellipse.