Factor analysis of mixed data

Resources

Wikipedia article
Escofier, B. (1979). Traitement simultané de variables qualitatives et quantitatives en analyse factorielle. Les cahiers de l’analyse des données, 4(2), 137-146.

Data

Factor analysis of mixed data is a general purpose method. It supports both numeric and categorical data.

import prince

dataset = prince.datasets.load_beers().head(1000)
dataset.head()

	is_organic	style	alcohol_by_volume	international_bitterness_units	standard_reference_method	final_gravity
name
Lightshine Radler	False	Blonde	4.50	20.0	5.0	1.012
LightSwitch Lager	False	American Light Lager	3.95	7.5	3.0	1.005
Lightwave Belgian Pale	False	Belgian Pale	5.00	25.0	9.0	1.011
Like Weisse	False	Berlinerweisse	3.10	4.5	3.0	1.005
Lil Heaven Session IPA	False	Session	4.55	20.0	2.0	1.007

Fitting

The rescale_with_mean and rescale_with_std parameters control whether centering and standardization are applied to the underlying PCA. By default, rescale_with_mean is True and rescale_with_std is False.

famd = prince.FAMD(
    n_components=2,
    n_iter=3,
    rescale_with_mean=True,
    rescale_with_std=False,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error",  # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd = famd.fit(dataset)

Eigenvalues

famd.eigenvalues_summary

	eigenvalue	% of variance	% of variance (cumulative)
component
0	3.735	3.70%	3.70%
1	1.662	1.65%	5.34%

Coordinates

famd.row_coordinates(dataset).head()

component	0	1
name
Lightshine Radler	-1.795872	-0.316854
LightSwitch Lager	-3.351119	-0.193896
Lightwave Belgian Pale	-1.429076	-0.083288
Like Weisse	-3.774585	-0.255144
Lil Heaven Session IPA	-2.570021	-0.069867

famd.column_coordinates_

component	0	1
variable
alcohol_by_volume	8.474727e-01	0.024329
international_bitterness_units	6.648504e-01	0.224303
standard_reference_method	3.828369e-01	0.386307
final_gravity	8.401140e-01	0.025106
is_organic	6.361371e-07	0.002685
style	9.994811e-01	0.999218

Visualization

famd.plot(dataset, x_component=0, y_component=1)

Contributions

(famd.row_contributions_.sort_values(0, ascending=False).head(5).style.format("{:.3%}"))

component	0	1
name
Agamemnon	0.536%	0.255%
Epic Blackout Stout	0.536%	0.202%
Entire Wood Aged Stout	0.536%	0.202%
Agamemnon Monster Shake	0.536%	0.202%
After Midnight Imperial Stout	0.536%	0.202%

famd.column_contributions_.style.format("{:.0%}")

component	0	1
variable
alcohol_by_volume	23%	1%
international_bitterness_units	18%	13%
standard_reference_method	10%	23%
final_gravity	22%	2%
is_organic	0%	0%
style	27%	60%

Wikipedia example

The Wikipedia article on FAMD uses a small dataset with 6 individuals, 3 quantitative variables (k₁, k₂, k₃) and 3 qualitative variables (q₁, q₂, q₃). Let’s reproduce it.

import pandas as pd

wiki = pd.DataFrame(
    {
        "k1": [2.0, 5.0, 3.0, 4.0, 1.0, 6.0],
        "k2": [4.5, 4.5, 1.0, 1.0, 1.0, 1.0],
        "k3": [4.0, 4.0, 2.0, 2.0, 1.0, 2.0],
        "q1": ["A", "C", "B", "B", "A", "C"],
        "q2": ["B", "B", "B", "B", "A", "A"],
        "q3": ["C", "C", "B", "B", "A", "A"],
    },
    index=[f"i{i}" for i in range(1, 7)],
)
wiki

	k1	k2	k3	q1	q2	q3
i1	2.0	4.5	4.0	A	B	C
i2	5.0	4.5	4.0	C	B	C
i3	3.0	1.0	2.0	B	B	B
i4	4.0	1.0	2.0	B	B	B
i5	1.0	1.0	1.0	A	A	A
i6	6.0	1.0	2.0	C	A	A

wiki_famd = prince.FAMD(n_components=5, engine="scipy")
wiki_famd = wiki_famd.fit(wiki)
wiki_famd.eigenvalues_summary

	eigenvalue	% of variance	% of variance (cumulative)
component
0	3.467	43.34%	43.34%
1	2.500	31.25%	74.59%
2	1.971	24.64%	99.22%
3	0.056	0.70%	99.92%
4	0.006	0.08%	100.00%

import altair as alt

rc = wiki_famd.row_coordinates(wiki)
rc.columns = [f"F{c + 1}" for c in rc.columns]
rc["label"] = rc.index

alt.Chart(rc.reset_index(drop=True)).mark_text(fontSize=14).encode(
    x=alt.X("F1:Q", title="F1"),
    y=alt.Y("F2:Q", title="F2"),
    text="label:N",
).properties(title="Figure 1 — Individuals", width=400, height=300)

cc = wiki_famd.column_coordinates_.iloc[:, :2].copy()
cc.columns = ["F1", "F2"]
cc["variable"] = cc.index
cc["type"] = ["quanti"] * 3 + ["quali"] * 3

points = (
    alt.Chart(cc.reset_index(drop=True))
    .mark_text(fontSize=14)
    .encode(
        x=alt.X("F1:Q", title="F1 (r² / η²)", scale=alt.Scale(domain=[0, 1.05])),
        y=alt.Y("F2:Q", title="F2 (r² / η²)", scale=alt.Scale(domain=[0, 1.05])),
        text="variable:N",
        color=alt.Color("type:N", legend=alt.Legend(title="Variable type")),
    )
)

points.properties(title="Figure 2 — Relationship square", width=400, height=400)

import numpy as np

num_cols = ["k1", "k2", "k3"]
cat_cols = ["q1", "q2", "q3"]

corr_circle = wiki_famd.column_correlations.loc[num_cols].iloc[:, :2].copy()
corr_circle.columns = ["F1", "F2"]
corr_circle["variable"] = corr_circle.index

# Arrows from origin to each variable
arrows = pd.DataFrame(
    [
        {"x": 0, "y": 0, "x2": row["F1"], "y2": row["F2"], "variable": row["variable"]}
        for _, row in corr_circle.iterrows()
    ]
)

circle_theta = np.linspace(0, 2 * np.pi, 100)
unit_circle = pd.DataFrame(
    {"x": np.cos(circle_theta), "y": np.sin(circle_theta), "order": range(100)}
)

c_circle = (
    alt.Chart(unit_circle)
    .mark_line(color="gray", strokeDash=[4, 4])
    .encode(
        x=alt.X("x:Q", title="F1", scale=alt.Scale(domain=[-1.2, 1.2])),
        y=alt.Y("y:Q", title="F2", scale=alt.Scale(domain=[-1.2, 1.2])),
        order="order:O",
    )
)
c_arrows = (
    alt.Chart(arrows)
    .mark_rule(strokeWidth=2)
    .encode(
        x="x:Q",
        y="y:Q",
        x2="x2:Q",
        y2="y2:Q",
        color="variable:N",
    )
)
c_labels = (
    alt.Chart(corr_circle.reset_index(drop=True))
    .mark_text(fontSize=14, dx=15)
    .encode(x="F1:Q", y="F2:Q", text="variable:N")
)

(c_circle + c_arrows + c_labels).properties(
    title="Figure 3 — Correlation circle", width=400, height=400
)

# Category coordinates are the mean of row coordinates for each category
cat_coords = []
for q in cat_cols:
    for cat in sorted(wiki[q].unique()):
        mean_rc = rc.loc[wiki[q] == cat, ["F1", "F2"]].mean()
        cat_coords.append({"F1": mean_rc["F1"], "F2": mean_rc["F2"], "label": f"{q}={cat}"})

cat_df = pd.DataFrame(cat_coords)

alt.Chart(cat_df).mark_text(fontSize=14).encode(
    x=alt.X("F1:Q", title="F1"),
    y=alt.Y("F2:Q", title="F2"),
    text="label:N",
).properties(title="Figure 4 — Categories", width=400, height=300)

Prince foo