Prince foo

Factor analysis of mixed data

Table of contents

Resources

Data

Factor analysis of mixed data is a general purpose method. It supports both numeric and categorical data.

import prince

dataset = prince.datasets.load_beers().head(1000)
dataset.head()

is_organicstylealcohol_by_volumeinternational_bitterness_unitsstandard_reference_methodfinal_gravity
name
Lightshine RadlerFalseBlonde4.5020.05.01.012
LightSwitch LagerFalseAmerican Light Lager3.957.53.01.005
Lightwave Belgian PaleFalseBelgian Pale5.0025.09.01.011
Like WeisseFalseBerlinerweisse3.104.53.01.005
Lil Heaven Session IPAFalseSession4.5520.02.01.007

Fitting

The rescale_with_mean and rescale_with_std parameters control whether centering and standardization are applied to the underlying PCA. By default, rescale_with_mean is True and rescale_with_std is False.

famd = prince.FAMD(
    n_components=2,
    n_iter=3,
    rescale_with_mean=True,
    rescale_with_std=False,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="error",  # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd = famd.fit(dataset)

Eigenvalues

famd.eigenvalues_summary

eigenvalue% of variance% of variance (cumulative)
component
03.7353.70%3.70%
11.6621.65%5.34%

Coordinates

famd.row_coordinates(dataset).head()

component01
name
Lightshine Radler-1.795872-0.316854
LightSwitch Lager-3.351119-0.193896
Lightwave Belgian Pale-1.429076-0.083288
Like Weisse-3.774585-0.255144
Lil Heaven Session IPA-2.570021-0.069867
famd.column_coordinates_

component01
variable
alcohol_by_volume8.474727e-010.024329
international_bitterness_units6.648504e-010.224303
standard_reference_method3.828369e-010.386307
final_gravity8.401140e-010.025106
is_organic6.361371e-070.002685
style9.994811e-010.999218

Visualization

famd.plot(dataset, x_component=0, y_component=1)

Contributions

(famd.row_contributions_.sort_values(0, ascending=False).head(5).style.format("{:.3%}"))
component01
name  
Agamemnon0.536%0.255%
Epic Blackout Stout0.536%0.202%
Entire Wood Aged Stout0.536%0.202%
Agamemnon Monster Shake0.536%0.202%
After Midnight Imperial Stout0.536%0.202%
famd.column_contributions_.style.format("{:.0%}")
component01
variable  
alcohol_by_volume23%1%
international_bitterness_units18%13%
standard_reference_method10%23%
final_gravity22%2%
is_organic0%0%
style27%60%

Wikipedia example

The Wikipedia article on FAMD uses a small dataset with 6 individuals, 3 quantitative variables (k₁, k₂, k₃) and 3 qualitative variables (q₁, q₂, q₃). Let’s reproduce it.

import pandas as pd

wiki = pd.DataFrame(
    {
        "k1": [2.0, 5.0, 3.0, 4.0, 1.0, 6.0],
        "k2": [4.5, 4.5, 1.0, 1.0, 1.0, 1.0],
        "k3": [4.0, 4.0, 2.0, 2.0, 1.0, 2.0],
        "q1": ["A", "C", "B", "B", "A", "C"],
        "q2": ["B", "B", "B", "B", "A", "A"],
        "q3": ["C", "C", "B", "B", "A", "A"],
    },
    index=[f"i{i}" for i in range(1, 7)],
)
wiki

k1k2k3q1q2q3
i12.04.54.0ABC
i25.04.54.0CBC
i33.01.02.0BBB
i44.01.02.0BBB
i51.01.01.0AAA
i66.01.02.0CAA
wiki_famd = prince.FAMD(n_components=5, engine="scipy")
wiki_famd = wiki_famd.fit(wiki)
wiki_famd.eigenvalues_summary

eigenvalue% of variance% of variance (cumulative)
component
03.46743.34%43.34%
12.50031.25%74.59%
21.97124.64%99.22%
30.0560.70%99.92%
40.0060.08%100.00%
import altair as alt

rc = wiki_famd.row_coordinates(wiki)
rc.columns = [f"F{c + 1}" for c in rc.columns]
rc["label"] = rc.index

alt.Chart(rc.reset_index(drop=True)).mark_text(fontSize=14).encode(
    x=alt.X("F1:Q", title="F1"),
    y=alt.Y("F2:Q", title="F2"),
    text="label:N",
).properties(title="Figure 1 — Individuals", width=400, height=300)
cc = wiki_famd.column_coordinates_.iloc[:, :2].copy()
cc.columns = ["F1", "F2"]
cc["variable"] = cc.index
cc["type"] = ["quanti"] * 3 + ["quali"] * 3

points = (
    alt.Chart(cc.reset_index(drop=True))
    .mark_text(fontSize=14)
    .encode(
        x=alt.X("F1:Q", title="F1 (r² / η²)", scale=alt.Scale(domain=[0, 1.05])),
        y=alt.Y("F2:Q", title="F2 (r² / η²)", scale=alt.Scale(domain=[0, 1.05])),
        text="variable:N",
        color=alt.Color("type:N", legend=alt.Legend(title="Variable type")),
    )
)

points.properties(title="Figure 2 — Relationship square", width=400, height=400)
import numpy as np

num_cols = ["k1", "k2", "k3"]
cat_cols = ["q1", "q2", "q3"]

corr_circle = wiki_famd.column_correlations.loc[num_cols].iloc[:, :2].copy()
corr_circle.columns = ["F1", "F2"]
corr_circle["variable"] = corr_circle.index

# Arrows from origin to each variable
arrows = pd.DataFrame(
    [
        {"x": 0, "y": 0, "x2": row["F1"], "y2": row["F2"], "variable": row["variable"]}
        for _, row in corr_circle.iterrows()
    ]
)

circle_theta = np.linspace(0, 2 * np.pi, 100)
unit_circle = pd.DataFrame(
    {"x": np.cos(circle_theta), "y": np.sin(circle_theta), "order": range(100)}
)

c_circle = (
    alt.Chart(unit_circle)
    .mark_line(color="gray", strokeDash=[4, 4])
    .encode(
        x=alt.X("x:Q", title="F1", scale=alt.Scale(domain=[-1.2, 1.2])),
        y=alt.Y("y:Q", title="F2", scale=alt.Scale(domain=[-1.2, 1.2])),
        order="order:O",
    )
)
c_arrows = (
    alt.Chart(arrows)
    .mark_rule(strokeWidth=2)
    .encode(
        x="x:Q",
        y="y:Q",
        x2="x2:Q",
        y2="y2:Q",
        color="variable:N",
    )
)
c_labels = (
    alt.Chart(corr_circle.reset_index(drop=True))
    .mark_text(fontSize=14, dx=15)
    .encode(x="F1:Q", y="F2:Q", text="variable:N")
)

(c_circle + c_arrows + c_labels).properties(
    title="Figure 3 — Correlation circle", width=400, height=400
)
# Category coordinates are the mean of row coordinates for each category
cat_coords = []
for q in cat_cols:
    for cat in sorted(wiki[q].unique()):
        mean_rc = rc.loc[wiki[q] == cat, ["F1", "F2"]].mean()
        cat_coords.append({"F1": mean_rc["F1"], "F2": mean_rc["F2"], "label": f"{q}={cat}"})

cat_df = pd.DataFrame(cat_coords)

alt.Chart(cat_df).mark_text(fontSize=14).encode(
    x=alt.X("F1:Q", title="F1"),
    y=alt.Y("F2:Q", title="F2"),
    text="label:N",
).properties(title="Figure 4 — Categories", width=400, height=300)