Factor analysis of mixed data Table of contents Resources Data Factor analysis of mixed data is a general purpose method. It supports both numeric and categorical data.
import prince
dataset = prince . datasets . load_beers (). head (1000 )
dataset . head ()
is_organic style alcohol_by_volume international_bitterness_units standard_reference_method final_gravity name Lightshine Radler False Blonde 4.50 20.0 5.0 1.012 LightSwitch Lager False American Light Lager 3.95 7.5 3.0 1.005 Lightwave Belgian Pale False Belgian Pale 5.00 25.0 9.0 1.011 Like Weisse False Berlinerweisse 3.10 4.5 3.0 1.005 Lil Heaven Session IPA False Session 4.55 20.0 2.0 1.007
Fitting The rescale_with_mean and rescale_with_std parameters control whether centering and standardization are applied to the underlying PCA. By default, rescale_with_mean is True and rescale_with_std is False.
famd = prince . FAMD (
n_components = 2 ,
n_iter = 3 ,
rescale_with_mean = True ,
rescale_with_std = False ,
copy = True ,
check_input = True ,
random_state = 42 ,
engine = "sklearn" ,
handle_unknown = "error" , # same parameter as sklearn.preprocessing.OneHotEncoder
)
famd = famd . fit (dataset )
Eigenvalues eigenvalue % of variance % of variance (cumulative) component 0 3.735 3.70% 3.70% 1 1.662 1.65% 5.34%
Coordinates famd . row_coordinates (dataset ). head ()
component 0 1 name Lightshine Radler -1.795872 -0.316854 LightSwitch Lager -3.351119 -0.193896 Lightwave Belgian Pale -1.429076 -0.083288 Like Weisse -3.774585 -0.255144 Lil Heaven Session IPA -2.570021 -0.069867
component 0 1 variable alcohol_by_volume 8.474727e-01 0.024329 international_bitterness_units 6.648504e-01 0.224303 standard_reference_method 3.828369e-01 0.386307 final_gravity 8.401140e-01 0.025106 is_organic 6.361371e-07 0.002685 style 9.994811e-01 0.999218
Visualization famd . plot (dataset , x_component = 0 , y_component = 1 )
Contributions (famd . row_contributions_ . sort_values (0 , ascending = False ). head (5 ). style . format (" {:.3%} " ))
component 0 1 name Agamemnon 0.536% 0.255% Epic Blackout Stout 0.536% 0.202% Entire Wood Aged Stout 0.536% 0.202% Agamemnon Monster Shake 0.536% 0.202% After Midnight Imperial Stout 0.536% 0.202%
famd . column_contributions_ . style . format (" {:.0%} " )
component 0 1 variable alcohol_by_volume 23% 1% international_bitterness_units 18% 13% standard_reference_method 10% 23% final_gravity 22% 2% is_organic 0% 0% style 27% 60%
Wikipedia example The Wikipedia article on FAMD uses a small dataset with 6 individuals, 3 quantitative variables (k₁, k₂, k₃) and 3 qualitative variables (q₁, q₂, q₃). Let’s reproduce it.
import pandas as pd
wiki = pd . DataFrame (
{
"k1" : [2.0 , 5.0 , 3.0 , 4.0 , 1.0 , 6.0 ],
"k2" : [4.5 , 4.5 , 1.0 , 1.0 , 1.0 , 1.0 ],
"k3" : [4.0 , 4.0 , 2.0 , 2.0 , 1.0 , 2.0 ],
"q1" : ["A" , "C" , "B" , "B" , "A" , "C" ],
"q2" : ["B" , "B" , "B" , "B" , "A" , "A" ],
"q3" : ["C" , "C" , "B" , "B" , "A" , "A" ],
},
index = [f "i { i } " for i in range (1 , 7 )],
)
wiki
k1 k2 k3 q1 q2 q3 i1 2.0 4.5 4.0 A B C i2 5.0 4.5 4.0 C B C i3 3.0 1.0 2.0 B B B i4 4.0 1.0 2.0 B B B i5 1.0 1.0 1.0 A A A i6 6.0 1.0 2.0 C A A
wiki_famd = prince . FAMD (n_components = 5 , engine = "scipy" )
wiki_famd = wiki_famd . fit (wiki )
wiki_famd . eigenvalues_summary
eigenvalue % of variance % of variance (cumulative) component 0 3.467 43.34% 43.34% 1 2.500 31.25% 74.59% 2 1.971 24.64% 99.22% 3 0.056 0.70% 99.92% 4 0.006 0.08% 100.00%
import altair as alt
rc = wiki_famd . row_coordinates (wiki )
rc . columns = [f "F { c + 1 } " for c in rc . columns ]
rc ["label" ] = rc . index
alt . Chart (rc . reset_index (drop = True )). mark_text (fontSize = 14 ). encode (
x = alt . X ("F1:Q" , title = "F1" ),
y = alt . Y ("F2:Q" , title = "F2" ),
text = "label:N" ,
). properties (title = "Figure 1 — Individuals" , width = 400 , height = 300 )
cc = wiki_famd . column_coordinates_ . iloc [:, :2 ]. copy ()
cc . columns = ["F1" , "F2" ]
cc ["variable" ] = cc . index
cc ["type" ] = ["quanti" ] * 3 + ["quali" ] * 3
points = (
alt . Chart (cc . reset_index (drop = True ))
. mark_text (fontSize = 14 )
. encode (
x = alt . X ("F1:Q" , title = "F1 (r² / η²)" , scale = alt . Scale (domain = [0 , 1.05 ])),
y = alt . Y ("F2:Q" , title = "F2 (r² / η²)" , scale = alt . Scale (domain = [0 , 1.05 ])),
text = "variable:N" ,
color = alt . Color ("type:N" , legend = alt . Legend (title = "Variable type" )),
)
)
points . properties (title = "Figure 2 — Relationship square" , width = 400 , height = 400 )
import numpy as np
num_cols = ["k1" , "k2" , "k3" ]
cat_cols = ["q1" , "q2" , "q3" ]
corr_circle = wiki_famd . column_correlations . loc [num_cols ]. iloc [:, :2 ]. copy ()
corr_circle . columns = ["F1" , "F2" ]
corr_circle ["variable" ] = corr_circle . index
# Arrows from origin to each variable
arrows = pd . DataFrame (
[
{"x" : 0 , "y" : 0 , "x2" : row ["F1" ], "y2" : row ["F2" ], "variable" : row ["variable" ]}
for _ , row in corr_circle . iterrows ()
]
)
circle_theta = np . linspace (0 , 2 * np . pi , 100 )
unit_circle = pd . DataFrame (
{"x" : np . cos (circle_theta ), "y" : np . sin (circle_theta ), "order" : range (100 )}
)
c_circle = (
alt . Chart (unit_circle )
. mark_line (color = "gray" , strokeDash = [4 , 4 ])
. encode (
x = alt . X ("x:Q" , title = "F1" , scale = alt . Scale (domain = [- 1.2 , 1.2 ])),
y = alt . Y ("y:Q" , title = "F2" , scale = alt . Scale (domain = [- 1.2 , 1.2 ])),
order = "order:O" ,
)
)
c_arrows = (
alt . Chart (arrows )
. mark_rule (strokeWidth = 2 )
. encode (
x = "x:Q" ,
y = "y:Q" ,
x2 = "x2:Q" ,
y2 = "y2:Q" ,
color = "variable:N" ,
)
)
c_labels = (
alt . Chart (corr_circle . reset_index (drop = True ))
. mark_text (fontSize = 14 , dx = 15 )
. encode (x = "F1:Q" , y = "F2:Q" , text = "variable:N" )
)
(c_circle + c_arrows + c_labels ). properties (
title = "Figure 3 — Correlation circle" , width = 400 , height = 400
)
# Category coordinates are the mean of row coordinates for each category
cat_coords = []
for q in cat_cols :
for cat in sorted (wiki [q ]. unique ()):
mean_rc = rc . loc [wiki [q ] == cat , ["F1" , "F2" ]]. mean ()
cat_coords . append ({"F1" : mean_rc ["F1" ], "F2" : mean_rc ["F2" ], "label" : f " { q } = { cat } " })
cat_df = pd . DataFrame (cat_coords )
alt . Chart (cat_df ). mark_text (fontSize = 14 ). encode (
x = alt . X ("F1:Q" , title = "F1" ),
y = alt . Y ("F2:Q" , title = "F2" ),
text = "label:N" ,
). properties (title = "Figure 4 — Categories" , width = 400 , height = 300 )