Handling Binary and Categorical Variables
2026-04-08
Source:vignettes/binary_variables_tutorial.rmd
binary_variables_tutorial.rmdThe CISS-VAE model can handle binary and categorical variables, but categorical variables must first be converted into binary dummy variables.
The Palmer
Penguins dataset has both continuous (bill_length, bill_depth,
flipper_length, body_mass) and categorical (species, island, sex) values
so it makes a good example for this. We can use the
dummy_cols() function from the fastDummies
package to create dummy variables for our categories. Set
ignore_na = TRUE and
remove_selected_columns = TRUE to avoid creating a new
column for NA values and to remove the original categoricals once the
dummies are created. The dummy variable columns must be mapped back to
their original categorical variables using a
categorical_column_map, which is a named list where each
entry corresponds to a categorical feature and contains the names of its
associated dummy columns.
library(tidyverse)
library(kableExtra)
library(reticulate)
library(rCISSVAE)
library(fastDummies)
library(palmerpenguins)## Warning: package 'palmerpenguins' was built under R version 4.5.2
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
data(package = 'palmerpenguins')
penguins_clean = na.omit(penguins)%>%
select(year, everything()) ## removing existing incomplete rows for illustration purposes
glue::glue("Dimensions: {paste0(dim(penguins), collapse = ',')}")## Dimensions: 344,8
| year | species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex |
|---|---|---|---|---|---|---|---|
| 2007 | Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | male |
| 2007 | Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | female |
| 2007 | Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | female |
| 2007 | Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | female |
| 2007 | Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | male |
| 2007 | Adelie | Torgersen | 38.9 | 17.8 | 181 | 3625 | female |
## create penguins_missing
n <- nrow(penguins_clean)
p <- ncol(penguins_clean)
m <- floor(0.20 * n * p) # number of cells to mask
idx <- sample.int(n * p, m) # positions in a logical matrix
mask <- matrix(FALSE, nrow = n, ncol = p)
mask[idx] <- TRUE
penguins_missing <- penguins_clean
## anything can be missing except the year
for (j in seq(2, p, 1)) {
penguins_missing[[j]][mask[, j]] <- NaN
}
# quick check of missingness rate
glue::glue("\nMissingness proportion of penguins_missing: {round(mean(is.na(as.matrix(penguins_missing))), 2)}") ## Missingness proportion of penguins_missing: 0.17
## create dummy vars
penguin_dummies_complete = penguins_clean %>%
dummy_cols(select_columns = c("species", "island", "sex"),
ignore_na = TRUE,
remove_first_dummy = FALSE,
remove_selected_columns = TRUE)
penguin_dummies = penguins_missing %>%
dummy_cols(select_columns = c("species", "island", "sex"),
ignore_na = TRUE,
remove_first_dummy = FALSE,
remove_selected_columns = TRUE)
head(penguin_dummies) %>% kable()| year | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | species_Adelie | species_Chinstrap | species_Gentoo | island_Biscoe | island_Dream | island_Torgersen | sex_female | sex_male |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2007 | 39.1 | NaN | 181 | 3750 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 2007 | 39.5 | 17.4 | 186 | 3800 | NA | NA | NA | NA | NA | NA | 1 | 0 |
| 2007 | 40.3 | 18.0 | 195 | 3250 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 2007 | NaN | NaN | 193 | 3450 | NA | NA | NA | NA | NA | NA | 1 | 0 |
| 2007 | 39.3 | NaN | 190 | 3650 | 1 | 0 | 0 | NA | NA | NA | NA | NA |
| 2007 | 38.9 | NaN | NaN | NaN | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
## define categorical_column_map
categorical_column_map <- list(
species = grep("^species_", names(penguin_dummies), value = TRUE),
island = grep("^island_", names(penguin_dummies), value = TRUE),
sex = grep("^sex_", names(penguin_dummies), value = TRUE)
)
categorical_column_map## $species
## [1] "species_Adelie" "species_Chinstrap" "species_Gentoo"
##
## $island
## [1] "island_Biscoe" "island_Dream" "island_Torgersen"
##
## $sex
## [1] "sex_female" "sex_male"
Now that the dummy vars are created and there is missingness, we can
create a binary_feature_mask and impute with
run_cissvae().
binary_feature_mask = c(rep(FALSE, 5), rep(TRUE, 8))
glue::glue("Binary Feature Mask: {paste0(binary_feature_mask, collapse = ', ')}")
results = run_cissvae(
data = penguin_dummies,
val_proportion = 0.20, ## small dataset so using higher val proportion
columns_ignore = "year",
binary_feature_mask = binary_feature_mask,
categorical_column_map = categorical_column_map,
clusters = NULL,
n_clusters = 1,
scale_features = TRUE,
epochs = 500,
debug = FALSE
)
head(results$imputed_dataset)
head(penguin_dummies)## year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678 39.10000 18.70000 197.3811 3827.858
## 1 -1.283678 39.50000 17.40000 186.0000 3800.000
## 2 -1.283678 42.15094 16.55605 195.0000 3250.000
## 3 -1.283678 36.70000 17.30473 193.0000 3450.000
## 4 -1.283678 39.30000 20.60000 190.0000 3650.000
## 5 -1.283678 38.90000 17.22320 181.0000 3666.441
## species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0 1 0 0 0 0
## 1 1 0 0 0 1
## 2 1 0 0 0 1
## 3 1 0 0 0 0
## 4 1 0 0 0 1
## 5 1 0 0 0 0
## island_Torgersen sex_female sex_male
## 0 1 0 1
## 1 0 1 0
## 2 0 1 0
## 3 1 1 0
## 4 0 0 1
## 5 1 1 0
## # A tibble: 6 × 13
## year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 2007 39.1 NaN 181 3750
## 2 2007 39.5 17.4 186 3800
## 3 2007 40.3 18 195 3250
## 4 2007 NaN NaN 193 3450
## 5 2007 39.3 NaN 190 3650
## 6 2007 38.9 NaN NaN NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## # species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## # island_Torgersen <int>, sex_female <int>, sex_male <int>
As we can see above, the imputed values for the binary variables are in terms of probability, not a flat 0,1 so we have to convert those values to binary. The ‘imputed_dataset’ is returned as a data.frame, so we can use tidyverse mutate to convert the binary variables.
results$imputed_dataset <- results$imputed_dataset %>%
mutate(across(
.cols = matches("species|island|sex"),
.fns = ~ case_when(
.x > 0.5 ~ 1,
.x <= 0.5 ~ 0,
TRUE ~ .x
)
))
head(results$imputed_dataset)
head(penguin_dummies)
head(penguin_dummies_complete)## year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678 39.10000 18.70000 197.3811 3827.858
## 1 -1.283678 39.50000 17.40000 186.0000 3800.000
## 2 -1.283678 42.15094 16.55605 195.0000 3250.000
## 3 -1.283678 36.70000 17.30473 193.0000 3450.000
## 4 -1.283678 39.30000 20.60000 190.0000 3650.000
## 5 -1.283678 38.90000 17.22320 181.0000 3666.441
## species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0 1 0 0 0 0
## 1 1 0 0 0 1
## 2 1 0 0 0 1
## 3 1 0 0 0 0
## 4 1 0 0 0 1
## 5 1 0 0 0 0
## island_Torgersen sex_female sex_male
## 0 1 0 1
## 1 0 1 0
## 2 0 1 0
## 3 1 1 0
## 4 0 0 1
## 5 1 1 0
## # A tibble: 6 × 13
## year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 2007 39.1 NaN 181 3750
## 2 2007 39.5 17.4 186 3800
## 3 2007 40.3 18 195 3250
## 4 2007 NaN NaN 193 3450
## 5 2007 39.3 NaN 190 3650
## 6 2007 38.9 NaN NaN NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## # species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## # island_Torgersen <int>, sex_female <int>, sex_male <int>
## # A tibble: 6 × 13
## year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <int> <dbl> <dbl> <int> <int>
## 1 2007 39.1 18.7 181 3750
## 2 2007 39.5 17.4 186 3800
## 3 2007 40.3 18 195 3250
## 4 2007 36.7 19.3 193 3450
## 5 2007 39.3 20.6 190 3650
## 6 2007 38.9 17.8 181 3625
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## # species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## # island_Torgersen <int>, sex_female <int>, sex_male <int>