Handling Binary and Categorical Variables

The CISS-VAE model can handle binary and categorical variables, but categorical variables must first be converted into binary dummy variables.

The Palmer Penguins dataset has both continuous (bill_length, bill_depth, flipper_length, body_mass) and categorical (species, island, sex) values so it makes a good example for this. We can use the dummy_cols() function from the fastDummies package to create dummy variables for our categories. Set ignore_na = TRUE and remove_selected_columns = TRUE to avoid creating a new column for NA values and to remove the original categoricals once the dummies are created. The dummy variable columns must be mapped back to their original categorical variables using a categorical_column_map, which is a named list where each entry corresponds to a categorical feature and contains the names of its associated dummy columns.

library(tidyverse)
library(kableExtra)
library(reticulate)
library(rCISSVAE)
library(fastDummies)
library(palmerpenguins)

## 
## Attaching package: 'palmerpenguins'

## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw

data(package = 'palmerpenguins')

penguins_clean = na.omit(penguins)%>%
        select(year, everything()) ## removing existing incomplete rows for illustration purposes

glue::glue("Dimensions: {paste0(dim(penguins), collapse = ',')}")

## Dimensions: 344,8

head(penguins_clean) %>% kable()

year	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
2007	Adelie	Torgersen	39.1	18.7	181	3750	male
2007	Adelie	Torgersen	39.5	17.4	186	3800	female
2007	Adelie	Torgersen	40.3	18.0	195	3250	female
2007	Adelie	Torgersen	36.7	19.3	193	3450	female
2007	Adelie	Torgersen	39.3	20.6	190	3650	male
2007	Adelie	Torgersen	38.9	17.8	181	3625	female

## create penguins_missing
n  <- nrow(penguins_clean)
p  <- ncol(penguins_clean)
m  <- floor(0.20 * n * p)               # number of cells to mask
idx <- sample.int(n * p, m)             # positions in a logical matrix

mask <- matrix(FALSE, nrow = n, ncol = p)
mask[idx] <- TRUE

penguins_missing <- penguins_clean

## anything can be missing except the year
for (j in seq(2, p, 1)) {
  penguins_missing[[j]][mask[, j]] <- NaN
}

# quick check of missingness rate
glue::glue("\nMissingness proportion of penguins_missing: {round(mean(is.na(as.matrix(penguins_missing))), 2)}")

## Missingness proportion of penguins_missing: 0.17

## create dummy vars

penguin_dummies_complete = penguins_clean %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = FALSE, 
    remove_selected_columns = TRUE) 

penguin_dummies = penguins_missing %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = FALSE, 
    remove_selected_columns = TRUE)

head(penguin_dummies) %>% kable()

year	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	species_Adelie	species_Chinstrap	species_Gentoo	island_Biscoe	island_Dream	island_Torgersen	sex_female	sex_male
2007	39.1	NaN	181	3750	1	0	0	0	0	1	0	1
2007	39.5	17.4	186	3800	NA	NA	NA	NA	NA	NA	1	0
2007	40.3	18.0	195	3250	1	0	0	0	0	1	1	0
2007	NaN	NaN	193	3450	NA	NA	NA	NA	NA	NA	1	0
2007	39.3	NaN	190	3650	1	0	0	NA	NA	NA	NA	NA
2007	38.9	NaN	NaN	NaN	1	0	0	0	0	1	1	0

## define categorical_column_map
categorical_column_map <- list(
  species = grep("^species_", names(penguin_dummies), value = TRUE),
  island  = grep("^island_",  names(penguin_dummies), value = TRUE),
  sex     = grep("^sex_",     names(penguin_dummies), value = TRUE)
)

categorical_column_map

## $species
## [1] "species_Adelie"    "species_Chinstrap" "species_Gentoo"   
## 
## $island
## [1] "island_Biscoe"    "island_Dream"     "island_Torgersen"
## 
## $sex
## [1] "sex_female" "sex_male"

Now that the dummy vars are created and there is missingness, we can create a binary_feature_mask and impute with run_cissvae().

binary_feature_mask = c(rep(FALSE, 5), rep(TRUE, 8))

glue::glue("Binary Feature Mask: {paste0(binary_feature_mask, collapse = ', ')}")

results = run_cissvae(
    data = penguin_dummies,
    val_proportion = 0.20, ## small dataset so using higher val proportion
    columns_ignore = "year",
    binary_feature_mask = binary_feature_mask,
    categorical_column_map = categorical_column_map,
    clusters = NULL,
    n_clusters = 1,
    scale_features = TRUE,
    epochs = 500,
    debug = FALSE
)

head(results$imputed_dataset)
head(penguin_dummies)

##        year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678       39.10000      18.70000          197.3811    3827.858
## 1 -1.283678       39.50000      17.40000          186.0000    3800.000
## 2 -1.283678       42.15094      16.55605          195.0000    3250.000
## 3 -1.283678       36.70000      17.30473          193.0000    3450.000
## 4 -1.283678       39.30000      20.60000          190.0000    3650.000
## 5 -1.283678       38.90000      17.22320          181.0000    3666.441
##   species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0              1                 0              0             0            0
## 1              1                 0              0             0            1
## 2              1                 0              0             0            1
## 3              1                 0              0             0            0
## 4              1                 0              0             0            1
## 5              1                 0              0             0            0
##   island_Torgersen sex_female sex_male
## 0                1          0        1
## 1                0          1        0
## 2                0          1        0
## 3                1          1        0
## 4                0          0        1
## 5                1          1        0

## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <dbl>       <dbl>
## 1  2007           39.1         NaN                 181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007          NaN           NaN                 193        3450
## 5  2007           39.3         NaN                 190        3650
## 6  2007           38.9         NaN                 NaN         NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>

As we can see above, the imputed values for the binary variables are in terms of probability, not a flat 0,1 so we have to convert those values to binary. The ‘imputed_dataset’ is returned as a data.frame, so we can use tidyverse mutate to convert the binary variables.

results$imputed_dataset <- results$imputed_dataset %>%
  mutate(across(
    .cols = matches("species|island|sex"),
    .fns = ~ case_when(
      .x > 0.5 ~ 1,
      .x <= 0.5 ~ 0,
      TRUE ~ .x
    )
  ))

head(results$imputed_dataset)
head(penguin_dummies)
head(penguin_dummies_complete)

##        year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678       39.10000      18.70000          197.3811    3827.858
## 1 -1.283678       39.50000      17.40000          186.0000    3800.000
## 2 -1.283678       42.15094      16.55605          195.0000    3250.000
## 3 -1.283678       36.70000      17.30473          193.0000    3450.000
## 4 -1.283678       39.30000      20.60000          190.0000    3650.000
## 5 -1.283678       38.90000      17.22320          181.0000    3666.441
##   species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0              1                 0              0             0            0
## 1              1                 0              0             0            1
## 2              1                 0              0             0            1
## 3              1                 0              0             0            0
## 4              1                 0              0             0            1
## 5              1                 0              0             0            0
##   island_Torgersen sex_female sex_male
## 0                1          0        1
## 1                0          1        0
## 2                0          1        0
## 3                1          1        0
## 4                0          0        1
## 5                1          1        0

## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <dbl>       <dbl>
## 1  2007           39.1         NaN                 181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007          NaN           NaN                 193        3450
## 5  2007           39.3         NaN                 190        3650
## 6  2007           38.9         NaN                 NaN         NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>

## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <int>       <int>
## 1  2007           39.1          18.7               181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007           36.7          19.3               193        3450
## 5  2007           39.3          20.6               190        3650
## 6  2007           38.9          17.8               181        3625
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>

2026-04-21