Skip to contents

The CISS-VAE model can handle binary and categorical variables, but categorical variables must first be converted into binary dummy variables.

The Palmer Penguins dataset has both continuous (bill_length, bill_depth, flipper_length, body_mass) and categorical (species, island, sex) values so it makes a good example for this. We can use the dummy_cols() function from the fastDummies package to create dummy variables for our categories. Set ignore_na = TRUE and remove_selected_columns = TRUE to avoid creating a new column for NA values and to remove the original categoricals once the dummies are created. The dummy variable columns must be mapped back to their original categorical variables using a categorical_column_map, which is a named list where each entry corresponds to a categorical feature and contains the names of its associated dummy columns.

## Warning: package 'palmerpenguins' was built under R version 4.5.2
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
data(package = 'palmerpenguins')

penguins_clean = na.omit(penguins)%>%
        select(year, everything()) ## removing existing incomplete rows for illustration purposes

glue::glue("Dimensions: {paste0(dim(penguins), collapse = ',')}")
## Dimensions: 344,8
head(penguins_clean) %>% kable()
year species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
2007 Adelie Torgersen 39.1 18.7 181 3750 male
2007 Adelie Torgersen 39.5 17.4 186 3800 female
2007 Adelie Torgersen 40.3 18.0 195 3250 female
2007 Adelie Torgersen 36.7 19.3 193 3450 female
2007 Adelie Torgersen 39.3 20.6 190 3650 male
2007 Adelie Torgersen 38.9 17.8 181 3625 female
## create penguins_missing
n  <- nrow(penguins_clean)
p  <- ncol(penguins_clean)
m  <- floor(0.20 * n * p)               # number of cells to mask
idx <- sample.int(n * p, m)             # positions in a logical matrix

mask <- matrix(FALSE, nrow = n, ncol = p)
mask[idx] <- TRUE

penguins_missing <- penguins_clean

## anything can be missing except the year
for (j in seq(2, p, 1)) {
  penguins_missing[[j]][mask[, j]] <- NaN
}

# quick check of missingness rate
glue::glue("\nMissingness proportion of penguins_missing: {round(mean(is.na(as.matrix(penguins_missing))), 2)}") 
## Missingness proportion of penguins_missing: 0.17
## create dummy vars

penguin_dummies_complete = penguins_clean %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = FALSE, 
    remove_selected_columns = TRUE) 

penguin_dummies = penguins_missing %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = FALSE, 
    remove_selected_columns = TRUE)

head(penguin_dummies) %>% kable()
year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream island_Torgersen sex_female sex_male
2007 39.1 NaN 181 3750 1 0 0 0 0 1 0 1
2007 39.5 17.4 186 3800 NA NA NA NA NA NA 1 0
2007 40.3 18.0 195 3250 1 0 0 0 0 1 1 0
2007 NaN NaN 193 3450 NA NA NA NA NA NA 1 0
2007 39.3 NaN 190 3650 1 0 0 NA NA NA NA NA
2007 38.9 NaN NaN NaN 1 0 0 0 0 1 1 0
## define categorical_column_map
categorical_column_map <- list(
  species = grep("^species_", names(penguin_dummies), value = TRUE),
  island  = grep("^island_",  names(penguin_dummies), value = TRUE),
  sex     = grep("^sex_",     names(penguin_dummies), value = TRUE)
)

categorical_column_map
## $species
## [1] "species_Adelie"    "species_Chinstrap" "species_Gentoo"   
## 
## $island
## [1] "island_Biscoe"    "island_Dream"     "island_Torgersen"
## 
## $sex
## [1] "sex_female" "sex_male"

Now that the dummy vars are created and there is missingness, we can create a binary_feature_mask and impute with run_cissvae().

binary_feature_mask = c(rep(FALSE, 5), rep(TRUE, 8))

glue::glue("Binary Feature Mask: {paste0(binary_feature_mask, collapse = ', ')}")

results = run_cissvae(
    data = penguin_dummies,
    val_proportion = 0.20, ## small dataset so using higher val proportion
    columns_ignore = "year",
    binary_feature_mask = binary_feature_mask,
    categorical_column_map = categorical_column_map,
    clusters = NULL,
    n_clusters = 1,
    scale_features = TRUE,
    epochs = 500,
    debug = FALSE
)

head(results$imputed_dataset)
head(penguin_dummies)
##        year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678       39.10000      18.70000          197.3811    3827.858
## 1 -1.283678       39.50000      17.40000          186.0000    3800.000
## 2 -1.283678       42.15094      16.55605          195.0000    3250.000
## 3 -1.283678       36.70000      17.30473          193.0000    3450.000
## 4 -1.283678       39.30000      20.60000          190.0000    3650.000
## 5 -1.283678       38.90000      17.22320          181.0000    3666.441
##   species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0              1                 0              0             0            0
## 1              1                 0              0             0            1
## 2              1                 0              0             0            1
## 3              1                 0              0             0            0
## 4              1                 0              0             0            1
## 5              1                 0              0             0            0
##   island_Torgersen sex_female sex_male
## 0                1          0        1
## 1                0          1        0
## 2                0          1        0
## 3                1          1        0
## 4                0          0        1
## 5                1          1        0
## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <dbl>       <dbl>
## 1  2007           39.1         NaN                 181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007          NaN           NaN                 193        3450
## 5  2007           39.3         NaN                 190        3650
## 6  2007           38.9         NaN                 NaN         NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>

As we can see above, the imputed values for the binary variables are in terms of probability, not a flat 0,1 so we have to convert those values to binary. The ‘imputed_dataset’ is returned as a data.frame, so we can use tidyverse mutate to convert the binary variables.

results$imputed_dataset <- results$imputed_dataset %>%
  mutate(across(
    .cols = matches("species|island|sex"),
    .fns = ~ case_when(
      .x > 0.5 ~ 1,
      .x <= 0.5 ~ 0,
      TRUE ~ .x
    )
  ))

head(results$imputed_dataset)
head(penguin_dummies)
head(penguin_dummies_complete)
##        year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## 0 -1.283678       39.10000      18.70000          197.3811    3827.858
## 1 -1.283678       39.50000      17.40000          186.0000    3800.000
## 2 -1.283678       42.15094      16.55605          195.0000    3250.000
## 3 -1.283678       36.70000      17.30473          193.0000    3450.000
## 4 -1.283678       39.30000      20.60000          190.0000    3650.000
## 5 -1.283678       38.90000      17.22320          181.0000    3666.441
##   species_Adelie species_Chinstrap species_Gentoo island_Biscoe island_Dream
## 0              1                 0              0             0            0
## 1              1                 0              0             0            1
## 2              1                 0              0             0            1
## 3              1                 0              0             0            0
## 4              1                 0              0             0            1
## 5              1                 0              0             0            0
##   island_Torgersen sex_female sex_male
## 0                1          0        1
## 1                0          1        0
## 2                0          1        0
## 3                1          1        0
## 4                0          0        1
## 5                1          1        0
## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <dbl>       <dbl>
## 1  2007           39.1         NaN                 181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007          NaN           NaN                 193        3450
## 5  2007           39.3         NaN                 190        3650
## 6  2007           38.9         NaN                 NaN         NaN
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>
## # A tibble: 6 × 13
##    year bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <int>          <dbl>         <dbl>             <int>       <int>
## 1  2007           39.1          18.7               181        3750
## 2  2007           39.5          17.4               186        3800
## 3  2007           40.3          18                 195        3250
## 4  2007           36.7          19.3               193        3450
## 5  2007           39.3          20.6               190        3650
## 6  2007           38.9          17.8               181        3625
## # ℹ 8 more variables: species_Adelie <int>, species_Chinstrap <int>,
## #   species_Gentoo <int>, island_Biscoe <int>, island_Dream <int>,
## #   island_Torgersen <int>, sex_female <int>, sex_male <int>