--- title: "Analyzing the capacity of the colBiSBM to recover structure for missing data from other networks" output: html_document: toc: true theme: journal pdf_document: keep_tex: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r required_libs, echo = FALSE, include=FALSE} require("ggplot2") require("tidyverse") ``` ```{r import_data, echo=FALSE, include=FALSE} NA_robustness_raw <- readRDS("simulation/data/NA_robustness_results-alpha_0.7, 0.4, 0.3, 0.4, 0.2, 0.05, 0.3, 0.05, 0.05-reps-10-14-06-23_17-41.Rds") NA_robustness_df <- NA_robustness_raw %>% mutate( auc_diff = auc_colBiSBM - auc_LBM, ari_row_diff = NA_net_ari_row - LBM_ari_row, ari_col_diff = NA_net_ari_col - LBM_ari_col ) %>% group_by(prop_NAs, model) %>% summarise( mean_auc_diff = mean(auc_diff), sd_auc_diff = sd(auc_diff), mean_ari_row_diff = mean(ari_row_diff), sd_ari_row_diff = sd(ari_row_diff), mean_ari_col_diff = mean(ari_col_diff), sd_ari_col_diff = sd(ari_col_diff), mean_LBM_ari_row = mean(LBM_ari_row), sd_LBM_ari_row = sd(LBM_ari_row), mean_LBM_ari_col = mean(LBM_ari_col), sd_LBM_ari_col = sd(LBM_ari_col), mean_NA_net_ari_row = mean(NA_net_ari_row), sd_NA_net_ari_row = sd(NA_net_ari_row), mean_NA_net_ari_col = mean(NA_net_ari_col), sd_NA_net_ari_col = sd(NA_net_ari_col), mean_elapsed_secs = mean(elapsed_secs), sd_elapsed_secs = sd(elapsed_secs) ) %>% ungroup() ``` ```{r useful_function, echo = FALSE} write_matex2 <- function(x) { if (!is.matrix(x)) { x <- matrix(x) } begin <- "\\begin{bmatrix}" end <- "\\end{bmatrix}" X <- apply(x, 1, function(x) { paste( paste(x, collapse = "&"), "\\\\" ) }) paste(c(begin, X, end), collapse = "") } ``` # Simulation context The idea is to benchmark the capacity of the models when NAs are in the data. To do this, we chose the below structure: ```{r simulation_parameters, echo = FALSE} eps <- 0.05 M <- 3 # Defining parameters nr <- 100 nc <- 150 pir <- c(0.5, 0.3, 0.2) pic <- c(0.5, 0.3, 0.2) alpha <- matrix(c( 0.7, 0.4, 0.3, 0.4, 0.2, eps, 0.3, eps, eps ), byrow = TRUE, nrow = length(pir), ncol = length(pic)) ``` $$M = `r M`, n_r = `r nr`, n_c = `r nc` \\ \alpha = `r write_matex2(alpha)` \\ \pi = `r write_matex2(pir)` \rho = `r write_matex2(pic)`$$ With $M$ the number of networks, $n_r$ the number of nodes in row of the incidence matrix, $n_c$ the number of nodes in column, $\alpha$ the connectivity parameters between the row and column clusters. $\pi$ and $\rho$ are the proportion of nodes in the row and columns clusters. And set some randomly chosen interactions to NA. The below plots will show the different quality indicators in function of proportion of NAs in the first of the 3 networks. # AUC in function of the proportion of NAs ```{r auc_plots, echo = FALSE} auc_plot <- NA_robustness_df %>% ggplot() + geom_ribbon(aes(ymin = mean_auc_diff - sd_auc_diff, ymax = mean_auc_diff + sd_auc_diff, x = prop_NAs, fill = model), alpha = 0.1) + geom_line(aes(x = prop_NAs, y = mean_auc_diff, color = model)) + geom_point(aes(x = prop_NAs, y = mean_auc_diff, color = model)) + xlab("NA proportion") + ylab("AUC difference (colBiSBM - LBM)") + scale_x_continuous(breaks = seq(0, 0.9, 0.1)) auc_plot ``` ```{r auc_, echo = FALSE} auc_plot <- NA_robustness_df %>% ggplot() + geom_ribbon(aes(ymin = mean_auc_diff - sd_auc_diff, ymax = mean_auc_diff + sd_auc_diff, x = prop_NAs, fill = model), alpha = 0.1) + geom_line(aes(x = prop_NAs, y = mean_auc_diff, color = model)) + geom_point(aes(x = prop_NAs, y = mean_auc_diff, color = model)) + xlab("NA proportion") + ylab("AUC difference (colBiSBM - LBM)") + scale_x_continuous(breaks = seq(0, 0.9, 0.1)) auc_plot ``` # ARI in function of the proportion of NAs ```{r ARI_row_plot, echo = FALSE, fig.cap="Difference of ARI for the row clusterings"} ari_row_plot <- NA_robustness_df %>% ggplot() + # ylim(-1, 1) + geom_ribbon(aes(ymin = mean_ari_row_diff - sd_ari_row_diff, ymax = mean_ari_row_diff + sd_ari_row_diff, x = prop_NAs, fill = model), alpha = 0.25) + geom_line(aes(x = prop_NAs, y = mean_ari_row_diff, color = model)) + geom_point(aes(x = prop_NAs, y = mean_ari_row_diff, color = model)) + xlab("NA proportion") + ylab("ARI difference") + ggtitle("ARI on the row clustering, difference (colBiSBM - LBM)") + scale_x_continuous(breaks = seq(0, 0.9, 0.1)) ari_row_plot ``` ```{r ARI_col_plot, echo = FALSE, fig.cap="Difference of ARI for the columns clusterings"} ari_col_plot <- NA_robustness_df %>% ggplot() + # ylim(-1, 1) + geom_ribbon(aes(ymin = mean_ari_col_diff - sd_ari_col_diff, ymax = mean_ari_col_diff + sd_ari_col_diff, x = prop_NAs, fill = model), alpha = 0.25) + geom_line(aes(x = prop_NAs, y = mean_ari_col_diff, color = model)) + geom_point(aes(x = prop_NAs, y = mean_ari_col_diff, color = model)) + xlab("NA proportion") + ylab("ARI difference") + ggtitle("ARI on the column clustering, difference (colBiSBM - LBM)") + scale_x_continuous(breaks = seq(0, 0.9, 0.1)) ari_col_plot ```