diff --git a/.gitignore b/.gitignore index ee239e9..43a0ff5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ analyze_inference_bernoulli_*/ *.e[1-9]* *.pe[1-9]* *.po[1-9]* + +/.luarc.json diff --git a/code/for_report/inference_analyze.R b/code/for_report/inference_analyze.R new file mode 100644 index 0000000..cec5949 --- /dev/null +++ b/code/for_report/inference_analyze.R @@ -0,0 +1,162 @@ +## ----libraries, echo = FALSE, include = FALSE---------------------------------------------------------------------------------------------------------------------------------------- +library("ggplot2") +library("ggokabeito") +library("tidyr") +library("dplyr") +library("stringr") +library("knitr") +library("pander") +library("patchwork") +library("latex2exp") +library("here") + + +## ----setup, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------- +options(dplyr.summarise.inform = FALSE) +knitr::opts_knit$set(kable.force.latex = TRUE) + +meanse <- function(x, ...) { + mean1 <- signif(round(mean(x, na.rm = T), 2), 5) # calculate mean and round + se1 <- signif(round(sd(x, na.rm = T) / sqrt(sum(!is.na(x))), 2), 2) # std error - round adding zeros + out <- paste(mean1, "$\\pm$", se1) # paste together mean plus/minus and standard error + if (str_detect(out, "NA")) { + out <- "NA" + } # if missing do not add plusminus + if (se1 == 0) { + out <- paste(mean1) + } + return(out) +} + + +## ----import-data, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------------- +filenames <- list.files( + path = "./data/", + pattern = "inference_testing_2023-07-*", + full.names = TRUE +) +data_list <- lapply(filenames, readRDS) +col_id_BICLS <- c(11, 16, 23, 30, 37) +result_data_frame <- dplyr::bind_rows(data_list) + +# Compute the preferred model +result_data_frame <- cbind(result_data_frame, preferred_model = sapply(seq_len(nrow(result_data_frame)), function(n) sub("_BICL", "", names(which.max(result_data_frame[n, col_id_BICLS]))))) +result_data_frame$preferred_model <- factor(result_data_frame$preferred_model, levels = c( + "sep", "iid", "pi", + "rho", "pirho" +)) + + +## ----inference_table, echo = FALSE--------------------------------------------------------------------------------------------------------------------------------------------------- +averaged_data <- result_data_frame %>% + group_by(epsilon_alpha) %>% + summarise(across(-preferred_model, list("avrg" = meanse))) %>% + select(-c(2:10)) +averaged_data <- averaged_data %>% + select(which(!grepl("*_BICL_*", colnames(averaged_data)), + arr.ind = TRUE + )) + + +## ----function_per_model, echo = FALSE------------------------------------------------------------------------------------------------------------------------------------------------ +dataframe_per_model <- function(model) { + averaged_data %>% + select(epsilon_alpha, starts_with(paste0(model, "_"))) +} + + +## ----per_model_table, echo = FALSE, results='asis', message=FALSE, warning = FALSE--------------------------------------------------------------------------------------------------- +for (model in c("sep", "iid", "pi", "rho", "pirho")) { + kable_colnames <- c( + "$\\eps[\\alpha]$", # "BIC-L", + "$\\overline{\\text{ARI}}_{1}$", + "$\\overline{\\text{ARI}}_{2}$", "$\\text{ARI}_{1}$", "$\\text{ARI}_{2}$" + ) + model_name <- model + if (model != "sep") { + kable_colnames <- c( + kable_colnames, "Recovered $Q_1$", + "Recovered $Q_2$" + ) + } + if (model == "pirho") { + model_name <- "$\\pi\\rho$" + } else { + if (model != "iid" && model != "sep") { + model_name <- paste0("$\\", model, "$") + } else { + model_name <- paste0("$", model, "$") + } + } + print(kable(dataframe_per_model(model), + escape = FALSE, + booktabs = TRUE, + digits = 2, + position = "!h", + caption = paste0( + "\\label{tab:per_model_", model, + "}Quality metrics for ", + ifelse(model != "sep", paste0(model_name, "$\\text{-}colBiSBM$"), "$sep\\text{-}BiSBM$") + ), + col.names = kable_colnames + )) +} + + +## ----proportion-preferred_model, echo = FALSE---------------------------------------------------------------------------------------------------------------------------------------- +proportion_preferred_data <- result_data_frame %>% + group_by(epsilon_alpha, preferred_model) %>% + summarise(n = n()) %>% + mutate(prop_model = n / sum(n)) %>% + ungroup() %>% + select(-n) + +proportion_preferred_table <- proportion_preferred_data %>% + pivot_wider( + names_from = preferred_model, + values_from = prop_model, values_fill = 0 + ) + +kable(proportion_preferred_table, + escape = FALSE, + booktabs = TRUE, + digits = 2, + position = "!h", + caption = "\\label{tab:proportion-preferred-table}Proportions of models selected per \\eps[\\alpha] (data for Figure \\ref{fig:inference-proportion-preferred})", + col.names = c( + "\\eps[\\alpha]", + "$sep\\text{-}BiSBM$", + "$iid\\text{-}colBiSBM$", + "$\\pi\\text{-}colBiSBM$", + "$\\rho\\text{-}colBiSBM$", + "$\\pi\\rho\\text{-}colBiSBM$" + ), + align = "rccccc", + format = "latex" +) + + +## ----proportion_preferred_figure, echo = FALSE--------------------------------------------------------------------------------------------------------------------------------------- +#| fig.cap="\\label{fig:inference-proportion-preferred}Plot of the proportions of different preferred models in function of \\eps[\\alpha]", +#| fig.asp = 0.5, +#| fig.pos = "H", +#| fig.width = 7, +#| fig.height = 4, +#| dpi=300 + +plot <- proportion_preferred_data %>% ggplot() + + aes( + x = epsilon_alpha, y = prop_model, color = preferred_model, + fill = preferred_model + ) + + guides( + fill = guide_legend(title = "Preferred Model"), + color = guide_legend(title = "Preferred Model") + ) + + scale_x_continuous(breaks = seq(from = 0.0, to = 0.24, by = 0.03)) + + scale_color_okabe_ito() + + scale_fill_okabe_ito() + + xlab(TeX("$\\epsilon_{\\alpha}$")) + + ylab("Model proportions") + + geom_col(position = "stack") +print(plot) diff --git a/code/for_report/model_selection_analyze.R b/code/for_report/model_selection_analyze.R new file mode 100644 index 0000000..a23913d --- /dev/null +++ b/code/for_report/model_selection_analyze.R @@ -0,0 +1,161 @@ +## ----libraries, echo = FALSE, include = FALSE----------------------------------------------------------------------------------------------------------------------- +require("ggplot2") +require("ggokabeito") +require("knitr") +require("kableExtra") +require("stringr") +require("tidyr") +require("dplyr") +require("patchwork") +require("latex2exp") + + +## ----setup, echo = FALSE, include= FALSE---------------------------------------------------------------------------------------------------------------------------- +options(knitr.table.format = "latex") + +meanse <- function(x, ...) { + mean1 <- signif(round(mean(x, na.rm = T), 2), 5) # calculate mean and round + se1 <- signif(round(sd(x, na.rm = T) / sqrt(sum(!is.na(x))), 2), 2) # std error - round adding zeros + out <- paste(mean1, "$\\pm$", se1) # paste together mean plus/minus and standard error + if (str_detect(out, "NA")) { + out <- "NA" + } # if missing do not add plusminus + if (se1 == 0) { + out <- paste(mean1) + } + return(out) +} + + +## ----import-data, echo = FALSE-------------------------------------------------------------------------------------------------------------------------------------- +filenames <- list.files( + path = "./data/", + pattern = "model_selection_check_batch_report_3_rep_*", + full.names = TRUE +) +data_list <- lapply(filenames, readRDS) +result_data_frame <- dplyr::bind_rows(data_list) + +result_data_frame$preferred_model <- factor(result_data_frame$preferred_model, levels = c( + "sep","iid","pi", + "rho","pirho" +)) + +# Adding a column accounting for true model iid, pi, rho or pirho +# result_data_frame <- result_data_frame %>% mutate(true_model = if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(TRUE, TRUE))) print("pirho") else if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(TRUE, FALSE))) print("pi") else if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(F, T))) print("rho") else print("iid")) + + + +## ----compute-table, echo = FALSE, include = FALSE------------------------------------------------------------------------------------------------------------------- +model_comparison_eps_pi_rho <- result_data_frame %>% + group_by(epsilon_pi, epsilon_rho, preferred_model) %>% + summarise(n = n()) %>% + mutate(prop_model = n / sum(n)) + +model_comparison_eps_pi <- result_data_frame %>% + group_by(epsilon_pi, preferred_model) %>% + summarise(n = n(), rec_Q1 = mean(iid_Q1 + pi_Q1 + rho_Q1 + pirho_Q1)/4) %>% + mutate(prop_model = n / sum(n)) + +model_comparison_eps_rho <- result_data_frame %>% + group_by(epsilon_rho, preferred_model) %>% + summarise(n = n(), rec_Q2 = mean(iid_Q2 + pi_Q2 + rho_Q2 + pirho_Q2)/4) %>% + mutate(prop_model = n / sum(n)) + + +## ----epsilon_plot, echo = FALSE, include = FALSE-------------------------------------------------------------------------------------------------------------------- +#| fig.asp = 0.5, +#| fig.pos = "H", +#| fig.width = 7, +#| fig.height = 4, +#| dpi=300 + +plot_pi <- model_comparison_eps_pi %>% ggplot() + + aes( + x = epsilon_pi, y = prop_model, color = preferred_model, + fill = preferred_model + ) + + guides(fill = guide_legend(title = "Preferred Model"), + color = guide_legend(title = "Preferred Model")) + + xlab(TeX("$\\epsilon_{\\pi}$")) + + ylab("Model proportions") + + scale_color_okabe_ito() + + scale_fill_okabe_ito() + + geom_col(position = "stack") +plot_rho <- model_comparison_eps_rho %>% ggplot() + + aes( + x = epsilon_rho, y = prop_model, color = preferred_model, + fill = preferred_model + ) + + guides(fill = guide_legend(title = "Preferred Model"), + color = guide_legend(title = "Preferred Model")) + + xlab(TeX("$\\epsilon_{\\rho}$")) + + ylab("") + + scale_color_okabe_ito() + + scale_fill_okabe_ito() + + geom_col(position = "stack") + +ggsave("./img/plot_model_function_eps.png", plot_pi + plot_rho + + plot_layout(guides = "collect")) + + +## ----tables, echo = FALSE, results='asis'--------------------------------------------------------------------------------------------------------------------------- +kable( + (model_comparison_eps_pi %>% + select(-one_of("n")) %>% + pivot_wider( + names_from = preferred_model, + values_from = prop_model, + values_fill = 0 + ) %>% group_by(epsilon_pi) %>% + summarise( + rec_Q1 = meanse(rec_Q1), + iid = sum(iid), + pi = sum(pi), + rho = sum(rho), + pirho = sum(pirho) + ))[, c(1, 3:6, 2)], + digits = 2, + col.names = c( + "$\\eps[\\pi]$", + "$iid\\text{-}colBiSBM$ ", + "$\\pi\\text{-}colBiSBM$", + "$\\rho\\text{-}colBiSBM$", + "$\\pi\\rho\\text{-}colBiSBM$", + "Recovered $Q_1$" + ), align = "lcccc", + booktab = TRUE, + position = "!h", + escape = FALSE, + caption = "\\label{tab:pi-model-sel}Model selection for varying $\\pi$ mixture parameters", + format = "latex" +) %>% kableExtra::add_header_above(c(" "=1,"Models"=4,"Blocks"=1)) +kable( + (model_comparison_eps_rho %>% + select(-one_of("n")) %>% + pivot_wider( + names_from = preferred_model, + values_from = prop_model, + values_fill = 0 + ) %>% group_by(epsilon_rho) %>% + summarise(rec_Q2 = meanse(rec_Q2), + iid = sum(iid), + pi = sum(pi), + rho = sum(rho), + pirho = sum(pirho)))[,c(1,3:6, 2)], + digits = 2, + col.names = c( + "$\\eps[\\rho]$", + "$iid\\text{-}colBiSBM$ ", + "$\\pi\\text{-}colBiSBM$", + "$\\rho\\text{-}colBiSBM$", + "$\\pi\\rho\\text{-}colBiSBM$", + "Recovered $Q_2$" + ), align = "lcccc", + booktab = TRUE, + position = "!h", + escape = FALSE, + caption = "\\label{tab:rho-model-sel}Model selection for varying $\\rho$ mixture parameters", + format = "latex" +) %>% kableExtra::add_header_above(c(" "=1,"Models"=4,"Blocks"=1)) + diff --git a/mia-rapport-2024 b/mia-rapport-2024 index 4348507..e412e00 160000 --- a/mia-rapport-2024 +++ b/mia-rapport-2024 @@ -1 +1 @@ -Subproject commit 43485078a719b65f23009ecf5b2112cb8f29ff22 +Subproject commit e412e0015127b1361bb7cee42d4f84dc98121cce