```{r libraries, echo = FALSE, include = FALSE} require("ggplot2") require("knitr") # require("kableExtra") require("tidyr") require("dplyr") require("patchwork") require("latex2exp") ``` ```{r setup, echo = FALSE, include= FALSE} options(knitr.table.knitr.table.format = "latex") ``` ```{r import-data, echo = FALSE} filenames <- list.files( path = "./data/", pattern = "model_selection_check_batch_report_3_rep_*", full.names = TRUE ) data_list <- lapply(filenames, readRDS) result_data_frame <- dplyr::bind_rows(data_list) result_data_frame$preferred_model <- factor(result_data_frame$preferred_model, levels = c( "sep","iid","pi", "rho","pirho" )) # Adding a column accounting for true model iid, pi, rho or pirho # result_data_frame <- result_data_frame %>% mutate(true_model = if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(TRUE, TRUE))) print("pirho") else if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(TRUE, FALSE))) print("pi") else if (all( c(epsilon_pi >0, epsilon_rho > 0) == c(F, T))) print("rho") else print("iid")) ``` # Capacity to distinguish $\pi\rho\text{-}colBiSBM$ from $iid\text{-}colBiSBM$ and other variants The idea of this model selection simulations is to assess how the model select the correct *colBiSBM* model among the possible ones: \textit{iid, pi, rho, pirho}. This difference being based on the row and col block proportions. For this task we choose the same simulation context as \cite{chabert-liddellLearningCommonStructures2023}. Namely $n_{1}^{m} = 90, n_{2}^{m} = 90, Q_1 = Q_2 = 3$, $\bm{\alpha}, \bm{\pi}$ and $\bm{\rho}$ are set as follows: \begin{align*} \bm{\alpha} =.25 + \begin{pmatrix} 3 \eps[\alpha] & 2 \eps[\alpha] & \eps[\alpha] \\ 2 \eps[\alpha] & 2 \eps[\alpha] & - \eps[\alpha] \\ \eps[\alpha] & - \eps[\alpha] & \eps[\alpha] \end{pmatrix}, & & \bm{\pi}^1 = \begin{pmatrix} \frac{1}{3}, & \frac{1}{3}, & \frac{1}{3} \end{pmatrix}, & & \bm{\pi}^2 = \sigma\begin{pmatrix} \frac{1}{3} - \eps[\pi], & \frac{1}{3}, & \frac{1}{3} + \eps[\pi] \end{pmatrix},\\ & & \bm{\rho}^1 = \begin{pmatrix} \frac{1}{3}, & \frac{1}{3}, & \frac{1}{3} \end{pmatrix}, & & \bm{\rho}^2 = \sigma\begin{pmatrix} \frac{1}{3} - \eps[\rho], & \frac{1}{3}, & \frac{1}{3} + \eps[\rho] \end{pmatrix}, \end{align*} with $\eps[\alpha] = 0.16$, $\eps[\pi]$ and $\eps[\rho]$ taking 9 values equally spaced in $\left[ 0, .28\right]$. We simulate 324 different collections for each value of $\eps[\pi]$ and $\eps[\rho]$. $\pi\rho\text{-}colBiSBM$, $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, $iid\text{-}colBiSBM$ and $sep\text{-}colBiSBM$ are put in competition and the model with the greater BIC-L is selected as the \emph{preferred model}. ```{r compute-table, echo = FALSE, include = FALSE} model_comparison_eps_pi_rho <- result_data_frame %>% group_by(epsilon_pi, epsilon_rho, preferred_model) %>% summarise(n = n()) %>% mutate(prop_model = n / sum(n)) model_comparison_eps_pi <- result_data_frame %>% group_by(epsilon_pi, preferred_model) %>% summarise(n = n(), rec_Q1 = mean(iid_Q1 + pi_Q1 + rho_Q1 + pirho_Q1)/4) %>% mutate(prop_model = n / sum(n)) model_comparison_eps_rho <- result_data_frame %>% group_by(epsilon_rho, preferred_model) %>% summarise(n = n(), rec_Q2 = mean(iid_Q2 + pi_Q2 + rho_Q2 + pirho_Q2)/4) %>% mutate(prop_model = n / sum(n)) ``` ```{r epsilon_plot, echo = FALSE, include = FALSE} #| fig.asp = 0.5, #| fig.pos = "H", #| fig.width = 7, #| fig.height = 4, #| dpi=300 plot_pi <- model_comparison_eps_pi %>% ggplot() + aes(x = epsilon_pi, y = prop_model, color = preferred_model, fill = preferred_model) + xlab(TeX("$\\epsilon_{\\pi}$")) + ylab("Model proportions")+ geom_col(position = "stack") plot_rho <- model_comparison_eps_rho %>% ggplot() + aes(x = epsilon_rho, y = prop_model, color = preferred_model, fill = preferred_model) + xlab(TeX("$\\epsilon_{\\rho}$")) + ylab("")+ geom_col(position = "stack") ggsave("./img/plot_model_function_eps.png", plot_pi + plot_rho + plot_layout(guides = "collect")) ``` When $\eps[\pi] = 0$, $\bm{\pi}^1 = \bm{\pi}^2$, $\eps[\rho] = 0$ and $\bm{\rho}^1 = \bm{\rho}^2$, the generated collection is an $iid\text{-}colBiSBM$. When $\eps[\pi] > 0$ or $\bm{\pi}^1 \neq \bm{\pi}^2$, the model is a $\pi\text{-}colBiSBM$. When $\eps[\rho] > 0$ or $\bm{\rho}^1 \neq \bm{\rho}^2$, the model is a $\rho\text{-}colBiSBM$. Finally, when $\eps[\pi] > 0$ or $\bm{\pi}^1 \neq \bm{\pi}^2$ and $\eps[\rho] > 0$ or $\bm{\rho}^1 \neq \bm{\rho}^2$, the model is a $\pi\rho\text{-}colBiSBM$. ```{r tables, echo = FALSE, results='asis'} kable( (model_comparison_eps_pi %>% select(-one_of("n")) %>% pivot_wider( names_from = preferred_model, values_from = prop_model, values_fill = 0 ) %>% group_by(epsilon_pi) %>% summarise( rec_Q1 = mean(rec_Q1), iid = sum(iid), pi = sum(pi), rho = sum(rho), pirho = sum(pirho) ))[, c(1, 3:6, 2)], digits = 2, col.names = c( "$\\eps[\\pi]$", "$iid\\text{-}colBiSBM$ ", "$\\pi\\text{-}colBiSBM$", "$\\rho\\text{-}colBiSBM$", "$\\pi\\rho\\text{-}colBiSBM$", "Recovered $Q_1$" ), align = "lcccc", booktab = TRUE, position = "!h", escape = FALSE, caption = "\\label{tab:pi-model-sel}Model selection for varying $\\pi$ mixture parameters" ) %>% kableExtra::add_header_above(c(" " = 1, "Proportions of model selection" = 4, "Blocks" = 1)) kable( (model_comparison_eps_rho %>% select(-one_of("n")) %>% pivot_wider( names_from = preferred_model, values_from = prop_model, values_fill = 0 ) %>% group_by(epsilon_rho) %>% summarise(rec_Q2 = mean(rec_Q2), iid = sum(iid), pi = sum(pi), rho = sum(rho), pirho = sum(pirho)))[,c(1,3:6, 2)], digits = 2, col.names = c( "$\\eps[\\rho]$", "$iid\\text{-}colBiSBM$ ", "$\\pi\\text{-}colBiSBM$", "$\\rho\\text{-}colBiSBM$", "$\\pi\\rho\\text{-}colBiSBM$", "Recovered $Q_2$" ), align = "lcccc", booktab = TRUE, position = "!h", escape = FALSE, caption = "\\label{tab:rho-model-sel}Model selection for varying $\\rho$ mixture parameters" ) %>% kableExtra::add_header_above(c(" " = 1, "Proportions of model selection" = 4, "Blocks" = 1)) ``` \begin{figure}[H] \includegraphics{./Rcodes/simulation/img/plot_model_function_eps.png} \caption{Plot of preferred model in function of $\eps[\pi]$ and $\eps[\rho]$} \label{fig:pref_model_func_eps} \end{figure} \paragraph{Results:}On the figure \ref{fig:pref_model_func_eps} and tables \ref{tab:pi-model-sel} and \ref{tab:rho-model-sel}, one can see that there is a turning point around $\eps[\pi] = 0.2$ (resp. $\eps[\rho] = 0.2$), before which $iid\text{-}colBiSBM$ and $\rho\text{-}colBiSBM$ (resp. $\pi\text{-}colBiSBM$) are selected most of the times and after $0.2$ the $\pi\text{-}colBiSBM$ (resp. $\rho\text{-}colBiSBM$) and $\pi\rho\text{-}colBiSBM$ gets more and more selected, highlighting our capacity to recover the simulated structure. \paragraph*{Remark:} Please note that when "Recovered $Q_1$(or $Q_2$)" is not an integer it's because some procedures returned a value other than 3.