diff --git a/.vscode/settings.json b/.vscode/settings.json index b087b8f..6523ba8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -4,7 +4,7 @@ { "match": ".*\\.Rmd", "isAsync": true, - "cmd": "Rscript Rmd2Latex-fragment.R '${file}' " + "cmd": "/bin/bash -c \"Rscript Rmd2Latex-fragment.R '${file}'\"" }, { "match": ".*", diff --git a/Rcodes/simulation/inference_analyze.Rmd b/Rcodes/simulation/inference_analyze.Rmd index e69de29..503c687 100644 --- a/Rcodes/simulation/inference_analyze.Rmd +++ b/Rcodes/simulation/inference_analyze.Rmd @@ -0,0 +1,196 @@ +```{r libraries, echo = FALSE, include = FALSE} +require("ggplot2") +require("tidyr") +require("dplyr") +require("stringr") +require("knitr") +require("pander") +require("patchwork") +require("latex2exp") +``` + +```{r setup, echo = FALSE} +options(dplyr.summarise.inform = FALSE) +knitr::opts_knit$set(kable.force.latex = TRUE) + +meanse <- function(x, ...) { + mean1 <- signif(round(mean(x, na.rm = T), 2), 5) # calculate mean and round + se1 <- signif(round(sd(x, na.rm = T) / sqrt(sum(!is.na(x))), 2), 2) # std error - round adding zeros + out <- paste(mean1, "$\\pm$", se1) # paste together mean plus/minus and standard error + if (str_detect(out, "NA")) { + out <- "NA" + } # if missing do not add plusminus + if (se1 == 0) { + out <- paste(mean1) + } + return(out) +} +``` + +```{r import-data, echo = FALSE} +filenames <- list.files( + path = "./data/", + pattern = "inference_testing_2023-07-*", + full.names = TRUE +) +data_list <- lapply(filenames, readRDS) +col_id_BICLS <- c(11, 16, 23, 30, 37) +result_data_frame <- dplyr::bind_rows(data_list) + +# Compute the preferred model +result_data_frame <- cbind(result_data_frame, preferred_model = sapply(seq_len(nrow(result_data_frame)), function(n) sub("_BICL", "", names(which.max(result_data_frame[n, col_id_BICLS]))))) + +``` + +# Efficiency of the inference + +\paragraph{Simulation settings} For this simulation the data is simulated with +$M = 2, n_{1}^{m} = 120, n_{2}^{m} = 120, Q_1 = Q_2 = 4$, $\bm{\alpha}, \bm{\pi}$ +and $\bm{\rho}$ are set as follows: +\begin{align*} + &&\bm{\alpha} = .25 + + \begin{pmatrix} + 3 \eps[\alpha] & 2 \eps[\alpha] & \eps[\alpha] & - \eps[\alpha]\\ + 2 \eps[\alpha] & 2 \eps[\alpha] & - \eps[\alpha] & \eps[\alpha]\\ + \eps[\alpha] & - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha]\\ + - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha] & 0 + \end{pmatrix}, \\ \bm{\pi}^1 = \sigma_1 + \begin{pmatrix} + 0.2 & 0.4 & 0.4 & 0 + \end{pmatrix}, + && \bm{\pi}^2 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, \\ + \bm{\rho}^1 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, && + \bm{\rho}^2 = \sigma_2 + \begin{pmatrix} + 0 & 0.33 & 0.33 & 0.33 + \end{pmatrix}, && +\end{align*} +with $\eps[\alpha]$ taking nine equally spaced values ranging from 0 to 0.24. +For each value of $\eps[\alpha]$, 108 datasets ($X_1, X_2$) are simulated, +resulting in $9 \times 108 = 972$ datasets. More precisely, for each dataset, +we pick uniformly at random two permutations of $\{ 1, \dots , 4 \}$ +($\sigma_1, \sigma_2$) with the constraint that $\sigma_1(4) \neq \sigma_2(1)$. +This ensures that each of the two networks have a non-empty block that is empty +in the other one. Then the networks are simulated with +$\mathcal{B}$ern-$BiSBM_{120}(4, \bm{\alpha}, \bm{\pi}^m, \bm{\rho}^m)$ +with the previous parameters. Each network has 2 blocks in common and their +connectivity structures encompass a mix of core-periphery, assortative +community and disassortative community structures, depending on which 3 of the 4 +blocks are selected for each network. $\eps[\alpha]$ represents the strength of +these structures, the larger, the easier it is to tell apart one block from +another. + +\paragraph{Inference} We want to measure the quality of the +inference procedure, for this we use the inference described in the section +\ref{sec:variational-estimation-of-the-parameters}. + +\paragraph{Quality indicators} To assess the quality of the inference, we will +use the following indicators: +\begin{itemize} + \item First, for each dataset, we put in competition $\pi\text{-}colBiSBM$ with + $sep\text{-}BiSBM$, $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ + respectively. To do so, for each dataset, we compute the + BIC-L of each model $\pi\text{-}colBiSBM$ is preferred to $sep\text{-}BiSBM$ + (resp. $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$) if + its BIC-L is greater. + \item When considering $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare $\widehat{Q_1}$, $\widehat{Q_2}$ to + their true values. ($Q_1 = 4$ and $Q_2 = 4$) + \item Finally, we assess the quality of the node grouping by computing the + Adjusted Rand Index \parencite[][, ARI = 0 for a random grouping, ARI = 1 for a perfect recovery]{hubertComparingPartitions1985}. For each network, for the + $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare the inferred block memberships to the + real ones by computing the mean of the ARI per axis over the two networks + \begin{equation*} + \overline{\text{ARI}}_d = \frac{1}{2} \text{ARI}\big( \text{ARI}(\widehat{\bm{Z}^1_d},\bm{Z}^1_d) + \text{ARI}(\widehat{\bm{Z}^2_d},\bm{Z}^2_d) \big) + \end{equation*} + where $d$ is the dimension or axis (i.e., rows, $d=1$, or columns, $d=2$) of + the block memberships. + And we compute the ARI of the whole set of nodes to account for block + pairing between networks + \begin{equation*} + \text{ARI}_d = \text{ARI}\big((\widehat{\bm{Z}^1_d},\widehat{\bm{Z}^2_d}),(\bm{Z}^1_d,\bm{Z}^2_d) \big) + \end{equation*} +\end{itemize} + +All these quality indicators are averaged over the 108 datasets. The results are +provided in the tables \ref{tab:per_model_sep} to \ref{tab:per_model_pirho}. Each line corresponds to the +108 datasets for a given value of value of $\eps[\alpha]$. + +```{r inference_table, echo = FALSE} +averaged_data <- result_data_frame %>% + group_by(epsilon_alpha) %>% + summarise(across(-preferred_model, list("avrg" = meanse))) %>% + select(-c(2:10)) +averaged_data <- averaged_data %>% + select(which(!grepl("*_BICL_*", colnames(averaged_data)), + arr.ind = TRUE)) +``` + +```{r function_per_model, echo = FALSE} +dataframe_per_model <- function(model) { + averaged_data %>% + select(epsilon_alpha, starts_with(paste0(model, "_"))) +} +``` + +```{r per_model_table, echo = FALSE, results='asis', message=FALSE, warning = FALSE} +for (model in c("sep", "iid", "pi", "rho", "pirho")) { + kable_colnames <- c( + "$\\eps[\\alpha]$", #"BIC-L", + "$\\overline{\\text{ARI}}_{1}$", + "$\\overline{\\text{ARI}}_{2}$", "$\\text{ARI}_{1}$", "$\\text{ARI}_{2}$" + ) + model_name <- model + if (model != "sep") { + kable_colnames <- c( + kable_colnames, "Recovered $Q_1$", + "Recovered $Q_2$" + ) + } + if (model == "pirho") { + model_name <- "$\\pi\\rho$" + } else { + if (model != "iid" && model != "sep") model_name <- paste0("$\\", model, "$") + } + print(kable(dataframe_per_model(model), + escape = FALSE, + booktabs = TRUE, + digits = 2, + position = "!h", + caption = paste0( + "\\label{tab:per_model_", model, + "}Quality metrics for ", + paste0(model_name, "-colBiSBM") + ), + col.names = kable_colnames + )) +} +``` + + +```{r proportion-preferred_model, echo = FALSE} +proportion_preferred_table <- result_data_frame %>% + group_by(epsilon_alpha, preferred_model) %>% + summarise(n = n()) %>% + mutate(freq = n / sum(n)) %>% + ungroup() %>% + select(-n) %>% + pivot_wider( + names_from = preferred_model, + values_from = freq, values_fill = 0 + ) +``` + +\paragraph{Results} For the model comparison, when $\eps[\alpha]$ is small +($\eps[\alpha]\in[0, .04]$), the simulation model is close to the +Erd\H{o}s-Reńyi network and it is very hard to find any structure beyond the one +of a single block on each dimension. \ No newline at end of file diff --git a/Rcodes/simulation/inference_analyze.tex b/Rcodes/simulation/inference_analyze.tex new file mode 100644 index 0000000..da0f782 --- /dev/null +++ b/Rcodes/simulation/inference_analyze.tex @@ -0,0 +1,205 @@ +\hypertarget{efficiency-of-the-inference}{% +\section{Efficiency of the +inference}\label{efficiency-of-the-inference}} + +\paragraph{Simulation settings} + +For this simulation the data is simulated with +\(M = 2, n_{1}^{m} = 120, n_{2}^{m} = 120, Q_1 = Q_2 = 4\), +\(\bm{\alpha}, \bm{\pi}\) and \(\bm{\rho}\) are set as follows: +\begin{align*} + &&\bm{\alpha} = .25 + + \begin{pmatrix} + 3 \eps[\alpha] & 2 \eps[\alpha] & \eps[\alpha] & - \eps[\alpha]\\ + 2 \eps[\alpha] & 2 \eps[\alpha] & - \eps[\alpha] & \eps[\alpha]\\ + \eps[\alpha] & - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha]\\ + - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha] & 0 + \end{pmatrix}, \\ \bm{\pi}^1 = \sigma_1 + \begin{pmatrix} + 0.2 & 0.4 & 0.4 & 0 + \end{pmatrix}, + && \bm{\pi}^2 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, \\ + \bm{\rho}^1 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, && + \bm{\rho}^2 = \sigma_2 + \begin{pmatrix} + 0 & 0.33 & 0.33 & 0.33 + \end{pmatrix}, && +\end{align*} with \(\eps[\alpha]\) taking nine equally spaced values +ranging from 0 to 0.24. For each value of \(\eps[\alpha]\), 108 datasets +(\(X_1, X_2\)) are simulated, resulting in \(9 \times 108 = 972\) +datasets. More precisely, for each dataset, we pick uniformly at random +two permutations of \(\{ 1, \dots , 4 \}\) (\(\sigma_1, \sigma_2\)) with +the constraint that \(\sigma_1(4) \neq \sigma_2(1)\). This ensures that +each of the two networks have a non-empty block that is empty in the +other one. Then the networks are simulated with +\(\mathcal{B}\)ern-\(BiSBM_{120}(4, \bm{\alpha}, \bm{\pi}^m, \bm{\rho}^m)\) +with the previous parameters. Each network has 2 blocks in common and +their connectivity structures encompass a mix of core-periphery, +assortative community and disassortative community structures, depending +on which 3 of the 4 blocks are selected for each network. +\(\eps[\alpha]\) represents the strength of these structures, the +larger, the easier it is to tell apart one block from another. + +\paragraph{Inference} + +We want to measure the quality of the inference procedure, for this we +use the inference described in the section +\ref{sec:variational-estimation-of-the-parameters}. + +\paragraph{Quality indicators} + +To assess the quality of the inference, we will use the following +indicators: + +\begin{itemize} + \item First, for each dataset, we put in competition $\pi\text{-}colBiSBM$ with + $sep\text{-}BiSBM$, $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ + respectively. To do so, for each dataset, we compute the + BIC-L of each model $\pi\text{-}colBiSBM$ is preferred to $sep\text{-}BiSBM$ + (resp. $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$) if + its BIC-L is greater. + \item When considering $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare $\widehat{Q_1}$, $\widehat{Q_2}$ to + their true values. ($Q_1 = 4$ and $Q_2 = 4$) + \item Finally, we assess the quality of the node grouping by computing the + Adjusted Rand Index \parencite[][, ARI = 0 for a random grouping, ARI = 1 for a perfect recovery]{hubertComparingPartitions1985}. For each network, for the + $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare the inferred block memberships to the + real ones by computing the mean of the ARI per axis over the two networks + \begin{equation*} + \overline{\text{ARI}}_d = \frac{1}{2} \text{ARI}\big( \text{ARI}(\widehat{\bm{Z}^1_d},\bm{Z}^1_d) + \text{ARI}(\widehat{\bm{Z}^2_d},\bm{Z}^2_d) \big) + \end{equation*} + where $d$ is the dimension or axis (i.e., rows, $d=1$, or columns, $d=2$) of + the block memberships. + And we compute the ARI of the whole set of nodes to account for block + pairing between networks + \begin{equation*} + \text{ARI}_d = \text{ARI}\big((\widehat{\bm{Z}^1_d},\widehat{\bm{Z}^2_d}),(\bm{Z}^1_d,\bm{Z}^2_d) \big) + \end{equation*} +\end{itemize} + +All these quality indicators are averaged over the 108 datasets. The +results are provided in the tables \ref{tab:per_model_sep} to +\ref{tab:per_model_pirho}. Each line corresponds to the 108 datasets for +a given value of value of \(\eps[\alpha]\). + +\begin{table}[!h] + +\caption{\label{tab:per_model_table}\label{tab:per_model_sep}Quality metrics for sep-colBiSBM} +\centering +\begin{tabular}[t]{rllll} +\toprule +$\eps[\alpha]$ & $\overline{\text{ARI}}_{1}$ & $\overline{\text{ARI}}_{2}$ & $\text{ARI}_{1}$ & $\text{ARI}_{2}$\\ +\midrule +0.00 & 0 & 0 & 0 & 0\\ +0.03 & 0 & 0 & 0 & 0\\ +0.06 & 0.1 $\pm$ 0.01 & 0.08 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.05 $\pm$ 0.01\\ +0.09 & 0.71 $\pm$ 0.02 & 0.7 $\pm$ 0.01 & 0.37 $\pm$ 0.02 & 0.37 $\pm$ 0.02\\ +0.12 & 0.94 $\pm$ 0.01 & 0.93 $\pm$ 0.01 & 0.5 $\pm$ 0.02 & 0.49 $\pm$ 0.02\\ +\addlinespace +0.15 & 0.99 & 0.99 & 0.54 $\pm$ 0.02 & 0.49 $\pm$ 0.01\\ +0.18 & 0.99 & 0.99 & 0.52 $\pm$ 0.02 & 0.52 $\pm$ 0.02\\ +0.21 & 0.99 & 0.99 & 0.54 $\pm$ 0.02 & 0.52 $\pm$ 0.02\\ +0.24 & 1 & 1 & 0.55 $\pm$ 0.02 & 0.52 $\pm$ 0.02\\ +\bottomrule +\end{tabular} +\end{table} +\begin{table}[!h] + +\caption{\label{tab:per_model_table}\label{tab:per_model_iid}Quality metrics for iid-colBiSBM} +\centering +\begin{tabular}[t]{rllllll} +\toprule +$\eps[\alpha]$ & $\overline{\text{ARI}}_{1}$ & $\overline{\text{ARI}}_{2}$ & $\text{ARI}_{1}$ & $\text{ARI}_{2}$ & Recovered $Q_1$ & Recovered $Q_2$\\ +\midrule +0.00 & 0 & 0 & 0 & 0 & 1 & 1\\ +0.03 & 0 & 0 & 0 & 0 & 1 & 1\\ +0.06 & 0.08 $\pm$ 0.01 & 0.08 $\pm$ 0.01 & 0.08 $\pm$ 0.01 & 0.07 $\pm$ 0.01 & 1.4 $\pm$ 0.05 & 1.49 $\pm$ 0.05\\ +0.09 & 0.72 $\pm$ 0.01 & 0.71 $\pm$ 0.01 & 0.53 $\pm$ 0.02 & 0.52 $\pm$ 0.02 & 3.4 $\pm$ 0.06 & 3.41 $\pm$ 0.06\\ +0.12 & 0.94 & 0.93 & 0.75 $\pm$ 0.03 & 0.72 $\pm$ 0.03 & 4.06 $\pm$ 0.02 & 3.97 $\pm$ 0.02\\ +\addlinespace +0.15 & 0.98 & 0.98 & 0.77 $\pm$ 0.03 & 0.76 $\pm$ 0.03 & 4.11 $\pm$ 0.03 & 4.11 $\pm$ 0.03\\ +0.18 & 0.99 & 0.99 & 0.82 $\pm$ 0.03 & 0.82 $\pm$ 0.03 & 4.15 $\pm$ 0.04 & 4.13 $\pm$ 0.03\\ +0.21 & 0.99 & 0.99 & 0.8 $\pm$ 0.02 & 0.79 $\pm$ 0.03 & 4.35 $\pm$ 0.06 & 4.19 $\pm$ 0.04\\ +0.24 & 0.99 & 0.99 & 0.77 $\pm$ 0.03 & 0.77 $\pm$ 0.03 & 4.3 $\pm$ 0.06 & 4.43 $\pm$ 0.07\\ +\bottomrule +\end{tabular} +\end{table} +\begin{table}[!h] + +\caption{\label{tab:per_model_table}\label{tab:per_model_pi}Quality metrics for $\pi$-colBiSBM} +\centering +\begin{tabular}[t]{rllllll} +\toprule +$\eps[\alpha]$ & $\overline{\text{ARI}}_{1}$ & $\overline{\text{ARI}}_{2}$ & $\text{ARI}_{1}$ & $\text{ARI}_{2}$ & Recovered $Q_1$ & Recovered $Q_2$\\ +\midrule +0.00 & 0 & 0 & 0 & 0 & 1 & 1\\ +0.03 & 0 & 0 & 0 & 0 & 1.01 $\pm$ 0.01 & 1\\ +0.06 & 0.07 $\pm$ 0.01 & 0.08 $\pm$ 0.01 & 0.07 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 1.49 $\pm$ 0.05 & 1.5 $\pm$ 0.05\\ +0.09 & 0.73 $\pm$ 0.02 & 0.72 $\pm$ 0.01 & 0.56 $\pm$ 0.02 & 0.53 $\pm$ 0.02 & 3.78 $\pm$ 0.07 & 3.37 $\pm$ 0.07\\ +0.12 & 0.96 & 0.93 & 0.79 $\pm$ 0.02 & 0.74 $\pm$ 0.03 & 4.46 $\pm$ 0.07 & 3.95 $\pm$ 0.02\\ +\addlinespace +0.15 & 0.99 & 0.97 & 0.82 $\pm$ 0.02 & 0.76 $\pm$ 0.03 & 4.62 $\pm$ 0.08 & 4\\ +0.18 & 1 & 0.98 & 0.83 $\pm$ 0.02 & 0.79 $\pm$ 0.03 & 4.65 $\pm$ 0.09 & 4\\ +0.21 & 1 & 0.98 & 0.84 $\pm$ 0.02 & 0.79 $\pm$ 0.03 & 4.69 $\pm$ 0.1 & 4\\ +0.24 & 1 & 0.99 & 0.86 $\pm$ 0.02 & 0.79 $\pm$ 0.03 & 4.74 $\pm$ 0.11 & 4.01 $\pm$ 0.01\\ +\bottomrule +\end{tabular} +\end{table} +\begin{table}[!h] + +\caption{\label{tab:per_model_table}\label{tab:per_model_rho}Quality metrics for $\rho$-colBiSBM} +\centering +\begin{tabular}[t]{rllllll} +\toprule +$\eps[\alpha]$ & $\overline{\text{ARI}}_{1}$ & $\overline{\text{ARI}}_{2}$ & $\text{ARI}_{1}$ & $\text{ARI}_{2}$ & Recovered $Q_1$ & Recovered $Q_2$\\ +\midrule +0.00 & 0 & 0 & 0 & 0 & 1 & 1\\ +0.03 & 0 & 0 & 0 & 0 & 1.01 $\pm$ 0.01 & 1.01 $\pm$ 0.01\\ +0.06 & 0.08 $\pm$ 0.01 & 0.08 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 0.07 $\pm$ 0.01 & 1.39 $\pm$ 0.05 & 1.6 $\pm$ 0.06\\ +0.09 & 0.72 $\pm$ 0.01 & 0.72 $\pm$ 0.01 & 0.53 $\pm$ 0.02 & 0.54 $\pm$ 0.02 & 3.39 $\pm$ 0.07 & 3.74 $\pm$ 0.07\\ +0.12 & 0.93 & 0.95 & 0.71 $\pm$ 0.03 & 0.75 $\pm$ 0.02 & 3.95 $\pm$ 0.02 & 4.5 $\pm$ 0.07\\ +\addlinespace +0.15 & 0.97 & 0.99 & 0.78 $\pm$ 0.03 & 0.81 $\pm$ 0.02 & 4 & 4.49 $\pm$ 0.07\\ +0.18 & 0.98 & 1 & 0.76 $\pm$ 0.03 & 0.81 $\pm$ 0.02 & 4.01 $\pm$ 0.01 & 4.71 $\pm$ 0.09\\ +0.21 & 0.98 & 1 & 0.76 $\pm$ 0.03 & 0.81 $\pm$ 0.02 & 4.03 $\pm$ 0.02 & 4.72 $\pm$ 0.09\\ +0.24 & 0.98 & 1 & 0.74 $\pm$ 0.03 & 0.8 $\pm$ 0.02 & 4.06 $\pm$ 0.02 & 4.8 $\pm$ 0.1\\ +\bottomrule +\end{tabular} +\end{table} +\begin{table}[!h] + +\caption{\label{tab:per_model_table}\label{tab:per_model_pirho}Quality metrics for $\pi\rho$-colBiSBM} +\centering +\begin{tabular}[t]{rllllll} +\toprule +$\eps[\alpha]$ & $\overline{\text{ARI}}_{1}$ & $\overline{\text{ARI}}_{2}$ & $\text{ARI}_{1}$ & $\text{ARI}_{2}$ & Recovered $Q_1$ & Recovered $Q_2$\\ +\midrule +0.00 & 0 & 0 & 0 & 0 & 1 & 1\\ +0.03 & 0 & 0 & 0 & 0 & 1.01 $\pm$ 0.01 & 1.01 $\pm$ 0.01\\ +0.06 & 0.07 $\pm$ 0.01 & 0.07 $\pm$ 0.01 & 0.07 $\pm$ 0.01 & 0.06 $\pm$ 0.01 & 1.48 $\pm$ 0.05 & 1.57 $\pm$ 0.06\\ +0.09 & 0.74 $\pm$ 0.01 & 0.73 $\pm$ 0.01 & 0.56 $\pm$ 0.03 & 0.55 $\pm$ 0.02 & 3.69 $\pm$ 0.06 & 3.66 $\pm$ 0.06\\ +0.12 & 0.96 $\pm$ 0.01 & 0.95 $\pm$ 0.01 & 0.73 $\pm$ 0.03 & 0.73 $\pm$ 0.03 & 4.31 $\pm$ 0.05 & 4.26 $\pm$ 0.05\\ +\addlinespace +0.15 & 0.99 & 0.99 & 0.79 $\pm$ 0.02 & 0.78 $\pm$ 0.03 & 4.31 $\pm$ 0.05 & 4.35 $\pm$ 0.05\\ +0.18 & 1 & 1 & 0.83 $\pm$ 0.02 & 0.83 $\pm$ 0.02 & 4.31 $\pm$ 0.05 & 4.25 $\pm$ 0.04\\ +0.21 & 1 & 1 & 0.77 $\pm$ 0.03 & 0.77 $\pm$ 0.03 & 4.42 $\pm$ 0.05 & 4.34 $\pm$ 0.05\\ +0.24 & 1 & 1 & 0.82 $\pm$ 0.02 & 0.82 $\pm$ 0.02 & 4.25 $\pm$ 0.04 & 4.31 $\pm$ 0.05\\ +\bottomrule +\end{tabular} +\end{table} + +\paragraph{Results} + +For the model comparison, when \(\eps[\alpha]\) is small +(\(\eps[\alpha]\in[0, .04]\)), the simulation model is close to the +Erd\H{o}s-Reńyi network and it is very hard to find any structure beyond +the one of a single block on each dimension. diff --git a/Rcodes/simulation/model_selection_analyze.Rmd b/Rcodes/simulation/model_selection_analyze.Rmd index 564e373..d21af4a 100644 --- a/Rcodes/simulation/model_selection_analyze.Rmd +++ b/Rcodes/simulation/model_selection_analyze.Rmd @@ -1,11 +1,18 @@ ```{r libraries, echo = FALSE, include = FALSE} require("ggplot2") +require("knitr") +# require("kableExtra") require("tidyr") require("dplyr") require("patchwork") require("latex2exp") ``` +```{r setup, echo = FALSE, include= FALSE} +options(knitr.table.knitr.table.format = "latex") +``` + + ```{r import-data, echo = FALSE} filenames <- list.files( path = "./data/", @@ -113,7 +120,7 @@ Finally, when $\eps[\pi] > 0$ or $\bm{\pi}^1 \neq \bm{\pi}^2$ and $\eps[\rho] > 0$ or $\bm{\rho}^1 \neq \bm{\rho}^2$, the model is a $\pi\rho\text{-}colBiSBM$. -```{r tables, echo = FALSE} +```{r tables, echo = FALSE, results='asis'} kable( (model_comparison_eps_pi %>% select(-one_of("n")) %>% @@ -121,23 +128,28 @@ kable( names_from = preferred_model, values_from = prop_model, values_fill = 0 - ) %>% group_by(epsilon_pi) %>% - summarise(rec_Q1 = mean(rec_Q1), - iid = sum(iid), - pi = sum(pi), - rho = sum(rho), - pirho = sum(pirho)))[,c(1,3:6, 2)], + ) %>% group_by(epsilon_pi) %>% + summarise( + rec_Q1 = mean(rec_Q1), + iid = sum(iid), + pi = sum(pi), + rho = sum(rho), + pirho = sum(pirho) + ))[, c(1, 3:6, 2)], digits = 2, col.names = c( "$\\eps[\\pi]$", - "$iid\\text{-}colBiSBM$", + "$iid\\text{-}colBiSBM$ ", "$\\pi\\text{-}colBiSBM$", "$\\rho\\text{-}colBiSBM$", "$\\pi\\rho\\text{-}colBiSBM$", "Recovered $Q_1$" ), align = "lcccc", + booktab = TRUE, + position = "!h", + escape = FALSE, caption = "\\label{tab:pi-model-sel}Model selection for varying $\\pi$ mixture parameters" -) +) %>% kableExtra::add_header_above(c(" " = 1, "Proportions of model selection" = 4, "Blocks" = 1)) kable( (model_comparison_eps_rho %>% select(-one_of("n")) %>% @@ -154,14 +166,17 @@ kable( digits = 2, col.names = c( "$\\eps[\\rho]$", - "$iid\\text{-}colBiSBM$", + "$iid\\text{-}colBiSBM$ ", "$\\pi\\text{-}colBiSBM$", "$\\rho\\text{-}colBiSBM$", "$\\pi\\rho\\text{-}colBiSBM$", "Recovered $Q_2$" ), align = "lcccc", + booktab = TRUE, + position = "!h", + escape = FALSE, caption = "\\label{tab:rho-model-sel}Model selection for varying $\\rho$ mixture parameters" -) +) %>% kableExtra::add_header_above(c(" " = 1, "Proportions of model selection" = 4, "Blocks" = 1)) ``` \begin{figure}[H] @@ -170,7 +185,7 @@ kable( \label{fig:pref_model_func_eps} \end{figure} -On the figure \ref{fig:pref_model_func_eps} and tables \ref{tab:pi-model-sel} +\paragraph{Results:}On the figure \ref{fig:pref_model_func_eps} and tables \ref{tab:pi-model-sel} and \ref{tab:rho-model-sel}, one can see that there is a turning point around $\eps[\pi] = 0.2$ (resp. $\eps[\rho] = 0.2$), before which $iid\text{-}colBiSBM$ @@ -180,5 +195,5 @@ $\rho\text{-}colBiSBM$) and $\pi\rho\text{-}colBiSBM$ gets more and more selected, highlighting our capacity to recover the simulated structure. -Please note that when "Recovered $Q_1$(or $Q_2$)" is not an integer it's because +\paragraph*{Remark:} Please note that when "Recovered $Q_1$(or $Q_2$)" is not an integer it's because some procedures returned a value other than 3. \ No newline at end of file diff --git a/Rcodes/simulation/model_selection_analyze.tex b/Rcodes/simulation/model_selection_analyze.tex index 70ad597..754b600 100644 --- a/Rcodes/simulation/model_selection_analyze.tex +++ b/Rcodes/simulation/model_selection_analyze.tex @@ -49,315 +49,53 @@ is a \(\rho\text{-}colBiSBM\). Finally, when \(\eps[\pi] > 0\) or \(\bm{\rho}^1 \neq \bm{\rho}^2\), the model is a \(\pi\rho\text{-}colBiSBM\). -\begin{longtable}[]{@{}lccccl@{}} -\caption{\label{tab:pi-model-sel}Model selection for varying \(\pi\) -mixture parameters}\tabularnewline -\toprule -\begin{minipage}[b]{0.08\columnwidth}\raggedright -\(\eps[\pi]\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(iid\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(\pi\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\centering -\(\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\centering -\(\pi\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright -Recovered \(Q_1\)\strut -\end{minipage}\tabularnewline -\midrule -\endfirsthead -\toprule -\begin{minipage}[b]{0.08\columnwidth}\raggedright -\(\eps[\pi]\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(iid\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(\pi\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\centering -\(\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\centering -\(\pi\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright -Recovered \(Q_1\)\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.65\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.35\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.04\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.66\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.34\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.07\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.64\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.34\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.11\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.63\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.03\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.31\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.03\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.14\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.55\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.12\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.28\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.05\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.18\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.39\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.26\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.21\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.13\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.21\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.23\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.42\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.13\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.23\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.25\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.10\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.56\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.05\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.29\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.02\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.08\columnwidth}\raggedright -0.28\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.65\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.33\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\bottomrule -\end{longtable} +\begin{table}[!h] -\begin{longtable}[]{@{}lccccl@{}} -\caption{\label{tab:rho-model-sel}Model selection for varying \(\rho\) -mixture parameters}\tabularnewline +\caption{\label{tab:tables}\label{tab:pi-model-sel}Model selection for varying $\pi$ mixture parameters} +\centering +\begin{tabular}[t]{lccccl} \toprule -\begin{minipage}[b]{0.09\columnwidth}\raggedright -\(\eps[\rho]\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(iid\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(\pi\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\centering -\(\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\centering -\(\pi\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright -Recovered \(Q_2\)\strut -\end{minipage}\tabularnewline +\multicolumn{1}{c}{ } & \multicolumn{4}{c}{Proportions of model selection} & \multicolumn{1}{c}{Blocks} \\ +\cmidrule(l{3pt}r{3pt}){2-5} \cmidrule(l{3pt}r{3pt}){6-6} +$\eps[\pi]$ & $iid\text{-}colBiSBM$ & $\pi\text{-}colBiSBM$ & $\rho\text{-}colBiSBM$ & $\pi\rho\text{-}colBiSBM$ & Recovered $Q_1$\\ \midrule -\endfirsthead -\toprule -\begin{minipage}[b]{0.09\columnwidth}\raggedright -\(\eps[\rho]\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(iid\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.15\columnwidth}\centering -\(\pi\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\centering -\(\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.18\columnwidth}\centering -\(\pi\rho\text{-}colBiSBM\)\strut -\end{minipage} & \begin{minipage}[b]{0.11\columnwidth}\raggedright -Recovered \(Q_2\)\strut -\end{minipage}\tabularnewline -\midrule -\endhead -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.63\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.37\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.04\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.65\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.34\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.00\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.07\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.64\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.33\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.11\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.64\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.31\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.03\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.02\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.14\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.53\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.29\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.11\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.06\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.18\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.42\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.20\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.24\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.14\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.21\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.25\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.12\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.40\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.22\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.25\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.08\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.06\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.58\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.29\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.01\strut -\end{minipage}\tabularnewline -\begin{minipage}[t]{0.09\columnwidth}\raggedright -0.28\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.15\columnwidth}\centering -0.01\strut -\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\centering -0.65\strut -\end{minipage} & \begin{minipage}[t]{0.18\columnwidth}\centering -0.32\strut -\end{minipage} & \begin{minipage}[t]{0.11\columnwidth}\raggedright -3.00\strut -\end{minipage}\tabularnewline +0.00 & 0.65 & 0.00 & 0.35 & 0.00 & 3.00\\ +0.04 & 0.66 & 0.00 & 0.34 & 0.00 & 3.00\\ +0.07 & 0.64 & 0.01 & 0.34 & 0.01 & 3.01\\ +0.11 & 0.63 & 0.03 & 0.31 & 0.03 & 3.01\\ +0.14 & 0.55 & 0.12 & 0.28 & 0.05 & 3.00\\ +\addlinespace +0.18 & 0.39 & 0.26 & 0.21 & 0.13 & 3.01\\ +0.21 & 0.23 & 0.42 & 0.13 & 0.23 & 3.01\\ +0.25 & 0.10 & 0.56 & 0.05 & 0.29 & 3.02\\ +0.28 & 0.01 & 0.65 & 0.01 & 0.33 & 3.01\\ \bottomrule -\end{longtable} +\end{tabular} +\end{table} + +\begin{table}[!h] + +\caption{\label{tab:tables}\label{tab:rho-model-sel}Model selection for varying $\rho$ mixture parameters} +\centering +\begin{tabular}[t]{lccccl} +\toprule +\multicolumn{1}{c}{ } & \multicolumn{4}{c}{Proportions of model selection} & \multicolumn{1}{c}{Blocks} \\ +\cmidrule(l{3pt}r{3pt}){2-5} \cmidrule(l{3pt}r{3pt}){6-6} +$\eps[\rho]$ & $iid\text{-}colBiSBM$ & $\pi\text{-}colBiSBM$ & $\rho\text{-}colBiSBM$ & $\pi\rho\text{-}colBiSBM$ & Recovered $Q_2$\\ +\midrule +0.00 & 0.63 & 0.37 & 0.00 & 0.00 & 3.00\\ +0.04 & 0.65 & 0.34 & 0.00 & 0.01 & 3.00\\ +0.07 & 0.64 & 0.33 & 0.01 & 0.01 & 3.00\\ +0.11 & 0.64 & 0.31 & 0.03 & 0.02 & 3.00\\ +0.14 & 0.53 & 0.29 & 0.11 & 0.06 & 3.00\\ +\addlinespace +0.18 & 0.42 & 0.20 & 0.24 & 0.14 & 3.01\\ +0.21 & 0.25 & 0.12 & 0.40 & 0.22 & 3.01\\ +0.25 & 0.08 & 0.06 & 0.58 & 0.29 & 3.01\\ +0.28 & 0.01 & 0.01 & 0.65 & 0.32 & 3.00\\ +\bottomrule +\end{tabular} +\end{table} \begin{figure}[H] \includegraphics{./Rcodes/simulation/img/plot_model_function_eps.png} @@ -365,6 +103,8 @@ Recovered \(Q_2\)\strut \label{fig:pref_model_func_eps} \end{figure} +\paragraph{Results:} + On the figure \ref{fig:pref_model_func_eps} and tables \ref{tab:pi-model-sel} and \ref{tab:rho-model-sel}, one can see that there is a turning point around \(\eps[\pi] = 0.2\) (resp. @@ -375,5 +115,7 @@ most of the times and after \(0.2\) the \(\pi\text{-}colBiSBM\) (resp. more selected, highlighting our capacity to recover the simulated structure. +\paragraph*{Remark:} + Please note that when ``Recovered \(Q_1\)(or \(Q_2\))'' is not an integer it's because some procedures returned a value other than 3. diff --git a/Rcodes/simulation/netclustering_analyze.R b/Rcodes/simulation/netclustering_analyze.R index 70638a9..645df0e 100644 --- a/Rcodes/simulation/netclustering_analyze.R +++ b/Rcodes/simulation/netclustering_analyze.R @@ -1,9 +1,8 @@ require("ggplot2") require("tictoc") +require("colSBM") -devtools::load_all("R/") - -result_clustering <- readRDS("simulation/data/simulated_collection_clustering_rho_10-05-23-14:40:46.Rds") +result_clustering <- readRDS("./Rcodes/simulation/data/simulated") list_clustering <- lapply( seq_along(result_clustering), function(s) result_clustering[[s]]$list_of_clusterings @@ -13,7 +12,7 @@ list_best_partition <- lapply( seq_along(list_clustering), function(s) { list( epsilon = result_clustering[[s]]$epsilon, - best_partition = extract_bipartite_best_partition(list_clustering[[s]]) + best_partition = unlist(extract_best_bipartite_partition(list_clustering[[s]])) ) } ) diff --git a/Rcodes/simulation/netclustering_analyze.Rmd b/Rcodes/simulation/netclustering_analyze.Rmd new file mode 100644 index 0000000..3e5f15b --- /dev/null +++ b/Rcodes/simulation/netclustering_analyze.Rmd @@ -0,0 +1,19 @@ +```{r libraries, echo = FALSE, include = FALSE} +require("ggplot2") +require("tidyr") +require("dplyr") +require("patchwork") +require("latex2exp") +``` + +\section{Network clustering of simulated networks}\label{sec:network-clustering-of-simulated-networks} + +```{r impoting-data, echo = FALSE} +filenames <- list.files( + path = "./data/", + pattern = "simulated_collection_clustering_*", + full.names = TRUE +) +# data_list <- lapply(filenames, function(file) lapply(readRDS(file), function(model) model$list_clustering)) +data_list <- lapply(filenames, readRDS) +``` \ No newline at end of file diff --git a/Rcodes/simulation/netclustering_analyze.tex b/Rcodes/simulation/netclustering_analyze.tex new file mode 100644 index 0000000..290de08 --- /dev/null +++ b/Rcodes/simulation/netclustering_analyze.tex @@ -0,0 +1 @@ +\section{Network clustering of simulated networks}\label{sec:network-clustering-of-simulated-networks} diff --git a/Rmd2Latex-fragment.R b/Rmd2Latex-fragment.R index ed044e9..ac13366 100644 --- a/Rmd2Latex-fragment.R +++ b/Rmd2Latex-fragment.R @@ -1,11 +1,14 @@ -#!/usr/bin/env Rscript -require("knitr") +#!/usr/bin/Rscript + +print(getwd()) + +options(knitr.table.format = "latex") create_latex <- function(f) { knitr::knit(f, "/tmp/tmp-outputfile.md") newname <- paste0(tools::file_path_sans_ext(f), ".tex") - mess <- paste("pandoc --extract-media=./img -f markdown -t latex -p -o", shQuote(newname), "/tmp/tmp-outputfile.md") + mess <- paste("pandoc --extract-media=./img -f markdown -t latex -p /tmp/tmp-outputfile.md -o", shQuote(newname)) system(mess) } args <- commandArgs(trailingOnly = TRUE) -create_latex(args) \ No newline at end of file +create_latex(unlist(args)) diff --git a/img/plot_model_function_eps.png b/img/plot_model_function_eps.png index 46afb58..b11b813 100644 Binary files a/img/plot_model_function_eps.png and b/img/plot_model_function_eps.png differ diff --git a/presentation_UMR.tex b/presentation_UMR.tex index c2c8dd6..509261d 100644 --- a/presentation_UMR.tex +++ b/presentation_UMR.tex @@ -69,4 +69,6 @@ nouvelles problématiques passionnantes. De plus j'ai beaucoup progressé dans les domaines abordés pendant mon stage, et cela m'a rendu confiant dans le choix de faire le master \emph{MathSV} pour l'année scolaire 2023-2024. Ce stage a donc été -déterminant et confirme l'orientation de mon parcours professionnel. \ No newline at end of file +déterminant et confirme l'orientation de mon parcours professionnel. + +\paragraph*{Note} La suite de ce rapport a été rédigée en anglais. \ No newline at end of file diff --git a/rapport.pdf b/rapport.pdf index 14d7066..7b4f9db 100644 Binary files a/rapport.pdf and b/rapport.pdf differ diff --git a/rapport.tex b/rapport.tex index ee8c78e..e623056 100644 --- a/rapport.tex +++ b/rapport.tex @@ -21,6 +21,7 @@ \usepackage{rotating} % For allowing to rotate figures \usepackage{svg} % To allow svg inclusions \usepackage{float} % To allow Pandoc to control figure placement +% \usepackage{booktabs} % For good tables %% Bibliography \usepackage[style=apa,citestyle=authoryear-comp]{biblatex} @@ -272,23 +273,25 @@ This model supposes that: \label{fig:LBMvisu} \end{figure} -Parameters -% TODO fix parameters according to presentation \begin{itemize} - \item $Q_1 = \{{\color{blueind}\bullet},{\color{cyanind}\bullet},{\color{electricblue}\bullet}\}$ blocks in rows - \item $Q_2 = \{{\color{burntorange}\bullet},{\color{goldenyellow}\bullet},{\color{yellow}\bullet}\}$ blocks in columns - \item $\pi_{\bullet} = \mathbb{P}(i\in\bullet)$ in row and $\rho_{\bullet} = \mathbb{P}(j\in\bullet)$ in column - \item $\alpha_{{\color{blueind}\bullet}{\color{burntorange}\bullet}} = \mathbb{P}(i \leftrightarrow j | i \in {\color{blueind}\bullet}, j \in {\color{burntorange}\bullet})$ connectivity probability between two nodes, given their clustering + \item $Q_1 = |\{{\color{blueind}\bullet},{\color{cyanind}\bullet},{\color{electricblue}\bullet}\}|$ \emph{given} blocks in rows + \item $Q_2 = |\{{\color{burntorange}\bullet},{\color{goldenyellow}\bullet},{\color{yellow}\bullet}\}|$ \emph{given} blocks in columns +\end{itemize} +Parameters +\begin{itemize} + \item $\pi_{\bullet} = \mathbb{P}(Z_i = \bullet)$ for rows and $\rho_{\bullet} = \mathbb{P}(W_j = \bullet)$ for columns + \item $\alpha_{{\color{blueind}\bullet}{\color{burntorange}\bullet}} = \mathbb{P}(X_{ij} = 1 | Z_i = {\color{blueind}\bullet}, W_j = {\color{burntorange}\bullet})$, probability of connectivity knowing node membership blocks. \end{itemize} -On \ref{fig:LBMvisu}, $\pi$ are the probabilities for a row node to belong to -the row block of corresponding color, $\rho$ are the probabilities for a column -node to belong to the column block of corresponding color and $\alpha$ are the +On \ref{fig:LBMvisu}, $\bm{\pi}$ are the probabilities for a row node to belong to +the row block of corresponding color, $\bm{\rho}$ are the probabilities for a column +node to belong to the column block of corresponding color and $\bm{\alpha}$ are the connectivity parameters between the row and column blocks. This model can be used to easily generate bipartite graphs with complex and very varied structures. But when trying to determine the structure of a given network -we need to find those parameters. +we need to find those parameters and as the row and column block memberships are +\emph{latent} i.e.,\ they are not known and must be inferred. For this a common approach is to use a VEM algorithm (proposed for SBM in ~\cite{daudinMixtureModelRandom2008} and for LBM in ~\cite{govaertEMAlgorithmBlock2005}) @@ -316,7 +319,7 @@ We define a collection of bipartite networks as $\bm{X} = (X^1, \dots, X^M)$ the collection of incidence matrix. Moreover, all the networks in the collection have the same type of interaction (e.g., all interactions are binary). -\section{Separate BiSBM (sepBiSBM)}\label{sec:separate-bisbm-sepbisbm} +\section{Separate BiSBM (sep-BiSBM)}\label{sec:separate-bisbm-sepbisbm} A first approach to deal with a collection of networks is to adjust separate BiSBM for each network of the collection. @@ -531,9 +534,6 @@ $\bm{\tau}$. \\ % \mathcal{J}(\mathcal{R};\bm{\theta}) \coloneqq \mathbb{E}_{\mathcal{R}}[\ell(\bm{X},\bm{Z},\bm{W};\bm{\theta})] + \mathcal{H}(\bm{Z,W}) \leq \ell(\bm{X};\bm{\theta}) % \end{equation*} - -% TODO Develop the formula - The VEM algorithm alternates between two steps, the variational E step and the M step. The E steps consists in optimizing $\mathcal{J}(\bm{\tau};\bm{\theta})$ for a current value of $\bm{\theta}$ with respect to $\bm{\tau}$. And the M step @@ -1060,8 +1060,9 @@ We illustrate our capacity to perform a partition of a collection for all colBiSBM models in \ref{sec:network-clustering-of-simulated-networks}. \chapter{Simulation studies}\label{chap:simulation-studies} +\include{Rcodes/simulation/inference_analyze} \include{Rcodes/simulation/model_selection_analyze} -\section{Network clustering of simulated networks}\label{sec:network-clustering-of-simulated-networks} +\include{Rcodes/simulation/netclustering_analyze} \chapter{Applications} \include{Rcodes/real_data/application_dore_data} diff --git a/references.bib b/references.bib index e4f855d..f017e93 100644 --- a/references.bib +++ b/references.bib @@ -198,6 +198,25 @@ file = {/home/polarolouis/Zotero/storage/6F8YT8AD/holland1983.pdf.pdf;/home/polarolouis/Zotero/storage/7DSZ3KD9/Holland et al. - 1983 - Stochastic blockmodels First steps.pdf;/home/polarolouis/Zotero/storage/DUL2RV8Q/holland1983.pdf.pdf;/home/polarolouis/Zotero/storage/G9KZBG9W/0378873383900217.html} } +@article{hubertComparingPartitions1985, + title = {Comparing Partitions}, + author = {Hubert, Lawrence and Arabie, Phipps}, + date = {1985-12-01}, + journaltitle = {Journal of Classification}, + shortjournal = {Journal of Classification}, + volume = {2}, + number = {1}, + pages = {193--218}, + issn = {1432-1343}, + doi = {10.1007/BF01908075}, + url = {https://doi.org/10.1007/BF01908075}, + urldate = {2023-07-04}, + abstract = {The problem of comparing two different partitions of a finite set of objects reappears continually in the clustering literature. We begin by reviewing a well-known measure of partition correspondence often attributed to Rand (1971), discuss the issue of correcting this index for chance, and note that a recent normalization strategy developed by Morey and Agresti (1984) and adopted by others (e.g., Miligan and Cooper 1985) is based on an incorrect assumption. Then, the general problem of comparing partitions is approached indirectly by assessing the congruence of two proximity matrices using a simple cross-product measure. They are generated from corresponding partitions using various scoring rules. Special cases derivable include traditionally familiar statistics and/or ones tailored to weight certain object pairs differentially. Finally, we propose a measure based on the comparison of object triples having the advantage of a probabilistic interpretation in addition to being corrected for chance (i.e., assuming a constant value under a reasonable null hypothesis) and bounded between ±1.}, + langid = {english}, + keywords = {Consensus indices,Measures of agreement,Measures of association}, + file = {/home/polarolouis/Zotero/storage/7TKW7HEM/Hubert et Arabie - 1985 - Comparing partitions.pdf} +} + @article{kaszewska-gilasGlobalStudiesHostParasite2021, title = {Global {{Studies}} of the {{Host-Parasite Relationships}} between {{Ectoparasitic Mites}} of the {{Family Syringophilidae}} and {{Birds}} of the {{Order Columbiformes}}}, author = {Kaszewska-Gilas, Katarzyna and Kosicki, Jakub Ziemowit and Hromada, Martin and Skoracki, Maciej},