diff --git a/Rcodes/simulation/inference_analyze.Rnw b/Rcodes/simulation/inference_analyze.Rnw new file mode 100644 index 0000000..bec3191 --- /dev/null +++ b/Rcodes/simulation/inference_analyze.Rnw @@ -0,0 +1,264 @@ +<>= +require("ggplot2") +require("ggokabeito") +require("tidyr") +require("dplyr") +require("stringr") +require("knitr") +require("pander") +require("patchwork") +require("latex2exp") +@ + + +<>= +options(dplyr.summarise.inform = FALSE) +knitr::opts_knit$set(kable.force.latex = TRUE) + +meanse <- function(x, ...) { + mean1 <- signif(round(mean(x, na.rm = T), 2), 5) # calculate mean and round + se1 <- signif(round(sd(x, na.rm = T) / sqrt(sum(!is.na(x))), 2), 2) # std error - round adding zeros + out <- paste(mean1, "$\\pm$", se1) # paste together mean plus/minus and standard error + if (str_detect(out, "NA")) { + out <- "NA" + } # if missing do not add plusminus + if (se1 == 0) { + out <- paste(mean1) + } + return(out) +} +@ + +<>= +filenames <- list.files( + path = "./data/", + pattern = "inference_testing_2023-07-*", + full.names = TRUE +) +data_list <- lapply(filenames, readRDS) +col_id_BICLS <- c(11, 16, 23, 30, 37) +result_data_frame <- dplyr::bind_rows(data_list) + +# Compute the preferred model +result_data_frame <- cbind(result_data_frame, preferred_model = sapply(seq_len(nrow(result_data_frame)), function(n) sub("_BICL", "", names(which.max(result_data_frame[n, col_id_BICLS]))))) +result_data_frame$preferred_model <- factor(result_data_frame$preferred_model, levels = c( + "sep", "iid", "pi", + "rho", "pirho" +)) +@ + +# Efficiency of the inference + +\paragraph{Simulation settings} For this simulation the data is simulated with +$M = 2, n_{1}^{m} = 120, n_{2}^{m} = 120, Q_1 = Q_2 = 4$, $\bm{\alpha}, \bm{\pi}$ +and $\bm{\rho}$ are set as follows: +\begin{align*} + &&\bm{\alpha} = .25 + + \begin{pmatrix} + 3 \eps[\alpha] & 2 \eps[\alpha] & \eps[\alpha] & - \eps[\alpha]\\ + 2 \eps[\alpha] & 2 \eps[\alpha] & - \eps[\alpha] & \eps[\alpha]\\ + \eps[\alpha] & - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha]\\ + - \eps[\alpha] & \eps[\alpha] & 2 \eps[\alpha] & 0 + \end{pmatrix}, \\ \bm{\pi}^1 = \sigma_1 + \begin{pmatrix} + 0.2 & 0.4 & 0.4 & 0 + \end{pmatrix}, + && \bm{\pi}^2 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, \\ + \bm{\rho}^1 = + \begin{pmatrix} + 0.25 & 0.25 & 0.25 & 0.25 + \end{pmatrix}, && + \bm{\rho}^2 = \sigma_2 + \begin{pmatrix} + 0 & 0.33 & 0.33 & 0.33 + \end{pmatrix}, && +\end{align*} +with $\eps[\alpha]$ taking nine equally spaced values ranging from 0 to 0.24. +For each value of $\eps[\alpha]$, 108 datasets ($X_1, X_2$) are simulated, +resulting in $9 \times 108 = 972$ datasets. More precisely, for each dataset, +we pick uniformly at random two permutations of $\{ 1, \dots , 4 \}$ +($\sigma_1, \sigma_2$) with the constraint that $\sigma_1(4) \neq \sigma_2(1)$. +This ensures that each of the two networks have a non-empty block that is empty +in the other one. Then the networks are simulated with +$\mathcal{B}$ern-$BiSBM_{120}(4, \bm{\alpha}, \bm{\pi}^m, \bm{\rho}^m)$ +with the previous parameters. Each network has 2 blocks in common and their +connectivity structures encompass a mix of core-periphery, assortative +community and disassortative community structures, depending on which 3 of the 4 +blocks are selected for each network. $\eps[\alpha]$ represents the strength of +these structures, the larger, the easier it is to tell apart one block from +another. +The true model of all the simulation is a $\pi\rho\text{-}colBiSBM$. + +\paragraph{Inference} We want to measure the quality of the +inference procedure, for this we use the inference described in the section +\ref{sec:variational-estimation-of-the-parameters}. + +\paragraph{Quality indicators} To assess the quality of the inference, we will +use the following indicators: +\begin{itemize} + \item First, for each dataset, we put in competition $\pi\text{-}colBiSBM$ with + $sep\text{-}BiSBM$, $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ + respectively. To do so, for each dataset, we compute the + BIC-L of each model $\pi\text{-}colBiSBM$ is preferred to $sep\text{-}BiSBM$ + (resp. $iid\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$) if + its BIC-L is greater. + \item When considering $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare $\widehat{Q_1}$, $\widehat{Q_2}$ to + their true values. ($Q_1 = 4$ and $Q_2 = 4$) + \item Finally, we assess the quality of the node grouping by computing the + Adjusted Rand Index \parencite[][, ARI = 0 for a random grouping, ARI = 1 for a perfect recovery]{hubertComparingPartitions1985}. For each network, for the + $\pi\text{-}colBiSBM$, $\rho\text{-}colBiSBM$, + $\pi\rho\text{-}colBiSBM$ we compare the inferred block memberships to the + real ones by computing the mean of the ARI per axis over the two networks + \begin{equation*} + \overline{\text{ARI}}_d = \frac{1}{2} \text{ARI}\big( \text{ARI}(\widehat{\bm{Z}^1_d},\bm{Z}^1_d) + \text{ARI}(\widehat{\bm{Z}^2_d},\bm{Z}^2_d) \big) + \end{equation*} + where $d$ is the dimension or axis (i.e., rows, $d=1$, or columns, $d=2$) of + the block memberships. + And we compute the ARI of the whole set of nodes to account for block + pairing between networks + \begin{equation*} + \text{ARI}_d = \text{ARI}\big((\widehat{\bm{Z}^1_d},\widehat{\bm{Z}^2_d}),(\bm{Z}^1_d,\bm{Z}^2_d) \big) + \end{equation*} +\end{itemize} + +All these quality indicators are averaged over the 108 datasets. The results are +provided in the tables \ref{tab:per_model_sep} to \ref{tab:per_model_pirho}. Each line corresponds to the +108 datasets for a given value of value of $\eps[\alpha]$. + +<>= +averaged_data <- result_data_frame %>% + group_by(epsilon_alpha) %>% + summarise(across(-preferred_model, list("avrg" = meanse))) %>% + select(-c(2:10)) +averaged_data <- averaged_data %>% + select(which(!grepl("*_BICL_*", colnames(averaged_data)), + arr.ind = TRUE)) +@ + +<>= +dataframe_per_model <- function(model) { + averaged_data %>% + select(epsilon_alpha, starts_with(paste0(model, "_"))) +} +@ + +\tiny +<>= +for (model in c("sep", "iid", "pi", "rho", "pirho")) { + kable_colnames <- c( + "$\\eps[\\alpha]$", #"BIC-L", + "$\\overline{\\text{ARI}}_{1}$", + "$\\overline{\\text{ARI}}_{2}$", "$\\text{ARI}_{1}$", "$\\text{ARI}_{2}$" + ) + model_name <- model + if (model != "sep") { + kable_colnames <- c( + kable_colnames, "Recovered $Q_1$", + "Recovered $Q_2$" + ) + } + if (model == "pirho") { + model_name <- "$\\pi\\rho$" + } else { + if (model != "iid" && model != "sep") { + model_name <- paste0("$\\", model, "$") + } else { + model_name <- paste0("$", model, "$") + } + } + print(kable(dataframe_per_model(model), + escape = FALSE, + booktabs = TRUE, + digits = 2, + position = "!h", + caption = paste0( + "\\label{tab:per_model_", model, + "}Quality metrics for ", + ifelse(model != "sep", paste0(model_name, "$\\text{-}colBiSBM$"),"$sep\\text{-}BiSBM$") + ), + col.names = kable_colnames + )) +} +@ +\normalsize + +<>= +proportion_preferred_data <- result_data_frame %>% + group_by(epsilon_alpha, preferred_model) %>% + summarise(n = n()) %>% + mutate(prop_model = n / sum(n)) %>% + ungroup() %>% + select(-n) + +proportion_preferred_table <- proportion_preferred_data %>% + pivot_wider( + names_from = preferred_model, + values_from = prop_model, values_fill = 0 + ) + +kable(proportion_preferred_table, + escape = FALSE, + booktabs = TRUE, + digits = 2, + position = "!h", + caption = "\\label{tab:proportion-preferred-table}Proportions of models selected per \\eps[\\alpha] (data for Figure \\ref{fig:inference-proportion-preferred})", + col.names = c( + "\\eps[\\alpha]", + "$sep\\text{-}BiSBM$", + "$iid\\text{-}colBiSBM$", + "$\\pi\\text{-}colBiSBM$", + "$\\rho\\text{-}colBiSBM$", + "$\\pi\\rho\\text{-}colBiSBM$" + ), + align = "rccccc", + format = "latex" +) +@ + +<>= +#| fig.cap="\\label{fig:inference-proportion-preferred}Plot of the proportions of different preferred models in function of \\eps[\\alpha]", +#| fig.asp = 0.5, +#| fig.pos = "H", +#| fig.width = 7, +#| fig.height = 4, +#| dpi=300 + +plot <- proportion_preferred_data %>% ggplot() + + aes( + x = epsilon_alpha, y = prop_model, color = preferred_model, + fill = preferred_model + ) + + guides( + fill = guide_legend(title = "Preferred Model"), + color = guide_legend(title = "Preferred Model") + ) + + scale_x_continuous(breaks = seq(from = 0.0, to = 0.24, by = 0.03)) + + scale_color_okabe_ito() + + scale_fill_okabe_ito() + + xlab(TeX("$\\epsilon_{\\alpha}$")) + + ylab("Model proportions") + + geom_col(position = "stack") +print(plot) +@ + +\paragraph{Results} For the model comparison, when $\eps[\alpha]$ is small +($\eps[\alpha]\in[0, .04]$), the simulation model is close to the +Erd\H{o}s-Reńyi network and it is very hard to find any structure beyond the one +of a single block on each dimension. + +On the figure \ref{fig:inference-proportion-preferred} and table +\ref{tab:proportion-preferred-table} we can see that from +$\eps[\alpha] = 0.12$ around $70\%$ of the time the $\pi\rho\text{-}colBiSBM$ +model (i.e., the correct one) is selected. + +An interesting result we can read in the tables is that our models outperform +the $sep\text{-}BiSBM$ when considering the ARI on the whole set of nodes +($\text{ARI}_d$). This means that our models are able to recover the block +pairing \emph{between the networks} in addition to recovering the blocks and +their parameters. \ No newline at end of file diff --git a/rapport/chapter3-structure-detection.tex b/rapport/chapter3-structure-detection.tex index e8b7961..a3e5cf8 100644 --- a/rapport/chapter3-structure-detection.tex +++ b/rapport/chapter3-structure-detection.tex @@ -70,7 +70,7 @@ network $m$ is assumed to follow a $BiSBM$ with its own parameters ($\bm{\pi}^m, % Here are some common notations and conventions that we will use in the following % sections. -\subsection{A collection of i.i.d bipartite SBM}\label{ssec:a-collection-of-i-i-d-bipartite-sbm} +\subsection{A collection of iid bipartite SBM}\label{ssec:a-collection-of-i-i-d-bipartite-sbm} As for \emph{colSBM} this first model is the most constrained. It assumes that all the networks are the independent realizations of the same $Q_1$-$Q_2$-BiSBM with identical parameters. The \emph{iid-colBiSBM} is defined as follows: @@ -653,9 +653,12 @@ The process of clustering a collection of networks involves discovering a partition $\mathcal{G} = (\mathcal{M}_g)_{g=1,\dots,G}$ of $\{1,\dots, M\}$. Given $\mathcal{G}$ we set the following model on $\bm{X}$: -\[ - \forall g \in \{1,\dots, G\}, \forall m \in \mathcal{M}_g, X^m \sim \mathcal{F}\text{-}BiSBM(Q_1^g, Q_2^g, \bm{\pi^m, \rho^m,} \bm{\alpha}^g) -\] +\begin{align*} + \forall g \in \{1,\dots, G\}, + ~\forall m \in \mathcal{M}_g, + ~X^m \sim + \mathcal{F}\text{-}BiSBM(Q_1^g, Q_2^g, \bm{\pi^m, \rho^m,} \bm{\alpha}^g) +\end{align*} And we defined the score of a given partition $\mathcal{G}$: \[ diff --git a/rapport/rapport.pdf b/rapport/rapport.pdf index 1274125..94a386a 100644 Binary files a/rapport/rapport.pdf and b/rapport/rapport.pdf differ diff --git a/rapport/rapport.tex b/rapport/rapport.tex index a031de1..7de3824 100644 --- a/rapport/rapport.tex +++ b/rapport/rapport.tex @@ -221,8 +221,6 @@ automata,positioning} \addtocounter{maincontentend}{1} \addtocounter{customchapter}{1} \printbibliography -\end{selectlanguage} -\begin{selectlanguage}{french} \listoffigures \listoftables \end{selectlanguage}