Cleaning repo and continue working and fixing netclust inference

2023-07-10 11:13:09 +02:00 · 2023-07-10 11:13:09 +02:00 · bd6a12b8df
commit bd6a12b8df
parent 2f561d57ce
22 changed files with 135 additions and 3724 deletions
--- a/Rcodes/real_data/CoOPLBM_completion_analyze.html
+++ b/Rcodes/real_data/CoOPLBM_completion_analyze.html
--- a/Rcodes/real_data/CoOPLBM_completion_analyze.pdf
+++ b/Rcodes/real_data/CoOPLBM_completion_analyze.pdf
--- a/Rcodes/real_data/presentation_dore.html
+++ b/Rcodes/real_data/presentation_dore.html
--- a/Rcodes/real_data/presentation_dore.pdf
+++ b/Rcodes/real_data/presentation_dore.pdf
--- a/Rcodes/simulation/NA_robustness_analyse.html
+++ b/Rcodes/simulation/NA_robustness_analyse.html
--- a/Rcodes/simulation/NA_robustness_analyse.pdf
+++ b/Rcodes/simulation/NA_robustness_analyse.pdf
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_iid_08-07-23-16:51:52.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_iid_08-07-23-16:51:52.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_iid_10-07-23-10:08:21.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_iid_10-07-23-10:08:21.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_pi_10-07-23-10:39:58.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_pi_10-07-23-10:39:58.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_pirho_08-07-23-17:04:36.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_pirho_08-07-23-17:04:36.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_pirho_10-07-23-10:34:17.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_pirho_10-07-23-10:34:17.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_rho_08-07-23-16:58:41.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_rho_08-07-23-16:58:41.Rds
--- a/Rcodes/simulation/data/simulated_collection_data_clustering_rho_10-07-23-10:45:16.Rds
+++ b/Rcodes/simulation/data/simulated_collection_data_clustering_rho_10-07-23-10:45:16.Rds
--- a/Rcodes/simulation/inference_analyze.Rmd
+++ b/Rcodes/simulation/inference_analyze.Rmd
@ -227,7 +227,7 @@ kable(proportion_preferred_table,
 #| fig.height = 4,
 #| dpi=300

-proportion_preferred_data %>% ggplot() +
+plot <- proportion_preferred_data %>% ggplot() +
    aes(
        x = epsilon_alpha, y = prop_model, color = preferred_model,
        fill = preferred_model
@ -237,12 +237,12 @@ proportion_preferred_data %>% ggplot() +
        color = guide_legend(title = "Preferred Model")
    ) +
    scale_x_continuous(breaks = seq(from = 0.0, to = 0.24, by = 0.03)) +
-    scale_color_okabe_ito() +
-    scale_fill_okabe_ito() +
-    xlab(TeX("$\\epsilon_{\\alpha}$")) +
+        scale_color_okabe_ito() +
+        scale_fill_okabe_ito() +
+        xlab(TeX("$\\epsilon_{\\alpha}$")) +
        ylab("Model proportions") +
        geom_col(position = "stack")
-
+print(plot)
 ```

 \paragraph{Results} For the model comparison, when $\eps[\alpha]$ is small 
--- a/Rcodes/simulation/netclustering_analyze.Rmd
+++ b/Rcodes/simulation/netclustering_analyze.Rmd
@ -18,8 +18,62 @@ filenames <- list.files(

 # data_list <- lapply(filenames, function(file) lapply(readRDS(file), function(model) model$list_clustering))
 df_netclust <- do.call("rbind", lapply(filenames, readRDS))
+df_netclust$model <- factor(df_netclust$model, levels = c(
+    "iid", "pi",
+    "rho", "pirho"
+))

 ```
+\paragraph{Simulation settings} For all models we simulate $M = 9$ networks with
+$\forall m \in \{ 1 \dots M \} , n^m_1 = n^m_2 = 75$ with $Q_1 = Q_2 = 3$. For
+the simulations the proportions are the following:
+
+\begin{align*}
+\bm{\pi}^1 = \left( 0.2, 0.3, 0.5 \right) & &  \bm{\rho}^1 = \left( 0.2, 0.3, 0.5 \right)
+\end{align*}
+and for all $m = 2,\dots,9$
+\begin{align*}
+\bm{\pi}^m = \begin{cases}
+    \bm{\pi}^1 & \text{for } iid\text{-}colBiSBM \\
+    \sigma^1_m(\bm{\pi}^1) & \text{for } \pi\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
+\end{cases}\\
+\bm{\rho}^m = 
+\begin{cases}
+    \bm{\rho}^1 & \text{for } iid\text{-}colBiSBM \\
+    \sigma^2_m(\bm{\rho}^1) & \text{for } \rho\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
+\end{cases} 
+\end{align*}
+where $\sigma^1_m$ and $\sigma^2_m$ are permutations of {1, 2, 3} proper to network $m$ and 
+$\sigma^1 (\pi)= {(\pi_{\sigma^1 (i)})}_{i=\{1,\dots,3\}}$ 
+and $\sigma^2 (\rho)= {(\rho_{\sigma^2 (i)})}_{i=\{1,\dots,3\}}$. 
+The networks are divided into 3 sub-collections of 3
+networks with connectivity parameters as follows:
+
+\begin{align*}
+\bm{\alpha}^{as} = .3 + \begin{pmatrix}
+    \epsilon & - \frac{\epsilon}{2} & - \frac{\epsilon}{2}\\
+    - \frac{\epsilon}{2} & \epsilon & - \frac{\epsilon}{2}\\
+    - \frac{\epsilon}{2} & - \frac{\epsilon}{2} & \epsilon
+\end{pmatrix}, &&
+\bm{\alpha}^{cp} = .3 + \begin{pmatrix}
+    \frac{3 \epsilon}{2} & \epsilon & \frac{\epsilon}{2}\\
+    \epsilon & \frac{\epsilon}{2} & 0\\
+    \frac{\epsilon}{2} & 0 & - \frac{\epsilon}{2}
+\end{pmatrix}, &&
+\bm{\alpha}^{dis} = .3 + \begin{pmatrix}
+    - \frac{\epsilon}{2} & \epsilon & \epsilon\\
+    \epsilon & - \frac{\epsilon}{2} & \epsilon\\
+    \epsilon & \epsilon & - \frac{\epsilon}{2}
+\end{pmatrix},
+\end{align*}
+with $\epsilon \in [.1, .4]$. $\bm{\alpha}^{as}$ represents a classical
+assortative community structure, 
+while $\bm{\alpha}^{cp}$ is a layered core-periphery structure with block 2
+acting as a semi-core. Finally, $\bm{\alpha}^{dis}$ is a disassortative
+community structure with stronger
+connections between blocks than within blocks. If $\epsilon = 0$, the three
+matrices are equal and the 9 networks have the same connection structure. 
+Increasing $\epsilon$ differentiates the 3 sub-collections of networks.

 ```{r netclustering-ARI-boxplot, echo = FALSE}
 #| dpi = 300,
@ -34,4 +88,10 @@ df_netclust %>%
    guides(fill = guide_legend(title = "Model")) +
    ylab("ARI of obtained netclustering") +
    geom_boxplot(aes(fill = model))
-```
+```
+
+\paragraph{Results} The evaluation of our method involves a comparison between 
+the resulting partition of the network collection and the simulated partition 
+using the ARI index. As the value of $\epsilon$ increases, our ability to 
+distinguish between the networks improves, and this distinction becomes nearly
+perfect in all setups of the $colBiSBM$.
--- a/Rcodes/simulation/netclustering_analyze.tex
+++ b/Rcodes/simulation/netclustering_analyze.tex
@ -1,8 +1,67 @@
 \section{Network clustering of simulated networks}\label{sec:network-clustering-of-simulated-networks}

+\paragraph{Simulation settings}
+
+For all models we simulate \(M = 9\) networks with
+\(\forall m \in \{ 1 \dots M \} , n^m_1 = n^m_2 = 75\) with
+\(Q_1 = Q_2 = 3\). For the simulations the proportions are the
+following:
+
+\begin{align*}
+\bm{\pi}^1 = \left( 0.2, 0.3, 0.5 \right) & &  \bm{\rho}^1 = \left( 0.2, 0.3, 0.5 \right)
+\end{align*} and for all \(m = 2,\dots,9\) \begin{align*}
+\bm{\pi}^m = \begin{cases}
+    \bm{\pi}^1 & \text{for } iid\text{-}colBiSBM \\
+    \sigma^1_m(\bm{\pi}^1) & \text{for } \pi\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
+\end{cases}\\
+\bm{\rho}^m =
+\begin{cases}
+    \bm{\rho}^1 & \text{for } iid\text{-}colBiSBM \\
+    \sigma^2_m(\bm{\rho}^1) & \text{for } \rho\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
+\end{cases}
+\end{align*} where \(\sigma^1_m\) and \(\sigma^2_m\) are permutations of
+\{1, 2, 3\} proper to network \(m\) and
+\(\sigma^1 (\pi)= {(\pi_{\sigma^1 (i)})}_{i=\{1,\dots,3\}}\) and
+\(\sigma^2 (\rho)= {(\rho_{\sigma^2 (i)})}_{i=\{1,\dots,3\}}\). The
+networks are divided into 3 sub-collections of 3 networks with
+connectivity parameters as follows:
+
+\begin{align*}
+\bm{\alpha}^{as} = .3 + \begin{pmatrix}
+    \epsilon & - \frac{\epsilon}{2} & - \frac{\epsilon}{2}\\
+    - \frac{\epsilon}{2} & \epsilon & - \frac{\epsilon}{2}\\
+    - \frac{\epsilon}{2} & - \frac{\epsilon}{2} & \epsilon
+\end{pmatrix}, &&
+\bm{\alpha}^{cp} = .3 + \begin{pmatrix}
+    \frac{3 \epsilon}{2} & \epsilon & \frac{\epsilon}{2}\\
+    \epsilon & \frac{\epsilon}{2} & 0\\
+    \frac{\epsilon}{2} & 0 & - \frac{\epsilon}{2}
+\end{pmatrix}, &&
+\bm{\alpha}^{dis} = .3 + \begin{pmatrix}
+    - \frac{\epsilon}{2} & \epsilon & \epsilon\\
+    \epsilon & - \frac{\epsilon}{2} & \epsilon\\
+    \epsilon & \epsilon & - \frac{\epsilon}{2}
+\end{pmatrix},
+\end{align*} with \(\epsilon \in [.1, .4]\). \(\bm{\alpha}^{as}\)
+represents a classical assortative community structure, while
+\(\bm{\alpha}^{cp}\) is a layered core-periphery structure with block 2
+acting as a semi-core. Finally, \(\bm{\alpha}^{dis}\) is a
+disassortative community structure with stronger connections between
+blocks than within blocks. If \(\epsilon = 0\), the three matrices are
+equal and the 9 networks have the same connection structure. Increasing
+\(\epsilon\) differentiates the 3 sub-collections of networks.
+
 \begin{figure}
 \centering
-\includegraphics{./img/99d363f6aa43bf0eba413cb994dc00b130709107.png}
+\includegraphics{./img/ca0adc96e26b9b41eb8dec4c472696309ebcf0fe.png}
 \caption{\label{}ARI of the partition obtained by clustering in function
 of \(\eps\)}
 \end{figure}
+
+\paragraph{Results}
+
+The evaluation of our method involves a comparison between the resulting
+partition of the network collection and the simulated partition using
+the ARI index. As the value of \(\epsilon\) increases, our ability to
+distinguish between the networks improves, and this distinction becomes
+nearly perfect in all setups of the \(colBiSBM\).
--- a/Rcodes/simulation/netclustering_check.R
+++ b/Rcodes/simulation/netclustering_check.R
@ -10,7 +10,7 @@ if (!exists("model_to_test")) {
 }

 if (!exists("repetitions")) {
-    repetitions <- seq.int(3)
+    repetitions <- seq.int(30)
 }

 nr <- 75
@ -32,7 +32,7 @@ if (identical(arg, character(0))) {

 conditions <- tidyr::crossing(epsilons, pi, rho, repetitions)

-results <- lapply(seq_len(nrow(conditions)), function(s) {
+results <- bettermc::mclapply(seq_len(nrow(conditions)), function(s) {
    eps <- conditions[s, ]$epsilons
    current_pi <- conditions[s, ]$pi
    current_rho <- conditions[s, ]$rho
@ -195,6 +195,9 @@ results <- lapply(seq_len(nrow(conditions)), function(s) {
    )

    best_partitions <- unlist(extract_best_bipartite_partition(list_collection))
+    if (!is(best_partitions, "list")) {
+        best_partitions <- list(best_partitions)
+    }
    clustering <- unlist(lapply(seq_along(best_partitions), function(col_idx) {
        setNames(
            rep(col_idx, best_partitions[[col_idx]]$M),
@ -206,15 +209,13 @@ results <- lapply(seq_len(nrow(conditions)), function(s) {
    ari <- aricode::ARI(rep(c(1, 2, 3), each = 3), clustering)

    toc()
-    cat(paste("Finished", s))
    return(
        data.frame(epsilon = eps, model = model_to_test, ARI = ari)
    )
-}
-# ,
-# mc.cores = parallel::detectCores() - 1,
-# mc.progress = TRUE,
-# mc.retry = -1
+},
+mc.cores = parallel::detectCores() - 1,
+mc.progress = TRUE,
+mc.retry = -1
 )

 data_frame_result <- do.call("rbind", results)
--- a/figure/netclustering-ARI-boxplot-1.png
+++ b/figure/netclustering-ARI-boxplot-1.png
--- a/img/6a5c3c2748922aace8a2034349434383ce4a9f11.png
+++ b/img/6a5c3c2748922aace8a2034349434383ce4a9f11.png
--- a/img/ca0adc96e26b9b41eb8dec4c472696309ebcf0fe.png
+++ b/img/ca0adc96e26b9b41eb8dec4c472696309ebcf0fe.png
--- a/img/d424b38c3b69ae646295e877eee9ae4e8602ec6c.png
+++ b/img/d424b38c3b69ae646295e877eee9ae4e8602ec6c.png
--- a/rapport.pdf
+++ b/rapport.pdf