Cleaning repo and continue working and fixing netclust inference

This commit is contained in:
Louis Lacoste 2023-07-10 11:13:09 +02:00
parent 2f561d57ce
commit bd6a12b8df
22 changed files with 135 additions and 3724 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -227,7 +227,7 @@ kable(proportion_preferred_table,
#| fig.height = 4,
#| dpi=300
proportion_preferred_data %>% ggplot() +
plot <- proportion_preferred_data %>% ggplot() +
aes(
x = epsilon_alpha, y = prop_model, color = preferred_model,
fill = preferred_model
@ -237,12 +237,12 @@ proportion_preferred_data %>% ggplot() +
color = guide_legend(title = "Preferred Model")
) +
scale_x_continuous(breaks = seq(from = 0.0, to = 0.24, by = 0.03)) +
scale_color_okabe_ito() +
scale_fill_okabe_ito() +
xlab(TeX("$\\epsilon_{\\alpha}$")) +
scale_color_okabe_ito() +
scale_fill_okabe_ito() +
xlab(TeX("$\\epsilon_{\\alpha}$")) +
ylab("Model proportions") +
geom_col(position = "stack")
print(plot)
```
\paragraph{Results} For the model comparison, when $\eps[\alpha]$ is small

View file

@ -18,8 +18,62 @@ filenames <- list.files(
# data_list <- lapply(filenames, function(file) lapply(readRDS(file), function(model) model$list_clustering))
df_netclust <- do.call("rbind", lapply(filenames, readRDS))
df_netclust$model <- factor(df_netclust$model, levels = c(
"iid", "pi",
"rho", "pirho"
))
```
\paragraph{Simulation settings} For all models we simulate $M = 9$ networks with
$\forall m \in \{ 1 \dots M \} , n^m_1 = n^m_2 = 75$ with $Q_1 = Q_2 = 3$. For
the simulations the proportions are the following:
\begin{align*}
\bm{\pi}^1 = \left( 0.2, 0.3, 0.5 \right) & & \bm{\rho}^1 = \left( 0.2, 0.3, 0.5 \right)
\end{align*}
and for all $m = 2,\dots,9$
\begin{align*}
\bm{\pi}^m = \begin{cases}
\bm{\pi}^1 & \text{for } iid\text{-}colBiSBM \\
\sigma^1_m(\bm{\pi}^1) & \text{for } \pi\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
\end{cases}\\
\bm{\rho}^m =
\begin{cases}
\bm{\rho}^1 & \text{for } iid\text{-}colBiSBM \\
\sigma^2_m(\bm{\rho}^1) & \text{for } \rho\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
\end{cases}
\end{align*}
where $\sigma^1_m$ and $\sigma^2_m$ are permutations of {1, 2, 3} proper to network $m$ and
$\sigma^1 (\pi)= {(\pi_{\sigma^1 (i)})}_{i=\{1,\dots,3\}}$
and $\sigma^2 (\rho)= {(\rho_{\sigma^2 (i)})}_{i=\{1,\dots,3\}}$.
The networks are divided into 3 sub-collections of 3
networks with connectivity parameters as follows:
\begin{align*}
\bm{\alpha}^{as} = .3 + \begin{pmatrix}
\epsilon & - \frac{\epsilon}{2} & - \frac{\epsilon}{2}\\
- \frac{\epsilon}{2} & \epsilon & - \frac{\epsilon}{2}\\
- \frac{\epsilon}{2} & - \frac{\epsilon}{2} & \epsilon
\end{pmatrix}, &&
\bm{\alpha}^{cp} = .3 + \begin{pmatrix}
\frac{3 \epsilon}{2} & \epsilon & \frac{\epsilon}{2}\\
\epsilon & \frac{\epsilon}{2} & 0\\
\frac{\epsilon}{2} & 0 & - \frac{\epsilon}{2}
\end{pmatrix}, &&
\bm{\alpha}^{dis} = .3 + \begin{pmatrix}
- \frac{\epsilon}{2} & \epsilon & \epsilon\\
\epsilon & - \frac{\epsilon}{2} & \epsilon\\
\epsilon & \epsilon & - \frac{\epsilon}{2}
\end{pmatrix},
\end{align*}
with $\epsilon \in [.1, .4]$. $\bm{\alpha}^{as}$ represents a classical
assortative community structure,
while $\bm{\alpha}^{cp}$ is a layered core-periphery structure with block 2
acting as a semi-core. Finally, $\bm{\alpha}^{dis}$ is a disassortative
community structure with stronger
connections between blocks than within blocks. If $\epsilon = 0$, the three
matrices are equal and the 9 networks have the same connection structure.
Increasing $\epsilon$ differentiates the 3 sub-collections of networks.
```{r netclustering-ARI-boxplot, echo = FALSE}
#| dpi = 300,
@ -34,4 +88,10 @@ df_netclust %>%
guides(fill = guide_legend(title = "Model")) +
ylab("ARI of obtained netclustering") +
geom_boxplot(aes(fill = model))
```
```
\paragraph{Results} The evaluation of our method involves a comparison between
the resulting partition of the network collection and the simulated partition
using the ARI index. As the value of $\epsilon$ increases, our ability to
distinguish between the networks improves, and this distinction becomes nearly
perfect in all setups of the $colBiSBM$.

View file

@ -1,8 +1,67 @@
\section{Network clustering of simulated networks}\label{sec:network-clustering-of-simulated-networks}
\paragraph{Simulation settings}
For all models we simulate \(M = 9\) networks with
\(\forall m \in \{ 1 \dots M \} , n^m_1 = n^m_2 = 75\) with
\(Q_1 = Q_2 = 3\). For the simulations the proportions are the
following:
\begin{align*}
\bm{\pi}^1 = \left( 0.2, 0.3, 0.5 \right) & & \bm{\rho}^1 = \left( 0.2, 0.3, 0.5 \right)
\end{align*} and for all \(m = 2,\dots,9\) \begin{align*}
\bm{\pi}^m = \begin{cases}
\bm{\pi}^1 & \text{for } iid\text{-}colBiSBM \\
\sigma^1_m(\bm{\pi}^1) & \text{for } \pi\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
\end{cases}\\
\bm{\rho}^m =
\begin{cases}
\bm{\rho}^1 & \text{for } iid\text{-}colBiSBM \\
\sigma^2_m(\bm{\rho}^1) & \text{for } \rho\text{-}colBiSBM \text{ and } \pi\rho\text{-}colBiSBM
\end{cases}
\end{align*} where \(\sigma^1_m\) and \(\sigma^2_m\) are permutations of
\{1, 2, 3\} proper to network \(m\) and
\(\sigma^1 (\pi)= {(\pi_{\sigma^1 (i)})}_{i=\{1,\dots,3\}}\) and
\(\sigma^2 (\rho)= {(\rho_{\sigma^2 (i)})}_{i=\{1,\dots,3\}}\). The
networks are divided into 3 sub-collections of 3 networks with
connectivity parameters as follows:
\begin{align*}
\bm{\alpha}^{as} = .3 + \begin{pmatrix}
\epsilon & - \frac{\epsilon}{2} & - \frac{\epsilon}{2}\\
- \frac{\epsilon}{2} & \epsilon & - \frac{\epsilon}{2}\\
- \frac{\epsilon}{2} & - \frac{\epsilon}{2} & \epsilon
\end{pmatrix}, &&
\bm{\alpha}^{cp} = .3 + \begin{pmatrix}
\frac{3 \epsilon}{2} & \epsilon & \frac{\epsilon}{2}\\
\epsilon & \frac{\epsilon}{2} & 0\\
\frac{\epsilon}{2} & 0 & - \frac{\epsilon}{2}
\end{pmatrix}, &&
\bm{\alpha}^{dis} = .3 + \begin{pmatrix}
- \frac{\epsilon}{2} & \epsilon & \epsilon\\
\epsilon & - \frac{\epsilon}{2} & \epsilon\\
\epsilon & \epsilon & - \frac{\epsilon}{2}
\end{pmatrix},
\end{align*} with \(\epsilon \in [.1, .4]\). \(\bm{\alpha}^{as}\)
represents a classical assortative community structure, while
\(\bm{\alpha}^{cp}\) is a layered core-periphery structure with block 2
acting as a semi-core. Finally, \(\bm{\alpha}^{dis}\) is a
disassortative community structure with stronger connections between
blocks than within blocks. If \(\epsilon = 0\), the three matrices are
equal and the 9 networks have the same connection structure. Increasing
\(\epsilon\) differentiates the 3 sub-collections of networks.
\begin{figure}
\centering
\includegraphics{./img/99d363f6aa43bf0eba413cb994dc00b130709107.png}
\includegraphics{./img/ca0adc96e26b9b41eb8dec4c472696309ebcf0fe.png}
\caption{\label{}ARI of the partition obtained by clustering in function
of \(\eps\)}
\end{figure}
\paragraph{Results}
The evaluation of our method involves a comparison between the resulting
partition of the network collection and the simulated partition using
the ARI index. As the value of \(\epsilon\) increases, our ability to
distinguish between the networks improves, and this distinction becomes
nearly perfect in all setups of the \(colBiSBM\).

View file

@ -10,7 +10,7 @@ if (!exists("model_to_test")) {
}
if (!exists("repetitions")) {
repetitions <- seq.int(3)
repetitions <- seq.int(30)
}
nr <- 75
@ -32,7 +32,7 @@ if (identical(arg, character(0))) {
conditions <- tidyr::crossing(epsilons, pi, rho, repetitions)
results <- lapply(seq_len(nrow(conditions)), function(s) {
results <- bettermc::mclapply(seq_len(nrow(conditions)), function(s) {
eps <- conditions[s, ]$epsilons
current_pi <- conditions[s, ]$pi
current_rho <- conditions[s, ]$rho
@ -195,6 +195,9 @@ results <- lapply(seq_len(nrow(conditions)), function(s) {
)
best_partitions <- unlist(extract_best_bipartite_partition(list_collection))
if (!is(best_partitions, "list")) {
best_partitions <- list(best_partitions)
}
clustering <- unlist(lapply(seq_along(best_partitions), function(col_idx) {
setNames(
rep(col_idx, best_partitions[[col_idx]]$M),
@ -206,15 +209,13 @@ results <- lapply(seq_len(nrow(conditions)), function(s) {
ari <- aricode::ARI(rep(c(1, 2, 3), each = 3), clustering)
toc()
cat(paste("Finished", s))
return(
data.frame(epsilon = eps, model = model_to_test, ARI = ari)
)
}
# ,
# mc.cores = parallel::detectCores() - 1,
# mc.progress = TRUE,
# mc.retry = -1
},
mc.cores = parallel::detectCores() - 1,
mc.progress = TRUE,
mc.retry = -1
)
data_frame_result <- do.call("rbind", results)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.