mia-rapport-2024/rapport/chapter3-structure-detection.tex

\addtocounter{customchapter}{1}
\chapter[Structure detection in bipartite collection]{Structure detection in a collection of bipartite networks}
\section{Definition of a collection}
\label{sec:definition-of-a-collection}

We define a collection of bipartite networks as
$\bm{X} = (X^1,\dots X^m,\dots, X^M)$
the collection of incidence matrix. Moreover, all the networks in the
collection have the same type of interaction (e.g., all interactions are
binary).

\section{Separate BiSBM (sep-BiSBM)}\label{sec:separate-bisbm-sepbisbm}

A first approach to deal with a collection of networks is to adjust separate
BiSBM for each network of the collection.

For network $m$, let $n_1^m$ (resp. $n_2^m$) be the number of nodes in row
(resp. column) divided into $Q_1^m$ row clusters (resp. $Q_2^m$ column
clusters).\\ Let $Z^m=(Z^m_1, \dots, Z^m_i, \dots, Z^m_{n_1^m})$ and
$W^m = (W^m_1, \dots,W^m_j, \dots, W^m_{n_2^m})$ be independent latent variables
such that $Z^m_i = q$ if row node $i$ of network $m$ belongs to row cluster $q$
($q\in\{1,\dots,Q_1^m\}$) and
$W^m_j = r$ if column node $j$ of network $m$ belong to column block $r$
($r\in\{1,\dots,Q_2^m\}$). And we have
\begin{align}\label{eqn:lbm-block-membership-prob}
    \mathbb{P}(Z_i^m=q)=\pi_q^m, &  & \mathbb{P}(W_j^m=r)=\rho_r^m
\end{align}
where $\pi_q^m > 0$, $\rho_r^m > 0$, $\sum_{q=1}^{Q_1^m}\pi_q^m = 1$ and
$\sum_{r=1}^{Q_2^m}\rho_r^m = 1$. Given the latent variables
$Z^m, W^m$, the $X_{ij}^m$s are assumed to be independent and distributed
as

\begin{align}\label{eqn:lbm-conditional-to-latent}
    X_{ij}^m|Z_i^m = q,W_j^m = r \sim \mathcal{F}(.;\alpha_{qr}^m)
\end{align}
where $\mathcal{F}$ is referred to as the emission distribution. $\mathcal{F}$ is chosen to
be the Bernoulli distribution for binary interactions, and the Poisson
distribution for weighted interactions such as counts. Let $f$ be the density of
the emission distribution, then:

\begin{equation}\label{eqn:lbm-emission}
    \log f(X^m_{ij};\alpha_{qr}^m) =
    \begin{cases}
        X_{ij}^m \log(\alpha_{qr}^m) + (1-X_{ij}^m) \log(1-\alpha_{qr}^m) & \text{for Bernoulli emission} \\
        -\alpha_{qr}^m + X_{ij}^m \log(\alpha_{qr}^m) - \log(X_{ij}^m!)   & \text{for Poisson emission}
    \end{cases}
\end{equation}

Equations~\eqref{eqn:lbm-block-membership-prob},
\eqref{eqn:lbm-conditional-to-latent} and \eqref{eqn:lbm-emission} defines the
BiSBM model and we will now use a short notation:

\begin{equation}
    \tag{\emph{sep-BiSBM}}
    X^m \sim \mathcal{F}\text{-BiSBM}_{n_1^m,n_2^m}(Q_1^m, Q_2^m, \bm{\pi^m}, \bm{\rho^m}, \bm{\alpha^m})
\end{equation}
where $\mathcal{F}$ encodes the emission distribution, $n_1^m,n_2^m$ are the row
and column nodes, $Q_1^m, Q_2^m$ are the number of row and column blocks in
network $m$, $\bm{\pi}^m~=~{(\pi^m_q)}_{q=1,\dots,Q_1^m}$ and
$\bm{\rho}^m~=~{(\rho^m_r)}_{r=1,\dots,Q_2^m}$ are the vectors  of their
proportions. The $Q_1^m \times Q_2^m$ matrix
$\bm{\alpha}^m = {(\alpha^m_{qr})}_{\substack{q = 1,\dots,Q_1^m \\ r = 1,\dots,Q_2^m}}$
are the connectivity parameters, the parameters of the emission distribution.
$\alpha^m_{qr}\in\mathcal{A}_{\mathcal{F}}$ where, for the Bernoulli
(resp. Poisson) emission distribution, $\mathcal{A}_{\mathcal{F}} = (0,1)$ (resp.
$\mathcal{A}_{\mathcal{F}} = \mathbb{R}^{*+}$). In this $sep$-$BiSBM$ each
network $m$ is assumed to follow a $BiSBM$ with its own parameters ($\bm{\pi}^m,
    \bm{\rho}^m, \bm{\alpha}^m$).
% DONE Finish explaining

\section{Definition of the \emph{colBiSBM} models}\label{sec:definition-of-the-colbisbm-models}
% Here are some common notations and conventions that we will use in the following
% sections.

\subsection{A collection of iid bipartite SBM}\label{ssec:a-collection-of-i-i-d-bipartite-sbm}
As for \emph{colSBM} this first model is the most constrained. It assumes that
all the networks are the independent realizations of the same $Q_1$-$Q_2$-BiSBM
with identical parameters. The \emph{iid-colBiSBM} is defined as follows:

\begin{align}
    \tag{\emph{iid}-colBiSBM}
    X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi}, \bm{\rho}, \bm{\alpha}), &  & \forall m = 1, \dots M
\end{align}
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
$\pi_q \in \left( 0,1 \right], \sum_{q=1}^{Q_1} \pi_q = 1 $ and $\rho_r \in \left( 0,1 \right], \sum_{r=1}^{Q_2} \rho_r = 1 $.
This model involves $(Q_1 - 1) + (Q_2 - 1) + Q_1\times Q_2$ parameters, the two
first terms corresponding to block proportions on the row and column dimensions
and the third term to connectivity parameters.

But the assumption that block proportions are the same among the networks is a
strong assumption. In plant-pollinator networks, the proportion of specialist
species can differ between networks and thus the model may benefit from not
having the same block proportions but sharing a common connectivity structure.
The following models relaxes this assumption on either row, column or both.

\subsection{A collection of bipartite SBM with varying block size on either rows or columns}\label{ssec:a-collection-of-bipartite-sbm-with-varying-block-size-on-either-rows-or-columns}
% DONE Finish explaining

$\pi$-colBiSBM model still assumes that the networks share a common connectivity
structure represented by $\bm{\alpha}$ but that each network has its own row
block proportions. For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
\begin{align}
    \tag{\emph{$\pi$}-colBiSBM}
    X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi^m}, \bm{\rho}, \bm{\alpha}), &  & \forall m = 1, \dots, M
\end{align}
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
$\pi^m_q \in \left[ 0,1 \right], \sum_{q=1}^{Q_1} \pi^m_q~=~1, \forall m \in \{1,\dots,M\}$ and $\rho_r \in \left( 0,1 \right], \sum_{r=1}^{Q_2} \rho_r = 1 $.
This model is more flexible than the iid-colBiSBM as it allows some row block
proportions to be null
in certain networks ($\pi^m_q\in\left[ 0,1 \right]$): if $\pi_q^m = 0$ then the
block $q$ is not represented in the network $m$. The connectivity structure is
thus a subset of a large connectivity structure common to all networks. We face
the same problems as~\cite{chabert-liddellLearningCommonStructures2024a} and
adapt the support $S$ they define for the $\pi$-colSBM to the bipartite case by
having $S^1$ of size $M\times Q_1$ the support for the rows and $S^2$ of size
$M\times Q_2$ the support for the columns. Thus
$S^1_{mq} = \mathbbb{1}_{\pi^m_q > 0}$ and
$S^2_{mr} = \mathbbb{1}_{\rho^m_r > 0}$. In this case, $S^2 = \bm{1}$, because
there is no freedom on the column dimension.

For a given number of blocks $Q_1$, $Q_2$ and matrix $S^1$ ($S^2$ being in this
case the matrix full of ones), the number of parameters is:
\begin{equation*}
    \text{NP}(\pi\text{-colBiSBM}) = \sum_{m=1}^{M}\Bigg( \sum_{q=1}^{Q_1} S^1_{mq} - 1 \Bigg) + (Q_2 - 1) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
\end{equation*}
The first term corresponds to the non-null block proportions in each network.
The third quantity accounts for the fact that some blocks may never be
represented simultaneously in any network, so the corresponding connection
parameters $\alpha_{qr}$ are not useful for defining the model.

$\rho$-colBiSBM model still assumes that the networks share a common connectivity
structure represented by $\bm{\alpha}$ but that each network has its own column
block proportions. For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
\begin{align}
    \tag{\emph{$\rho$}-colBiSBM}
    X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi}, \bm{\rho^m}, \bm{\alpha}), &  & \forall m = 1, \dots, M
\end{align}
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
$\pi_q \in \left( 0,1 \right], \sum_{q=1}^{Q_1} \pi_q = 1 $ and
$\rho^m_r \in \left[ 0,1 \right], \sum_{r=1}^{Q_2} \rho^m_r = 1 $.
This model is more flexible than the iid-colBiSBM as it allows some column block
proportions to be
null in certain networks ($\rho^m_r\in\left[ 0,1 \right]$): if $\rho_r^m = 0$
then the column block $r$ is not represented in the network $m$.

\enquote{Mirroring} the formulas for the $\pi$-colBiSBM we relax the constraints on
the column dimension.

For a given number of blocks $Q_1$, $Q_2$ and matrix $S^2$ ($S^1$ being in this
case the matrix full of ones), the number of parameters is:
\begin{equation*}
    \text{NP}(\rho\text{-colBiSBM}) = (Q_1 - 1) + \sum_{m=1}^{M}\Bigg( \sum_{r=1}^{Q_2} S^2_{mr} - 1 \Bigg) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
\end{equation*}

$\pi\rho$-colBiSBM model still assumes that the networks share a common connectivity
structure represented by $\bm{\alpha}$ but that each network has its own row and
column block proportions, it is the less constrained model.
For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
\begin{align}
    \tag{\emph{$\pi\rho$}-colBiSBM}
    X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi^m}, \bm{\rho^m}, \bm{\alpha}), &  & \forall m = 1, \dots, M
\end{align}
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
$\pi^m_q \in \left[ 0,1 \right], \sum_{q=1}^{Q_1} \pi^m_q~=~1, \forall m \in \{1,\dots,M\}$ and
$\rho^m_r \in \left[ 0,1 \right], \sum_{r=1}^{Q_2} \rho^m_r = 1 $.

For a given number of blocks $Q_1$, $Q_2$ and matrices $S^1$, $S^2$, the number
of parameters is:
\begin{equation*}
    \text{NP}(\pi\rho\text{-colBiSBM}) = \sum_{m=1}^{M}\Bigg( \sum_{q=1}^{Q_1} S^1_{mq} - 1 \Bigg) + \sum_{m=1}^{M}\Bigg( \sum_{r=1}^{Q_2} S^2_{mr} - 1 \Bigg) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
\end{equation*}

\section{Variational estimation of the parameters}\label{sec:variational-estimation-of-the-parameters}

In practice, the estimation of the likelihood is not tractable. Following the
classical approach defined in~\cite{daudinMixtureModelRandom2008} we use a
variational version of the Expectation Maximization (VEM) algorithm.

We maximize a variational lower bound of the log-likelihood of the observed
data, the so-called Evidence Lower Bound (or ELBO), by approximating
$p(\bm{Z,W}|\bm{X};\bm{\theta})$ with a distribution on
$\bm{Z}$ and $\bm{W}$ named $\mathcal{R}$ defined as $\mathcal{R} =
    \otimes_{m=1}^M \mathcal{R}_m$.\

The lower bound is defined as:
\begin{equation*}
    \mathcal{J}(\mathcal{R};\bm{\theta}) \coloneqq \sum_{m=1}^{M} \bigg( \mathbb{E}_{\mathcal{R}_m}[\ell(X^m,Z^m,W^m;\bm{\theta})] + \mathcal{H}(\mathcal{R}_m) \bigg)  \leq \ell(\bm{X};\bm{\theta})
\end{equation*}

$(Z^m_i)_{i=1\dots n_1^m}$ and $(W^m_j)_{j=1\dots n_2^m}$ are
redefined using the \emph{one-hot encoded} conversion (i.e., $Z_i^m = q
    \rightarrow Z_{iq}^m = 1$ and $W_j^m = r \rightarrow W_{jr}^m = 1$).\\ % W_{jr\prime}^m pour r != r égal 0

% TODO Demander une confirmation des formules

When $\mathcal{R}_m$ is issued from the set of the factorizable distributions,
we denote $\tau_{iq}^{1,m} = \mathbb{P}_{\mathcal{R}_m}(Z_{iq}^m = 1|X_{i\bullet}^m)$
and $\tau_{jr}^{2,m} = \mathbb{P}_{\mathcal{R}_m}(W_{jr}^m = 1|X_{\bullet j}^m)$, thus
we have: $\mathbb{P}_{\mathcal{R}_m} (Z_{iq}^m = 1, W_{jr}^m = 1|X^m) =
    \mathbb{P}_{\mathcal{R}_m}(Z_{iq}^m = 1|X_{i\bullet}^m) {\color{red}\times}
    \mathbb{P}_{\mathcal{R}_m}(W_{jr}^m = 1|X_{\bullet j}^m) = \tau_{iq}^{1,m}
    {\color{red}\times} \tau_{jr}^{2,m}$.

The formula for the entropy per network is thus:
\begin{equation*}
    \mathcal{H}(\mathcal{R}_m) = - \sum_{i=1}^{n_1^m} \tau^{1,m}_{i,q} \log \tau^{1,m}_{i,q} - \sum_{j=1}^{n_2^m} \tau^{2,m}_{j,r} \log \tau^{2,m}_{j,r}
\end{equation*}

And the expectation of the completed log-likelihood under the $\mathcal{R}_m$
variational distribution for network $m$ is:
\begin{align*}
    \mathbb{E}_{\mathcal{R}_m}[\ell(X^m,Z^m,W^m;\bm{\theta})] = \sum_{i = 1}^{n_1^m}\sum_{j=1}^{n_2^m}\sum_{q \in \mathcal{Q}_{1,m}} \sum_{r \in \mathcal{Q}_{2,m}} \tau^{1,m}_{i,q} \tau^{2,m}_{j,r} \log f(X^{m}_{ij}; \alpha_{qr}) \\
    + \sum_{i=1}^{n_1^m} \sum_{q \in \mathcal{Q}_{1,m}} \tau^{1,m}_{i,q} \log \pi_{\color{black}q}^{\color{gray}m} + \sum_{j=1}^{n_2^m} \sum_{r \in \mathcal{Q}_{2,m}} \tau^{2,m}_{j,r} \log \rho_{\color{black}r}^{\color{gray}m}
\end{align*}

And thus the lower bound becomes:

\begin{align*}
    \mathcal{J}(\bm{\tau};\bm{\theta}) \coloneqq \sum_{m=1}^{M} \bigg(\sum_{i = 1}^{n_1^m}\sum_{j=1}^{n_2^m}\sum_{q \in \mathcal{Q}_{1,m}} \sum_{r \in \mathcal{Q}_{2,m}} \tau^{1,m}_{i,q} \tau^{2,m}_{j,r} \log f(X^{m}_{ij}; \alpha_{qr}) \\
    + \sum_{i=1}^{n_1^m} \sum_{q \in \mathcal{Q}_{1,m}} \tau^{1,m}_{i,q} \log \pi_{\color{black}q}^{\color{gray}m} + \sum_{j=1}^{n_2^m} \sum_{r \in \mathcal{Q}_{2,m}} \tau^{2,m}_{j,r} \log \rho_{\color{black}r}^{\color{gray}m}          \\
    - \sum_{i=1}^{n_1^m} \tau^{1,m}_{i,q} \log \tau^{1,m}_{i,q} - \sum_{j=1}^{n_2^m} \tau^{2,m}_{j,r} \log \tau^{2,m}_{j,r} \bigg) \color{black}
\end{align*}

where we identify the variational distribution $\mathcal{R}$ with its parameter
$\bm{\tau}$. \\

% \begin{equation*}
%     \mathcal{J}(\mathcal{R};\bm{\theta}) \coloneqq \mathbb{E}_{\mathcal{R}}[\ell(\bm{X},\bm{Z},\bm{W};\bm{\theta})] + \mathcal{H}(\bm{Z,W}) \leq \ell(\bm{X};\bm{\theta})
% \end{equation*}

The VEM algorithm alternates between two steps, the variational E step and the
M step. The E steps consists in optimizing $\mathcal{J}(\bm{\tau};\bm{\theta})$
for a current value of $\bm{\theta}$ with respect to $\bm{\tau}$. And the M
step consists of maximizing $\mathcal{J}(\bm{\tau};\bm{\theta})$ with respect
to $\bm{\theta}$ and for a given variational distribution $\bm{\tau}$.

\subsection{Variational E step}
\label{ssec:variational-e-step}

At this step we maximize with respect to the variational distribution
$\bm{\tau}$: $$\widehat{\bm{\tau}}^{(t+1)} = \arg \max_{\bm{\tau}}
    \mathcal{J}(\mathcal{\bm{\tau}},\bm{\widehat{\theta}}^{(t)}).$$

And we obtain the following formulae for the $\bm{\tau^m}$:
\begin{equation*}
    \begin{cases}
        \widehat{\tau}_{iq}^{1,m} \propto \widehat{\pi}_{q}^{m(t)} \prod_{j=1}^{n_2^m}\prod_{r\in\mathcal{Q}_2^m} f(X_{ij}^m;\widehat{\alpha}_{qr}^{(t)})^{\widehat{\tau}_{jr}^{2,m(t+1)}}  & \forall i = 1, \dots , n_1^m, q \in \mathcal{Q}_1^m \\
        \widehat{\tau}_{jr}^{2,m} \propto \widehat{\rho}_{r}^{m(t)} \prod_{i=1}^{n_1^m}\prod_{q\in\mathcal{Q}_1^m} f(X_{ij}^m;\widehat{\alpha}_{qr}^{(t)})^{\widehat{\tau}_{iq}^{1,m(t+1)}} & \forall j = 1, \dots , n_2^m, r \in \mathcal{Q}_2^m
    \end{cases}
\end{equation*}

which are used to update iteratively the values by a fixed point algorithm with
only one step.

\subsection{M step of the algorithm}
\label{ssec:m-step-of-the-algorithm}
At iteration $(t)$ the M-step maximizes the variational bound with respect to
the model parameters $\bm{\theta}$:
\[
    \widehat{\bm{\theta}}^{(t+1)} = \arg \max_{\bm{\theta}} \mathcal{J}(\mathcal{\bm{\widehat{\tau}}}^{(t+1)},\bm{\theta})
\]

The following quantities are involved in the obtained formulae:

\begin{align*}
    e^{m}_{qr} = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m} X_{ij}^m
     & , & n^{m}_{qr} = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m}
     & , & n^{1,m}_{q} = \sum_{i=1}^{n_1^m} \tau_{iq}^{1,m}
     & , & n^{2,m}_{r} = \sum_{j=1}^{n_2^m} \tau_{jr}^{2,m}
\end{align*}

The block proportions, in free mixture models,
$(\pi_q^m)_{q\in\mathcal{Q}_1^m}, (\rho_r^m)_{r\in\mathcal{Q}_2^m}$ are
estimated as
\begin{align*}
    \widehat{\pi}_q^{m}= \frac{n^{1,m}_{q}}{n_1^m}  &  & \text{for } \pi\text{-colBiSBM} \text{ and } \pi\rho\text{-colBiSBM}  \\
    \widehat{\rho}_r^{m}= \frac{n^{2,m}_{r}}{n_2^m} &  & \text{for } \rho\text{-colBiSBM} \text{ and } \pi\rho\text{-colBiSBM}
\end{align*}
while on the other hand,
\begin{align*}
    \widehat{\pi}_q = \frac{\sum_{m=1}^{M} n^{1,m}_{q}}{\sum_{m=1}^{M} n_1^m}  &  & \text{for } iid\text{-colBiSBM} \text{ and } \rho\text{-colBiSBM} \\
    \widehat{\rho}_r = \frac{\sum_{m=1}^{M} n^{2,m}_{r}}{\sum_{m=1}^{M} n_2^m} &  & \text{for } iid\text{-colBiSBM} \text{ and } \pi\text{-colBiSBM}
\end{align*}
the parameters takes into account all the networks at the same time. The
connectivity parameters $\alpha_{qr}$ for all models are estimated as the ratio
of the number of interactions between row block $q$ and column block $r$ among
all networks over the number of number of possible interactions:
\begin{align*}
    \widehat{\alpha}_{qr} = \frac{\sum_{m=1}^{M} e^{m}_{qr}}{\sum_{m=1}^{M} n^{m}_{qr}}
\end{align*}

\section{Model selection}\label{sec:model-selection}
% DONE
% Adapt bicl, methode explo car defi
% 1 bicl 2 model exploration
% Citer la conclusion de l'article de St Clair discussion sur bipartite
The section \ref{sec:variational-estimation-of-the-parameters} explains how we
estimate the parameters of the model for \emph{fixed} number of blocks
$Q_1$ and $Q_2$. But as they are in general not known we need to explore the
latent space to find the \emph{best} values.
As discussed in~\cite{chabert-liddellLearningCommonStructures2024a}, the
algorithmic aspect becomes complex when dealing with the bipartite case. Due to
the size of the latent space being $\mathbb{N}^2$, conducting a complete
exploration of the latent space is practically infeasible. Therefore, in
addition to adapting the existing formulas, our contribution to addressing this
challenge involved making significant choices, which are outlined below.

The below procedures are implemented in the \emph{colSBM} package, available on
\url{https://github.com/Chabert-Liddell/colSBM}.

\subsection{The \emph{Bayesian Information Criterion like} (BIC-L) criterion for model selection}
\label{ssec:the-bic-l-criterion-for-model-selection}
To select the best number of blocks we need a criterion to
measure adequacy between our model and data. The ELBO might seem a good
criterion at first but as for the likelihood, the more complex a model the
higher it gets. And thus a good criterion should make a \emph{trade-off} between
fitting to data and model complexity.

The Integrated Classified Likelihood (ICL) is a well-established tool in the SBM
and LBM domains for selecting the appropriate number of blocks. It was
introduced by~\cite{biernackiAssessingMixtureModel2000,
    daudinMixtureModelRandom2008}. The ICL is derived from an asymptotic
approximation of the marginal complete likelihood. In this approach, the model
parameters are integrated out using a prior distribution, resulting in a
penalized likelihood criterion. By employing the ICL, one can effectively
determine the optimal number of blocks for the given problem in a systematic
manner.
We obtain the following expression
\[
    \text{ICL} = \max_{\theta} \mathbb{E}_{\widehat{\mathcal{R}}} [\ell(\bm{X,Z,W;\theta})] - \frac{1}{2}\text{pen}
\]
with pen the penalties.\\ Using the formula $\mathbb{E}_{\widehat{\mathcal{R}}}
    [\ell(\bm{X,Z,W;\theta})] \approx \ell (\bm{X;\theta}) -
    \mathcal{H(\widehat{R})}$, it becomes clearer, as highlighted in the existing
literature, that the Integrated Classified Likelihood (ICL) gives preference to
well-separated blocks by imposing a penalty on the entropy of node grouping.
However, the objective of our study extends beyond grouping nodes into coherent
blocks. We also aim to assess the similarity of connectivity patterns across
different networks. Consequently, we aim to permit models that offer more
flexible node grouping without penalizing entropy.

This leads us to formulate a BIC-like criterion in the following manner:

\[
    \text{BIC-L} = \max_{\bm{\theta}} \mathbb{E}_{\widehat{\mathcal{R}}} [\ell(\bm{X,Z,W;\theta})] + \mathcal{H(\widehat{R})} - \frac{1}{2}\text{pen} = \max_{\bm{\theta}} \mathcal{J(\widehat{R}, \bm{\theta})} - \frac{1}{2}\text{pen}
\]

We provide below the expression for the penalties for the 4 models that we
propose.
\begin{description}
    \item[\textit{iid}-colBiSBM] For the $\bm\pi$ and $\bm\rho$:
          \begin{align*}
              \text{pen}_{\pi}(Q_1) = (Q_1 - 1)\log(\sum_{m=1}^{M}n_{1}^{m}) & , &
              \text{pen}_{\rho}(Q_2) = (Q_2 - 1)\log(\sum_{m=1}^{M}n_{2}^{m})
          \end{align*}
          For the $\bm\alpha$:
          \[\text{pen}_{\alpha}(Q_1, Q_2) = Q_1 \times Q_2 \log(N_M)\]
          with
          \[ N_M = \sum_{m = 1}^{M} n_{1}^{m} \times n_{2}^{m} \]
          And thus the $\text{BIC-L}$ formula is the following:
          \[ \text{BIC-L}(\bm{X},Q_1, Q_2) = \max_{\theta}
              \mathcal{J} (\mathcal{\hat{R}}, \bm{\theta})
              - \frac{1}{2} [\text{pen}_{\pi}(Q_1) + \text{pen}_{\rho}(Q_2) +
                  \text{pen}_{\alpha}(Q_1, Q_2)]\]
    \item[$\bm{\pi\rho}$-colBiSBM] The support penalties are
          \begin{align*}
              \text{pen}_{S_1}(Q_1) = -2 \log p_{Q_1} (S_1) & , &
              \text{pen}_{S_2}(Q_2) = -2 \log p_{Q_2} (S_2)
          \end{align*}
          with \begin{align*}
              \textstyle \log p_{Q_1}(S_1) = - M \log(Q_1) - \sum_{m=1}^{M} \log {Q_1
              \choose Q_1^{(m)}}, \\
              \textstyle \log p_{Q_2}(S_2) = - M \log(Q_2) - \sum_{m=1}^{M} \log {Q_2
              \choose Q_2^{(m)}}.
          \end{align*}
          And penalties for the $\bm\rho$ and $\bm\pi$ are
          \[ \text{pen}_{\pi}(Q_1, S_1) = \sum_{m=1}^{M} (Q_{1}^{(m)} - 1)
              \log n_{1}^{m},
              ~\text{pen}_{\rho}(Q_2, S_2) = \sum_{m=1}^{M} (Q_{2}^{(m)} - 1)
              \log n_{2}^{m}. \]
          Penalties for the $\bm\alpha$
          \[ \text{pen}_{\alpha}(Q_1, Q_2, S_1, S_2) = (\sum_{q=1}^{Q_1}
              \sum_{r=1}^{Q_2} \mathbbb{1}_{(S_1)'S_2 > 0}) \log (N_M). \]
          And the corresponding BIC-L formula,
          \[
              \begin{aligned}
                  \text{BIC-L}(\bm{X},Q_1, Q_2) =
                  \max_{S_1,S_2} [
                                & \max_{\theta_{S_1,S_2} \in \Theta_{S_1,S_2}} \mathcal{J}(\mathcal{\hat{R}},\theta_{S_1,S_2}) \\
                  - \frac{1}{2} & (\text{pen}_{\pi}(Q_1, S_1)  + \text{pen}_{\rho}(Q_2, S_2)                                   \\
                                & + \text{pen}_{\alpha}(Q_1, Q_2, S_1, S_2)                                                    \\
                                & + \text{pen}_{S_1}(Q_1) + \text{pen}_{S_2}(Q_2))]                                            \\
              \end{aligned}
          \]
\end{description}

\subsection{Initialization and pairing of the models}
\label{ssec:initialization-and-pairing-of-the-models}
The row (resp. column) block memberships are the labels of row (resp. column)
nodes corresponding to the group to which they were assigned based on their
connection patterns. This adds another layer of complexity to the model
selection as we need to find the best $Q_1, Q_2$ and the best memberships for
each vertex.

First to combine the information from the $M$ networks we fit a LBM model
for each network at the two points $Q = (1, 2)$ and $Q = (2, 1)$. Using the
previously described VEM algorithm we obtain for each network its parameters
($\bm{\rho,\pi,\alpha}$).
We then compute the marginal laws for each dimension, for each network. Then we
order the network blocks by the probabilities obtained in decreasing order.

For the memberships on the columns: $col~order_m = order\left(\pi_m \times
    \alpha_m\right)$.

For the memberships on the rows: $row~order_m = order\left(\rho_m \times
    ~^{t}(\alpha_m)\right)$.

Using this order we relabel the memberships for the $M$ fitted collection of a
single network.
We then use the $M$ memberships to fit a collection containing
the $M$ networks.
\subsection{Greedy exploration to find an estimation of the mode}\label{ssec:greedy-exploration-to-find-an-estimation-of-the-mode}
Using the previously fitted models for $Q = (1,2)$ and $Q = (2,1)$ we choose to
perform a greedy exploration to find a first mode.

Meaning that for a given $Q = (Q_1, Q_2)$ we will compute all the possible
memberships for the points $Q \in \{(Q_1 + 1, Q_2),(Q_1, Q_2 + 1),(Q_1 - 1,
    Q_2), (Q_1, Q_2 - 1)\}$, fit the corresponding models and choose the one that
maximizes the BIC-L as the next point from which to repeat the procedure. We
repeat the procedure until the BIC-L stops increasing $2$ times in a row.

\begin{algorithm}[H]
    \caption{Greedy Exploration for Mode Estimation}
    \SetAlgoLined
    \SetKwInOut{Input}{Input}
    \SetKwInOut{Output}{Output}

    \Input{Fitted models for $Q = (1,2)$ and $Q = (2,1)$}
    \Output{Estimation of the mode using greedy exploration}

    \BlankLine
    Initialize $Q = (1,2)$ as the starting point
    Initialize $\text{BIC-L}_{\text{max}}$ as the maximum achieved BIC-L value
    Initialize $consecutive\_count$ as 0

    \BlankLine
    \While{$consecutive\_count < 2$}{
    Compute possible memberships for $Q \in \{(Q_1 + 1, Q_2), (Q_1, Q_2 + 1), (Q_1 - 1, Q_2), (Q_1, Q_2 - 1)\}$\;
    Fit models with the computed memberships
    Choose the model with the maximum BIC-L as the next point

    \BlankLine
    \If{$\text{BIC-L} > \text{BIC-L}_{\text{max}}$}{
    $\text{BIC-L}_{\text{max}} \leftarrow \text{BIC-L}$
    $consecutive\_count \leftarrow 0$
    }
    \Else{
        $consecutive\_count \leftarrow consecutive\_count + 1$
    }

    \BlankLine
    $Q \leftarrow$ Next selected point
    }

    \BlankLine
    \textbf{Output:} Estimation of the mode using greedy exploration
\end{algorithm}

When this first estimation of the BIC-L mode has been find we apply the moving
window on it.

\subsection{Moving window to update the block memberships and the BIC-L}
\label{ssec:moving-window-to-update-the-block-memberships-and-the-bic-l}
The \emph{moving window} is used to update the block memberships on rows and
columns and fit new models with those changes.
To define the window, we use a center point and a \emph{depth}, giving us the
bottom left corner ($Q_{1,center} - depth, Q_{2,center} - depth$) and the top right corner of the
window ($Q_{1,center} + depth, Q_{2,center} + depth$). All the points in this square will be
updated and contribute to the update of the others.
This procedure is repeated until convergence of the BIC-L.

The figure \ref{fig:moving-window-procedure} illustrates the procedure. It
consists of two alternating steps:
\begin{itemize}
    \item the \emph{forward pass}: repeatedly computing the possible splits to fit the
          current model.
    \item the \emph{backward pass}: computing the possible merges to fit the current
          model.
\end{itemize}

\begin{algorithm}[t]
    \caption{Moving Window Procedure}
    \SetAlgoLined
    \SetKwInOut{Input}{Input}
    \SetKwInOut{Output}{Output}

    \Input{Center point $(Q_{1,\text{center}}, Q_{2,\text{center}})$, depth}
    \Output{Best model with maximum BIC-L in the window}

    \BlankLine
    Define bottom left corner $(Q_{1,\text{center}} - \text{depth}, Q_{2,\text{center}} - \text{depth})$\\
    Define top right corner $(Q_{1,\text{center}} + \text{depth}, Q_{2,\text{center}} + \text{depth})$

    \BlankLine
    \While{not converged}{
        \textbf{Forward pass:}

        \For{$Q_1 \in \left[ Q_{1,\text{center}} - \text{depth} ; Q_{1,\text{center}} + \text{depth} \right]$}{
            \For{$Q_2 \in \left[ Q_{2,\text{center}} - \text{depth}; Q_{2,\text{center}} + \text{depth} \right] $}{
                Compute possible splits from predecessors $(Q_1 - 1, Q_2)$ and $(Q_1, Q_2 - 1)$
                Fit models with the block membership changes
                Compare and keep the best model based on BIC-L
            }
        }

        \BlankLine
        \textbf{Backward pass:}

        \For{$Q_1 \in \left[ Q_{1,\text{center}} + \text{depth} ; Q_{1,\text{center}} - \text{depth} \right]$}{
            \For{$Q_2 \in \left[ Q_{2,\text{center}} + \text{depth}; Q_{2,\text{center}} - \text{depth} \right] $}{
                Compute possible merges from predecessors $(Q_1 + 1, Q_2)$ and $(Q_1, Q_2 + 1)$
                Fit models with the block membership changes
                Compare and keep the best model based on BIC-L
            }
        }

        \BlankLine
        Update the best model based on the maximum BIC-L
    }

    \BlankLine
    \textbf{Output:} Best model with maximum BIC-L in the window
\end{algorithm}

\begin{figure}[t]
    \definecolor{mypurple}{RGB}{128,0,128}
    \begin{subfigure}[b]{0.48\textwidth}
        \begin{tikzpicture}[scale=1.5]
            \tikzstyle{model}=[circle,draw=none,fill=gray, thick]
            \tikzstyle{split}=[>=stealth,->,thick, draw=blueps]
            \tikzstyle{merge}=[>=stealth,->,thick, draw=red]
            \draw[step=1cm, help lines] (-2,-2) grid (2,2);
            \node[model] (mode) at (0,0) {{\color{red}X}};

            \draw[color=red, line width=1pt, dashed] (-1.5,-1.5) rectangle ++(3,3);

            \node[model] (bottom_left) at (-1,-1) {};
            \node[model, draw=blueps] (row_1) at (0,-1) {};
            \node[model, draw=blueps] (col_1) at (-1,0) {};
            \node[model, draw=blueps] (row_2) at (1,-1) {};
            \node[model, draw=blueps] (col_2) at (-1,1) {};
            \node[model, draw=blueps] (mode) at (0,0) {{\color{red}X}};
            \node[model, draw=blueps] (row_3) at (1,0) {};
            \node[model, draw=blueps] (col_3) at (0,1) {};
            \node[model, draw=blueps] (top_right) at (1,1) {};

            \draw[split] (bottom_left) -- (col_1);
            \draw[split] (-1.75,0) -- (col_1);
            \draw[split] (bottom_left) -- (row_1);
            \draw[split] (0,-1.75) -- (row_1);

            \draw[split] (col_1) -- (col_2);
            \draw[split] (-1.75,1) -- (col_2);
            \draw[split] (row_1) -- (row_2);
            \draw[split] (1,-1.75) -- (row_2);
            \draw[split] (row_1) -- (mode);
            \draw[split] (col_1) -- (mode);

            \draw[split] (col_2) -- (col_3);
            \draw[split] (row_2) -- (row_3);
            \draw[split] (mode) -- (row_3);
            \draw[split] (mode) -- (col_3);

            \draw[split] (col_3) -- (top_right);
            \draw[split] (row_3) -- (top_right);
        \end{tikzpicture}
        \caption[forward]{Visualisation of a forward pass of moving window}\label{fig:visualisation-forward-pass}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.48\textwidth}
        \begin{tikzpicture}[scale=1.5]
            \tikzstyle{model}=[circle,draw=none,fill=gray]
            \tikzstyle{split}=[>=stealth,->,thick, draw=blueind]
            \tikzstyle{merge}=[>=stealth,->,thick, draw=red]
            \draw[step=1cm, help lines] (-2,-2) grid (2,2);
            \draw[color=red, line width=1pt, dashed] (-1.5,-1.5) rectangle ++(3,3);

            \node[model, draw=mypurple] (top_right) at (1,1) {};
            \node[model, draw=mypurple] (row_3) at (1,0) {};
            \node[model, draw=mypurple] (col_3) at (0,1) {};
            \node[model, draw=mypurple] (row_2) at (1,-1) {};
            \node[model, draw=mypurple] (col_2) at (-1,1) {};
            \node[model, draw=mypurple] (mode) at (0,0) {{\color{red}X}};
            \node[model, draw=red] (bottom_left) at (-1,-1) {};
            \node[model, draw=mypurple] (row_1) at (0,-1) {};
            \node[model, draw=mypurple] (col_1) at (-1,0) {};

            \draw[merge] (1,1.75) -- (top_right);
            \draw[merge] (1.75,1) -- (top_right);
            \draw[merge] (0,1.75) -- (col_3);
            \draw[merge] (1.75,0) -- (row_3);
            \draw[merge] (1.75,-1) -- (row_2);
            \draw[merge] (-1,1.75) -- (col_2);

            \draw[merge] (top_right) -- (col_3);
            \draw[merge] (top_right) -- (row_3);
            \draw[merge] (col_3) -- (col_2);
            \draw[merge] (row_3) -- (row_2) ;
            \draw[merge] (row_3) -- (mode);
            \draw[merge] (col_3) -- (mode);
            \draw[merge] (col_2) --(col_1);
            \draw[merge] (row_2) -- (row_1);
            \draw[merge] (mode) -- (row_1);
            \draw[merge] (mode) -- (col_1);
            \draw[merge] (col_1) -- (bottom_left);
            \draw[merge] (row_1) -- (bottom_left);
        \end{tikzpicture}
        \caption[forward]{Visualisation of a backward pass of moving window}\label{fig:visualisation-backward-pass}
    \end{subfigure}
    \caption{Moving window procedure, the center node marked with an {\color{red}X} is the mode of BIC-L}\label{fig:moving-window-procedure}
\end{figure}

\paragraph*{Forward pass} The forward pass consists for a model at $(Q_1, Q_2)$
to compute the possible splits from the block memberships of its ``predecessors``.
The predecessors are the point at the left $(Q_1 - 1, Q_2)$ and below
$(Q_1, Q_2 - 1)$ the current model (if they exist). To update the current model,
we take its predecessors block memberships and try to split one of the blocks in
two. Then the current model is fitted using this clustering as a starting
clustering. Once all the possible splits are fitted, they are compared, keeping
the best, in the sense of the BIC-L. If a model was already present it is also
compared and the best is chosen as the model for this round at $(Q_1, Q_2)$.\\
The procedure then repeats for the point at $(Q_1 + 1, Q_2)$ until it reaches
$(Q_{1,center} + depth, Q_2)$ from which it repeats from
$(Q_{1,center} - depth, Q_2 + 1)$. This repeats until computing the best model
for $(Q_{1,center} + depth, Q_{2,center} + depth)$.
\textit{Note on the initialization:} The forward pass starts from the point
$(Q_{1,center} + depth, Q_{2,center} + depth)$, so this points needs to have at
least a model fitted. In the best case, the greedy exploration will have visited
this point. But if the point has not been visited, a model will be fitted from
a spectral initialization (i.e the block memberships is computed by using a
spectral clustering). From this point, the next model will have at least one
predecessor and the procedure can iterate.

\paragraph*{Backward pass} The backward pass consists for a model at $(Q_1, Q_2)$
to compute the possible merges from the block memberships of its ``predecessors``.
The predecessors are the point at the right $(Q_1 + 1, Q_2)$ and on top
$(Q_1, Q_2 + 1)$ of the current model (if the predecessors exist). To update the
current model, we take its predecessors block memberships and try to merge two
blocks in one. Then the current model is fitted using this clustering as a starting clustering. Once all the possible merges are fitted, they are
compared, keeping the best, in the sense of the BIC-L.
If a model was already present it is also
compared and the best is chosen as the model for this round at $(Q_1, Q_2)$.\\
The procedure then repeats for the point at $(Q_1 - 1, Q_2)$ until it reaches
$(Q_{1,center} - depth, Q_2)$ from which it repeats from
$(Q_{1,center} - depth, Q_2 - 1)$. This repeats until computing the best model
for ($Q_{1,center} - depth, Q_{2,center} - depth$).
\textit{Note on the initialization:} The backward pass starts from
$(Q_{1,center} + depth, Q_{2,center} + depth)$, we know it was initialized at
least by the forward pass, no special case here.\\

At the end of the moving window pass, the model of max BIC-L is the new best
fit and the procedure can repeat until convergence.

\section{Networks clustering}
\label{sec:networks-clustering}
As in~\cite{chabert-liddellLearningCommonStructures2024a} we use a recursive
algorithm to determine the best clustering of the given networks. The procedure
being the same, we will present it briefly and focus on adjustments.

When networks in a collection do not share the same mesoscale connectivity
structure we want to be able to partition them correctly. For this we perform a
clustering of networks.

The process of clustering a collection of networks involves discovering a
partition $\mathcal{G} = (\mathcal{M}_g)_{g=1,\dots,G}$ of $\{1,\dots, M\}$.
Given $\mathcal{G}$ we set the following model on $\bm{X}$:

\begin{align*}
    \forall g \in \{1,\dots, G\},
    ~\forall m \in \mathcal{M}_g,
    ~X^m \sim
    \mathcal{F}\text{-}BiSBM(Q_1^g, Q_2^g, \bm{\pi^m, \rho^m,} \bm{\alpha}^g)
\end{align*}

And we defined the score of a given partition $\mathcal{G}$:
\[
    Sc(\mathcal{G}) = \sum_{g=1}^{G} \max_{Q^g=1,\dots,Q_{\max}} \text{BIC-L}((X^m)_{m\in\mathcal{M}_g},Q_1^g, Q_2^g)
\]
Thus the score consists of the sum of the BIC-L of the sub-collections for the
partition $\mathcal{G}$.

\subsection{Dissimilarity between two networks}
\label{ssec:dissimilarity-between-two-networks}
The parameters for the dissimilarity are defined as follow:
\begin{align*}
    \widetilde{n}_{qr}^m & = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \widehat{\tau}_{iq}^{1,m} \widehat{\tau}_{jr}^{2,m},
                         &                                                                                              & \widetilde{\alpha}_{qr}^m = \frac{\sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \widehat{\tau}_{iq}^{1,m} \widehat{\tau}_{jr}^{2,m} X_{ij}^m}{\widetilde{n}_{qr}^m}, \\
    \widetilde{\pi}_q^m  & = \frac{\sum_{i=1}^{n_1^m} \widehat{\tau}_{iq}^{1,m}}{n_1^m},
                         &                                                                                              & \widetilde{\rho}_r^m = \frac{\sum_{j=1}^{n_2^m} \widehat{\tau_{jr}}^{2,m}}{n_2^m}.
\end{align*}
And the pairwise dissimilarity for networks $(m,m')\in\mathcal{M}^2$ is then:
\[
    D_{\mathcal{M}}(m,m') = \sum_{q = 1}^{Q_1} \sum_{r = 1}^{Q_2} \max(\widetilde{\pi}_{q}^{m}, \widetilde{\pi}_{q}^{m'}) \left( \widetilde{\alpha}_{qr}^{m} - \widetilde{\alpha}_{qr}^{m'}\right)^{2} \max(\widetilde{\rho}_{r}^{m}, \widetilde{\rho}_{r}^{m'})
\]

\begin{figure}[t]
    \centering
    \begin{tikzpicture}
        \tikzstyle{instruct}=[font=\small, text justified, rectangle,draw,fill=yellow!50]
        \tikzstyle{first_col}=[rectangle, text justified, draw,fill=gray!50]
        \tikzstyle{second_col}=[scale=0.55, circle, draw,fill=red!50]
        \tikzstyle{test}=[font=\small, text justified, diamond, aspect=2.5,thick,
        draw=blueps,fill=yellow!50]
        \tikzstyle{es}=[font=\small, text justified, rectangle,draw,rounded corners=4pt,fill=cyanind!25]

        \node[es] (liste) at (0,4) {Supply a collection to partition};
        \node[instruct, text width=5cm, below = 0.45cm of liste] (1-collection) {Fit colBiSBM};
        \node[first_col, right = 0.5cm of 1-collection] (1-col-obj) {};
        \node[instruct, text width=5cm, below = 0.45cm of 1-collection] (dissimi) {Compute a dissimilarity matrix over the collection};
        \node[instruct, text width=5cm, below = 0.45cm of dissimi] (2-sous-collection) {Split the \emph{collection in 2 sub-collections} and fit the colBiSBM};
        \node[second_col, right = 0.25cm of 2-sous-collection] (1-sec-col-obj) {1};
        \node[second_col, right = 0.25cm of 1-sec-col-obj] (1-sec-col-obj) {2};
        \node[test,below = 0.45cm of 2-sous-collection, scale=0.7] (BICL-test) {$\sum_{i=1}^{2} (\text{BIC-L}(\tikz[baseline=-0.25cm]{\node[second_col] {i};} )) > \text{BIC-L}(\tikz[baseline=-0.25cm]{\node[first_col] {};})$?};
        \node[es, right = 0.55cm of BICL-test] (sortie) {Output \tikz{\node[rectangle, draw, fill=gray!50, rounded corners=0pt] {};}};
        \node[es, left = 0.45cm of dissimi, text width = 2cm] (recursion) {Loop over \tikz{\node[second_col] {1};} and \tikz{\node[second_col] {2};} };

        \tikzstyle{suite}=[->,>=stealth,thick,rounded corners=4pt]
        \draw[suite] (liste) -- (1-collection);
        \draw[suite] (1-collection) -- (dissimi);
        \draw[suite] (dissimi) -- (2-sous-collection);
        \draw[suite] (2-sous-collection) -- (BICL-test);
        \draw[suite] (BICL-test) -| node[near start, above, fill=none] {Yes} (recursion);
        \draw[suite] (recursion.east) -- (dissimi.west);
        \draw[suite] (BICL-test) -- node[near start, above, fill=none] {No} (sortie);
    \end{tikzpicture}
    \caption{Network clustering procedure}
    \label{fig:netclustering-procedure}
\end{figure}

The above figure (\ref{fig:netclustering-procedure}) shows a condensed
explanation of the network clustering algorithm.

The idea is to adjust the colBiSBM model over the full collection of $M$
networks and then compute the dissimilarity matrix between all networks of the
collection. We obtain the collection $\mathcal{G} = \{\mathcal{M}\}$ the
trivial partition in a unique group.

Then using the \emph{Kmeans} we split the collection in two sub-collections
with the dissimilarity matrix. The two sub-collections are fitted and we
compute the score of this new partition $\mathcal{G}^{*} = \{G_1, G_2\}$.

If $Sc(\mathcal{G}^{*}) > Sc(\mathcal{G})$ then we repeat the same procedure on
$G_1$ and $G_2$. Else we return $\mathcal{G}$.

We illustrate our capacity to perform a partition of a collection for all
colBiSBM models in %\ref{sec:network-clustering-of-simulated-networks}.

\section{Model identifiability}
\label{sec:model-identifiability}
% Ici l'identifiabilité du modèle
The goal here is to prove that if $\ell(\bm{X};\bm{\theta}) = \ell(\bm{X};\bm{\theta}')$ for any collection $\bm{X}$ then $\bm{\theta} = \bm{\theta}'$.

Following the proof proposed by~\cite{chabert-liddellLearningCommonStructures2024a}, that adapted it to the collection case and~\cite{keribinEstimationSelectionLatent2015} that extended the result of~\cite{celisseConsistencyMaximumlikelihoodVariational2012} to the LBM Bernoulli model,
we obtain the following proof of identifiability for the $iid$-colBiSBM:
\begin{theorem}[Identifiability of $iid$-colBiSBM]
    \label{thm:identifiability-iid}
    The parameters $(\bm{\pi}, \bm{\rho}, \bm{\alpha})$ are
    identifiable up to a label switching of the blocks if those
    conditions are achieved:
    \begin{itemize}
        \item[(1.1)] $\exists m^*\in\{1,\dots,M\} : n^1_{m^*} \geq 2 Q_2 - 1~\text{and}~n^2_{m^*} \geq 2 Q_1 - 1$.
        \item[(1.2)] $\forall 1\leq q \leq Q_1, \pi_q > 0$
              and the coordinates of vector $\bm{\rho}
                  {X^{m^*}}^T$ are distinct (where ${X^{m^*}}^T$ is the transpose of $X^{m^*}$).
        \item[(1.3)] $\forall 1\leq r \leq Q_2, \rho_r > 0$
              and the coordinates of vector $\bm{\pi}
                  X^{m^*}$ are distinct.
    \end{itemize}
\end{theorem}