789 lines
42 KiB
TeX
789 lines
42 KiB
TeX
\addtocounter{customchapter}{1}
|
|
\chapter[Structure detection in bipartite collection]{Structure detection in a collection of bipartite networks}
|
|
\label{chap:struct-detection}
|
|
\section{Definition of a collection}
|
|
\label{sec:definition-of-a-collection}
|
|
|
|
We define a collection of bipartite networks as
|
|
$\bm{X} = (X^1,\dots X^m,\dots, X^M)$
|
|
the collection of incidence matrix. Moreover, all the networks in the
|
|
collection have the same valuation of the interactions (e.g., they are
|
|
all binary).
|
|
|
|
\section{Separate BiSBM (sep-BiSBM)}\label{sec:separate-bisbm-sepbisbm}
|
|
|
|
A first approach to deal with a collection of networks is to adjust separate
|
|
BiSBM for each network of the collection.
|
|
|
|
For network $m$, let $n_1^m$ (resp. $n_2^m$) be the number of nodes in row
|
|
(resp. column) divided into $Q_1^m$ row clusters (resp. $Q_2^m$ column
|
|
clusters).\\ Let $Z^m=(Z^m_1, \dots, Z^m_i, \dots, Z^m_{n_1^m})$ and
|
|
$W^m = (W^m_1, \dots,W^m_j, \dots, W^m_{n_2^m})$ be independent latent variables
|
|
such that $Z^m_i = q$ if row node $i$ of network $m$ belongs to row cluster $q$
|
|
($q\in\{1,\dots,Q_1^m\}$) and
|
|
$W^m_j = r$ if column node $j$ of network $m$ belong to column block $r$
|
|
($r\in\{1,\dots,Q_2^m\}$). And we have
|
|
\begin{align}\label{eqn:lbm-block-membership-prob}
|
|
\mathbb{P}(Z_i^m=q)=\pi_q^m, & & \mathbb{P}(W_j^m=r)=\rho_r^m
|
|
\end{align}
|
|
where $\pi_q^m > 0$, $\rho_r^m > 0$, $\sum_{q=1}^{Q_1^m}\pi_q^m = 1$ and
|
|
$\sum_{r=1}^{Q_2^m}\rho_r^m = 1$. Given the latent variables
|
|
$Z^m, W^m$, the $X_{ij}^m$s are assumed to be independent and distributed
|
|
as
|
|
|
|
\begin{align}\label{eqn:lbm-conditional-to-latent}
|
|
X_{ij}^m|Z_i^m = q,W_j^m = r \sim \mathcal{F}(.;\alpha_{qr}^m)
|
|
\end{align}
|
|
where $\mathcal{F}$ is referred to as the emission distribution. $\mathcal{F}$ is chosen to
|
|
be the Bernoulli distribution for binary interactions, and the Poisson
|
|
distribution for weighted interactions such as counts. Let $f$ be the density of
|
|
the emission distribution, then:
|
|
|
|
\begin{equation}\label{eqn:lbm-emission}
|
|
\log f(X^m_{ij};\alpha_{qr}^m) =
|
|
\begin{cases}
|
|
X_{ij}^m \log(\alpha_{qr}^m) + (1-X_{ij}^m) \log(1-\alpha_{qr}^m) & \text{for Bernoulli emission} \\
|
|
-\alpha_{qr}^m + X_{ij}^m \log(\alpha_{qr}^m) - \log(X_{ij}^m!) & \text{for Poisson emission}
|
|
\end{cases}
|
|
\end{equation}
|
|
|
|
Equations~\eqref{eqn:lbm-block-membership-prob},
|
|
\eqref{eqn:lbm-conditional-to-latent} and \eqref{eqn:lbm-emission} defines the
|
|
BiSBM model and we will now use a short notation:
|
|
|
|
\begin{align}
|
|
\tag{\emph{sep-BiSBM}}
|
|
X^m \sim \mathcal{F}\text{-BiSBM}_{n_1^m,n_2^m}(Q_1^m, Q_2^m, \bm{\pi^m}, \bm{\rho^m}, \bm{\alpha^m}) & & \forall m = 1, \dots M
|
|
\end{align}
|
|
where $\mathcal{F}$ encodes the emission distribution, $n_1^m,n_2^m$ are the number of row
|
|
and column nodes, $Q_1^m, Q_2^m$ are the number of row and column blocks in
|
|
network $m$, $\bm{\pi}^m~=~{(\pi^m_q)}_{q=1,\dots,Q_1^m}$ and
|
|
$\bm{\rho}^m~=~{(\rho^m_r)}_{r=1,\dots,Q_2^m}$ are the vectors of their
|
|
proportions. The $Q_1^m \times Q_2^m$ matrix
|
|
$\bm{\alpha}^m = {(\alpha^m_{qr})}_{\substack{q = 1,\dots,Q_1^m \\ r = 1,\dots,Q_2^m}}$
|
|
are the connectivity parameters, i.e.~the parameters of the emission distribution.
|
|
$\alpha^m_{qr}\in\mathcal{A}_{\mathcal{F}}$ where, for the Bernoulli
|
|
(resp. Poisson) emission distribution, $\mathcal{A}_{\mathcal{F}} = (0,1)$ (resp.
|
|
$\mathcal{A}_{\mathcal{F}} = \mathbb{R}^{*+}$). In this $sep$-BiSBM model each
|
|
network $m$ is assumed to follow a BiSBM with its own parameters ($\bm{\pi}^m,
|
|
\bm{\rho}^m, \bm{\alpha}^m$).
|
|
% DONE Finish explaining
|
|
|
|
\section{Definition of the \emph{colBiSBM} models}\label{sec:definition-of-the-colbisbm-models}
|
|
% Here are some common notations and conventions that we will use in the following
|
|
% sections.
|
|
|
|
\subsection{A collection of iid bipartite SBM}\label{ssec:a-collection-of-i-i-d-bipartite-sbm}
|
|
As for \emph{colSBM} this first model is the most constrained. It assumes that
|
|
all the networks are the independent realizations of the same $Q_1$-$Q_2$-BiSBM
|
|
with identical parameters. The \emph{iid}-colBiSBM is defined as follows:
|
|
|
|
\begin{align}
|
|
\tag{\emph{iid}-colBiSBM}
|
|
X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi}, \bm{\rho}, \bm{\alpha}), & & \forall m = 1, \dots M
|
|
\end{align}
|
|
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
|
|
$\pi_q \in \left( 0,1 \right], \sum_{q=1}^{Q_1} \pi_q = 1 $ and $\rho_r \in \left( 0,1 \right], \sum_{r=1}^{Q_2} \rho_r = 1 $.
|
|
This model involves $(Q_1 - 1) + (Q_2 - 1) + Q_1\times Q_2$ parameters, the two
|
|
first terms corresponding to block proportions on the row and column
|
|
and the third term to connectivity parameters.
|
|
|
|
But the assumption that block proportions are the same among the networks is a
|
|
strong assumption. In plant-pollinator networks, the proportion of specialist
|
|
species can differ between networks and thus the model may benefit from not
|
|
having the same block proportions but sharing a common connectivity structure.
|
|
The following models relaxes this assumption on either row, column or both.
|
|
|
|
\subsection{A collection of bipartite SBM with varying block size on either rows or columns}\label{ssec:a-collection-of-bipartite-sbm-with-varying-block-size-on-either-rows-or-columns}
|
|
% DONE Finish explaining
|
|
|
|
$\pi$-colBiSBM model still assumes that the networks share a common connectivity
|
|
structure represented by $\bm{\alpha}$ but that each network has its own row
|
|
block proportions. For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
|
|
\begin{align}
|
|
\tag{\emph{$\pi$}-colBiSBM}
|
|
X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi^m}, \bm{\rho}, \bm{\alpha}), & & \forall m = 1, \dots, M
|
|
\end{align}
|
|
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
|
|
$\pi^m_q \in \left[ 0,1 \right], \sum_{q=1}^{Q_1} \pi^m_q~=~1, \forall m \in \{1,\dots,M\}$ and $\rho_r \in \left( 0,1 \right], \sum_{r=1}^{Q_2} \rho_r = 1 $.
|
|
This model is more flexible than the iid-colBiSBM as it allows the row block
|
|
proportions to vary between networks and even to be null
|
|
($\pi^m_q\in\left[ 0,1 \right]$): if $\pi_q^m = 0$ then the
|
|
block $q$ is not represented in the network $m$. The connectivity structure is
|
|
thus a subset of a large connectivity structure common to all networks. We face
|
|
the same problems as~\cite{chabert-liddellLearningCommonStructures2024a} and
|
|
adapt the support $S$ they define for the $\pi$-colSBM to the bipartite case by
|
|
having $S^1$ of size $M\times Q_1$ the support for the rows and $S^2$ of size
|
|
$M\times Q_2$ the support for the columns. Thus
|
|
$S^1_{mq} = \mathbbb{1}_{\pi^m_q > 0}$ and
|
|
$S^2_{mr} = \mathbbb{1}_{\rho^m_r > 0}$. In this case, $S^2 = \bm{1}$, because
|
|
there is no freedom on the column dimension.
|
|
|
|
For a given number of blocks $Q_1$, $Q_2$ and matrix $S^1$ ($S^2$ being in this
|
|
case the matrix full of ones), the number of parameters is:
|
|
\begin{equation*}
|
|
\text{NP}(\pi\text{-colBiSBM}) = \sum_{m=1}^{M}\Bigg( \sum_{q=1}^{Q_1} S^1_{mq} - 1 \Bigg) + (Q_2 - 1) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
|
|
\end{equation*}
|
|
The first term corresponds to the non-null block proportions in each network.
|
|
The third quantity accounts for the fact that some blocks may never be
|
|
represented simultaneously in any network, so the corresponding connection
|
|
parameters $\alpha_{qr}$ are not useful for defining the model.
|
|
|
|
$\rho$-colBiSBM model still assumes that the networks share a common connectivity
|
|
structure represented by $\bm{\alpha}$ but that each network has its own column
|
|
block proportions. For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
|
|
\begin{align}
|
|
\tag{\emph{$\rho$}-colBiSBM}
|
|
X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi}, \bm{\rho^m}, \bm{\alpha}), & & \forall m = 1, \dots, M
|
|
\end{align}
|
|
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
|
|
$\pi_q \in \left( 0,1 \right], \sum_{q=1}^{Q_1} \pi_q = 1 $ and
|
|
$\rho^m_r \in \left[ 0,1 \right], \sum_{r=1}^{Q_2} \rho^m_r = 1 $.
|
|
This model is more flexible than the iid-colBiSBM as it allows
|
|
proportions to vary between networks and even to be null
|
|
($\rho^m_r\in\left[ 0,1 \right]$): if $\rho_r^m = 0$
|
|
then the column block $r$ is not represented in the network $m$.
|
|
|
|
\enquote{Mirroring} the formulas for the $\pi$-colBiSBM we relax the constraints on
|
|
the column dimension.
|
|
|
|
For a given number of blocks $Q_1$, $Q_2$ and matrix $S^2$ ($S^1$ being in this
|
|
case the matrix full of ones), the number of parameters is:
|
|
\begin{equation*}
|
|
\text{NP}(\rho\text{-colBiSBM}) = (Q_1 - 1) + \sum_{m=1}^{M}\Bigg( \sum_{r=1}^{Q_2} S^2_{mr} - 1 \Bigg) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
|
|
\end{equation*}
|
|
|
|
$\pi\rho$-colBiSBM model still assumes that the networks share a common connectivity
|
|
structure represented by $\bm{\alpha}$ but that each network has its own row and
|
|
column block proportions, it is the least constrained model.
|
|
For $m \in \{1,\dots,M\}$, the $X^m$ are independent and
|
|
\begin{align}
|
|
\tag{\emph{$\pi\rho$}-colBiSBM}
|
|
X^m \sim \mathcal{F}-BiSBM_{n_1^m,n_2^m} (Q_1, Q_2, \bm{\pi^m}, \bm{\rho^m}, \bm{\alpha}), & & \forall m = 1, \dots, M
|
|
\end{align}
|
|
where $\forall (q,r) \in \{1,\dots,Q_1\}\times\{1,\dots,Q_2\}$, $\alpha_{qr} \in \mathcal{A}_{\mathcal{F}}$,
|
|
$\pi^m_q \in \left[ 0,1 \right], \sum_{q=1}^{Q_1} \pi^m_q~=~1, \forall m \in \{1,\dots,M\}$ and
|
|
$\rho^m_r \in \left[ 0,1 \right], \sum_{r=1}^{Q_2} \rho^m_r = 1 $.
|
|
|
|
For a given number of blocks $Q_1$, $Q_2$ and matrices $S^1$, $S^2$, the number
|
|
of parameters is:
|
|
\begin{equation*}
|
|
\text{NP}(\pi\rho\text{-colBiSBM}) = \sum_{m=1}^{M}\Bigg( \sum_{q=1}^{Q_1} S^1_{mq} - 1 \Bigg) + \sum_{m=1}^{M}\Bigg( \sum_{r=1}^{Q_2} S^2_{mr} - 1 \Bigg) + \sum_{\substack{q=1,\dots,Q_1 \\ r=1,\dots,Q_2}} \mathbbb{1}_{{(S^{1\prime}S^2)}_{qr}>0}
|
|
\end{equation*}
|
|
|
|
\section{Variational estimation of the parameters}\label{sec:variational-estimation-of-the-parameters}
|
|
|
|
In practice, the estimation of the likelihood is not tractable. Following the
|
|
classical approach defined in~\cite{daudinMixtureModelRandom2008} we use a
|
|
variational version of the Expectation Maximization (VEM) algorithm.
|
|
|
|
We maximize a variational lower bound of the log-likelihood of the observed
|
|
data, the so-called Evidence Lower Bound (or ELBO), by approximating
|
|
$p(\bm{Z,W}|\bm{X};\bm{\theta})$ with a distribution on
|
|
$\bm{Z}$ and $\bm{W}$ named $\mathcal{R}$ defined as $\mathcal{R} =
|
|
\otimes_{m=1}^M \mathcal{R}_m$.\
|
|
|
|
The lower bound is defined as:
|
|
\begin{equation*}
|
|
\mathcal{J}(\mathcal{R};\bm{\theta}) \coloneqq \sum_{m=1}^{M} \bigg( \mathbb{E}_{\mathcal{R}_m}[\ell(X^m,Z^m,W^m;\bm{\theta})] + \mathcal{H}(\mathcal{R}_m) \bigg) \leq \ell(\bm{X};\bm{\theta})
|
|
\end{equation*}
|
|
|
|
$(Z^m_i)_{i=1\dots n_1^m}$ and $(W^m_j)_{j=1\dots n_2^m}$ are
|
|
redefined using the \emph{one-hot encoded} conversion (i.e., $Z_i^m = q
|
|
\rightarrow Z_{iq}^m = 1$ and $W_j^m = r \rightarrow W_{jr}^m = 1$).\\ % W_{jr\prime}^m pour r != r égal 0
|
|
|
|
% TODO Demander une confirmation des formules
|
|
|
|
When $\mathcal{R}_m$ is issued from the set of the factorizable distributions,
|
|
we denote $\tau_{iq}^{1,m} = \mathbb{P}_{\mathcal{R}_m}(Z_{iq}^m = 1|X_{i\bullet}^m)$
|
|
and $\tau_{jr}^{2,m} = \mathbb{P}_{\mathcal{R}_m}(W_{jr}^m = 1|X_{\bullet j}^m)$, thus
|
|
we have: $\mathbb{P}_{\mathcal{R}_m} (Z_{iq}^m = 1, W_{jr}^m = 1|X^m) =
|
|
\mathbb{P}_{\mathcal{R}_m}(Z_{iq}^m = 1|X_{i\bullet}^m) {\color{red}\times}
|
|
\mathbb{P}_{\mathcal{R}_m}(W_{jr}^m = 1|X_{\bullet j}^m) = \tau_{iq}^{1,m}
|
|
{\color{red}\times} \tau_{jr}^{2,m}$.
|
|
|
|
The formula for the entropy per network is thus:
|
|
\begin{equation*}
|
|
\mathcal{H}(\mathcal{R}_m) = - \sum_{i=1}^{n_1^m} \tau_{iq}^{1,m} \log \tau_{iq}^{1,m} - \sum_{j=1}^{n_2^m} \tau_{jr}^{2,m} \log \tau_{jr}^{2,m}
|
|
\end{equation*}
|
|
|
|
And the expectation of the completed log-likelihood under the $\mathcal{R}_m$
|
|
variational distribution for network $m$ is:
|
|
\begin{align*}
|
|
\mathbb{E}_{\mathcal{R}_m}[\ell(X^m,Z^m,W^m;\bm{\theta})] = \sum_{i = 1}^{n_1^m}\sum_{j=1}^{n_2^m}\sum_{q \in \mathcal{Q}_1^m} \sum_{r \in \mathcal{Q}_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m} \log f(X^{m}_{ij}; \alpha_{qr}) \\
|
|
+ \sum_{i=1}^{n_1^m} \sum_{q \in \mathcal{Q}_1^m} \tau_{iq}^{1,m} \log \pi_{\color{black}q}^{\color{gray}m} + \sum_{j=1}^{n_2^m} \sum_{r \in \mathcal{Q}_2^m} \tau_{jr}^{2,m} \log \rho_{\color{black}r}^{\color{gray}m}
|
|
\end{align*}
|
|
with $\mathcal{Q}_1^m = \{q\in \{1 \dots, Q_1\}|\pi_q^m > 0\}$ and
|
|
$\mathcal{Q}_2^m = \{r\in \{1 \dots, Q_2\}|\rho_r^m > 0\}$
|
|
And thus the lower bound becomes:
|
|
|
|
\begin{align*}
|
|
\mathcal{J}(\bm{\tau};\bm{\theta}) \coloneqq \sum_{m=1}^{M} \bigg(\sum_{i = 1}^{n_1^m}\sum_{j=1}^{n_2^m}\sum_{q \in \mathcal{Q}_1^m} \sum_{r \in \mathcal{Q}_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m} \log f(X^{m}_{ij}; \alpha_{qr}) \\
|
|
+ \sum_{i=1}^{n_1^m} \sum_{q \in \mathcal{Q}_1^m} \tau_{iq}^{1,m} \log \pi_{\color{black}q}^{\color{gray}m} + \sum_{j=1}^{n_2^m} \sum_{r \in \mathcal{Q}_2^m} \tau_{jr}^{2,m} \log \rho_{\color{black}r}^{\color{gray}m} \\
|
|
- \sum_{i=1}^{n_1^m} \tau_{iq}^{1,m} \log \tau_{iq}^{1,m} - \sum_{j=1}^{n_2^m} \tau_{jr}^{2,m} \log \tau_{jr}^{2,m} \bigg) \color{black}
|
|
\end{align*}
|
|
|
|
where we identify the variational distribution $\mathcal{R}$ with its parameter
|
|
$\bm{\tau}$. \\
|
|
|
|
% \begin{equation*}
|
|
% \mathcal{J}(\mathcal{R};\bm{\theta}) \coloneqq \mathbb{E}_{\mathcal{R}}[\ell(\bm{X},\bm{Z},\bm{W};\bm{\theta})] + \mathcal{H}(\bm{Z,W}) \leq \ell(\bm{X};\bm{\theta})
|
|
% \end{equation*}
|
|
|
|
The VEM algorithm alternates between two steps, the variational E step and the
|
|
M step. The E steps consists in optimizing $\mathcal{J}(\bm{\tau};\bm{\theta})$
|
|
for a current value of $\bm{\theta}$ with respect to $\bm{\tau}$. And the M
|
|
step consists of maximizing $\mathcal{J}(\bm{\tau};\bm{\theta})$ with respect
|
|
to $\bm{\theta}$ and for a given variational distribution $\bm{\tau}$.
|
|
|
|
\subsection{Variational E step}
|
|
\label{ssec:variational-e-step}
|
|
|
|
At this step we maximize with respect to the variational distribution
|
|
$\bm{\tau}$: $$\widehat{\bm{\tau}}^{(t+1)} = \arg \max_{\bm{\tau}}
|
|
\mathcal{J}(\mathcal{\bm{\tau}},\bm{\widehat{\theta}}^{(t)}).$$
|
|
|
|
And we obtain the following formulae for the $\bm{\tau^m}$:
|
|
\begin{equation*}
|
|
\begin{cases}
|
|
\widehat{\tau}_{iq}^{1,m} \propto \widehat{\pi}_{q}^{m(t)} \prod_{j=1}^{n_2^m}\prod_{r\in\mathcal{Q}_2^m} f(X_{ij}^m;\widehat{\alpha}_{qr}^{(t)})^{\widehat{\tau}_{jr}^{2,m(t+1)}} & \forall i = 1, \dots , n_1^m, q \in \mathcal{Q}_1^m \\
|
|
\widehat{\tau}_{jr}^{2,m} \propto \widehat{\rho}_{r}^{m(t)} \prod_{i=1}^{n_1^m}\prod_{q\in\mathcal{Q}_1^m} f(X_{ij}^m;\widehat{\alpha}_{qr}^{(t)})^{\widehat{\tau}_{iq}^{1,m(t+1)}} & \forall j = 1, \dots , n_2^m, r \in \mathcal{Q}_2^m
|
|
\end{cases}
|
|
\end{equation*}
|
|
|
|
which are used to update iteratively the values by a fixed point algorithm with
|
|
only one step.
|
|
|
|
\subsection{M step of the algorithm}
|
|
\label{ssec:m-step-of-the-algorithm}
|
|
At iteration $(t)$ the M-step maximizes the variational bound with respect to
|
|
the model parameters $\bm{\theta}$:
|
|
\[
|
|
\widehat{\bm{\theta}}^{(t+1)} = \arg \max_{\bm{\theta}} \mathcal{J}(\mathcal{\bm{\widehat{\tau}}}^{(t+1)},\bm{\theta})
|
|
\]
|
|
|
|
The following quantities are involved in the obtained formulae:
|
|
|
|
\begin{align*}
|
|
e^{m}_{qr} = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m} X_{ij}^m
|
|
& , & n^{m}_{qr} = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \tau_{iq}^{1,m} \tau_{jr}^{2,m}
|
|
& , & n^{1,m}_{q} = \sum_{i=1}^{n_1^m} \tau_{iq}^{1,m}
|
|
& , & n^{2,m}_{r} = \sum_{j=1}^{n_2^m} \tau_{jr}^{2,m}
|
|
\end{align*}
|
|
|
|
The block proportions, in free mixture models,
|
|
$(\pi_q^m)_{q\in\mathcal{Q}_1^m}, (\rho_r^m)_{r\in\mathcal{Q}_2^m}$ are
|
|
estimated as
|
|
\begin{align*}
|
|
\widehat{\pi}_q^{m}= \frac{n^{1,m}_{q}}{n_1^m} & & \text{for } \pi\text{-colBiSBM} \text{ and } \pi\rho\text{-colBiSBM} \\
|
|
\widehat{\rho}_r^{m}= \frac{n^{2,m}_{r}}{n_2^m} & & \text{for } \rho\text{-colBiSBM} \text{ and } \pi\rho\text{-colBiSBM}
|
|
\end{align*}
|
|
while on the other hand,
|
|
\begin{align*}
|
|
\widehat{\pi}_q = \frac{\sum_{m=1}^{M} n^{1,m}_{q}}{\sum_{m=1}^{M} n_1^m} & & \text{for } iid\text{-colBiSBM} \text{ and } \rho\text{-colBiSBM} \\
|
|
\widehat{\rho}_r = \frac{\sum_{m=1}^{M} n^{2,m}_{r}}{\sum_{m=1}^{M} n_2^m} & & \text{for } iid\text{-colBiSBM} \text{ and } \pi\text{-colBiSBM}
|
|
\end{align*}
|
|
the parameters take into account all the networks at the same time.
|
|
The connectivity parameters $\alpha_{qr}$ for all models are estimated as the
|
|
ratio of the number of observed interactions between row block $q$ and column block $r$
|
|
among all networks over the number of possible interactions:
|
|
\begin{align*}
|
|
\widehat{\alpha}_{qr} = \frac{\sum_{m=1}^{M} e^{m}_{qr}}{\sum_{m=1}^{M} n^{m}_{qr}}
|
|
\end{align*}
|
|
|
|
Please note that those formulae can vary with the emission distribution used.
|
|
|
|
\section{Model selection}\label{sec:model-selection}
|
|
% DONE
|
|
% Adapt bicl, methode explo car defi
|
|
% 1 bicl 2 model exploration
|
|
% Citer la conclusion de l'article de St Clair discussion sur bipartite
|
|
The section \ref{sec:variational-estimation-of-the-parameters} explains how we
|
|
estimate the parameters of the model for \emph{fixed} number of blocks
|
|
$Q_1$ and $Q_2$. But as they are in general not known we need to explore the
|
|
latent space to find the \emph{best} values.
|
|
As discussed in~\cite{chabert-liddellLearningCommonStructures2024a}, the
|
|
algorithmic aspect becomes complex when dealing with the bipartite case. Due to
|
|
the latent space being $\mathbb{N}^2$, conducting a complete
|
|
exploration of the latent space is practically infeasible. Therefore, in
|
|
addition to adapting the existing formulas, our contribution to addressing this
|
|
challenge involved making significant choices, which are outlined below.
|
|
|
|
The below procedures are implemented in the \emph{colSBM} package, available on
|
|
\url{https://github.com/Chabert-Liddell/colSBM}.
|
|
|
|
\subsection{The \emph{Bayesian Information Criterion like} (BIC-L) criterion for model selection}
|
|
\label{ssec:the-bic-l-criterion-for-model-selection}
|
|
To select the best number of blocks we need a criterion to
|
|
measure adequacy between our model and data. The ELBO might seem a good
|
|
criterion at first but as for the likelihood, the more complex the model, the
|
|
higher it gets. And thus a good criterion should make a \emph{trade-off} between
|
|
fitting to data and model complexity.
|
|
|
|
The Integrated Classified Likelihood (ICL) is a well-established tool in the SBM
|
|
and LBM domains for selecting the appropriate number of blocks. It was
|
|
introduced by~\cite{biernackiAssessingMixtureModel2000,
|
|
daudinMixtureModelRandom2008}. The ICL is derived from an asymptotic
|
|
approximation of the marginal complete likelihood. In this approach, the model
|
|
parameters are integrated out using a prior distribution, resulting in a
|
|
penalized likelihood criterion. By employing the ICL, one can effectively
|
|
determine the optimal number of blocks for the given problem in a systematic
|
|
manner.
|
|
We obtain the following expression
|
|
\[
|
|
\text{ICL} = \max_{\theta} \mathbb{E}_{\widehat{\mathcal{R}}} [\ell(\bm{X,Z,W;\theta})] - \frac{1}{2}\text{pen}
|
|
\]
|
|
with pen the penalties.\\ Using the formula $\mathbb{E}_{\widehat{\mathcal{R}}}
|
|
[\ell(\bm{X,Z,W;\theta})] \approx \ell (\bm{X;\theta}) -
|
|
\mathcal{H(\widehat{R})}$, it becomes clearer, as highlighted in the existing
|
|
literature, that the Integrated Classified Likelihood (ICL) gives preference to
|
|
well-separated blocks by imposing a penalty on the entropy of node grouping.
|
|
However, the objective of our study extends beyond grouping nodes into coherent
|
|
blocks. We also aim to assess the similarity of connectivity patterns across
|
|
different networks. Consequently, we aim to permit models that offer more
|
|
flexible node grouping by not penalizing on entropy.
|
|
|
|
This leads us to formulate a BIC-like criterion in the following manner:
|
|
|
|
\[
|
|
\text{BIC-L} = \max_{\bm{\theta}} \mathbb{E}_{\widehat{\mathcal{R}}} [\ell(\bm{X,Z,W;\theta})] + \mathcal{H(\widehat{R})} - \frac{1}{2}\text{pen} = \max_{\bm{\theta}} \mathcal{J(\widehat{R}, \bm{\theta})} - \frac{1}{2}\text{pen}
|
|
\]
|
|
|
|
We provide below the expression for the penalties for the 4 models that we
|
|
propose.
|
|
\begin{description}
|
|
\item[\textit{iid}-colBiSBM] For the $\bm\pi$ and $\bm\rho$:
|
|
\begin{align*}
|
|
\text{pen}_{\pi}(Q_1) = (Q_1 - 1)\log(\sum_{m=1}^{M}n_{1}^{m}) & , &
|
|
\text{pen}_{\rho}(Q_2) = (Q_2 - 1)\log(\sum_{m=1}^{M}n_{2}^{m})
|
|
\end{align*}
|
|
For the $\bm\alpha$:
|
|
\[\text{pen}_{\alpha}(Q_1, Q_2) = Q_1 \times Q_2 \log(N_M)\]
|
|
with
|
|
\[ N_M = \sum_{m = 1}^{M} n_{1}^{m} \times n_{2}^{m} \]
|
|
And thus the $\text{BIC-L}$ formula is the following:
|
|
\[ \text{BIC-L}(\bm{X},Q_1, Q_2) = \max_{\theta}
|
|
\mathcal{J} (\mathcal{\hat{R}}, \bm{\theta})
|
|
- \frac{1}{2} [\text{pen}_{\pi}(Q_1) + \text{pen}_{\rho}(Q_2) +
|
|
\text{pen}_{\alpha}(Q_1, Q_2)]\]
|
|
\item[$\bm{\pi\rho}$-colBiSBM] The support penalties are
|
|
\begin{align*}
|
|
\text{pen}_{S_1}(Q_1) = -2 \log p_{Q_1} (S_1) & , &
|
|
\text{pen}_{S_2}(Q_2) = -2 \log p_{Q_2} (S_2)
|
|
\end{align*}
|
|
with \begin{align*}
|
|
\textstyle \log p_{Q_1}(S_1) = - M \log(Q_1) - \sum_{m=1}^{M} \log {Q_1
|
|
\choose Q_1^{(m)}}, \\
|
|
\textstyle \log p_{Q_2}(S_2) = - M \log(Q_2) - \sum_{m=1}^{M} \log {Q_2
|
|
\choose Q_2^{(m)}}.
|
|
\end{align*}
|
|
And penalties for the $\bm\rho$ and $\bm\pi$ are
|
|
\[ \text{pen}_{\pi}(Q_1, S_1) = \sum_{m=1}^{M} (Q_{1}^{(m)} - 1)
|
|
\log n_{1}^{m},
|
|
~\text{pen}_{\rho}(Q_2, S_2) = \sum_{m=1}^{M} (Q_{2}^{(m)} - 1)
|
|
\log n_{2}^{m}. \]
|
|
Penalties for the $\bm\alpha$
|
|
\[ \text{pen}_{\alpha}(Q_1, Q_2, S_1, S_2) = (\sum_{q=1}^{Q_1}
|
|
\sum_{r=1}^{Q_2} \mathbbb{1}_{(S_1)'S_2 > 0}) \log (N_M). \]
|
|
And the corresponding BIC-L formula,
|
|
\[
|
|
\begin{aligned}
|
|
\text{BIC-L}(\bm{X},Q_1, Q_2) =
|
|
\max_{S_1,S_2} [
|
|
& \max_{\theta_{S_1,S_2} \in \Theta_{S_1,S_2}} \mathcal{J}(\mathcal{\hat{R}},\theta_{S_1,S_2}) \\
|
|
- \frac{1}{2} & (\text{pen}_{\pi}(Q_1, S_1) + \text{pen}_{\rho}(Q_2, S_2) \\
|
|
& + \text{pen}_{\alpha}(Q_1, Q_2, S_1, S_2) \\
|
|
& + \text{pen}_{S_1}(Q_1) + \text{pen}_{S_2}(Q_2))] \\
|
|
\end{aligned}
|
|
\]
|
|
\end{description}
|
|
|
|
\subsection{Initialization and pairing of the models}
|
|
\label{ssec:initialization-and-pairing-of-the-models}
|
|
The row (resp. column) block memberships are the labels of row (resp. column)
|
|
nodes corresponding to the group to which they were assigned based on their
|
|
connection patterns. This adds another layer of complexity to the model
|
|
selection as we need to find the best $Q_1, Q_2$ and the best memberships for
|
|
each vertex.
|
|
|
|
First to combine the information from the $M$ networks we fit a LBM model
|
|
for each network at the two points $Q = (1, 2)$ and $Q = (2, 1)$. Using the
|
|
previously described VEM algorithm we obtain for each network its parameters
|
|
($\bm{\rho,\pi,\alpha}$).
|
|
We then compute the marginal laws for each dimension, for each network. Then we
|
|
order the network blocks by the probabilities obtained in decreasing order.
|
|
|
|
For the memberships on the columns: $col~order_m = order\left(\pi_m \times
|
|
\alpha_m\right)$.
|
|
|
|
For the memberships on the rows: $row~order_m = order\left(\rho_m \times
|
|
~^{t}(\alpha_m)\right)$.
|
|
|
|
Using this order we relabel the memberships for the $M$ fitted collection of a
|
|
single network.
|
|
We then use the $M$ memberships to compute first $\bm{\tau}$ to fit a collection
|
|
containing the $M$ networks.
|
|
\subsection{Greedy exploration to find an estimation of the mode}\label{ssec:greedy-exploration-to-find-an-estimation-of-the-mode}
|
|
Using the previously fitted models for $Q = (1,2)$ and $Q = (2,1)$ we choose to
|
|
perform a greedy exploration from each of those points to find a first mode.
|
|
|
|
Meaning that for a given $Q = (Q_1, Q_2)$ we will compute all the possible
|
|
memberships for the points $Q \in \{(Q_1 + 1, Q_2),(Q_1, Q_2 + 1),(Q_1 - 1,
|
|
Q_2), (Q_1, Q_2 - 1)\}$, fit the corresponding models and choose the one that
|
|
maximizes the BIC-L as the next point from which to repeat the procedure. We
|
|
repeat the procedure until the BIC-L stops increasing $2$ times in a row.
|
|
|
|
Let us denote the neighborhood in the latent space of a point $Q$ by
|
|
$\mathcal{N}(Q) = Q + {(1,0), (0,1), (-1,0), (0,-1)}$, the four neighbors of $Q$
|
|
in the grid.
|
|
|
|
\begin{algorithm}[H]
|
|
\small
|
|
\caption{Greedy Exploration for Mode Estimation}
|
|
\SetAlgoLined
|
|
\SetKwInOut{Input}{Input}
|
|
\SetKwInOut{Output}{Output}
|
|
|
|
\Input{Fitted models for $Q = (1,2)$ and $Q = (2,1)$}
|
|
\Output{Estimation of the mode using greedy exploration}
|
|
|
|
\BlankLine
|
|
\For{$Q_{\text{start}} \in \{(1,2), (2,1)\}$}{ % and $Q = (2,1)$ as starting point
|
|
\BlankLine
|
|
Initialize $\text{BIC-L}_{\text{max}} \leftarrow \text{BIC-L}(Q_{\text{start}})$\\
|
|
Initialize $consecutive\_count$ as 0
|
|
|
|
\BlankLine
|
|
$Q_{\text{curr}} \leftarrow Q_{\text{start}}$
|
|
|
|
\While{$consecutive\_count < 2$}{
|
|
Fit models in $\mathcal{N}(Q_{\text{curr}})$\;
|
|
|
|
\BlankLine
|
|
$Q \leftarrow \arg\max_{Q \in \mathcal{N}(Q_{\text{curr}})} \text{BIC-L}(Q)$
|
|
|
|
$\text{BIC-L}_{\text{curr}} \leftarrow \max_{Q \in \mathcal{N}(Q_{\text{curr}})} \text{BIC-L}(Q)$
|
|
\BlankLine
|
|
\If{$\text{BIC-L}_{\text{curr}} > \text{BIC-L}_{\text{max}}$}{
|
|
$\text{BIC-L}_{\text{max}} \leftarrow \text{BIC-L}_{\text{curr}}$\\
|
|
$consecutive\_count \leftarrow 0$
|
|
}
|
|
\Else{
|
|
$consecutive\_count \leftarrow consecutive\_count + 1$
|
|
}
|
|
}
|
|
}
|
|
\BlankLine
|
|
\textbf{Output:} Estimation of the mode using greedy exploration
|
|
\end{algorithm}
|
|
|
|
When this first estimation of the BIC-L mode has been find we apply the moving
|
|
window on it.
|
|
|
|
\subsection{Moving window to update the block memberships and the BIC-L}
|
|
\label{ssec:moving-window-to-update-the-block-memberships-and-the-bic-l}
|
|
The \emph{moving window} is used to update the block memberships on rows and
|
|
columns and fit new models with those changes.
|
|
To define the window, we use a center point and a \emph{depth}, giving us the
|
|
bottom left corner ($Q_{1,center} - depth, Q_{2,center} - depth$) and the top right corner of the
|
|
window ($Q_{1,center} + depth, Q_{2,center} + depth$). All the points in this square will be
|
|
updated and contribute to the update of the others.
|
|
This procedure is repeated until convergence of the BIC-L.
|
|
|
|
The figure \ref{fig:moving-window-procedure} illustrates the procedure. It
|
|
consists of two alternating steps:
|
|
\begin{itemize}
|
|
\item the \emph{forward pass}: repeatedly computing the possible splits to fit the
|
|
current model.
|
|
\item the \emph{backward pass}: computing the possible merges to fit the current
|
|
model.
|
|
\end{itemize}
|
|
|
|
\begin{algorithm}[t]
|
|
\small
|
|
\caption{Moving Window Procedure}
|
|
\SetAlgoLined
|
|
\SetKwInOut{Input}{Input}
|
|
\SetKwInOut{Output}{Output}
|
|
|
|
\Input{Center point $(Q_{1,\text{center}}, Q_{2,\text{center}})$, depth}
|
|
\Output{Best model with maximum BIC-L in the window}
|
|
|
|
\BlankLine
|
|
Define bottom left corner $(Q_{1,\text{center}} - \text{depth}, Q_{2,\text{center}} - \text{depth})$\\
|
|
Define top right corner $(Q_{1,\text{center}} + \text{depth}, Q_{2,\text{center}} + \text{depth})$
|
|
|
|
\BlankLine
|
|
\While{not converged}{
|
|
\textbf{Forward pass:}
|
|
|
|
\For{$Q_1 \in \left[ Q_{1,\text{center}} - \text{depth} ; Q_{1,\text{center}} + \text{depth} \right]$}{
|
|
\For{$Q_2 \in \left[ Q_{2,\text{center}} - \text{depth}; Q_{2,\text{center}} + \text{depth} \right] $}{
|
|
Compute possible splits from predecessors $(Q_1 - 1, Q_2)$ and $(Q_1, Q_2 - 1)$\\
|
|
Among the model generated from the splits choose the best in regard of the BIC-L
|
|
}
|
|
}
|
|
|
|
\BlankLine
|
|
\textbf{Backward pass:}
|
|
|
|
\For{$Q_1 \in \left[ Q_{1,\text{center}} + \text{depth} ; Q_{1,\text{center}} - \text{depth} \right]$}{
|
|
\For{$Q_2 \in \left[ Q_{2,\text{center}} + \text{depth}; Q_{2,\text{center}} - \text{depth} \right] $}{
|
|
Compute possible merges from predecessors $(Q_1 + 1, Q_2)$ and $(Q_1, Q_2 + 1)$\\
|
|
Among the model generated from the merges choose the best in regard of the BIC-L
|
|
}
|
|
}
|
|
|
|
\BlankLine
|
|
Choose the mode as the one that maximizes the BIC-L
|
|
}
|
|
|
|
\BlankLine
|
|
\textbf{Output:} Best model with maximum BIC-L in the window
|
|
\end{algorithm}
|
|
|
|
\begin{figure}[t]
|
|
\definecolor{mypurple}{RGB}{128,0,128}
|
|
\begin{subfigure}[b]{0.48\textwidth}
|
|
\begin{tikzpicture}[scale=1.5]
|
|
\tikzstyle{model}=[circle,draw=none,fill=gray, thick]
|
|
\tikzstyle{split}=[>=stealth,->,thick, draw=blueps]
|
|
\tikzstyle{merge}=[>=stealth,->,thick, draw=red]
|
|
\draw[step=1cm, help lines] (-2,-2) grid (2,2);
|
|
\node[model] (mode) at (0,0) {{\color{red}X}};
|
|
|
|
\draw[color=red, line width=1pt, dashed] (-1.5,-1.5) rectangle ++(3,3);
|
|
|
|
\node[model] (bottom_left) at (-1,-1) {};
|
|
\node[model, draw=blueps] (row_1) at (0,-1) {};
|
|
\node[model, draw=blueps] (col_1) at (-1,0) {};
|
|
\node[model, draw=blueps] (row_2) at (1,-1) {};
|
|
\node[model, draw=blueps] (col_2) at (-1,1) {};
|
|
\node[model, draw=blueps] (mode) at (0,0) {{\color{red}X}};
|
|
\node[model, draw=blueps] (row_3) at (1,0) {};
|
|
\node[model, draw=blueps] (col_3) at (0,1) {};
|
|
\node[model, draw=blueps] (top_right) at (1,1) {};
|
|
|
|
\draw[split] (bottom_left) -- (col_1);
|
|
\draw[split] (-1.75,0) -- (col_1);
|
|
\draw[split] (bottom_left) -- (row_1);
|
|
\draw[split] (0,-1.75) -- (row_1);
|
|
|
|
\draw[split] (col_1) -- (col_2);
|
|
\draw[split] (-1.75,1) -- (col_2);
|
|
\draw[split] (row_1) -- (row_2);
|
|
\draw[split] (1,-1.75) -- (row_2);
|
|
\draw[split] (row_1) -- (mode);
|
|
\draw[split] (col_1) -- (mode);
|
|
|
|
\draw[split] (col_2) -- (col_3);
|
|
\draw[split] (row_2) -- (row_3);
|
|
\draw[split] (mode) -- (row_3);
|
|
\draw[split] (mode) -- (col_3);
|
|
|
|
\draw[split] (col_3) -- (top_right);
|
|
\draw[split] (row_3) -- (top_right);
|
|
\end{tikzpicture}
|
|
\caption[forward]{Visualisation of a forward pass of moving window}\label{fig:visualisation-forward-pass}
|
|
\end{subfigure}
|
|
\hfill
|
|
\begin{subfigure}[b]{0.48\textwidth}
|
|
\begin{tikzpicture}[scale=1.5]
|
|
\tikzstyle{model}=[circle,draw=none,fill=gray]
|
|
\tikzstyle{split}=[>=stealth,->,thick, draw=blueind]
|
|
\tikzstyle{merge}=[>=stealth,->,thick, draw=red]
|
|
\draw[step=1cm, help lines] (-2,-2) grid (2,2);
|
|
\draw[color=red, line width=1pt, dashed] (-1.5,-1.5) rectangle ++(3,3);
|
|
|
|
\node[model, draw=mypurple] (top_right) at (1,1) {};
|
|
\node[model, draw=mypurple] (row_3) at (1,0) {};
|
|
\node[model, draw=mypurple] (col_3) at (0,1) {};
|
|
\node[model, draw=mypurple] (row_2) at (1,-1) {};
|
|
\node[model, draw=mypurple] (col_2) at (-1,1) {};
|
|
\node[model, draw=mypurple] (mode) at (0,0) {{\color{red}X}};
|
|
\node[model, draw=red] (bottom_left) at (-1,-1) {};
|
|
\node[model, draw=mypurple] (row_1) at (0,-1) {};
|
|
\node[model, draw=mypurple] (col_1) at (-1,0) {};
|
|
|
|
\draw[merge] (1,1.75) -- (top_right);
|
|
\draw[merge] (1.75,1) -- (top_right);
|
|
\draw[merge] (0,1.75) -- (col_3);
|
|
\draw[merge] (1.75,0) -- (row_3);
|
|
\draw[merge] (1.75,-1) -- (row_2);
|
|
\draw[merge] (-1,1.75) -- (col_2);
|
|
|
|
\draw[merge] (top_right) -- (col_3);
|
|
\draw[merge] (top_right) -- (row_3);
|
|
\draw[merge] (col_3) -- (col_2);
|
|
\draw[merge] (row_3) -- (row_2) ;
|
|
\draw[merge] (row_3) -- (mode);
|
|
\draw[merge] (col_3) -- (mode);
|
|
\draw[merge] (col_2) --(col_1);
|
|
\draw[merge] (row_2) -- (row_1);
|
|
\draw[merge] (mode) -- (row_1);
|
|
\draw[merge] (mode) -- (col_1);
|
|
\draw[merge] (col_1) -- (bottom_left);
|
|
\draw[merge] (row_1) -- (bottom_left);
|
|
\end{tikzpicture}
|
|
\caption[forward]{Visualisation of a backward pass of moving window}\label{fig:visualisation-backward-pass}
|
|
\end{subfigure}
|
|
\caption{Moving window procedure, the center node marked with an {\color{red}X} is the mode of BIC-L}\label{fig:moving-window-procedure}
|
|
\end{figure}
|
|
|
|
\paragraph*{Forward pass} The forward pass consists for a model at $(Q_1, Q_2)$
|
|
to compute the possible splits from the block memberships of its ``predecessors``.
|
|
The predecessors are the point at the left $(Q_1 - 1, Q_2)$ and below
|
|
$(Q_1, Q_2 - 1)$ the current model (if they exist). To update the current model,
|
|
we take its predecessors block memberships and try to split one of the blocks in
|
|
two. Then the current model is fitted using this clustering as a starting
|
|
clustering. Once all the possible splits are fitted, they are compared, keeping
|
|
the best, in the sense of the BIC-L. If a model was already present it is also
|
|
compared and the best is chosen as the model for this round at $(Q_1, Q_2)$.\\
|
|
The procedure then repeats for the point at $(Q_1 + 1, Q_2)$ until it reaches
|
|
$(Q_{1,center} + depth, Q_2)$ from which it repeats from
|
|
$(Q_{1,center} - depth, Q_2 + 1)$. This repeats until computing the best model
|
|
for $(Q_{1,center} + depth, Q_{2,center} + depth)$.
|
|
|
|
\textit{Note on the initialization:} The forward pass starts from the point
|
|
$(Q_{1,center} + depth, Q_{2,center} + depth)$, so this points needs to have at
|
|
least a model fitted. In the best case, the greedy exploration will have visited
|
|
this point. But if the point has not been visited, a model will be fitted from
|
|
a spectral initialization (i.e the block memberships is computed by using a
|
|
spectral clustering). From this point, the next model will have at least one
|
|
predecessor and the procedure can iterate.
|
|
|
|
\paragraph*{Backward pass} The backward pass consists for a model at $(Q_1, Q_2)$
|
|
to compute the possible merges from the block memberships of its ``predecessors``.
|
|
The predecessors are the point at the right $(Q_1 + 1, Q_2)$ and on top
|
|
$(Q_1, Q_2 + 1)$ of the current model (if the predecessors exist). To update the
|
|
current model, we take its predecessors block memberships and try to merge two
|
|
blocks in one. Then the current model is fitted using this clustering as a starting clustering. Once all the possible merges are fitted, they are
|
|
compared, keeping the best, in the sense of the BIC-L.
|
|
If a model was already present it is also
|
|
compared and the best is chosen as the model for this round at $(Q_1, Q_2)$.\\
|
|
The procedure then repeats for the point at $(Q_1 - 1, Q_2)$ until it reaches
|
|
$(Q_{1,center} - depth, Q_2)$ from which it repeats from
|
|
$(Q_{1,center} - depth, Q_2 - 1)$. This repeats until computing the best model
|
|
for ($Q_{1,center} - depth, Q_{2,center} - depth$).
|
|
\textit{Note on the initialization:} The backward pass starts from
|
|
$(Q_{1,center} + depth, Q_{2,center} + depth)$, we know it was initialized at
|
|
least by the forward pass, no special case here.\\
|
|
|
|
At the end of the moving window pass, the model of max BIC-L is the new best
|
|
fit and the procedure repeats until convergence.
|
|
|
|
\section{Networks clustering}
|
|
\label{sec:networks-clustering}
|
|
As in~\cite{chabert-liddellLearningCommonStructures2024a} we use a recursive
|
|
algorithm to determine the best clustering of the given networks. The procedure
|
|
being the same, we will present it briefly and focus on adjustments.
|
|
|
|
When networks in a collection do not share the same mesoscale connectivity
|
|
structure we want to be able to partition them correctly. For this we perform a
|
|
clustering of networks.
|
|
|
|
The process of clustering a collection of networks involves discovering a
|
|
partition $\mathcal{G} = (\mathcal{M}_g)_{g=1,\dots,G}$ of $\{1,\dots, M\}$.
|
|
Given $\mathcal{G}$ we set the following model on $\bm{X}$:
|
|
|
|
\begin{align*}
|
|
\forall g \in \{1,\dots, G\},
|
|
~\forall m \in \mathcal{M}_g,
|
|
~X^m \sim
|
|
\mathcal{F}\text{-}BiSBM(Q_1^g, Q_2^g, \bm{\pi^m, \rho^m,} \bm{\alpha}^g)
|
|
\end{align*}
|
|
|
|
And we defined the score of a given partition $\mathcal{G}$:
|
|
\[
|
|
Sc(\mathcal{G}) = \sum_{g=1}^{G} \max_{Q^g=1,\dots,Q_{\max}} \text{BIC-L}((X^m)_{m\in\mathcal{M}_g},Q_1^g, Q_2^g)
|
|
\]
|
|
Thus the score consists of the sum of the BIC-L of the sub-collections for the
|
|
partition $\mathcal{G}$.
|
|
|
|
\subsection{Dissimilarity between two networks}
|
|
\label{ssec:dissimilarity-between-two-networks}
|
|
The parameters for the dissimilarity are defined as follow:
|
|
\begin{align*}
|
|
\widetilde{n}_{qr}^m & = \sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \widehat{\tau}_{iq}^{1,m} \widehat{\tau}_{jr}^{2,m},
|
|
& & \widetilde{\alpha}_{qr}^m = \frac{\sum_{i=1}^{n_1^m} \sum_{j=1}^{n_2^m} \widehat{\tau}_{iq}^{1,m} \widehat{\tau}_{jr}^{2,m} X_{ij}^m}{\widetilde{n}_{qr}^m}, \\
|
|
\widetilde{\pi}_q^m & = \frac{\sum_{i=1}^{n_1^m} \widehat{\tau}_{iq}^{1,m}}{n_1^m},
|
|
& & \widetilde{\rho}_r^m = \frac{\sum_{j=1}^{n_2^m} \widehat{\tau_{jr}}^{2,m}}{n_2^m}.
|
|
\end{align*}
|
|
And the pairwise dissimilarity for networks $(m,m')\in\mathcal{M}^2$ is then:
|
|
\[
|
|
D_{\mathcal{M}}(m,m') = \sum_{q = 1}^{Q_1} \sum_{r = 1}^{Q_2} \max(\widetilde{\pi}_{q}^{m}, \widetilde{\pi}_{q}^{m'}) \left( \widetilde{\alpha}_{qr}^{m} - \widetilde{\alpha}_{qr}^{m'}\right)^{2} \max(\widetilde{\rho}_{r}^{m}, \widetilde{\rho}_{r}^{m'})
|
|
\]
|
|
|
|
\begin{figure}[t]
|
|
\centering
|
|
\begin{tikzpicture}[scale=0.7]
|
|
\tikzstyle{instruct}=[font=\small, text justified, rectangle,draw,fill=yellow!50]
|
|
\tikzstyle{first_col}=[rectangle, text justified, draw,fill=gray!50]
|
|
\tikzstyle{second_col}=[scale=0.55, circle, draw,fill=red!50]
|
|
\tikzstyle{test}=[font=\small, text justified, diamond, aspect=2.5,thick,
|
|
draw=blueps,fill=yellow!50]
|
|
\tikzstyle{es}=[font=\small, text justified, rectangle,draw,rounded corners=4pt,fill=cyanind!25]
|
|
|
|
\node[es] (liste) at (0,4) {Supply a collection to partition};
|
|
\node[instruct, text width=5cm, below = 0.45cm of liste] (1-collection) {Fit colBiSBM};
|
|
\node[first_col, right = 0.5cm of 1-collection] (1-col-obj) {};
|
|
\node[instruct, text width=5cm, below = 0.45cm of 1-collection] (dissimi) {Compute a dissimilarity matrix over the collection};
|
|
\node[instruct, text width=5cm, below = 0.45cm of dissimi] (2-sous-collection) {Split the \emph{collection in 2 sub-collections} and fit the colBiSBM};
|
|
\node[second_col, right = 0.25cm of 2-sous-collection] (1-sec-col-obj) {1};
|
|
\node[second_col, right = 0.25cm of 1-sec-col-obj] (1-sec-col-obj) {2};
|
|
\node[test,below = 0.45cm of 2-sous-collection, scale=0.7] (BICL-test) {$\sum_{i=1}^{2} (\text{BIC-L}(\tikz[baseline=-0.25cm]{\node[second_col] {i};} )) > \text{BIC-L}(\tikz[baseline=-0.25cm]{\node[first_col] {};})$?};
|
|
\node[es, right = 0.55cm of BICL-test] (sortie) {Output \tikz{\node[rectangle, draw, fill=gray!50, rounded corners=0pt] {};}};
|
|
\node[es, left = 0.45cm of dissimi, text width = 2cm] (recursion) {Loop over \tikz{\node[second_col] {1};} and \tikz{\node[second_col] {2};} };
|
|
|
|
\tikzstyle{suite}=[->,>=stealth,thick,rounded corners=4pt]
|
|
\draw[suite] (liste) -- (1-collection);
|
|
\draw[suite] (1-collection) -- (dissimi);
|
|
\draw[suite] (dissimi) -- (2-sous-collection);
|
|
\draw[suite] (2-sous-collection) -- (BICL-test);
|
|
\draw[suite] (BICL-test) -| node[near start, above, fill=none] {Yes} (recursion);
|
|
\draw[suite] (recursion.east) -- (dissimi.west);
|
|
\draw[suite] (BICL-test) -- node[near start, above, fill=none] {No} (sortie);
|
|
\end{tikzpicture}
|
|
\caption{Network clustering procedure}
|
|
\label{fig:netclustering-procedure}
|
|
\end{figure}
|
|
|
|
The above figure,~\ref{fig:netclustering-procedure}, shows a condensed
|
|
explanation of the network clustering algorithm.
|
|
|
|
The idea is to adjust the colBiSBM model over the full collection of $M$
|
|
networks and then compute the dissimilarity matrix between all networks of the
|
|
collection. We obtain the collection $\mathcal{G} = \{\mathcal{M}\}$ the
|
|
trivial partition in a unique group.
|
|
|
|
Then using the \emph{Kmeans} we split the collection in two sub-collections
|
|
with the dissimilarity matrix. The two sub-collections are fitted and we
|
|
compute the score of this new partition $\mathcal{G}^{*} = \{G_1, G_2\}$.
|
|
If $Sc(\mathcal{G}^{*}) > Sc(\mathcal{G})$, we repeat the same procedure on
|
|
$G_1$ and $G_2$. Else we return $\mathcal{G}$.
|
|
We illustrate our capacity to perform a partition of a collection for all
|
|
colBiSBM models in~\ref{sec:network-clustering-of-simulated-networks}.
|
|
|
|
\section{Model identifiability}
|
|
\label{sec:model-identifiability}
|
|
% Ici l'identifiabilité du modèle
|
|
The goal here is to prove that if $\ell(\bm{X};\bm{\theta}) = \ell(\bm{X};\bm{\theta}')$ for any collection $\bm{X}$ then $\bm{\theta} = \bm{\theta}'$.
|
|
|
|
Following the proof proposed by~\cite{chabert-liddellLearningCommonStructures2024a}, that adapted it to the collection case and~\cite{keribinEstimationSelectionLatent2015} that extended the result of~\cite{celisseConsistencyMaximumlikelihoodVariational2012} to the LBM Bernoulli model,
|
|
we obtain the following result of identifiability\footnote{The proof is in appendix. \ref{sec:proof-identifiability}} for the $iid$-colBiSBM:
|
|
\begin{theorem}[Identifiability of $iid$-colBiSBM]
|
|
\label{thm:identifiability-iid}
|
|
The parameters $(\bm{\pi}, \bm{\rho}, \bm{\alpha})$ are
|
|
identifiable up to a label switching of the blocks if those
|
|
conditions are achieved:
|
|
\begin{itemize}
|
|
\item[(1.1)] $\exists m^*\in\{1,\dots,M\} : n^1_{m^*} \geq 2 Q_2 - 1~\text{and}~n^2_{m^*} \geq 2 Q_1 - 1$.
|
|
\item[(1.2)] $\forall 1\leq q \leq Q_1, \pi_q > 0$
|
|
and the coordinates of vector $\bm{\rho}
|
|
{X^{m^*}}^T$ are distinct (where ${X^{m^*}}^T$ is the transpose of $X^{m^*}$).
|
|
\item[(1.3)] $\forall 1\leq r \leq Q_2, \rho_r > 0$
|
|
and the coordinates of vector $\bm{\pi}
|
|
X^{m^*}$ are distinct.
|
|
\end{itemize}
|
|
\end{theorem}
|
|
|