Continuing COoPLBM application

2023-08-16 15:50:17 +02:00 · 2023-08-16 15:50:17 +02:00 · 9fad423a5b
commit 9fad423a5b
parent ae38d0ce4e
6 changed files with 173 additions and 72 deletions
--- a/Rcodes/real_data/CoOPLBM_completion_analyze.Rmd
+++ b/Rcodes/real_data/CoOPLBM_completion_analyze.Rmd
@ -3,13 +3,25 @@ output:
  md_document:
    citation_package: biblatex
 ---
 \subsection{Completing raw data using CoOPLBM~\parencite{anakokDisentanglingStructureEcological2022}}
 ```{r, setup, include=FALSE, warning=FALSE}
 knitr::opts_chunk$set(echo = FALSE, dpi = 300)
 ```
 ```{r libraries, echo = FALSE, include=FALSE}
 require("colSBM")
-require(aricode)
+require("aricode")
 require("ggplot2")
 ```
 ```{r useful_functions, echo = FALSE}
 if (getwd() == "/home/polarolouis/Nextcloud/Documents/APT/CEI/Stage Recherche Mathématiques/Depuis PC Portable/Stage MIA 2023/rapport-MIA-2023") {
    path_to_add <- "Rcodes/real_data/"
 } else {
    path_to_add <- ""
 }
 extract_unlist <- function(data) {
  readRDS(data) |>
    extract_best_bipartite_partition() |>
@ -71,10 +83,10 @@ extract_full_reorder <- function(model_collections_list, target) {
 ```{r data_importation, echo = FALSE}
 # Uncompleted
 uncompleted_model_list <- list(
-  "iid" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_iid_70_networks_08-06-23-16:31:17.Rds"),
+  "iid" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_iid_70_networks_08-06-23-16:31:17.Rds")),
-  "pi" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_pi_70_networks_08-06-23-16:52:16.Rds"),
+  "pi" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_pi_70_networks_08-06-23-16:52:16.Rds")),
-  "rho" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_rho_70_networks_08-06-23-16:49:58.Rds"),
+  "rho" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_rho_70_networks_08-06-23-16:49:58.Rds")),
-  "pirho" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_pirho_70_networks_08-06-23-16:41:33.Rds")
+  "pirho" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_pirho_70_networks_08-06-23-16:41:33.Rds"))
 )
 # Below we will need to have the netid in the same order so we choose to use the
@ -86,28 +98,28 @@ uncompleted_clusterings <- extract_full_reorder(uncompleted_model_list, netid_or
 # 0.2 threshold
 point_2_model_list <- list(
-  "iid" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-18:40:10.Rds"),
+  "iid" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-18:40:10.Rds")),
-  "pi" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-19:22:19.Rds"),
+  "pi" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-19:22:19.Rds")),
-  "rho" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-20:03:53.Rds"),
+  "rho" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-20:03:53.Rds")),
-  "pirho" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:09:12.Rds")
+  "pirho" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:09:12.Rds"))
 )
 point_2_clusterings <- extract_full_reorder(point_2_model_list, netid_order)
 # 0.5 threshold
 point_5_model_list <- list(
-  "iid" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-19:19:53.Rds"),
+  "iid" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-19:19:53.Rds")),
-  "pi" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-21:31:20.Rds"),
+  "pi" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-21:31:20.Rds")),
-  "rho" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-21:03:50.Rds"),
+  "rho" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-21:03:50.Rds")),
-  "pirho" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:13:10.Rds")
+  "pirho" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:13:10.Rds"))
 )
 point_5_clusterings <- extract_full_reorder(point_5_model_list, netid_order)
 # Uniform re-sampled
 random_model_list <- list(
-  "iid" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-21:44:14.Rds"),
+  "iid" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-21:44:14.Rds")),
-  "pi" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-22:52:47.Rds"),
+  "pi" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-22:52:47.Rds")),
-  "rho" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_rho_70_networks_08-06-23-18:16:04.Rds"),
+  "rho" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_rho_70_networks_08-06-23-18:16:04.Rds")),
-  "pirho" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-23:07:08.Rds")
+  "pirho" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-23:07:08.Rds"))
 )
 random_clusterings <- extract_full_reorder(random_model_list, netid_order)
 ```
@ -116,8 +128,11 @@ random_clusterings <- extract_full_reorder(random_model_list, netid_order)
 After performing a netclustering on the raw data, we will see if the detect
 structure resulting in the clustering comes from the sampling effort. To test
-this we will use the CoOPLBM model by
+this we will use the CoOPLBM model by~\cite{anakokDisentanglingStructureEcological2022} to complete the data.
-\cite{anakokDisentanglingStructureEcological2022} to complete the data.
+
 \emph{Note:}~\cite{anakokDisentanglingStructureEcological2022} provided data
 for the networks for which the method was applicable, this explains that 
 there are fewer networks in the collections. 
 The CoOPLBM model assumes that the observed incidence matrix $R$ is an
 element-wise product of an $M$ matrix following an LBM and an $N$ matrix which
@ -149,8 +164,9 @@ $$X_{ij} = \begin{cases}
 ARI_netclustering_models <- function(
    clustering_compare,
    uncompleted_clustering = uncompleted_clustering,
-    models = c("iid", "pi", "rho", "pirho")) {
+    models = c("iid", "pi", "rho", "pirho"),
-  sapply(models, function(model) {
+    models_names = c("$iid\\text{-}colSBM$", "$\\pi\\text{-}colSBM$", "$\\rho\\text{-}colSBM$", "$\\pi\\rho\\text{-}colSBM$")) {
      out <- sapply(models, function(model) {
        ARI(
          uncompleted_clusterings[
            which(uncompleted_clusterings$model == model),
@ -160,6 +176,8 @@ ARI_netclustering_models <- function(
          ]$collection_id
        )
      })
  names(out) <- models_names
  out
 }
 ```
@ -168,26 +186,82 @@ Here, the completion threshold is set to $0.5$.
 First we will compute an ARI on the collection id given by the raw data and the
 completed matrix.
-```{r 0.5_ARI, echo = FALSE, results="asis"}
+```{r 0.5_ARI, echo = FALSE}
 knitr::kable(ARI_netclustering_models(point_5_clusterings),
-  col.names = c("ARI with uncompleted data")
+  col.names = c("ARI with uncompleted data"),
  escape = FALSE,
  booktabs = TRUE,
  digits = 2,
  position = "h!",
  caption = "\\label{tab:ari-table-0-5-completed} Table of ARI between 0.5 completed data and uncompleted data"
 )
 ```
-In the above table, one can see the network clustering obtained after applying
+In the table \ref{tab:ari-table-0-5-completed}, one can see the network clustering obtained after applying
-CoOPLBM has not much in common with the clustering of the uncompleted data.
+CoOPLBM has not much in common with the clustering of the uncompleted data. Thus
 we can think that the completion changed significantly the interactions in 
 the collections.
 ##### Number of sub-collections and details of each sub-collection
 ```{r 0.5_partition_numbers, echo = FALSE}
 ```
 ##### Supplementary information
 ```{r supinfo, echo = FALSE}
 supinfo <- readxl::read_xlsx(paste0(path_to_add, "data/supinfo.xlsx"), sheet = 2)
 interaction_data <- read.table(file = paste0(path_to_add, "data/interaction-data.txt"), sep = "\t", header = TRUE)
 seq_ids_network_aggreg <- unique(interaction_data$id_network_aggreg)
 incidence_matrices <- readRDS(file = paste0(path_to_add, "data/dore-matrices.Rds"))
 names_aggreg_networks <- names(incidence_matrices)
 get_vector_clustering_net <- function(unlisted_model) {
  vectorClusteringNet <- numeric(nrow(supinfo))
  for (k in 1:length(unlisted_model)) {
    idclust <- match(unlisted_model[[k]]$net_id, names_aggreg_networks)
    supinfoclust <- match(seq_ids_network_aggreg[idclust], supinfo$Idweb)
    vectorClusteringNet[supinfoclust] <- k
  }
  vectorClusteringNet
  # [! vectorClusteringNet %in% 0] # Filtering the network not present in uncompleted data
 }
 ```
 ```{r boxplot-function, echo = FALSE}
 supinfo_boxplot <- function(supinfo_vector, parameter, pretty_name) {
    return(ggplot(supinfo) +
        aes(
          x = factor(supinfo_vector), y = parameter,
          fill = as.factor(supinfo_vector), group = as.factor(supinfo_vector)
        ) +
        geom_boxplot() +
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
        labs(
            x = "Collection number", y = pretty_name,
            fill = "Collection number"
        ))
 }
 ```
 ```{r, supinfo_point_5}
 supinfo_point_5_iid <- get_vector_clustering_net(point_5_model_list[["iid"]])
 ```
 ### 0.2 completed threshold
 The $0.2$ threshold adds a lot of interactions compared to raw matrix.
 ```{r 0.2_ARI, echo = FALSE, results="asis"}
 knitr::kable(ARI_netclustering_models(point_2_clusterings),
-  col.names = c("ARI with uncompleted data")
+  col.names = c("ARI with uncompleted data"),
  escape = FALSE,
  booktabs = TRUE,
  digits = 2,
  position = "h!",
  caption = "\\label{tab:ari-table-0-2-completed} Table of ARI between 0.2 completed data and uncompleted data"
 )
 ```
@ -202,6 +276,12 @@ $$\mathbb{P}(X_{i,j} = 1) = M_{i,j} $$
 ```{r random_ARI, echo = FALSE, results="asis"}
 knitr::kable(ARI_netclustering_models(random_clusterings),
-  col.names = c("ARI with uncompleted data")
+  col.names = c("ARI with uncompleted data"),
  escape = FALSE,
  booktabs = TRUE,
  digits = 2,
  position = "h!",
  caption = "\\label{tab:ari-table-random-completed} Table of ARI between
  randomly completed data and uncompleted data"
 )
 ```
--- a/Rcodes/real_data/CoOPLBM_completion_analyze.tex
+++ b/Rcodes/real_data/CoOPLBM_completion_analyze.tex
@ -1,11 +1,18 @@
 \subsection{Completing raw data using CoOPLBM~\parencite{anakokDisentanglingStructureEcological2022}}
 \hypertarget{context-of-this-analysis}{%
 \subsubsection{Context of this
 analysis}\label{context-of-this-analysis}}
 After performing a netclustering on the raw data, we will see if the
 detect structure resulting in the clustering comes from the sampling
-effort. To test this we will use the CoOPLBM model by
+effort. To test this we will use the CoOPLBM model
-\cite{anakokDisentanglingStructureEcological2022} to complete the data.
+by\textasciitilde{}\cite{anakokDisentanglingStructureEcological2022} to
 complete the data.
 \emph{Note:}\textasciitilde{}\cite{anakokDisentanglingStructureEcological2022}
 provided data for the networks for which the method was applicable, this
 explains that there are fewer networks in the collections.
 The CoOPLBM model assumes that the observed incidence matrix \(R\) is an
 element-wise product of an \(M\) matrix following an LBM and an \(N\)
@ -49,42 +56,55 @@ Here, the completion threshold is set to \(0.5\).
 First we will compute an ARI on the collection id given by the raw data
 and the completed matrix.
-\begin{longtable}[]{@{}lr@{}}
+\begin{table}[h!]
 \toprule
 & ARI with uncompleted data\tabularnewline
 \midrule
 \endhead
 iid & 0.1142823\tabularnewline
 pi & 0.0263660\tabularnewline
 rho & 0.0933340\tabularnewline
 pirho & 0.2158747\tabularnewline
 \bottomrule
 \end{longtable}
-In the above table, one can see the network clustering obtained after
+\caption{\label{tab:0.5_ARI}\label{tab:ari-table-0-5-completed} Table of ARI between 0.5 completed data and uncompleted data}
-applying CoOPLBM has not much in common with the clustering of the
+\centering
-uncompleted data.
+\begin{tabular}[t]{lr}
 \toprule
  & ARI with uncompleted data\\
 \midrule
 $iid\text{-}colSBM$ & 0.11\\
 $\pi\text{-}colSBM$ & 0.03\\
 $\rho\text{-}colSBM$ & 0.09\\
 $\pi\rho\text{-}colSBM$ & 0.22\\
 \bottomrule
 \end{tabular}
 \end{table}
 In the table \ref{tab:ari-table-0-5-completed}, one can see the network
 clustering obtained after applying CoOPLBM has not much in common with
 the clustering of the uncompleted data. Thus we can think that the
 completion changed significantly the interactions in the collections.
 \hypertarget{number-of-sub-collections-and-details-of-each-sub-collection}{%
 \subparagraph{Number of sub-collections and details of each
 sub-collection}\label{number-of-sub-collections-and-details-of-each-sub-collection}}
 \hypertarget{supplementary-information}{%
 \subparagraph{Supplementary
 information}\label{supplementary-information}}
 \hypertarget{completed-threshold-1}{%
 \subsubsection{0.2 completed threshold}\label{completed-threshold-1}}
 The \(0.2\) threshold adds a lot of interactions compared to raw matrix.
-\begin{longtable}[]{@{}lr@{}}
+\begin{table}[h!]
 \caption{\label{tab:0.2_ARI}\label{tab:ari-table-0-2-completed} Table of ARI between 0.2 completed data and uncompleted data}
 \centering
 \begin{tabular}[t]{lr}
 \toprule
-& ARI with uncompleted data\tabularnewline
+  & ARI with uncompleted data\\
 \midrule
-\endhead
+$iid\text{-}colSBM$ & 0.04\\
-iid & 0.0429465\tabularnewline
+$\pi\text{-}colSBM$ & 0.03\\
-pi & 0.0330057\tabularnewline
+$\rho\text{-}colSBM$ & 0.02\\
-rho & 0.0187305\tabularnewline
+$\pi\rho\text{-}colSBM$ & 0.04\\
 pirho & 0.0357728\tabularnewline
 \bottomrule
-\end{longtable}
+\end{tabular}
 \end{table}
 Same as for \(0.5\), after applying CoOPLBM the obtained clustering
 doesn't match the uncompleted data.
@ -97,14 +117,19 @@ The \(M\) matrix is used to sample a new \(X\) matrix which elements are
 the realisation of Bernoulli distributions of probability \(M_{i,j}\).
 \[\mathbb{P}(X_{i,j} = 1) = M_{i,j} \]
-\begin{longtable}[]{@{}lr@{}}
+\begin{table}[h!]
 \caption{\label{tab:random_ARI}\label{tab:ari-table-random-completed} Table of ARI between
  randomly completed data and uncompleted data}
 \centering
 \begin{tabular}[t]{lr}
 \toprule
-& ARI with uncompleted data\tabularnewline
+  & ARI with uncompleted data\\
 \midrule
-\endhead
+$iid\text{-}colSBM$ & 0.01\\
-iid & 0.0148172\tabularnewline
+$\pi\text{-}colSBM$ & 0.03\\
-pi & 0.0265793\tabularnewline
+$\rho\text{-}colSBM$ & 0.01\\
-rho & 0.0051536\tabularnewline
+$\pi\rho\text{-}colSBM$ & 0.02\\
 pirho & 0.0152299\tabularnewline
 \bottomrule
-\end{longtable}
+\end{tabular}
 \end{table}
--- a/Rcodes/real_data/application_dore.Rmd
+++ b/Rcodes/real_data/application_dore.Rmd
@ -103,7 +103,6 @@ for (k in 1:length(iid_unlist)) {
 Using supplementary information we obtain the following boxplots.
 A
 ```{r boxplot-function, echo = FALSE}
 supinfo_boxplot <- function(parameter, pretty_name) {
--- a/Rcodes/real_data/application_dore.tex
+++ b/Rcodes/real_data/application_dore.tex
@ -40,8 +40,6 @@ information}\label{comparison-with-additional-information}}
 Using supplementary information we obtain the following boxplots.
 A
 \begin{figure}
 \centering
 \includegraphics{./img/de77b630fb66744d3a3ed68e45be765532d1eb0f.png}
--- a/rapport.pdf
+++ b/rapport.pdf
--- a/rapport.tex
+++ b/rapport.tex
@ -1098,7 +1098,6 @@ analysis can be found on the report repository at
 \chapter{Applications}
 \include{Rcodes/real_data/application_dore}
 \subsection{Completing raw data using CoOPLBM \parencite{anakokDisentanglingStructureEcological2022}}
 \include{Rcodes/real_data/CoOPLBM_completion_analyze}
 \printbibliography