287 lines
11 KiB
Text
287 lines
11 KiB
Text
---
|
|
output:
|
|
md_document:
|
|
citation_package: biblatex
|
|
---
|
|
\subsection{Completing raw data using CoOPLBM~\parencite{anakokDisentanglingStructureEcological2022}}
|
|
```{r, setup, include=FALSE, warning=FALSE}
|
|
knitr::opts_chunk$set(echo = FALSE, dpi = 300)
|
|
```
|
|
|
|
```{r libraries, echo = FALSE, include=FALSE}
|
|
require("colSBM")
|
|
require("aricode")
|
|
require("ggplot2")
|
|
```
|
|
|
|
```{r useful_functions, echo = FALSE}
|
|
|
|
if (getwd() == "/home/polarolouis/Nextcloud/Documents/APT/CEI/Stage Recherche Mathématiques/Depuis PC Portable/Stage MIA 2023/rapport-MIA-2023") {
|
|
path_to_add <- "Rcodes/real_data/"
|
|
} else {
|
|
path_to_add <- ""
|
|
}
|
|
|
|
extract_unlist <- function(data) {
|
|
readRDS(data) |>
|
|
extract_best_bipartite_partition() |>
|
|
unlist()
|
|
}
|
|
|
|
show_groups <- function(collection_list) {
|
|
lapply(collection_list, function(collection) {
|
|
collection$Q
|
|
})
|
|
}
|
|
|
|
partition_BICL <- function(collection_list) {
|
|
sum(sapply(collection_list, function(collection) {
|
|
collection$BICL
|
|
}))
|
|
}
|
|
|
|
extract_collection_clustering_id <- function(collection_list) {
|
|
out_dataframe <- setNames(data.frame(matrix(nrow = 0, ncol = 2)), c("netid", "collection_id"))
|
|
for (col_idx in seq_along(collection_list)) {
|
|
for (net_id in collection_list[[col_idx]]$net_id) {
|
|
out_dataframe[nrow(out_dataframe) + 1, ] <- c(net_id, col_idx)
|
|
}
|
|
}
|
|
out_dataframe
|
|
}
|
|
|
|
extract_full_models <- function(model_collections_list) {
|
|
return(do.call(
|
|
"rbind",
|
|
lapply(
|
|
seq_along(model_collections_list),
|
|
function(idx) {
|
|
model_name <- names(model_collections_list)[idx]
|
|
df <- extract_collection_clustering_id(model_collections_list[[idx]])
|
|
return(cbind(df, model = rep(model_name, length(df$netid))))
|
|
}
|
|
)
|
|
))
|
|
}
|
|
|
|
reorder_match_netids <- function(dataframe, target) {
|
|
df <- dataframe[match(target, dataframe$netid), ]
|
|
return(df)
|
|
}
|
|
|
|
extract_full_reorder <- function(model_collections_list, target) {
|
|
unordered <- extract_full_models(model_collections_list)
|
|
return(do.call("rbind", lapply(
|
|
names(model_collections_list),
|
|
function(model_name) {
|
|
return(reorder_match_netids(unordered[which(unordered$model == model_name), ], target))
|
|
}
|
|
)))
|
|
}
|
|
```
|
|
|
|
```{r data_importation, echo = FALSE}
|
|
# Uncompleted
|
|
uncompleted_model_list <- list(
|
|
"iid" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_iid_70_networks_08-06-23-16:31:17.Rds")),
|
|
"pi" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_pi_70_networks_08-06-23-16:52:16.Rds")),
|
|
"rho" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_rho_70_networks_08-06-23-16:49:58.Rds")),
|
|
"pirho" = extract_unlist(paste0(path_to_add, "data/dore_uncompleted_collection_clustering_nb_run_1_pirho_70_networks_08-06-23-16:41:33.Rds"))
|
|
)
|
|
|
|
# Below we will need to have the netid in the same order so we choose to use the
|
|
# uncompleted iid model order as reference
|
|
netid_order <- extract_collection_clustering_id(uncompleted_model_list$iid)$netid
|
|
model_order <- c("iid", "pi", "rho", "pirho")
|
|
|
|
uncompleted_clusterings <- extract_full_reorder(uncompleted_model_list, netid_order)
|
|
|
|
# 0.2 threshold
|
|
point_2_model_list <- list(
|
|
"iid" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-18:40:10.Rds")),
|
|
"pi" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-19:22:19.Rds")),
|
|
"rho" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-20:03:53.Rds")),
|
|
"pirho" = extract_unlist(paste0(path_to_add, "data/dore_point_2_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:09:12.Rds"))
|
|
)
|
|
point_2_clusterings <- extract_full_reorder(point_2_model_list, netid_order)
|
|
|
|
# 0.5 threshold
|
|
point_5_model_list <- list(
|
|
"iid" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-19:19:53.Rds")),
|
|
"pi" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-21:31:20.Rds")),
|
|
"rho" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-21:03:50.Rds")),
|
|
"pirho" = extract_unlist(paste0(path_to_add, "data/dore_point_5_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:13:10.Rds"))
|
|
)
|
|
point_5_clusterings <- extract_full_reorder(point_5_model_list, netid_order)
|
|
|
|
# Uniform re-sampled
|
|
random_model_list <- list(
|
|
"iid" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-21:44:14.Rds")),
|
|
"pi" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-22:52:47.Rds")),
|
|
"rho" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_rho_70_networks_08-06-23-18:16:04.Rds")),
|
|
"pirho" = extract_unlist(paste0(path_to_add, "data/dore_random_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-23:07:08.Rds"))
|
|
)
|
|
random_clusterings <- extract_full_reorder(random_model_list, netid_order)
|
|
```
|
|
|
|
### Context of this analysis
|
|
|
|
After performing a netclustering on the raw data, we will see if the detect
|
|
structure resulting in the clustering comes from the sampling effort. To test
|
|
this we will use the CoOPLBM model by~\cite{anakokDisentanglingStructureEcological2022} to complete the data.
|
|
|
|
\emph{Note:}~\cite{anakokDisentanglingStructureEcological2022} provided data
|
|
for the networks for which the method was applicable, this explains that
|
|
there are fewer networks in the collections.
|
|
|
|
The CoOPLBM model assumes that the observed incidence matrix $R$ is an
|
|
element-wise product of an $M$ matrix following an LBM and an $N$ matrix which
|
|
elements follow Poisson distributions independent on $M$.
|
|
|
|
The model gives us the $\widehat{M}$ matrix, the elements of which are:
|
|
|
|
$$\widehat{M_{ij}} = \mathbb{P}(M_{ij} = 1)$$
|
|
|
|
Note that if $R_{ij} = 1$ then $\widehat{M_{ij}} = 1$
|
|
|
|
- 1 if the interaction was observed
|
|
- a probability, that there should be an interaction but it wasn't observed
|
|
|
|
This *completed matrix* can be used in different manners to be fed to the colSBM
|
|
model.
|
|
|
|
### Threshold based completions
|
|
With the thresholds, the infered incidence matrix obtained by
|
|
CoOPLBM is used to generate a completed incidence matrix by the following
|
|
procedure :
|
|
$$X_{ij} = \begin{cases}
|
|
1 & \text{if the value is over the threshold} \\
|
|
0 & \text{else} \\
|
|
\end{cases}$$
|
|
|
|
#### 0.5 completed threshold
|
|
```{r useful-functions, echo = FALSE, include=FALSE}
|
|
ARI_netclustering_models <- function(
|
|
clustering_compare,
|
|
uncompleted_clustering = uncompleted_clustering,
|
|
models = c("iid", "pi", "rho", "pirho"),
|
|
models_names = c("$iid\\text{-}colSBM$", "$\\pi\\text{-}colSBM$", "$\\rho\\text{-}colSBM$", "$\\pi\\rho\\text{-}colSBM$")) {
|
|
out <- sapply(models, function(model) {
|
|
ARI(
|
|
uncompleted_clusterings[
|
|
which(uncompleted_clusterings$model == model),
|
|
]$collection_id,
|
|
clustering_compare[
|
|
which(clustering_compare$model == model),
|
|
]$collection_id
|
|
)
|
|
})
|
|
names(out) <- models_names
|
|
out
|
|
}
|
|
```
|
|
|
|
Here, the completion threshold is set to $0.5$.
|
|
|
|
First we will compute an ARI on the collection id given by the raw data and the
|
|
completed matrix.
|
|
|
|
```{r 0.5_ARI, echo = FALSE}
|
|
knitr::kable(ARI_netclustering_models(point_5_clusterings),
|
|
col.names = c("ARI with uncompleted data"),
|
|
escape = FALSE,
|
|
booktabs = TRUE,
|
|
digits = 2,
|
|
position = "h!",
|
|
caption = "\\label{tab:ari-table-0-5-completed} Table of ARI between 0.5 completed data and uncompleted data"
|
|
)
|
|
```
|
|
|
|
In the table \ref{tab:ari-table-0-5-completed}, one can see the network clustering obtained after applying
|
|
CoOPLBM has not much in common with the clustering of the uncompleted data. Thus
|
|
we can think that the completion changed significantly the interactions in
|
|
the collections.
|
|
|
|
##### Number of sub-collections and details of each sub-collection
|
|
```{r 0.5_partition_numbers, echo = FALSE}
|
|
```
|
|
|
|
##### Supplementary information
|
|
|
|
```{r supinfo, echo = FALSE}
|
|
supinfo <- readxl::read_xlsx(paste0(path_to_add, "data/supinfo.xlsx"), sheet = 2)
|
|
interaction_data <- read.table(file = paste0(path_to_add, "data/interaction-data.txt"), sep = "\t", header = TRUE)
|
|
|
|
seq_ids_network_aggreg <- unique(interaction_data$id_network_aggreg)
|
|
incidence_matrices <- readRDS(file = paste0(path_to_add, "data/dore-matrices.Rds"))
|
|
names_aggreg_networks <- names(incidence_matrices)
|
|
get_vector_clustering_net <- function(unlisted_model) {
|
|
vectorClusteringNet <- numeric(nrow(supinfo))
|
|
for (k in 1:length(unlisted_model)) {
|
|
idclust <- match(unlisted_model[[k]]$net_id, names_aggreg_networks)
|
|
supinfoclust <- match(seq_ids_network_aggreg[idclust], supinfo$Idweb)
|
|
vectorClusteringNet[supinfoclust] <- k
|
|
}
|
|
vectorClusteringNet
|
|
# [! vectorClusteringNet %in% 0] # Filtering the network not present in uncompleted data
|
|
}
|
|
```
|
|
|
|
```{r boxplot-function, echo = FALSE}
|
|
supinfo_boxplot <- function(supinfo_vector, parameter, pretty_name) {
|
|
return(ggplot(supinfo) +
|
|
aes(
|
|
x = factor(supinfo_vector), y = parameter,
|
|
fill = as.factor(supinfo_vector), group = as.factor(supinfo_vector)
|
|
) +
|
|
geom_boxplot() +
|
|
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
|
|
labs(
|
|
x = "Collection number", y = pretty_name,
|
|
fill = "Collection number"
|
|
))
|
|
}
|
|
```
|
|
|
|
```{r, supinfo_point_5}
|
|
supinfo_point_5_iid <- get_vector_clustering_net(point_5_model_list[["iid"]])
|
|
```
|
|
|
|
|
|
|
|
|
|
### 0.2 completed threshold
|
|
|
|
The $0.2$ threshold adds a lot of interactions compared to raw matrix.
|
|
|
|
```{r 0.2_ARI, echo = FALSE, results="asis"}
|
|
knitr::kable(ARI_netclustering_models(point_2_clusterings),
|
|
col.names = c("ARI with uncompleted data"),
|
|
escape = FALSE,
|
|
booktabs = TRUE,
|
|
digits = 2,
|
|
position = "h!",
|
|
caption = "\\label{tab:ari-table-0-2-completed} Table of ARI between 0.2 completed data and uncompleted data"
|
|
)
|
|
```
|
|
|
|
Same as for $0.5$, after applying CoOPLBM the obtained clustering doesn't match
|
|
the uncompleted data.
|
|
|
|
### Sample based completions
|
|
|
|
The $M$ matrix is used to sample a new $X$ matrix which elements are the
|
|
realisation of Bernoulli distributions of probability $M_{i,j}$.
|
|
$$\mathbb{P}(X_{i,j} = 1) = M_{i,j} $$
|
|
|
|
```{r random_ARI, echo = FALSE, results="asis"}
|
|
knitr::kable(ARI_netclustering_models(random_clusterings),
|
|
col.names = c("ARI with uncompleted data"),
|
|
escape = FALSE,
|
|
booktabs = TRUE,
|
|
digits = 2,
|
|
position = "h!",
|
|
caption = "\\label{tab:ari-table-random-completed} Table of ARI between
|
|
randomly completed data and uncompleted data"
|
|
)
|
|
```
|