210 lines
7.6 KiB
Text
210 lines
7.6 KiB
Text
---
|
|
title: "Netclustering analysis with the CoOPLBM completion"
|
|
bibliography: references.bib
|
|
suppress-bibliography: true
|
|
output:
|
|
html_document:
|
|
toc: true
|
|
theme: journal
|
|
pdf_document:
|
|
keep_tex: true
|
|
---
|
|
|
|
```{r libraries, echo = FALSE, include=FALSE}
|
|
devtools::load_all()
|
|
require(aricode)
|
|
```
|
|
|
|
```{r useful_functions, echo = FALSE}
|
|
extract_unlist <- function(data) {
|
|
readRDS(data) |>
|
|
extract_best_bipartite_partition() |>
|
|
unlist()
|
|
}
|
|
|
|
show_groups <- function(collection_list) {
|
|
lapply(collection_list, function(collection) {
|
|
collection$Q
|
|
})
|
|
}
|
|
|
|
partition_BICL <- function(collection_list) {
|
|
sum(sapply(collection_list, function(collection) {
|
|
collection$BICL
|
|
}))
|
|
}
|
|
|
|
extract_collection_clustering_id <- function(collection_list) {
|
|
out_dataframe <- setNames(data.frame(matrix(nrow = 0, ncol = 2)), c("netid", "collection_id"))
|
|
for (col_idx in seq_along(collection_list)) {
|
|
for (net_id in collection_list[[col_idx]]$net_id) {
|
|
out_dataframe[nrow(out_dataframe) + 1, ] <- c(net_id, col_idx)
|
|
}
|
|
}
|
|
out_dataframe
|
|
}
|
|
|
|
extract_full_models <- function(model_collections_list) {
|
|
return(do.call(
|
|
"rbind",
|
|
lapply(
|
|
seq_along(model_collections_list),
|
|
function(idx) {
|
|
model_name <- names(model_collections_list)[idx]
|
|
df <- extract_collection_clustering_id(model_collections_list[[idx]])
|
|
return(cbind(df, model = rep(model_name, length(df$netid))))
|
|
}
|
|
)
|
|
))
|
|
}
|
|
|
|
reorder_match_netids <- function(dataframe, target) {
|
|
df <- dataframe[match(target, dataframe$netid), ]
|
|
return(df)
|
|
}
|
|
|
|
extract_full_reorder <- function(model_collections_list, target) {
|
|
unordered <- extract_full_models(model_collections_list)
|
|
return(do.call("rbind", lapply(
|
|
names(model_collections_list),
|
|
function(model_name) {
|
|
return(reorder_match_netids(unordered[which(unordered$model == model_name), ], target))
|
|
}
|
|
)))
|
|
}
|
|
```
|
|
|
|
```{r data_importation, echo = FALSE}
|
|
# Uncompleted
|
|
uncompleted_model_list <- list(
|
|
"iid" = extract_unlist("real_data/data/dore_uncompleted_collection_clustering_nb_run_1_iid_70_networks_08-06-23-16:31:17.Rds"),
|
|
"pi" = extract_unlist("real_data/data/dore_uncompleted_collection_clustering_nb_run_1_pi_70_networks_08-06-23-16:52:16.Rds"),
|
|
"rho" = extract_unlist("real_data/data/dore_uncompleted_collection_clustering_nb_run_1_rho_70_networks_08-06-23-16:49:58.Rds"),
|
|
"pirho" = extract_unlist("real_data/data/dore_uncompleted_collection_clustering_nb_run_1_pirho_70_networks_08-06-23-16:41:33.Rds")
|
|
)
|
|
|
|
# Below we will need to have the netid in the same order so we choose to use the
|
|
# uncompleted iid model order as reference
|
|
netid_order <- extract_collection_clustering_id(uncompleted_model_list$iid)$netid
|
|
model_order <- c("iid", "pi", "rho", "pirho")
|
|
|
|
uncompleted_clusterings <- extract_full_reorder(uncompleted_model_list, netid_order)
|
|
|
|
# 0.2 threshold
|
|
point_2_model_list <- list(
|
|
"iid" = extract_unlist("real_data/data/dore_point_2_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-18:40:10.Rds"),
|
|
"pi" = extract_unlist("real_data/data/dore_point_2_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-19:22:19.Rds"),
|
|
"rho" = extract_unlist("real_data/data/dore_point_2_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-20:03:53.Rds"),
|
|
"pirho" = extract_unlist("real_data/data/dore_point_2_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:09:12.Rds")
|
|
)
|
|
point_2_clusterings <- extract_full_reorder(point_2_model_list, netid_order)
|
|
|
|
# 0.5 threshold
|
|
point_5_model_list <- list(
|
|
"iid" = extract_unlist("real_data/data/dore_point_5_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-19:19:53.Rds"),
|
|
"pi" = extract_unlist("real_data/data/dore_point_5_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-21:31:20.Rds"),
|
|
"rho" = extract_unlist("real_data/data/dore_point_5_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-21:03:50.Rds"),
|
|
"pirho" = extract_unlist("real_data/data/dore_point_5_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:13:10.Rds")
|
|
)
|
|
point_5_clusterings <- extract_full_reorder(point_5_model_list, netid_order)
|
|
|
|
# Uniform re-sampled
|
|
random_model_list <- list(
|
|
"iid" = extract_unlist("real_data/data/dore_random_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-21:44:14.Rds"),
|
|
"pi" = extract_unlist("real_data/data/dore_random_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-22:52:47.Rds"),
|
|
"rho" = extract_unlist("real_data/data/dore_random_completed_collection_clustering_nb_run_1_rho_70_networks_08-06-23-18:16:04.Rds"),
|
|
"pirho" = extract_unlist("real_data/data/dore_random_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-23:07:08.Rds")
|
|
)
|
|
random_clusterings <- extract_full_reorder(random_model_list, netid_order)
|
|
```
|
|
|
|
# Context of this analysis
|
|
|
|
After performing a netclustering on the raw data, we will see if the detect
|
|
structure resulting in the clustering comes from the sampling effort. To test
|
|
this we will use the CoOPLBM model by
|
|
@anakokDisentanglingStructureEcological2022 to complete the data.
|
|
|
|
The CoOPLBM model assumes that the observed incidence matrix $R$ is an
|
|
element-wise product of an $M$ matrix following an LBM and an $N$ matrix which
|
|
elements follow Poisson distributions independent on $M$.
|
|
|
|
The model gives us the $\widehat{M}$ matrix, the elements of which are:
|
|
|
|
$$\widehat{M_{ij}} = \mathbb{P}(M_{ij} = 1)$$
|
|
|
|
Note that if $R_{ij} = 1$ then $\widehat{M_{ij}} = 1$
|
|
|
|
- 1 if the interaction was observed
|
|
- a probability, that there should be an interaction but it wasn't observed
|
|
|
|
This *completed matrix* can be used in different manners to be fed to the colSBM
|
|
model.
|
|
|
|
# Threshold based completions
|
|
With the thresholds, the infered incidence matrix obtained by
|
|
CoOPLBM is used to generate a completed incidence matrix by the following
|
|
procedure :
|
|
$$X_{ij} = \begin{cases}
|
|
1 & \text{if the value is over the threshold} \\
|
|
0 & \text{else} \\
|
|
\end{cases}$$
|
|
|
|
## 0.5 completed threshold
|
|
```{r useful-functions, echo = FALSE, include=FALSE}
|
|
ARI_netclustering_models <- function(
|
|
clustering_compare,
|
|
uncompleted_clustering = uncompleted_clustering,
|
|
models = c("iid", "pi", "rho", "pirho")) {
|
|
sapply(models, function(model) {
|
|
ARI(
|
|
uncompleted_clusterings[
|
|
which(uncompleted_clusterings$model == model),
|
|
]$collection_id,
|
|
clustering_compare[
|
|
which(clustering_compare$model == model),
|
|
]$collection_id
|
|
)
|
|
})
|
|
}
|
|
```
|
|
|
|
Here, the completion threshold is set to $0.5$.
|
|
|
|
First we will compute an ARI on the collection id given by the raw data and the
|
|
completed matrix.
|
|
|
|
```{r 0.5_ARI, echo = FALSE, results="asis"}
|
|
knitr::kable(ARI_netclustering_models(point_5_clusterings),
|
|
col.names = c("ARI with uncompleted data")
|
|
)
|
|
```
|
|
|
|
In the above table, one can see the network clustering obtained after applying
|
|
CoOPLBM has not much in common with the clustering of the uncompleted data.
|
|
|
|
### Number of sub-collections and details of each sub-collection
|
|
```{r 0.5_partition_numbers, echo = FALSE}
|
|
```
|
|
|
|
## 0.2 completed threshold
|
|
|
|
The $0.2$ threshold adds a lot of interactions compared to raw matrix.
|
|
|
|
```{r 0.2_ARI, echo = FALSE, results="asis"}
|
|
knitr::kable(ARI_netclustering_models(point_2_clusterings),
|
|
col.names = c("ARI with uncompleted data")
|
|
)
|
|
```
|
|
|
|
# Sample based completions
|
|
|
|
The $M$ matrix is used to sample a new $X$ matrix which elements are the
|
|
realisation of Bernoulli distributions of probability $M_{i,j}$.
|
|
$$\mathbb{P}(X_{i,j} = 1) = M_{i,j} $$
|
|
|
|
```{r random_ARI, echo = FALSE, results="asis"}
|
|
knitr::kable(ARI_netclustering_models(random_clusterings),
|
|
col.names = c("ARI with uncompleted data")
|
|
)
|
|
```
|