--- output: md_document: citation_package: biblatex --- ```{r libraries, echo = FALSE, include=FALSE} require("colSBM") require(aricode) ``` ```{r useful_functions, echo = FALSE} extract_unlist <- function(data) { readRDS(data) |> extract_best_bipartite_partition() |> unlist() } show_groups <- function(collection_list) { lapply(collection_list, function(collection) { collection$Q }) } partition_BICL <- function(collection_list) { sum(sapply(collection_list, function(collection) { collection$BICL })) } extract_collection_clustering_id <- function(collection_list) { out_dataframe <- setNames(data.frame(matrix(nrow = 0, ncol = 2)), c("netid", "collection_id")) for (col_idx in seq_along(collection_list)) { for (net_id in collection_list[[col_idx]]$net_id) { out_dataframe[nrow(out_dataframe) + 1, ] <- c(net_id, col_idx) } } out_dataframe } extract_full_models <- function(model_collections_list) { return(do.call( "rbind", lapply( seq_along(model_collections_list), function(idx) { model_name <- names(model_collections_list)[idx] df <- extract_collection_clustering_id(model_collections_list[[idx]]) return(cbind(df, model = rep(model_name, length(df$netid)))) } ) )) } reorder_match_netids <- function(dataframe, target) { df <- dataframe[match(target, dataframe$netid), ] return(df) } extract_full_reorder <- function(model_collections_list, target) { unordered <- extract_full_models(model_collections_list) return(do.call("rbind", lapply( names(model_collections_list), function(model_name) { return(reorder_match_netids(unordered[which(unordered$model == model_name), ], target)) } ))) } ``` ```{r data_importation, echo = FALSE} # Uncompleted uncompleted_model_list <- list( "iid" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_iid_70_networks_08-06-23-16:31:17.Rds"), "pi" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_pi_70_networks_08-06-23-16:52:16.Rds"), "rho" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_rho_70_networks_08-06-23-16:49:58.Rds"), "pirho" = extract_unlist("data/dore_uncompleted_collection_clustering_nb_run_1_pirho_70_networks_08-06-23-16:41:33.Rds") ) # Below we will need to have the netid in the same order so we choose to use the # uncompleted iid model order as reference netid_order <- extract_collection_clustering_id(uncompleted_model_list$iid)$netid model_order <- c("iid", "pi", "rho", "pirho") uncompleted_clusterings <- extract_full_reorder(uncompleted_model_list, netid_order) # 0.2 threshold point_2_model_list <- list( "iid" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-18:40:10.Rds"), "pi" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-19:22:19.Rds"), "rho" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-20:03:53.Rds"), "pirho" = extract_unlist("data/dore_point_2_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:09:12.Rds") ) point_2_clusterings <- extract_full_reorder(point_2_model_list, netid_order) # 0.5 threshold point_5_model_list <- list( "iid" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-19:19:53.Rds"), "pi" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-21:31:20.Rds"), "rho" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_rho_70_networks_07-06-23-21:03:50.Rds"), "pirho" = extract_unlist("data/dore_point_5_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-21:13:10.Rds") ) point_5_clusterings <- extract_full_reorder(point_5_model_list, netid_order) # Uniform re-sampled random_model_list <- list( "iid" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_iid_70_networks_07-06-23-21:44:14.Rds"), "pi" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_pi_70_networks_07-06-23-22:52:47.Rds"), "rho" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_rho_70_networks_08-06-23-18:16:04.Rds"), "pirho" = extract_unlist("data/dore_random_completed_collection_clustering_nb_run_1_pirho_70_networks_07-06-23-23:07:08.Rds") ) random_clusterings <- extract_full_reorder(random_model_list, netid_order) ``` ### Context of this analysis After performing a netclustering on the raw data, we will see if the detect structure resulting in the clustering comes from the sampling effort. To test this we will use the CoOPLBM model by \cite{anakokDisentanglingStructureEcological2022} to complete the data. The CoOPLBM model assumes that the observed incidence matrix $R$ is an element-wise product of an $M$ matrix following an LBM and an $N$ matrix which elements follow Poisson distributions independent on $M$. The model gives us the $\widehat{M}$ matrix, the elements of which are: $$\widehat{M_{ij}} = \mathbb{P}(M_{ij} = 1)$$ Note that if $R_{ij} = 1$ then $\widehat{M_{ij}} = 1$ - 1 if the interaction was observed - a probability, that there should be an interaction but it wasn't observed This *completed matrix* can be used in different manners to be fed to the colSBM model. ### Threshold based completions With the thresholds, the infered incidence matrix obtained by CoOPLBM is used to generate a completed incidence matrix by the following procedure : $$X_{ij} = \begin{cases} 1 & \text{if the value is over the threshold} \\ 0 & \text{else} \\ \end{cases}$$ #### 0.5 completed threshold ```{r useful-functions, echo = FALSE, include=FALSE} ARI_netclustering_models <- function( clustering_compare, uncompleted_clustering = uncompleted_clustering, models = c("iid", "pi", "rho", "pirho")) { sapply(models, function(model) { ARI( uncompleted_clusterings[ which(uncompleted_clusterings$model == model), ]$collection_id, clustering_compare[ which(clustering_compare$model == model), ]$collection_id ) }) } ``` Here, the completion threshold is set to $0.5$. First we will compute an ARI on the collection id given by the raw data and the completed matrix. ```{r 0.5_ARI, echo = FALSE, results="asis"} knitr::kable(ARI_netclustering_models(point_5_clusterings), col.names = c("ARI with uncompleted data") ) ``` In the above table, one can see the network clustering obtained after applying CoOPLBM has not much in common with the clustering of the uncompleted data. ##### Number of sub-collections and details of each sub-collection ```{r 0.5_partition_numbers, echo = FALSE} ``` ### 0.2 completed threshold The $0.2$ threshold adds a lot of interactions compared to raw matrix. ```{r 0.2_ARI, echo = FALSE, results="asis"} knitr::kable(ARI_netclustering_models(point_2_clusterings), col.names = c("ARI with uncompleted data") ) ``` Same as for $0.5$, after applying CoOPLBM the obtained clustering doesn't match the uncompleted data. ### Sample based completions The $M$ matrix is used to sample a new $X$ matrix which elements are the realisation of Bernoulli distributions of probability $M_{i,j}$. $$\mathbb{P}(X_{i,j} = 1) = M_{i,j} $$ ```{r random_ARI, echo = FALSE, results="asis"} knitr::kable(ARI_netclustering_models(random_clusterings), col.names = c("ARI with uncompleted data") ) ```