From d7919e3098b8d24dd038c3f59c5f2cca48518312 Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 23 Jan 2026 15:30:11 +0100 Subject: [PATCH] Extracting sequential taxo function to utils-bm-seq.R --- lbm_at_diff_taxo_seq.R | 210 +---------------------------------------- utils-bm-seq.R | 131 +++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 207 deletions(-) create mode 100644 utils-bm-seq.R diff --git a/lbm_at_diff_taxo_seq.R b/lbm_at_diff_taxo_seq.R index e4b9663..dfe0d02 100644 --- a/lbm_at_diff_taxo_seq.R +++ b/lbm_at_diff_taxo_seq.R @@ -1,4 +1,5 @@ source("utils.R") +source("utils-bm-seq.R") library(biomformat) library(phyloseq) library(R.utils) @@ -23,221 +24,16 @@ the_data <- import_biom("data/mach/kinetic.biom") # return(fit) # }) -bm_propagate_taus_max_ICL <- function(phyloseq_data) { - # Prepare - per_taxa_network <- collapse_otu_at_taxo(phyloseq_data) - - removed_root <- FALSE - # Detect if first rank (higher) is fully collapsed - if (nrow(per_taxa_network[[1]]) <= 1L) { - message("The first network has only one row. Removing it.") - removed_root <- TRUE - per_taxa_network <- per_taxa_network[-1] - } - result_list <- list() - # Init - mean_diff_vec <- c() - PL_per_nb_vec <- c() - max_rank_id <- tax_table(phyloseq_data) |> ncol() - rank_id <- 2 - net_id <- ifelse(removed_root, rank_id - 1, rank_id) - model <- BM_poisson( - membership_type = "LBM", - adj = per_taxa_network[[net_id]], # Account for the root - verbosity = 2, - plotting = "", - ncores = 1L - ) - model$estimate() - result_list <- append(result_list, model) - # Here we extract the memberships - best_model_memberships <- model$memberships[[which.max(model$ICL)]] - next_cc <- list(Z1 = best_model_memberships$Z1, Z2 = best_model_memberships$Z2) - - # Here we dispatch the memberships for the OTU - ## Propagate the rownames - while (rank_id < max_rank_id) { - rownames(next_cc$Z1) <- rownames(per_taxa_network[[net_id]]) - rank_id <- rank_id + 1 - net_id <- ifelse(removed_root, rank_id - 1, rank_id) - next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[rank_id]) - - ## Noising of the next_Z1 - dZ1 <- dim(next_Z1) - nZ1 <- dZ1[1] * dZ1[2] - next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = 0.01), nrow = dZ1[1], ncol = dZ1[2]) - - ## Normalizing by rows - C_Z1 <- rowSums(next_Z1) - next_Z1 <- next_Z1 / C_Z1 - - next_cc$Z1 <- next_Z1 - - - next_res <- blockmodels:::dispatcher( - membership_name = "LBM", - membership_init = next_cc, - model_name = "poisson", - network = list(adjacency = per_taxa_network[[net_id]]), real_EM = TRUE - ) # account for the removed root - result_list <- append(result_list, next_res) - mean_diff_vec[paste0(rank_id - 1, "->", rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2) - cat("Mean diff across ranks:\n") - print(sqrt(mean_diff_vec)) - next_cc <- next_res[["membership"]][c("Z1", "Z2")] - PL_per_nb_vec[paste0(rank_id - 1, "->", rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2)) - cat("PL/nZ1*nZ2: \n") - print(PL_per_nb_vec) - } - return(result_list) -} - -# mach_out <- bm_propagate_taus_max_ICL(phyloseq_data = the_data) - -# chaillou_out <- bm_propagate_taus_max_ICL(phyloseq_data = import_biom("data/chaillou/chaillou.biom")) - -# ravel_out <- bm_propagate_taus_max_ICL(phyloseq_data = import_biom("data/ravel/ravel.biom")) - # To init a BM model I need to provide memberships and ICL # And to compute with the dispatcher the previous values to init the models with ICL and memberships - -bm_propagate_taus_Q_next_level <- function(phyloseq_data, first_model, Q, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data), rank_id_start = 2L, target_rank_id = rank_id_start + 1, removed_root = TRUE, sd_noise = 0.001) { - stopifnot("Q must be provided" = !missing(Q), "pĥyloseq_data must be provided" = !missing(phyloseq_data)) - force(per_taxa_networks) - - result_list <- list() - # Init - mean_diff_vec <- c() - PL_per_nb_vec <- c() - max_rank_id <- tax_table(phyloseq_data) |> ncol() - net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start) - result_list <- append(result_list, first_model) - # Here we extract the memberships - current_memberships <- first_model$memberships[[Q]] - if (!is.matrix(current_memberships$Z1)) { - nrow_Z1 <- length(current_memberships$Z1) - } else { - nrow_Z1 <- nrow(current_memberships$Z1) - } - next_cc <- list(Z1 = as.matrix(current_memberships$Z1, nrow = nrow_Z1), Z2 = current_memberships$Z2) - - # Here we dispatch the memberships for the OTU - ## Propagate the rownames - rownames(next_cc$Z1) <- rownames(per_taxa_networks[[net_id_start]]) - target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id) - next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[target_rank_id]) - - ## Noising of the next_Z1 - dZ1 <- dim(next_Z1) - nZ1 <- dZ1[1] * dZ1[2] - next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = sd_noise), nrow = dZ1[1], ncol = dZ1[2]) - - ## Normalizing by rows - C_Z1 <- rowSums(next_Z1) - next_Z1 <- next_Z1 / C_Z1 - - next_cc$Z1 <- next_Z1 - - - next_res <- blockmodels:::dispatcher( - membership_name = "LBM", - membership_init = next_cc, - model_name = "poisson", - network = list(adjacency = per_taxa_networks[[target_net_id]]), real_EM = TRUE - ) # account for the removed root - result_list <- append(result_list, next_res) - mean_diff_vec[paste0(rank_id_start, "->", target_rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2) - next_cc <- next_res[["membership"]][c("Z1", "Z2")] - PL_per_nb_vec[paste0(rank_id_start, "->", target_rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2)) - return(next_res) -} -#' Use a preinited model to run blockmodels exploration -#' -#' @param bm_model A pre-initialized model object -bm_estimate_with_inits <- function(bm_model, reinitialization_effort = 1) { - l <- TRUE - n <- 1 - changing_effort <- FALSE - while (l) { - cat("Pass", n, "\n") - - cat("With ascending number of groups\n") - ra <- bm_model$estim_ascend(reinitialization_effort, changing_effort) - - cat("With descending number of groups\n") - rb <- bm_model$estim_descend(reinitialization_effort) - - l <- ra || rb - n <- n + 1 - changing_effort <- FALSE - } -} - -bm_propagate_taus_all_models <- function(phyloseq_data, rank_id_start = 2, target_rank_id = rank_id_start + 1, first_model = NULL, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data)) { - removed_root <- FALSE - # Detect if first rank (higher) is fully collapsed - if (nrow(per_taxa_networks[[1]]) <= 1L) { - message("The first network has only one row. Removing it.") - removed_root <- TRUE - per_taxa_networks <- per_taxa_networks[-1] - } - result_list <- list() - # Init - net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start) - if (missing(first_model)) { - first_model <- BM_poisson( - membership_type = "LBM", - adj = per_taxa_networks[[net_id_start]], # Account for the root - verbosity = 2, - plotting = "", - ncores = 1L - ) - first_model$estimate() - } - - propagated_models <- lapply(seq_along(first_model$ICL), function(current_q) { - if (!is.na(first_model$ICL[current_q])) { - bm_propagate_taus_Q_next_level(phyloseq_data = phyloseq_data, first_model = first_model, per_taxa_networks = per_taxa_networks, Q = current_q, removed_root = removed_root, rank_id_start = rank_id_start, target_rank_id = target_rank_id) - } else { - return(NA) - } - }) - - target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id) - - new_model <- BM_poisson( - membership_type = "LBM", - adj = per_taxa_networks[[target_net_id]], # Account for the root - verbosity = 6, - plotting = "", - ncores = 1L - ) - - lapply(seq_along(propagated_models), function(current_q) { - if (any(is.na(propagated_models[[current_q]]))) { - return(NA) - } - new_model$memberships[[current_q]] <<- getRefClass("LBM")(from_cc = propagated_models[[current_q]]$membership) - new_model$model_parameters[[current_q]] <- propagated_models[[current_q]]$model - new_model$PL[current_q] <- propagated_models[[current_q]]$PL - new_model$H[current_q] <- propagated_models[[current_q]]$H - new_model$ICL[current_q] <- propagated_models[[current_q]]$PL - .5 * (propagated_models[[current_q]]$model$n_parameters * - log(new_model$data_number()) + new_model$memberships[[current_q]]$ICL_penalty()) - }) - - bm_estimate_with_inits(bm_model = new_model) - - return(new_model) -} - per_taxa_networks <- collapse_otu_at_taxo(the_data) r2_model <- BM_poisson( membership_type = "LBM", adj = per_taxa_networks[[2]], # Account for the root - verbosity = 2, + verbosity = 6, plotting = "", - ncores = 1L + ncores = parallelly::availableCores() ) r2_model$estimate() r3_model <- bm_propagate_taus_all_models(phyloseq_data = the_data, rank_id_start = 2, target_rank_id = 3, per_taxa_networks = per_taxa_networks) diff --git a/utils-bm-seq.R b/utils-bm-seq.R new file mode 100644 index 0000000..ef0e602 --- /dev/null +++ b/utils-bm-seq.R @@ -0,0 +1,131 @@ +library(blockmodels) +library(phyloseq) + +bm_propagate_taus_Q_next_level <- function(phyloseq_data, first_model, Q, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data), rank_id_start = 2L, target_rank_id = rank_id_start + 1, removed_root = TRUE, sd_noise = 0.001) { + stopifnot("Q must be provided" = !missing(Q), "pĥyloseq_data must be provided" = !missing(phyloseq_data)) + force(per_taxa_networks) + + result_list <- list() + # Init + mean_diff_vec <- c() + PL_per_nb_vec <- c() + max_rank_id <- tax_table(phyloseq_data) |> ncol() + net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start) + result_list <- append(result_list, first_model) + # Here we extract the memberships + current_memberships <- first_model$memberships[[Q]] + if (!is.matrix(current_memberships$Z1)) { + nrow_Z1 <- length(current_memberships$Z1) + } else { + nrow_Z1 <- nrow(current_memberships$Z1) + } + next_cc <- list(Z1 = as.matrix(current_memberships$Z1, nrow = nrow_Z1), Z2 = current_memberships$Z2) + + # Here we dispatch the memberships for the OTU + ## Propagate the rownames + rownames(next_cc$Z1) <- rownames(per_taxa_networks[[net_id_start]]) + target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id) + next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[target_rank_id]) + + ## Noising of the next_Z1 + dZ1 <- dim(next_Z1) + nZ1 <- dZ1[1] * dZ1[2] + next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = sd_noise), nrow = dZ1[1], ncol = dZ1[2]) + + ## Normalizing by rows + C_Z1 <- rowSums(next_Z1) + next_Z1 <- next_Z1 / C_Z1 + + next_cc$Z1 <- next_Z1 + + + next_res <- blockmodels:::dispatcher( + membership_name = "LBM", + membership_init = next_cc, + model_name = "poisson", + network = list(adjacency = per_taxa_networks[[target_net_id]]), real_EM = TRUE + ) # account for the removed root + result_list <- append(result_list, next_res) + mean_diff_vec[paste0(rank_id_start, "->", target_rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2) + next_cc <- next_res[["membership"]][c("Z1", "Z2")] + PL_per_nb_vec[paste0(rank_id_start, "->", target_rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2)) + return(next_res) +} +#' Use a preinited model to run blockmodels exploration +#' +#' @param bm_model A pre-initialized model object +bm_estimate_with_inits <- function(bm_model, reinitialization_effort = 1) { + l <- TRUE + n <- 1 + changing_effort <- FALSE + while (l) { + cat("Pass", n, "\n") + + cat("With ascending number of groups\n") + ra <- bm_model$estim_ascend(reinitialization_effort, changing_effort) + + cat("With descending number of groups\n") + rb <- bm_model$estim_descend(reinitialization_effort) + + l <- ra || rb + n <- n + 1 + changing_effort <- FALSE + } +} + +bm_propagate_taus_all_models <- function(phyloseq_data, rank_id_start = 2, target_rank_id = rank_id_start + 1, first_model = NULL, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data)) { + removed_root <- FALSE + # Detect if first rank (higher) is fully collapsed + if (nrow(per_taxa_networks[[1]]) <= 1L) { + message("The first network has only one row. Removing it.") + removed_root <- TRUE + per_taxa_networks <- per_taxa_networks[-1] + } + result_list <- list() + # Init + net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start) + if (missing(first_model)) { + first_model <- BM_poisson( + membership_type = "LBM", + adj = per_taxa_networks[[net_id_start]], # Account for the root + verbosity = 2, + plotting = "", + ncores = 1L + ) + first_model$estimate() + } + + propagated_models <- lapply(seq_along(first_model$ICL), function(current_q) { + if (!is.na(first_model$ICL[current_q])) { + bm_propagate_taus_Q_next_level(phyloseq_data = phyloseq_data, first_model = first_model, per_taxa_networks = per_taxa_networks, Q = current_q, removed_root = removed_root, rank_id_start = rank_id_start, target_rank_id = target_rank_id) + } else { + return(NA) + } + }) + + target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id) + + new_model <- BM_poisson( + membership_type = "LBM", + adj = per_taxa_networks[[target_net_id]], # Account for the root + verbosity = 6, + plotting = "", + ncores = 1L + ) + + lapply(seq_along(propagated_models), function(current_q) { + if (any(is.na(propagated_models[[current_q]]))) { + return(NA) + } + new_model$memberships[[current_q]] <<- getRefClass("LBM")(from_cc = propagated_models[[current_q]]$membership) + new_model$model_parameters[[current_q]] <- propagated_models[[current_q]]$model + new_model$PL[current_q] <- propagated_models[[current_q]]$PL + new_model$H[current_q] <- propagated_models[[current_q]]$H + new_model$ICL[current_q] <- propagated_models[[current_q]]$PL - .5 * (propagated_models[[current_q]]$model$n_parameters * + log(new_model$data_number()) + new_model$memberships[[current_q]]$ICL_penalty()) + }) + + bm_estimate_with_inits(bm_model = new_model) + + return(new_model) +}