Extracting sequential taxo function to utils-bm-seq.R

This commit is contained in:
Louis 2026-01-23 15:30:11 +01:00
parent 1df7268419
commit d7919e3098
2 changed files with 134 additions and 207 deletions

View file

@ -1,4 +1,5 @@
source("utils.R")
source("utils-bm-seq.R")
library(biomformat)
library(phyloseq)
library(R.utils)
@ -23,221 +24,16 @@ the_data <- import_biom("data/mach/kinetic.biom")
# return(fit)
# })
bm_propagate_taus_max_ICL <- function(phyloseq_data) {
# Prepare
per_taxa_network <- collapse_otu_at_taxo(phyloseq_data)
removed_root <- FALSE
# Detect if first rank (higher) is fully collapsed
if (nrow(per_taxa_network[[1]]) <= 1L) {
message("The first network has only one row. Removing it.")
removed_root <- TRUE
per_taxa_network <- per_taxa_network[-1]
}
result_list <- list()
# Init
mean_diff_vec <- c()
PL_per_nb_vec <- c()
max_rank_id <- tax_table(phyloseq_data) |> ncol()
rank_id <- 2
net_id <- ifelse(removed_root, rank_id - 1, rank_id)
model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_network[[net_id]], # Account for the root
verbosity = 2,
plotting = "",
ncores = 1L
)
model$estimate()
result_list <- append(result_list, model)
# Here we extract the memberships
best_model_memberships <- model$memberships[[which.max(model$ICL)]]
next_cc <- list(Z1 = best_model_memberships$Z1, Z2 = best_model_memberships$Z2)
# Here we dispatch the memberships for the OTU
## Propagate the rownames
while (rank_id < max_rank_id) {
rownames(next_cc$Z1) <- rownames(per_taxa_network[[net_id]])
rank_id <- rank_id + 1
net_id <- ifelse(removed_root, rank_id - 1, rank_id)
next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[rank_id])
## Noising of the next_Z1
dZ1 <- dim(next_Z1)
nZ1 <- dZ1[1] * dZ1[2]
next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = 0.01), nrow = dZ1[1], ncol = dZ1[2])
## Normalizing by rows
C_Z1 <- rowSums(next_Z1)
next_Z1 <- next_Z1 / C_Z1
next_cc$Z1 <- next_Z1
next_res <- blockmodels:::dispatcher(
membership_name = "LBM",
membership_init = next_cc,
model_name = "poisson",
network = list(adjacency = per_taxa_network[[net_id]]), real_EM = TRUE
) # account for the removed root
result_list <- append(result_list, next_res)
mean_diff_vec[paste0(rank_id - 1, "->", rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2)
cat("Mean diff across ranks:\n")
print(sqrt(mean_diff_vec))
next_cc <- next_res[["membership"]][c("Z1", "Z2")]
PL_per_nb_vec[paste0(rank_id - 1, "->", rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2))
cat("PL/nZ1*nZ2: \n")
print(PL_per_nb_vec)
}
return(result_list)
}
# mach_out <- bm_propagate_taus_max_ICL(phyloseq_data = the_data)
# chaillou_out <- bm_propagate_taus_max_ICL(phyloseq_data = import_biom("data/chaillou/chaillou.biom"))
# ravel_out <- bm_propagate_taus_max_ICL(phyloseq_data = import_biom("data/ravel/ravel.biom"))
# To init a BM model I need to provide memberships and ICL
# And to compute with the dispatcher the previous values to init the models with ICL and memberships
bm_propagate_taus_Q_next_level <- function(phyloseq_data, first_model, Q, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data), rank_id_start = 2L, target_rank_id = rank_id_start + 1, removed_root = TRUE, sd_noise = 0.001) {
stopifnot("Q must be provided" = !missing(Q), "pĥyloseq_data must be provided" = !missing(phyloseq_data))
force(per_taxa_networks)
result_list <- list()
# Init
mean_diff_vec <- c()
PL_per_nb_vec <- c()
max_rank_id <- tax_table(phyloseq_data) |> ncol()
net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start)
result_list <- append(result_list, first_model)
# Here we extract the memberships
current_memberships <- first_model$memberships[[Q]]
if (!is.matrix(current_memberships$Z1)) {
nrow_Z1 <- length(current_memberships$Z1)
} else {
nrow_Z1 <- nrow(current_memberships$Z1)
}
next_cc <- list(Z1 = as.matrix(current_memberships$Z1, nrow = nrow_Z1), Z2 = current_memberships$Z2)
# Here we dispatch the memberships for the OTU
## Propagate the rownames
rownames(next_cc$Z1) <- rownames(per_taxa_networks[[net_id_start]])
target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id)
next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[target_rank_id])
## Noising of the next_Z1
dZ1 <- dim(next_Z1)
nZ1 <- dZ1[1] * dZ1[2]
next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = sd_noise), nrow = dZ1[1], ncol = dZ1[2])
## Normalizing by rows
C_Z1 <- rowSums(next_Z1)
next_Z1 <- next_Z1 / C_Z1
next_cc$Z1 <- next_Z1
next_res <- blockmodels:::dispatcher(
membership_name = "LBM",
membership_init = next_cc,
model_name = "poisson",
network = list(adjacency = per_taxa_networks[[target_net_id]]), real_EM = TRUE
) # account for the removed root
result_list <- append(result_list, next_res)
mean_diff_vec[paste0(rank_id_start, "->", target_rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2)
next_cc <- next_res[["membership"]][c("Z1", "Z2")]
PL_per_nb_vec[paste0(rank_id_start, "->", target_rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2))
return(next_res)
}
#' Use a preinited model to run blockmodels exploration
#'
#' @param bm_model A pre-initialized model object
bm_estimate_with_inits <- function(bm_model, reinitialization_effort = 1) {
l <- TRUE
n <- 1
changing_effort <- FALSE
while (l) {
cat("Pass", n, "\n")
cat("With ascending number of groups\n")
ra <- bm_model$estim_ascend(reinitialization_effort, changing_effort)
cat("With descending number of groups\n")
rb <- bm_model$estim_descend(reinitialization_effort)
l <- ra || rb
n <- n + 1
changing_effort <- FALSE
}
}
bm_propagate_taus_all_models <- function(phyloseq_data, rank_id_start = 2, target_rank_id = rank_id_start + 1, first_model = NULL, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data)) {
removed_root <- FALSE
# Detect if first rank (higher) is fully collapsed
if (nrow(per_taxa_networks[[1]]) <= 1L) {
message("The first network has only one row. Removing it.")
removed_root <- TRUE
per_taxa_networks <- per_taxa_networks[-1]
}
result_list <- list()
# Init
net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start)
if (missing(first_model)) {
first_model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_networks[[net_id_start]], # Account for the root
verbosity = 2,
plotting = "",
ncores = 1L
)
first_model$estimate()
}
propagated_models <- lapply(seq_along(first_model$ICL), function(current_q) {
if (!is.na(first_model$ICL[current_q])) {
bm_propagate_taus_Q_next_level(phyloseq_data = phyloseq_data, first_model = first_model, per_taxa_networks = per_taxa_networks, Q = current_q, removed_root = removed_root, rank_id_start = rank_id_start, target_rank_id = target_rank_id)
} else {
return(NA)
}
})
target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id)
new_model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_networks[[target_net_id]], # Account for the root
verbosity = 6,
plotting = "",
ncores = 1L
)
lapply(seq_along(propagated_models), function(current_q) {
if (any(is.na(propagated_models[[current_q]]))) {
return(NA)
}
new_model$memberships[[current_q]] <<- getRefClass("LBM")(from_cc = propagated_models[[current_q]]$membership)
new_model$model_parameters[[current_q]] <- propagated_models[[current_q]]$model
new_model$PL[current_q] <- propagated_models[[current_q]]$PL
new_model$H[current_q] <- propagated_models[[current_q]]$H
new_model$ICL[current_q] <- propagated_models[[current_q]]$PL - .5 * (propagated_models[[current_q]]$model$n_parameters *
log(new_model$data_number()) + new_model$memberships[[current_q]]$ICL_penalty())
})
bm_estimate_with_inits(bm_model = new_model)
return(new_model)
}
per_taxa_networks <- collapse_otu_at_taxo(the_data)
r2_model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_networks[[2]], # Account for the root
verbosity = 2,
verbosity = 6,
plotting = "",
ncores = 1L
ncores = parallelly::availableCores()
)
r2_model$estimate()
r3_model <- bm_propagate_taus_all_models(phyloseq_data = the_data, rank_id_start = 2, target_rank_id = 3, per_taxa_networks = per_taxa_networks)

131
utils-bm-seq.R Normal file
View file

@ -0,0 +1,131 @@
library(blockmodels)
library(phyloseq)
bm_propagate_taus_Q_next_level <- function(phyloseq_data, first_model, Q, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data), rank_id_start = 2L, target_rank_id = rank_id_start + 1, removed_root = TRUE, sd_noise = 0.001) {
stopifnot("Q must be provided" = !missing(Q), "pĥyloseq_data must be provided" = !missing(phyloseq_data))
force(per_taxa_networks)
result_list <- list()
# Init
mean_diff_vec <- c()
PL_per_nb_vec <- c()
max_rank_id <- tax_table(phyloseq_data) |> ncol()
net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start)
result_list <- append(result_list, first_model)
# Here we extract the memberships
current_memberships <- first_model$memberships[[Q]]
if (!is.matrix(current_memberships$Z1)) {
nrow_Z1 <- length(current_memberships$Z1)
} else {
nrow_Z1 <- nrow(current_memberships$Z1)
}
next_cc <- list(Z1 = as.matrix(current_memberships$Z1, nrow = nrow_Z1), Z2 = current_memberships$Z2)
# Here we dispatch the memberships for the OTU
## Propagate the rownames
rownames(next_cc$Z1) <- rownames(per_taxa_networks[[net_id_start]])
target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id)
next_Z1 <- propagate_taus(tau_matrix = next_cc$Z1, physeq = phyloseq_data, taxrank = phyloseq::rank_names(phyloseq_data)[target_rank_id])
## Noising of the next_Z1
dZ1 <- dim(next_Z1)
nZ1 <- dZ1[1] * dZ1[2]
next_Z1 <- next_Z1 + matrix(rnorm(n = nZ1, mean = 0, sd = sd_noise), nrow = dZ1[1], ncol = dZ1[2])
## Normalizing by rows
C_Z1 <- rowSums(next_Z1)
next_Z1 <- next_Z1 / C_Z1
next_cc$Z1 <- next_Z1
next_res <- blockmodels:::dispatcher(
membership_name = "LBM",
membership_init = next_cc,
model_name = "poisson",
network = list(adjacency = per_taxa_networks[[target_net_id]]), real_EM = TRUE
) # account for the removed root
result_list <- append(result_list, next_res)
mean_diff_vec[paste0(rank_id_start, "->", target_rank_id)] <- mean((next_res$membership$Z1 - next_Z1)^2)
next_cc <- next_res[["membership"]][c("Z1", "Z2")]
PL_per_nb_vec[paste0(rank_id_start, "->", target_rank_id)] <- next_res$PL / (nrow(next_res$membership$Z1) * nrow(next_res$membership$Z2))
return(next_res)
}
#' Use a preinited model to run blockmodels exploration
#'
#' @param bm_model A pre-initialized model object
bm_estimate_with_inits <- function(bm_model, reinitialization_effort = 1) {
l <- TRUE
n <- 1
changing_effort <- FALSE
while (l) {
cat("Pass", n, "\n")
cat("With ascending number of groups\n")
ra <- bm_model$estim_ascend(reinitialization_effort, changing_effort)
cat("With descending number of groups\n")
rb <- bm_model$estim_descend(reinitialization_effort)
l <- ra || rb
n <- n + 1
changing_effort <- FALSE
}
}
bm_propagate_taus_all_models <- function(phyloseq_data, rank_id_start = 2, target_rank_id = rank_id_start + 1, first_model = NULL, per_taxa_networks = collapse_otu_at_taxo(phyloseq_data)) {
removed_root <- FALSE
# Detect if first rank (higher) is fully collapsed
if (nrow(per_taxa_networks[[1]]) <= 1L) {
message("The first network has only one row. Removing it.")
removed_root <- TRUE
per_taxa_networks <- per_taxa_networks[-1]
}
result_list <- list()
# Init
net_id_start <- ifelse(removed_root, rank_id_start - 1, rank_id_start)
if (missing(first_model)) {
first_model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_networks[[net_id_start]], # Account for the root
verbosity = 2,
plotting = "",
ncores = 1L
)
first_model$estimate()
}
propagated_models <- lapply(seq_along(first_model$ICL), function(current_q) {
if (!is.na(first_model$ICL[current_q])) {
bm_propagate_taus_Q_next_level(phyloseq_data = phyloseq_data, first_model = first_model, per_taxa_networks = per_taxa_networks, Q = current_q, removed_root = removed_root, rank_id_start = rank_id_start, target_rank_id = target_rank_id)
} else {
return(NA)
}
})
target_net_id <- ifelse(removed_root, target_rank_id - 1, target_rank_id)
new_model <- BM_poisson(
membership_type = "LBM",
adj = per_taxa_networks[[target_net_id]], # Account for the root
verbosity = 6,
plotting = "",
ncores = 1L
)
lapply(seq_along(propagated_models), function(current_q) {
if (any(is.na(propagated_models[[current_q]]))) {
return(NA)
}
new_model$memberships[[current_q]] <<- getRefClass("LBM")(from_cc = propagated_models[[current_q]]$membership)
new_model$model_parameters[[current_q]] <- propagated_models[[current_q]]$model
new_model$PL[current_q] <- propagated_models[[current_q]]$PL
new_model$H[current_q] <- propagated_models[[current_q]]$H
new_model$ICL[current_q] <- propagated_models[[current_q]]$PL - .5 * (propagated_models[[current_q]]$model$n_parameters *
log(new_model$data_number()) + new_model$memberships[[current_q]]$ICL_penalty())
})
bm_estimate_with_inits(bm_model = new_model)
return(new_model)
}