human-microbiome-compendium/increasing_size_test.R

117 lines
3.6 KiB
R

library(sbm)
args <- commandArgs(trailingOnly = TRUE)
if (length(args) <= 1) {
max_arg <- args
print("One argument was provided, will be treated as max")
min_arg <- NA
} else {
if (length(args) > 2) {
stop("Too many arguments provided")
}
print("Two arguments were provided, will be treated as min and max")
min_arg <- args[1]
max_arg <- args[2]
}
if (identical(max_arg, character(0)) || is.na(as.integer(max_arg)) || as.integer(max_arg) <= 0) {
max_nb_col <- 5000L
print(paste0("No or incorrect argument was passed setting max to default value : ", max_nb_col))
} else {
max_nb_col <- as.integer(max_arg)
print(paste0("Setting to max provided value : ", max_nb_col))
}
if (identical(min_arg, character(0)) || is.na(as.integer(min_arg)) || as.integer(min_arg) <= 0) {
min_nb_col <- 50L
print(paste0("No or incorrect argument was passed setting min to default value : ", min_nb_col))
} else {
min_nb_col <- as.integer(min_arg)
print(paste0("Setting to min provided value : ", min_nb_col))
}
if (max_nb_col - min_nb_col <= 0) {
stop("The range between min and max should be positive and larger than 0")
}
model <- "bernoulli"
set.seed(1234)
nb_row <- 50
blockProp <- list(
c(0.25, 0.75),
c(0.1, 0.4, 0.5)
)
connectParam <- list(mean = matrix(c(
0.9, 0.5, 0.1,
0.3, 0.2, 0.05
), nrow = 2L, ncol = 3L))
nb_col_seq <- seq(min_nb_col, max_nb_col, by = 50)
lbm_list <- lapply(nb_col_seq, function(nb_col) {
sampleBipartiteSBM(
nbNodes = c(nb_row, nb_col), blockProp = blockProp, connectParam = connectParam,
model = model
)$rNetwork()
})
unonehot <- function(mat) {
apply(mat, 1, FUN = function(row) which(row == 1))
}
lbm_matrices <- lapply(lbm_list, function(lbm) lbm$networkData)
lbm_row_memberships <- lapply(lbm_list, function(lbm) apply(lbm$indMemberships$row, 1, FUN = function(row) which(row == 1)))
lbm_col_memberships <- lapply(lbm_list, function(lbm) apply(lbm$indMemberships$col, 1, FUN = function(col) which(col == 1)))
library(here)
results_dir <- here("results", "increasing_size")
save_path <- here(results_dir, paste0("sbm_incr_", model, "_from_", min_nb_col, "_to_", max_nb_col, ".Rds"))
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
print(paste0("Final results will be saved to ", save_path))
# (epoch <- as.integer(Sys.time()))
# temp_dir <- here(results_dir, paste0(min_nb_col, "_to_", max_nb_col, "tmp", epoch))
# print(paste0("Temp saved to ", temp_dir))
library(parallelly)
library(future)
library(future.apply)
library(future.callr)
plan(tweak("callr", workers = 64))
lbm_res <- future_lapply(seq_along(lbm_matrices), function(mat_idx) {
start_time <- Sys.time()
fit <- estimateBipartiteSBM(netMat = lbm_matrices[[mat_idx]], estimOptions = list(plot = 0))
stop_time <- Sys.time()
out_list <- list(fit = fit, time = stop_time - start_time)
return(out_list)
}, future.seed = TRUE)
lbm_fits <- lapply(lbm_res, function(lbm) lbm$fit)
lbm_times <- sapply(lbm_res, function(lbm) lbm$time)
lbm_fit_row <- lapply(lbm_fits, function(lbm) unonehot(lbm$indMemberships$row))
lbm_fit_col <- lapply(lbm_fits, function(lbm) unonehot(lbm$indMemberships$col))
library(aricode)
ari_row <- sapply(seq_along((lbm_matrices)), function(idx) {
ARI(lbm_row_memberships[[idx]], lbm_fit_row[[idx]])
})
ari_col <- sapply(seq_along((lbm_matrices)), function(idx) {
ARI(lbm_col_memberships[[idx]], lbm_fit_col[[idx]])
})
out_df <- data.frame(n1 = nb_row, n2 = nb_col_seq, model = model, time = lbm_times, ari_row = ari_row, ari_col = ari_col)
saveRDS(out_df, save_path)