simulations : NA robustness, switching to future backend

This commit is contained in:
Louis Lacoste 2024-07-18 09:52:58 +02:00
parent 4dbdc6b61c
commit e9c4dc3883

View file

@ -9,6 +9,7 @@ if (!all((necessary_packages %in% installed.packages()))) {
library("pROC")
library("colSBM")
future::plan(future::multicore())
# Arguments
arg <- commandArgs(trailingOnly = TRUE)
@ -73,13 +74,15 @@ collections <- list(
"iid" = c(
generate_bipartite_collection(nr1, nc1,
pir, pic,
alpha, 1,
alpha,
M = 1,
model = "iid",
return_memberships = TRUE
),
generate_bipartite_collection(nr2, nc2,
pir, pic,
alpha, M - 1,
alpha,
M = M - 1,
model = "iid",
return_memberships = TRUE
)
@ -162,142 +165,144 @@ message(
"Starting NA robustness simulation with ", sampling,
" sampling and ", struct, " structure."
)
result_list <- parallel::mclapply(seq_len(nrow(conditions)), function(current) {
# Looping over conditions
prop_NAs <- conditions[current, ][["prop_NAs"]]
model <- as.character(conditions[current, ][["model"]])
bipartite_collection <- collections[[model]]
result_list <- future.apply::future_lapply(
seq_len(nrow(conditions)),
function(current) {
# Looping over conditions
prop_NAs <- conditions[current, ][["prop_NAs"]]
model <- as.character(conditions[current, ][["model"]])
bipartite_collection <- collections[[model]]
# This is a list of the M incidence matrices
bipartite_collection_incidence <- lapply(seq.int(M), function(m) {
bipartite_collection[[m]][["incidence_matrix"]]
})
# This is a list of the M incidence matrices
bipartite_collection_incidence <- lapply(seq.int(M), function(m) {
bipartite_collection[[m]][["incidence_matrix"]]
})
Z <- lapply(seq.int(M), function(m) {
list(
bipartite_collection[[m]][["row_blockmemberships"]],
bipartite_collection[[m]][["col_blockmemberships"]]
Z <- lapply(seq.int(M), function(m) {
list(
bipartite_collection[[m]][["row_blockmemberships"]],
bipartite_collection[[m]][["col_blockmemberships"]]
)
})
NAs_selected_index <- switch(sampling,
"uniform" = {
seq_len(length(bipartite_collection_incidence[[1]]))
},
"row" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_nodes_selected <- seq(1, nc2)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"col" = {
row_nodes_selected <- seq(1, nr2)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"rowcol" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
}
)
})
NAs_selected_index <- switch(sampling,
"uniform" = {
seq_len(length(bipartite_collection_incidence[[1]]))
},
"row" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_nodes_selected <- seq(1, nc2)
NAs_index <- sample(NAs_selected_index, size = floor(prop_NAs * length(NAs_selected_index)))
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"col" = {
row_nodes_selected <- seq(1, nr2)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"rowcol" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
# Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
}
)
NAs_index <- sample(NAs_selected_index, size = floor(prop_NAs * length(NAs_selected_index)))
real_val_NAs <- bipartite_collection_incidence[[1]][NAs_index]
bipartite_collection_incidence[[1]][NAs_index] <- NA
NAs_coordinates <- which(is.na(bipartite_collection_incidence[[1]]),
arr.ind = TRUE
)
start_time <- Sys.time()
mybisbmpop <- estimate_colBiSBM(
netlist = bipartite_collection_incidence, colsbm_model = model,
nb_run = 1,
global_opts = list(
nb_cores = parallel::detectCores() - 1, verbosity = 0
real_val_NAs <- bipartite_collection_incidence[[1]][NAs_index]
bipartite_collection_incidence[[1]][NAs_index] <- NA
NAs_coordinates <- which(is.na(bipartite_collection_incidence[[1]]),
arr.ind = TRUE
)
)
stop_time <- Sys.time()
baseline_LBM <- estimate_colBiSBM(
netlist = bipartite_collection_incidence[[1]], colsbm_model = "iid",
nb_run = 1,
global_opts = list(
nb_cores = parallel::detectCores() - 1, verbosity = 0
start_time <- Sys.time()
mybisbmpop <- estimate_colBiSBM(
netlist = bipartite_collection_incidence, colsbm_model = model,
nb_run = 1,
global_opts = list(
nb_cores = parallel::detectCores() - 1, verbosity = 0
)
)
)
stop_time <- Sys.time()
# Predicted links
X_hat_LBM <- baseline_LBM[["best_fit"]][["tau"]][[1]][[1]] %*%
baseline_LBM[["best_fit"]][["alpha"]] %*%
t(baseline_LBM[["best_fit"]][["tau"]][[1]][[2]])
X_hat <- mybisbmpop[["best_fit"]][["tau"]][[1]][[1]] %*%
mybisbmpop[["best_fit"]][["alpha"]] %*%
t(mybisbmpop[["best_fit"]][["tau"]][[1]][[2]])
baseline_LBM <- estimate_colBiSBM(
netlist = bipartite_collection_incidence[[1]], colsbm_model = "iid",
nb_run = 1,
global_opts = list(
nb_cores = parallel::detectCores() - 1, verbosity = 0
)
)
# Compute ROC and AUC
auc_LBM <- auc(c(0, 1, real_val_NAs), c(0, 1, X_hat_LBM[NAs_index]))
auc_colBiSBM <- auc(c(0, 1, real_val_NAs), c(0, 1, X_hat[NAs_index]))
# Predicted links
X_hat_LBM <- baseline_LBM[["best_fit"]][["tau"]][[1]][[1]] %*%
baseline_LBM[["best_fit"]][["alpha"]] %*%
t(baseline_LBM[["best_fit"]][["tau"]][[1]][[2]])
X_hat <- mybisbmpop[["best_fit"]][["tau"]][[1]][[1]] %*%
mybisbmpop[["best_fit"]][["alpha"]] %*%
t(mybisbmpop[["best_fit"]][["tau"]][[1]][[2]])
# Computing ARI on the NAs
out_data_frame <- data.frame(
prop_NAs = prop_NAs,
model = model,
repetition = conditions[current, ][["repetition"]],
auc_LBM = auc_LBM,
auc_colBiSBM = auc_colBiSBM,
arirow_LBM = aricode::ARI(
Z[[1]][[1]],
baseline_LBM[["best_fit"]][["Z"]][[1]][[1]]
),
aricol_LBM = aricode::ARI(
Z[[1]][[2]],
baseline_LBM[["best_fit"]][["Z"]][[1]][[2]]
),
arirow_colBiSBM = aricode::ARI(
Z[[1]][[1]],
mybisbmpop[["best_fit"]][["Z"]][[1]][[1]]
),
aricol_colBiSBM = aricode::ARI(
Z[[1]][[2]],
mybisbmpop[["best_fit"]][["Z"]][[1]][[2]]
),
elapsed_secs = difftime(stop_time, start_time, units = "sec"),
sampling = sampling,
struct = struct
)
# Compute ROC and AUC
auc_LBM <- auc(c(0, 1, real_val_NAs), c(0, 1, X_hat_LBM[NAs_index]))
auc_colBiSBM <- auc(c(0, 1, real_val_NAs), c(0, 1, X_hat[NAs_index]))
message("Finished step ", current, "/", nrow(conditions), "\n")
# Computing ARI on the NAs
out_data_frame <- data.frame(
prop_NAs = prop_NAs,
model = model,
repetition = conditions[current, ][["repetition"]],
auc_LBM = auc_LBM,
auc_colBiSBM = auc_colBiSBM,
arirow_LBM = aricode::ARI(
Z[[1]][[1]],
baseline_LBM[["best_fit"]][["Z"]][[1]][[1]]
),
aricol_LBM = aricode::ARI(
Z[[1]][[2]],
baseline_LBM[["best_fit"]][["Z"]][[1]][[2]]
),
arirow_colBiSBM = aricode::ARI(
Z[[1]][[1]],
mybisbmpop[["best_fit"]][["Z"]][[1]][[1]]
),
aricol_colBiSBM = aricode::ARI(
Z[[1]][[2]],
mybisbmpop[["best_fit"]][["Z"]][[1]][[2]]
),
elapsed_secs = difftime(stop_time, start_time, units = "sec"),
sampling = sampling,
struct = struct
)
# Saving temp
temp_file_save <- file.path(temp_dir, paste0(
"conditions_", current, "_on_",
nrow(conditions), ".Rds"
))
message("Finished step ", current, "/", nrow(conditions), "\n")
saveRDS(out_data_frame, file = temp_file_save)
# Saving temp
temp_file_save <- file.path(temp_dir, paste0(
"conditions_", current, "_on_",
nrow(conditions), ".Rds"
))
return(out_data_frame)
},
mc.cores = parallel::detectCores() - 1
saveRDS(out_data_frame, file = temp_file_save)
return(out_data_frame)
},
future.seed = NULL
)
result_dataframe <- do.call("rbind", result_list)