Adding all

This commit is contained in:
Louis Lacoste 2024-04-19 14:57:02 +02:00
parent 38cefffceb
commit c085f96a78
10 changed files with 507 additions and 43 deletions

View file

@ -0,0 +1,87 @@
library(ggplot2)
library(tidyverse)
library(latex2exp)
library(patchwork)
#  Loading data
data_folder <- file.path("code", "results", "simulations", "NA_robustness")
data <- readRDS(file.path(
data_folder,
"NA_robustness18-04-2024_17-05-44_1-100.Rds"
))
max_repetition <- max(data$repetition)
#  Averaging over repetitions
averaged_data <- data %>%
group_by(prop_NAs, model) %>%
select(-repetition) %>%
summarise_all(list(mean = mean, sd = sd))
#  Preparing auc_data
auc_data <- averaged_data %>%
select(c(prop_NAs, model) | contains("auc_")) %>%
rename_with(~ gsub("auc_", "", .x, fixed = TRUE))
auc_data_long <-
bind_cols(
auc_data %>% select(c("prop_NAs", "model") | contains("_mean")) %>%
pivot_longer(!c(prop_NAs, model),
names_to = c("method"),
values_to = "auc_mean"
),
auc_data %>% select(c("prop_NAs", "model") | contains("_sd")) %>%
pivot_longer(!c(prop_NAs, model),
names_to = NULL,
values_to = "auc_sd"
) %>% ungroup() %>% select(!c("prop_NAs", "model"))
) %>% mutate(method = gsub(
pattern = "_mean",
replacement = "", fixed = TRUE
))
# Preparing ARI data
ari_data <- averaged_data %>% select(c(prop_NAs, model) | contains("ari"))
ari_data_long <- bind_cols(
ari_data %>% select(c("prop_NAs", "model") | contains("_mean")) %>%
pivot_longer(!c(prop_NAs, model),
names_to = c("method"),
values_to = "ari_mean"
),
ari_data %>% select(c("prop_NAs", "model") | contains("_sd")) %>%
pivot_longer(!c(prop_NAs, model),
names_to = NULL,
values_to = "ari_sd"
) %>% ungroup() %>% select(!c("prop_NAs", "model"))
) %>%
separate(method, into = c("dim", "method", "useless"), sep = "_") %>%
select(-useless)
auc_plot <- ggplot(auc_data_long) +
aes(x = prop_NAs, y = auc_mean) +
geom_line(aes(color = method)) +
geom_point(aes(color = method)) +
geom_ribbon(aes(ymin = auc_mean - auc_sd, ymax = auc_mean + auc_sd, fill = method), alpha = 0.2) +
ylim(c(0, 1)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10L)) +
ylab(TeX("\\bar{AUC}")) +
xlab("NA proportion") +
ggtitle(TeX(paste(
"$\\bar{AUC}\\pm s_{AUC}$", ", function of NA proportion. N=", max_repetition
))) +
facet_grid(cols = vars(model))
ari_plot <- ggplot(ari_data_long) +
aes(x = prop_NAs, y = ari_mean) +
geom_line(aes(color = method)) +
geom_point(aes(color = method)) +
geom_ribbon(aes(ymin = ari_mean - ari_sd, ymax = ari_mean + ari_sd, fill = method), alpha = 0.2) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10L)) +
ylab(TeX("$\\bar{ARI^d}$")) +
xlab("NA proportion") +
ggtitle(TeX(paste(
"$\\bar{ARI^d}\\pm s_{ARI^d}$", ", function of NA proportion. N=", max_repetition
))) +
facet_grid(rows = vars(model), cols = vars(dim))
auc_plot + ari_plot

View file

@ -0,0 +1,18 @@
#!/usr/bin/env bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Finding simulations directory
SIMULATIONS_DIR=$(echo ${SCRIPT_DIR%/*}/simulations)
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct nested --sampling uniform"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct modular --sampling uniform"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct nested --sampling row"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct modular --sampling row"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct nested --sampling col"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct modular --sampling col"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct nested --sampling rowcol"
Rscript "$(echo ${SIMULATIONS_DIR}/simulations_NA_robustness.R)" "--struct modular --sampling rowcol"

View file

@ -1,35 +1,77 @@
necessary_packages <- c(
"remotes", "tictoc", "pROC", "sbm", "parallel", "colSBM"
"remotes", "tictoc", "pROC", "parallel", "colSBM"
)
if (!all((necessary_packages %in% installed.packages()))) {
install.packages(necessary_packages[-length(necessary_packages)])
remotes::install_github(repo = "Chabert-Liddell/colSBM@merge-bipartite-2")
install.packages(necessary_packages[-length(necessary_packages)])
remotes::install_github(repo = "Chabert-Liddell/colSBM@merge-bipartite-2")
}
library("sbm")
library("pROC")
library("colSBM")
# Arguments
arg <- commandArgs(trailingOnly = TRUE)
sampling <- "uniform"
struct <- "modular"
if (length(arg) == 0L) {
message("No arguments provided, using default.")
} else {
if ("--struct" %in% arg) {
struct <- arg[(which(arg) == "--struct") + 1L]
} else {
message("No structure provided, defaulting to modular.")
}
if ("--sampling" %in% arg) {
sampling <- arg[(which(arg) == "--sampling") + 1L]
} else {
message("No sampling provided, defaulting to uniform.")
}
}
#  Arguments checks
allowed_sampling <- c("uniform", "row", "col", "rowcol")
allowed_struct <- c("modular", "nested")
stopifnot(
"Unknown sampling, should be : uniform, row, rowcol" = (sampling %in%
allowed_sampling),
"Unknown structure, should be : modular or nested" = (struct %in% allowed_struct)
)
set.seed(1234)
eps <- 0.05
M <- 3
M <- 4
# Defining parameters
nr <- 100
nc <- 150
pir <- c(0.5, 0.3, 0.2)
pic <- c(0.5, 0.3, 0.2)
alpha <- matrix(c(
0.7, 0.4, 0.3,
0.4, 0.2, eps,
0.3, eps, eps
), byrow = TRUE, nrow = length(pir), ncol = length(pic))
max_repetition <- 10
alpha <- switch(struct,
"modular" = {
alpha <- matrix(c(
0.7, eps, eps,
eps, 0.4, eps,
eps, eps, 0.6
), byrow = TRUE, nrow = length(pir), ncol = length(pic))
},
"nested" = {
alpha <- matrix(c(
0.7, 0.4, 0.3,
0.4, 0.2, eps,
0.3, eps, eps
), byrow = TRUE, nrow = length(pir), ncol = length(pic))
}
)
max_repetition <- 5
# Collections
collections <- list(
@ -75,47 +117,89 @@ if (!dir.exists(main_dir)) {
}
start_time <- format(Sys.time(), "%d-%m-%Y_%H-%M-%S")
temp_dir <- file.path(main_dir, paste0("tmp", start_time))
temp_dir <- file.path(main_dir, paste0(
"tmp", start_time, "_",
sampling, "_", struct
))
if (!dir.exists(temp_dir)) {
dir.create(temp_dir, recursive = TRUE)
}
file_save <- file.path(main_dir, paste0(
"NA_robustness",
"NA_robustness_", sampling, "_", struct, "_",
start_time, "_1-", nrow(conditions), ".Rds"
))
message(
"Starting NA robustness simulation with ", sampling,
" sampling and ", struct, " structure."
)
result_list <- parallel::mclapply(seq_len(nrow(conditions)), function(current) {
# Looping over conditions
prop_NAs <- conditions[current, ]$prop_NAs
model <- as.character(conditions[current, ]$model)
prop_NAs <- conditions[current, ][["prop_NAs"]]
model <- as.character(conditions[current, ][["model"]])
bipartite_collection <- collections[[model]]
# This is a list of the M incidence matrices
bipartite_collection_incidence <- lapply(seq.int(M), function(m) {
bipartite_collection[[m]]$incidence_matrix
bipartite_collection[[m]][["incidence_matrix"]]
})
# Sampling values to replace by NAs
NAs_index <- sample(
seq_len(length(bipartite_collection_incidence[[1]])),
floor(prop_NAs * length(bipartite_collection_incidence[[1]]))
Z <- lapply(seq.int(M), function(m) {
list(
bipartite_collection[[m]][["row_blockmemberships"]],
bipartite_collection[[m]][["col_blockmemberships"]]
)
})
NAs_selected_index <- switch(sampling,
"uniform" = {
seq_len(length(bipartite_collection_incidence[[1]]))
},
"row" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_nodes_selected <- seq(1, nc)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
#  Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"col" = {
row_nodes_selected <- seq(1, nr)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
#  Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
},
"rowcol" = {
row_cluster_selected <- sample.int(n = length(pir), size = 1)
row_nodes_selected <- which(Z[[1]][[1]] == row_cluster_selected)
col_cluster_selected <- sample.int(n = length(pic), size = 1)
col_nodes_selected <- which(Z[[1]][[2]] == col_cluster_selected)
NAs_selected_index_exp <- expand.grid(row = row_nodes_selected, col = col_nodes_selected)
#  Computes the index as a single number, R stores matrices by column
(NAs_selected_index_exp[["col"]] - 1) *
nrow(bipartite_collection_incidence[[1]]) + NAs_selected_index_exp[["row"]]
}
)
NAs_index <- sample(NAs_selected_index, size = floor(prop_NAs * length(NAs_selected_index)))
real_val_NAs <- bipartite_collection_incidence[[1]][NAs_index]
bipartite_collection_incidence[[1]][NAs_index] <- NA
NAs_coordinates <- which(is.na(bipartite_collection_incidence[[1]]),
arr.ind = TRUE
)
Z <- lapply(seq.int(M), function(m) {
list(
bipartite_collection[[m]]$row_blockmemberships,
bipartite_collection[[m]]$col_blockmemberships
)
})
start_time <- Sys.time()
mybisbmpop <- estimate_colBiSBM(
netlist = bipartite_collection_incidence, colsbm_model = model,
@ -135,8 +219,12 @@ result_list <- parallel::mclapply(seq_len(nrow(conditions)), function(current) {
)
# Predicted links
X_hat_LBM <- baseline_LBM$best_fit$tau[[1]][[1]] %*% baseline_LBM$best_fit$alpha %*% t(baseline_LBM$best_fit$tau[[1]][[2]])
X_hat <- mybisbmpop$best_fit$tau[[1]][[1]] %*% mybisbmpop$best_fit$alpha %*% t(mybisbmpop$best_fit$tau[[1]][[2]])
X_hat_LBM <- baseline_LBM[["best_fit"]][["tau"]][[1]][[1]] %*%
baseline_LBM[["best_fit"]][["alpha"]] %*%
t(baseline_LBM[["best_fit"]][["tau"]][[1]][[2]])
X_hat <- mybisbmpop[["best_fit"]][["tau"]][[1]][[1]] %*%
mybisbmpop[["best_fit"]][["alpha"]] %*%
t(mybisbmpop[["best_fit"]][["tau"]][[1]][[2]])
# Compute ROC and AUC
auc_LBM <- auc(c(0, 1, real_val_NAs), c(0, 1, X_hat_LBM[NAs_index]))
@ -146,24 +234,24 @@ result_list <- parallel::mclapply(seq_len(nrow(conditions)), function(current) {
out_data_frame <- data.frame(
prop_NAs = prop_NAs,
model = model,
repetition = conditions[current, ]$repetition,
repetition = conditions[current, ][["repetition"]],
auc_LBM = auc_LBM,
auc_colBiSBM = auc_colBiSBM,
LBM_ari_row = aricode::ARI(
arirow_LBM = aricode::ARI(
Z[[1]][[1]],
baseline_LBM$best_fit$Z[[1]][[1]]
baseline_LBM[["best_fit"]][["Z"]][[1]][[1]]
),
LBM_ari_col = aricode::ARI(
aricol_LBM = aricode::ARI(
Z[[1]][[2]],
baseline_LBM$best_fit$Z[[1]][[2]]
baseline_LBM[["best_fit"]][["Z"]][[1]][[2]]
),
NA_net_ari_row = aricode::ARI(
arirow_colBiSBM = aricode::ARI(
Z[[1]][[1]],
mybisbmpop$best_fit$Z[[1]][[1]]
mybisbmpop[["best_fit"]][["Z"]][[1]][[1]]
),
NA_net_ari_col = aricode::ARI(
aricol_colBiSBM = aricode::ARI(
Z[[1]][[2]],
mybisbmpop$best_fit$Z[[1]][[2]]
mybisbmpop[["best_fit"]][["Z"]][[1]][[2]]
),
elapsed_secs = difftime(stop_time, start_time, units = "sec")
)
@ -190,3 +278,4 @@ saveRDS(
result_dataframe,
file = file_save
)
message("Finished simulations.")

View file

@ -90,6 +90,7 @@ file_save <- file.path(main_dir, paste0("bernoulli_inference_", start_time, "_",
tictoc::tic()
message("Starting bernoulli inference simulation.")
conditions_rows <- seq_len(nrow(conditions))
results <- parallel::mclapply(conditions_rows, function(s) {
start_time_condition <- Sys.time()
@ -311,7 +312,7 @@ results <- parallel::mclapply(conditions_rows, function(s) {
elapsed_secs = difftime(stop_time_condition,
start_time_condition, units = "sec")
)
message("Finished step ", s, "/", nrow(conditions), "\n")
message("Finished step ", s, "/", nrow(conditions))
#  Saving temp
temp_file_save <- file.path(temp_dir, paste0(
@ -333,3 +334,4 @@ full_data_frame <- do.call(rbind, results)
saveRDS(full_data_frame,
file = file_save
)
message("Finished simulations.")

View file

@ -89,7 +89,7 @@ if (!dir.exists(temp_dir)) {
file_save <- file.path(main_dir, paste0("poisson_inference_", start_time, "_", arg[1], "-", arg[2], ".Rds"))
tictoc::tic()
message("Starting poisson inference simulation.")
conditions_rows <- seq_len(nrow(conditions))
results <- parallel::mclapply(conditions_rows, function(s) {
start_time_condition <- Sys.time()
@ -311,7 +311,7 @@ results <- parallel::mclapply(conditions_rows, function(s) {
elapsed_secs = difftime(stop_time_condition,
start_time_condition, units = "sec")
)
message("Finished step ", s, "/", nrow(conditions), "\n")
message("Finished step ", s, "/", nrow(conditions))
#  Saving temp
temp_file_save <- file.path(temp_dir, paste0(
@ -333,3 +333,4 @@ full_data_frame <- do.call(rbind, results)
saveRDS(full_data_frame,
file = file_save
)
message("Finished simulations.")

View file

@ -0,0 +1,267 @@
suppressPackageStartupMessages(library("colSBM"))
suppressPackageStartupMessages(library(progressr))
suppressPackageStartupMessages(library(future))
suppressPackageStartupMessages(library(future.apply))
handlers(global = TRUE)
plan("multicore", workers = parallelly::availableCores() - 1)
# Network param
nr <- 90
nc <- 90
# Changing parameters
epsilons_pi <- seq(from = 0.0, to = 0.28, by = 0.035)
epsilons_rho <- seq(from = 0.0, to = 0.28, by = 0.035)
pi1 <- matrix(rep(1 / 3, 3), nrow = 1)
rho1 <- matrix(rep(1 / 3, 3), nrow = 1)
ea <- 0.16
alpha <- 0.25 + matrix(
c(
3 * ea, 2 * ea, ea,
2 * ea, 2 * ea, -ea,
ea, -ea, ea
),
byrow = TRUE, nrow = 3, ncol = 3
)
prob_order <- seq(1, 3)
prob_order <- t(sapply(combinat::permn(prob_order), function(v) v))
repetitions <- seq.int(3)
conditions <- tidyr::crossing(
epsilon_pi = epsilons_pi,
epsilon_rho = epsilons_rho,
pi2_order = prob_order,
rho2_order = prob_order,
repetition = repetitions
)
# To speed up computations and debug adding an argument based selection
if (!exists("arg")) {
arg <- commandArgs(trailingOnly = TRUE)
}
if (identical(arg, character(0))) {
cat(
"\nNo arguments provided,",
"assuming you want to go over all the conditions."
)
arg <- c(1, nrow(conditions))
batch_name <- "default"
} else {
if (!is.na(arg[3])) {
batch_name <- arg[3]
} else {
batch_name <- "default"
}
arg <- as.numeric(arg[-3])
}
if (arg[1] < 1 | arg[1] > nrow(conditions)) {
warning(paste("Arg 1 was invalid, set to 1."))
arg[1] <- 1
}
if (arg[2] > nrow(conditions) | arg[2] < 1) {
warning(paste("Arg 2 was invalid, set to", nrow(conditions)))
arg[2] <- nrow(conditions)
}
choosed_conditions <- seq.int(from = arg[1], to = arg[2])
conditions <- conditions[choosed_conditions, ]
#  Data params
main_dir <- file.path("code", "results", "simulations", "model_selection")
if (!dir.exists(main_dir)) {
dir.create(main_dir, recursive = TRUE)
}
start_time <- format(Sys.time(), "%d-%m-%Y_%H-%M-%S")
temp_dir <- file.path(main_dir, paste0("tmp", start_time))
if (!dir.exists(temp_dir)) {
dir.create(temp_dir, recursive = TRUE)
}
file_save <- file.path(main_dir, paste0(
"model_selection", start_time, "_",
batch_name, ".Rds"
))
message("Starting model selection simulation.")
tictoc::tic()
with_progress({
row_conditions <- seq_len(nrow(conditions))
p <- progressor(along = row_conditions)
results <- future_lapply(row_conditions, function(s) {
start_time_condition <- Sys.time()
epsilon_pi <- conditions[s, ]$epsilon_pi
epsilon_rho <- conditions[s, ]$epsilon_rho
# Computing the vector with the epsilons
current_pi2 <- c(
1 / 3 - epsilon_pi,
1 / 3,
1 / 3 + epsilon_pi
)
current_rho2 <- c(
1 / 3 - epsilon_rho,
1 / 3,
1 / 3 + epsilon_rho
)
# Permutating the vectors
current_pi2 <- current_pi2[conditions[s, ]$pi2_order]
current_rho2 <- current_rho2[conditions[s, ]$rho2_order]
netlist_generated <- list(
generate_bipartite_collection(
nr, nc, pi1, rho1,
alpha,
M = 1, return_memberships = TRUE
)[[1]],
generate_bipartite_collection(
nr, nc, current_pi2, current_rho2,
alpha,
M = 1, return_memberships = TRUE
)[[1]]
)
# Extracting the incidence matrices
netlist <- lapply(seq_along(netlist_generated), function(m) {
return(netlist_generated[[m]]$incidence_matrix)
})
pin <- progressor(steps = 4)
fitted_bisbmpop_iid <- estimate_colBiSBM(
netlist = netlist,
colsbm_model = "iid",
nb_run = 1,
global_opts = list(
verbosity = 0,
plot_details = 0,
nb_cores = parallel::detectCores() - 1
)
)
pin("iid computed")
fitted_bisbmpop_iid$sep_BiSBM$M <- 2
sep_BiSBM <- fitted_bisbmpop_iid$sep_BiSBM
fitted_bisbmpop_pi <- estimate_colBiSBM(
netlist = netlist,
colsbm_model = "pi",
nb_run = 1,
global_opts = list(
verbosity = 0,
plot_details = 0,
nb_cores = parallel::detectCores() - 1
),
sep_BiSBM = sep_BiSBM
)
pin("pi computed")
fitted_bisbmpop_rho <- estimate_colBiSBM(
netlist = netlist,
colsbm_model = "rho",
nb_run = 1,
global_opts = list(
verbosity = 0,
plot_details = 0,
nb_cores = parallel::detectCores() - 1
),
sep_BiSBM = sep_BiSBM
)
pin("rho computed")
fitted_bisbmpop_pirho <- estimate_colBiSBM(
netlist = netlist,
colsbm_model = "pirho",
nb_run = 1,
global_opts = list(
verbosity = 0,
plot_details = 0,
nb_cores = parallel::detectCores() - 1
),
sep_BiSBM = sep_BiSBM
)
pin("pirho computed")
stop_time_condition <- Sys.time()
# BICLs
sep_BICL <- sum(fitted_bisbmpop_iid$sep_BiSBM$BICL)
iid_BICL <- fitted_bisbmpop_iid$best_fit$BICL
pi_BICL <- fitted_bisbmpop_pi$best_fit$BICL
rho_BICL <- fitted_bisbmpop_rho$best_fit$BICL
pirho_BICL <- fitted_bisbmpop_pirho$best_fit$BICL
BICLs <- c(sep_BICL, iid_BICL, pi_BICL, rho_BICL, pirho_BICL)
data_frame_output <- data.frame(
# The conditions
epsilon_pi = epsilon_pi,
epsilon_rho = epsilon_rho,
pi2 = matrix(current_pi2, nrow = 1),
rho2 = matrix(current_rho2, nrow = 1),
repetition = as.numeric(conditions[s, ]$repetition),
# The results
## sep
sep_BICL = sep_BICL,
## iid
iid_BICL = iid_BICL,
iid_Q1 = fitted_bisbmpop_iid$best_fit$Q[1],
iid_Q2 = fitted_bisbmpop_iid$best_fit$Q[2],
## pi
pi_BICL = pi_BICL,
pi_Q1 = fitted_bisbmpop_pi$best_fit$Q[1],
pi_Q2 = fitted_bisbmpop_pi$best_fit$Q[2],
## pi
rho_BICL = rho_BICL,
rho_Q1 = fitted_bisbmpop_rho$best_fit$Q[1],
rho_Q2 = fitted_bisbmpop_rho$best_fit$Q[2],
## pirho
pirho_BICL = pirho_BICL,
pirho_Q1 = fitted_bisbmpop_pirho$best_fit$Q[1],
pirho_Q2 = fitted_bisbmpop_pirho$best_fit$Q[2],
# Preferred model
preferred_model = c(
"sep", "iid",
"pi", "rho", "pirho"
)[which.max(BICLs)],
elapsed_secs = difftime(stop_time_condition,
start_time_condition,
units = "sec"
)
)
message("Finished step ", s, "/", nrow(conditions))
#  Saving temp
temp_file_save <- file.path(temp_dir, paste0(
"conditions_", s, "_on_",
nrow(conditions), ".Rds"
))
saveRDS(object = data_frame_output, file = temp_file_save)
p(sprintf("Finished condition %i/%i", s, nrow(conditions)))
return(data_frame_output)
})
})
tictoc::toc()
full_data_frame <- do.call(rbind, results)
saveRDS(full_data_frame,
file = file_save
)
message("Finished simulations.")