From a3a44e02a6c44abdf5c28cb118125076fae7fee9 Mon Sep 17 00:00:00 2001 From: Louis Lacoste Date: Tue, 18 Jun 2024 16:57:59 +0200 Subject: [PATCH] Profiling wether max vem imports : IT DOES --- .../impact_of_vem_max_steps.html | 515 ++++++++++++++++++ .../investigating/impact_of_vem_max_steps.qmd | 62 +++ .../investigating/profiling_clustering.R | 170 ++++++ 3 files changed, 747 insertions(+) create mode 100644 code/analysis/investigating/impact_of_vem_max_steps.html create mode 100644 code/analysis/investigating/impact_of_vem_max_steps.qmd create mode 100644 code/analysis/investigating/profiling_clustering.R diff --git a/code/analysis/investigating/impact_of_vem_max_steps.html b/code/analysis/investigating/impact_of_vem_max_steps.html new file mode 100644 index 0000000..9d2c6f5 --- /dev/null +++ b/code/analysis/investigating/impact_of_vem_max_steps.html @@ -0,0 +1,515 @@ + + + + + + + + + +Investigation sur l’impact de max_vem_steps + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

Investigation sur l’impact de max_vem_steps

+
+ + + +
+ + + + +
+ + + +
+ + +
+

Générations des données

+

Générons les données avec une seed fixée (\(s_{net} = 0\))

+
+ +
+ + +
+ + + + + \ No newline at end of file diff --git a/code/analysis/investigating/impact_of_vem_max_steps.qmd b/code/analysis/investigating/impact_of_vem_max_steps.qmd new file mode 100644 index 0000000..3f835e1 --- /dev/null +++ b/code/analysis/investigating/impact_of_vem_max_steps.qmd @@ -0,0 +1,62 @@ +--- +title: Investigation sur l'impact de `max_vem_steps` +format: html +execute: + echo: false + warning: true +--- +```{r libraries} +library(colSBM) +library(here) +library(ggplot2) +``` + +```{r constants} +base_folder <- here("code", "results", "investigating", "vem_steps") + + +if (!dir.exists(base_folder)) { + dir.create(base_folder, recursive = TRUE) +} + +net_seed <- 0 +test_seeds <- c(12, 3) +epsilons <- c(0.1, 0.4) +vem_steps <- seq(10, 300, by = 40) + +conditions <- expand.grid( + seeds = test_seeds, + epsilons = epsilons, + vem_steps = vem_steps +) + + +base_alpha <- matrix(rep(0.3, 9L), nrow = 3L) +pi <- c(0.3, 0.2, 0.5) +rho <- c(0.55, 0.15, 0.3) +M <- 10L +nr <- c(rep(30L, M / 2L), rep(95L, M / 2L)) +nc <- c(rep(40L, M / 2L), rep(70L, M / 2L)) +``` + +```{r} +df <- readRDS(file.path(base_folder, "impact_vem_1718196265.Rds")) + +df$eps <- factor(df$eps) +df$seed <- factor(df$seed) +df$elapsed_time <- as.numeric(df$elapsed_time) +``` + +```{r ari} +ggplot(df) + + aes(x = max_vem_steps, y = ari, color = eps) + + geom_line(aes(linetype = seed)) + + geom_point() +``` + +```{r time} +ggplot(df) + + aes(x = max_vem_steps, y = elapsed_time, color = eps) + + geom_line(aes(linetype = seed)) + + geom_point() +``` \ No newline at end of file diff --git a/code/analysis/investigating/profiling_clustering.R b/code/analysis/investigating/profiling_clustering.R new file mode 100644 index 0000000..1ebbb79 --- /dev/null +++ b/code/analysis/investigating/profiling_clustering.R @@ -0,0 +1,170 @@ +library(colSBM) +library(aricode) +library(here) + + +base_folder <- here("code", "results", "investigating", "profiling_clustering") + + +if (!dir.exists(base_folder)) { + dir.create(base_folder, recursive = TRUE) +} + +net_seed <- 0 +test_seeds <- c(12, 3) +epsilons <- c(0.1, 0.4) +vem_steps <- seq(10, 300, by = 40) + +conditions <- expand.grid( + seeds = test_seeds, + epsilons = epsilons, + vem_steps = vem_steps +) + + +base_alpha <- matrix(rep(0.3, 9L), nrow = 3L) +pi <- c(0.3, 0.2, 0.5) +rho <- c(0.55, 0.15, 0.3) +M <- 10L +nr <- c(rep(30L, M / 2L), rep(95L, M / 2L)) +nc <- c(rep(40L, M / 2L), rep(70L, M / 2L)) + +generate_net <- function(eps, net_seed = 0) { + set.seed(net_seed) + as_alpha <- base_alpha + matrix( + c( + eps, -eps / 2L, -eps / 2L, + -eps / 2L, eps, -eps / 2L, + -eps / 2L, -eps / 2L, eps + ), + nrow = 3L + ) + + cp_alpha <- base_alpha + matrix( + c( + 3L * eps / 2L, eps, eps / 2L, + eps, eps / 2L, 0L, + eps / 2L, 0L, -eps / 2L + ), + nrow = 3L + ) + + dis_alpha <- base_alpha + matrix( + c( + -eps / 2L, eps, eps, + eps, -eps / 2L, eps, + eps, eps, -eps / 2L + ), + nrow = 3L + ) + + collection <- c( + generate_bipartite_collection( + nr = nr, nc = nc, + pi = pi, rho = rho, + alpha = as_alpha, M = M + ), + generate_bipartite_collection( + nr = nr, nc = nc, + pi = pi, rho = rho, + alpha = cp_alpha, M = M + ), + generate_bipartite_collection( + nr = nr, nc = nc, + pi = pi, rho = rho, + alpha = dis_alpha, M = M + ) + ) + names(collection) <- c( + 0 + seq(0, M %/% 2), 0 + seq(M %/% 2 + 1, M - 1), + 10 + seq(0, M %/% 2), 10 + seq(M %/% 2 + 1, M - 1), + 20 + seq(0, M %/% 2), 20 + seq(M %/% 2 + 1, M - 1) + ) + + return(collection) +} + +list_collections <- lapply(epsilons, function(eps) { + generate_net(eps = eps, net_seed = net_seed) +}) +names(list_collections) <- epsilons + +true_clustering <- c(rep(1, M), rep(2, M), rep(3, M)) + +begin_time <- format(Sys.time(), "%s") +tmp_folder <- file.path(base_folder, paste0("tmp", begin_time)) + +if (!dir.exists(tmp_folder)) { + dir.create(tmp_folder, recursive = TRUE) +} + +results <- parallel::mclapply(seq_len(nrow(conditions)), function(idx) { + current_seed <- conditions[["seeds"]][idx] + eps <- conditions[["epsilons"]][idx] + max_vem_steps <- conditions[["vem_steps"]][idx] + + message("Condition ", idx, " on ", nrow(conditions)) + + collection <- list_collections[[as.character(eps)]] + + set.seed(current_seed) + start_time <- Sys.time() + clust <- clusterize_bipartite_networks( + netlist = collection, net_id = names(collection), + colsbm_model = "iid", fit_opts = list(max_vem_steps = max_vem_steps), + global_opts = list( + verbosity = 0L, + nb_cores = parallelly::availableCores(omit = 1L) + ) + ) + stop_time <- Sys.time() + + elapsed_time <- stop_time - start_time + + unlisted_best_partition <- extract_best_bipartite_partition(clust) + + if (!is.list(unlisted_best_partition)) { + unlisted_best_partition <- list(unlisted_best_partition) + } + + clustering_vec <- sort(unlist(lapply(seq_len(length(unlisted_best_partition)), function(idx) { + ids_nets <- as.numeric(unlisted_best_partition[[idx]]$net_id) + names(ids_nets) <- rep(idx, length(ids_nets)) + ids_nets + }))) + + cluster_membership <- as.numeric(names(clustering_vec)) + + ari <- try(ARI(cluster_membership, true_clustering)) + + if (inherits(ari, "try-error")) { + ari <- NA + } + + out <- data.frame( + eps = eps, seed = current_seed, + max_vem_steps = max_vem_steps, + ari = ari, + elapsed_time = elapsed_time, + start_time = start_time, + stop_time = stop_time, + clustering = matrix( + cluster_membership, + nrow = 1L + ) + ) + saveRDS(out, + file = file.path( + tmp_folder, + paste0("c_", idx, "_on_", nrow(conditions), ".Rds") + ) + ) + message("Finished condition ", idx) + out +}, +mc.cores = parallelly::availableCores(omit = 1L) +) + +to_save <- do.call(rbind, results) +filename_to_save <- paste0("impact_vem_", begin_time, ".Rds") +saveRDS(to_save, file = file.path(base_folder, filename_to_save))