diff --git a/code/applications/sub-dore/sous-collection/01_souscol.R b/code/applications/sub-dore/sous-collection/01_souscol.R new file mode 100644 index 0000000..085dea5 --- /dev/null +++ b/code/applications/sub-dore/sous-collection/01_souscol.R @@ -0,0 +1,147 @@ +library(colSBM) +library(dplyr) +library(tidyr) +library(stringr) +library(here) +library(future) +library(future.apply) + +plan(multicore) +# Arguments +arg <- commandArgs(trailingOnly = TRUE) + +model <- "iid" +seed <- 1234L + +if (length(arg) == 0L) { + message("No arguments provided, using default.") +} else { + if ("--model" %in% arg) { + model <- arg[(which(arg == "--model") + 1L)] + } else { + message("No model provided, defaulting to iid.") + } + if ("--seed" %in% arg) { + seed <- try(as.integer(arg[(which(arg == "--seed") + 1L)])) + } else { + message("No seed provided, defaulting to 1234.") + } +} + +# Arguments checks +allowed_model <- c("iid", "pi", "rho", "pirho") +stopifnot( + "Unknown model, should be : iid, pi, rho or pirho" = (model %in% allowed_model), + "Seed isn't castable to integer" = (is.integer(seed)) +) + +message( + "Début du clustering des données sous-collection avec le modèle ", model, + " et la seed ", seed +) + +set.seed(seed, "L'Ecuyer-CMRG") +base_data_folder <- file.path(here(), "code", "data", "dore") +save_folder <- file.path( + here(), "code", "results", "applications", + "sub-dore", "souscollection" +) + +if (!dir.exists(save_folder)) { + dir.create(save_folder, recursive = TRUE) +} + +collection_data <- file.path( + base_data_folder, + "dore-matrices.Rds" +) + +interaction_data <- read.table(file = file.path(base_data_folder, "interaction-data.txt"), sep = "\t", header = TRUE) + +seq_ids_network_aggreg <- unique(interaction_data$id_network_aggreg) +names_aggreg_networks <- sapply( + seq_ids_network_aggreg, + function(id) { + paste0( + unique(interaction_data[which(interaction_data$id_network_aggreg == id), ]$web), + collapse = "+" + ) + } +) +# Computation of incidence matrices +incidence_matrices <- lapply( + seq_ids_network_aggreg, + function(m) { + current_interaction_data <- interaction_data[which(interaction_data$id_network_aggreg == m), ] %>% + mutate( + plantaggreg = paste(plantorder, + plantfamily, plantgenus, plantspecies, + sep = "-" + ), + insectaggreg = paste(insectorder, + insectfamily, insectgenus, insectspecies, + sep = "-" + ) + ) + current_interaction_data <- table(current_interaction_data$plantaggreg, current_interaction_data$insectaggreg) + + current_incidence_matrix <- matrix(current_interaction_data, + ncol = ncol(current_interaction_data), dimnames = dimnames(current_interaction_data) + ) + + current_incidence_matrix[which(current_incidence_matrix > 0)] <- 1 + return(current_incidence_matrix) + } +) + +names(incidence_matrices) <- names_aggreg_networks + +cluster1 <- c( + "arroyo1982_1+arroyo1982_2+arroyo3", + "Struck1994", + "Albrecht2010_49yr+Albrecht2010_63yr+Albrecht2010_84yr+Albrecht2010_109yr+Albrecht2010_130yr", + "Chamberlain_HLU+Chamberlain_HLG+Chamberlain_OKU+Chamberlain_OKG+Chamberlain_WLU+Chamberlain_WLG+Chamberlain_SOU+Chamberlain_SOG", + "Devoto2005_PP+Devoto2005_AP", + "Devoto2005_LL+Devoto2005_CT", + "KatoMiura1996", + "Souza_chaco", + "Souza_vereda", + "Adedoja2019", + "Hackett2019_NZ_salt_marsh+Hackett2019_NZ_sand_dune+Hackett2019_NZ_scrub_coprosma" +) + +cluster2 <- c( + "medan2002ld", + "ramirez1992", + "Benadi2013_3(1340m)", + "CordenizPicanco2018_NatVeg" +) + +incidence_matrices <- incidence_matrices[names_aggreg_networks %in% c(cluster1, cluster2)] + +start_time <- format(Sys.time(), "%d-%m-%y_%H-%M-%S") + +list_collection <- clusterize_bipartite_networks( + netlist = incidence_matrices, + net_id = names(incidence_matrices), + colsbm_model = model, + nb_run = 3L, + global_opts = list( + nb_cores = parallelly::availableCores(omit = 1L), + verbosity = 3L, + plot_details = 0L, + backend = "future" + ), + fit_opts = list(max_vem_steps = 200L) +) + +save_file <- file.path( + save_folder, paste0( + "souscollection_collection_", + model, "_seed_", seed, + "_", start_time, ".Rds" + ) +) + +message("Clustering saved. Model", model) +saveRDS(list_collection, file = save_file) diff --git a/code/scripts/migale_application_souscollection.sh b/code/scripts/migale_application_souscollection.sh new file mode 100755 index 0000000..65933dd --- /dev/null +++ b/code/scripts/migale_application_souscollection.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +#$ -V +#$ -cwd +#$ -N baldock +#$ -m besa +#$ -t 1:16 +#$ -q short.q +#$ -pe thread 64 +#$ -M louis.lacoste+migale@agroparistech.fr +#$ -o logs/$JOB_NAME.$TASK_ID +#$ -e logs/$JOB_NAME.$TASK_ID + +# Creating log directory if it doesn't exists +BASE_DIR="/home/$USER/work/mia-stage-2024" +LOG_DIR=$(echo "$BASE_DIR/logs") + +if [ ! -d "$LOG_DIR" ]; then + mkdir -p $LOG_DIR +fi + +# Constant data +MODELARRAY=("iid" "pi" "rho" "pirho") +ID=$((SGE_TASK_ID - 1)) +MODEL=${MODELARRAY[$((ID % 4))]} + +SEED=$(($((ID + $((ID / 4)))) % 4)) + + +# Finding directory +APPLICATIONS_DIR=$(echo "$BASE_DIR/code/applications") + +echo $APPLICATIONS_DIR + +Rscript "${APPLICATIONS_DIR}/sub-dore/sous-collection/01_souscol.R" --model $MODEL --seed $SEED