From cb7f459341c86aced85ab74eb950ceb62d171327 Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 30 Jan 2026 14:33:49 +0100 Subject: [PATCH] Replacing R script by qmd --- analysis_benchmark_lbm_seq.R | 67 ------------ analysis_benchmark_lbm_seq.qmd | 191 +++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 67 deletions(-) delete mode 100644 analysis_benchmark_lbm_seq.R create mode 100644 analysis_benchmark_lbm_seq.qmd diff --git a/analysis_benchmark_lbm_seq.R b/analysis_benchmark_lbm_seq.R deleted file mode 100644 index f476768..0000000 --- a/analysis_benchmark_lbm_seq.R +++ /dev/null @@ -1,67 +0,0 @@ -library(here) -library(stringr) -library(tidyverse) -library(phyloseq) -library(biomformat) -source("utils.R") -the_data <- import_biom("data/mach/kinetic.biom") -per_taxa_networks <- collapse_otu_at_taxo(the_data) -otu_df <- sapply(per_taxa_networks, nrow) %>% - data.frame() %>% - rownames_to_column() %>% - rename(Nb_OTU = ".", Rank = "rowname") - - -flist <- list.files(here("results", "lbm-seq"), full.names = TRUE, pattern = ".Rds") - -para_flist <- grepv(pattern = "para.Rds", flist) -seq_flist <- grepv(pattern = "seq.Rds", flist) -notrans_flist <- grepv(pattern = "notrans.Rds", flist) - -bench_df <- do.call("rbind", lapply(flist, function(file) readRDS(file)$benchmark)) - -bench_df <- bench_df %>% - mutate(expr = as.character(expr)) %>% - separate_wider_regex(cols = "expr", patterns = c(Rank = "Rank[0-9]", type = "para|seq|notrans")) %>% - mutate(Rank = as.factor(Rank), type = as.factor(type)) %>% - left_join(otu_df, by = "Rank") %>% - mutate(Rank = as.factor(Rank), type = as.factor(type)) - - - -levels(bench_df$Rank) <- c("Phylum", "Class", "Order", "Family", "Genus") - -library(ggplot2) - -coeff <- 220000000000 - -ggplot(bench_df, aes(x = Rank, col = type)) + - geom_boxplot(aes(y = time)) + - geom_point(aes(y = coeff * Nb_OTU, size = Nb_OTU), shape = 13) + - scale_color_manual(values = c("#363634", "#009E73", "#CC79A7"), labels = c("No transfer", "Parallelized", "Sequential")) + - scale_y_continuous(sec.axis = sec_axis(~ . / coeff, name = "Number of OTUs")) + - labs(size = "Number of OTUs", color = "Algorithm Type", y = "Time (seconds)") + - theme_minimal() - -seq_results <- lapply(seq_flist, function(file) readRDS(file)$models) - -names(seq_results) <- paste0("Rep", seq_along(seq_results)) - -library(aricode) -seq_results <- transpose(seq_results) -lapply(seq_results, function(rank) { - rep1 <- rank$Rep1 - memb1 <- rep1$memberships[[which.max(rep1$ICL)]] - - - rep2 <- rank$Rep2 - memb2 <- rep2$memberships[[which.max(rep2$ICL)]] - - Z11 <- apply(memb1$Z1, 1, which.max) - Z12 <- apply(memb2$Z1, 1, which.max) - - Z21 <- apply(memb1$Z2, 1, which.max) - Z22 <- apply(memb2$Z2, 1, which.max) - - c(ARI(Z11, Z12), ARI(Z21, Z22)) -}) diff --git a/analysis_benchmark_lbm_seq.qmd b/analysis_benchmark_lbm_seq.qmd new file mode 100644 index 0000000..7a045fd --- /dev/null +++ b/analysis_benchmark_lbm_seq.qmd @@ -0,0 +1,191 @@ +--- +title: "Stats at Taxo" +format: html +--- + +```{r} +library(here) +library(stringr) +library(tidyverse) +library(phyloseq) +library(biomformat) +source("utils.R") +the_data <- import_biom("data/mach/kinetic.biom") +per_taxa_networks <- collapse_otu_at_taxo(the_data) +otu_df <- sapply(per_taxa_networks, nrow) %>% + data.frame() %>% + rownames_to_column() %>% + rename(Nb_OTU = ".", Rank = "rowname") + + +flist <- list.files(here("results", "lbm-seq"), full.names = TRUE, pattern = ".Rds") + +para_flist <- grepv(pattern = "para.Rds", flist) +seq_flist <- grepv(pattern = "seq.Rds", flist) +notrans_flist <- grepv(pattern = "notrans.Rds", flist) +``` + +```{r} +bench_df <- do.call("rbind", lapply(flist, function(file) readRDS(file)$benchmark)) + +bench_df <- bench_df %>% + mutate(expr = as.character(expr)) %>% + separate_wider_regex(cols = "expr", patterns = c(Rank = "Rank[0-9]", type = "para|seq|notrans")) %>% + mutate(Rank = as.factor(Rank), type = as.factor(type)) %>% + left_join(otu_df, by = "Rank") %>% + mutate(Rank = as.factor(Rank), type = as.factor(type)) + +levels(bench_df$Rank) <- c("Phylum", "Class", "Order", "Family", "Genus") +``` + +```{r} +library(ggplot2) + +coeff <- 220000000000 + +ggplot(bench_df, aes(x = Rank, col = type)) + + geom_boxplot(aes(y = time)) + + geom_point(aes(y = coeff * Nb_OTU, size = Nb_OTU), shape = 13) + + scale_color_manual(values = c("#363634", "#009E73", "#CC79A7"), labels = c("No transfer", "Parallelized", "Sequential")) + + scale_y_continuous(sec.axis = sec_axis(~ . / coeff, name = "Number of OTUs")) + + labs(size = "Number of OTUs", color = "Algorithm Type", y = "Time (seconds)") + + theme_minimal() +``` + +```{r} +load_result_list <- function(flist) { + results <- lapply(flist, function(file) readRDS(file)$models) + names(results) <- paste0("Rep", seq_along(results)) + results <- purrr::transpose(results) + return(results) +} + +library(aricode) +ARI_in_repet_Z1 <- function(result_list) { + lapply(result_list, function(rank) { + Z1_list <- lapply(rank, function(repet) { + Z11 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z1, 1, which.max) + }) + outer(X = Z1_list, Y = Z1_list, Vectorize(ARI)) + }) +} + +ARI_in_repet_Z2 <- function(result_list) { + lapply(result_list, function(rank) { + Z2_list <- lapply(rank, function(repet) { + Z2 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z2, 1, which.max) + }) + outer(X = Z2_list, Y = Z2_list, Vectorize(ARI)) + }) +} +``` + +```{r} +seq_results <- load_result_list(seq_flist) +para_results <- load_result_list(para_flist) +notrans_results <- load_result_list(notrans_flist) +``` + + +```{r} +ARI_in_repet_Z1(seq_results) +ARI_in_repet_Z1(para_results) +ARI_in_repet_Z1(notrans_results) +``` + +```{r} +ARI_in_repet_Z2(seq_results) +ARI_in_repet_Z2(para_results) +ARI_in_repet_Z2(notrans_results) +``` + +Tous les Z sont concordants entre les répétitions, je vais donc sélectionner une seule répétition de chaque pour l'analyse. + +# Analyse des résultats des LBM + +Ci-après on extrait les premiers modèles. +```{r} +seq_model <- purrr::transpose(seq_results)[[2]] +para_model <- purrr::transpose(para_results)[[1]] +notrans_model <- purrr::transpose(notrans_results)[[1]] +``` + +```{r} +extract_memberships <- function(model) { + memberships <- model$memberships[[which.max(model$ICL)]] + rownames(memberships[["Z1"]]) <- rownames(model$adj) + rownames(memberships[["Z2"]]) <- colnames(model$adj) + memberships + map_memberships <- list(Z1 = apply(memberships[["Z1"]], 1, which.max), Z2 = apply(memberships[["Z2"]], 1, which.max)) + map_memberships +} +``` + +## Comparaison des ARI par rang taxonomique + +```{r} +seq_memberships <- lapply(seq_model, extract_memberships) +para_memberships <- lapply(para_model, extract_memberships) +notrans_memberships <- lapply(notrans_model, extract_memberships) +``` + +```{r} +tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(seq_memberships)[["Z1"]], .f = ARI)) %>% t() %>% +knitr::kable(col.names = paste0("Rank", seq(2,5)), caption = "ARI for the methods for OTU memberships") +``` + +```{r} +tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(seq_memberships)[["Z2"]], .f = ARI)) %>% t() %>% +knitr::kable(col.names = paste0("Rank", seq(2,5)), caption = "ARI for the methods for sample memberships") +``` + +```{r} +library(ggalluvial) + + +membership_to_df <- function(membership_list, suffix = 1) { + lapply(membership_list, function(membership_vec) { + df <- data.frame(membership_vec) + colnames(df) <- paste0("Z", suffix) + df %>% + rownames_to_column(var = "Rank") %>% + separate_wider_delim(cols = "Rank", delim = ";_;", names_sep = "") + }) +} + +seq_df <- lapply(seq_along(seq_memberships), function(idx) { + membership_to_df(seq_memberships[[idx]], suffix = idx + 1) +}) + +seq_df_otu <- lapply(seq_df, function(df) { + df$Z1 +}) %>% # Below we join by the ranks + reduce(left_join) %>% + select(sort(names(.))) + + + +seq_df_sample <- lapply(seq_df, function(df) { + df$Z2 +}) %>% reduce(left_join, by = "Rank1") + + +seq_df_sample_lodes <- to_lodes_form(seq_df_sample, key = "x", axes = 2:5) +seq_df_otu_lodes <- to_lodes_form(seq_df_otu, key = "x", axes = 6:9) +``` + +```{r} +mach_metadata <- read.table(file = "data/mach/kinetic_sample_metadata.tsv") %>% rownames_to_column(var = "Rank1") + +seq_df_sample_lodes <- left_join(seq_df_sample_lodes, mach_metadata) + +ggplot(seq_df_sample_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) + + geom_alluvium(aes(fill = Weaned)) + + geom_stratum() + + geom_text(stat = "stratum") + +ggplot(seq_df_otu_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) + + geom_alluvium(aes(fill = Rank2)) + + geom_stratum() + + geom_text(stat = "stratum") +``` \ No newline at end of file