--- title: "Stats at Taxo" format: html --- ```{r} library(here) library(stringr) library(tidyverse) library(phyloseq) library(biomformat) source("utils.R") the_data <- import_biom("data/mach/kinetic.biom") per_taxa_networks <- collapse_otu_at_taxo(the_data) otu_df <- sapply(per_taxa_networks, nrow) %>% data.frame() %>% rownames_to_column() %>% rename(Nb_OTU = ".", Rank = "rowname") flist <- list.files(here("results", "lbm-seq"), full.names = TRUE, pattern = ".Rds") para_flist <- grepv(pattern = "para.Rds", flist) seq_flist <- grepv(pattern = "seq.Rds", flist) notrans_flist <- grepv(pattern = "notrans.Rds", flist) ``` ```{r} bench_df <- do.call("rbind", lapply(flist, function(file) readRDS(file)$benchmark)) bench_df <- bench_df %>% mutate(expr = as.character(expr)) %>% separate_wider_regex(cols = "expr", patterns = c(Rank = "Rank[0-9]", type = "para|seq|notrans")) %>% mutate(Rank = as.factor(Rank), type = as.factor(type)) %>% left_join(otu_df, by = "Rank") %>% mutate(Rank = as.factor(Rank), type = as.factor(type)) levels(bench_df$Rank) <- c("Phylum", "Class", "Order", "Family", "Genus") ``` ```{r} library(ggplot2) coeff <- 220000000000 ggplot(bench_df, aes(x = Rank, col = type)) + geom_boxplot(aes(y = time)) + geom_point(aes(y = coeff * Nb_OTU, size = Nb_OTU), shape = 13) + scale_color_manual(values = c("#363634", "#009E73", "#CC79A7"), labels = c("No transfer", "Parallelized", "Sequential")) + scale_y_continuous(sec.axis = sec_axis(~ . / coeff, name = "Number of OTUs")) + labs(size = "Number of OTUs", color = "Algorithm Type", y = "Time (seconds)") + theme_minimal() ``` ```{r} load_result_list <- function(flist) { results <- lapply(flist, function(file) readRDS(file)$models) names(results) <- paste0("Rep", seq_along(results)) results <- purrr::transpose(results) return(results) } library(aricode) ARI_in_repet_Z1 <- function(result_list) { lapply(result_list, function(rank) { Z1_list <- lapply(rank, function(repet) { Z11 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z1, 1, which.max) }) outer(X = Z1_list, Y = Z1_list, Vectorize(ARI)) }) } ARI_in_repet_Z2 <- function(result_list) { lapply(result_list, function(rank) { Z2_list <- lapply(rank, function(repet) { Z2 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z2, 1, which.max) }) outer(X = Z2_list, Y = Z2_list, Vectorize(ARI)) }) } ``` ```{r} seq_results <- load_result_list(seq_flist) para_results <- load_result_list(para_flist) notrans_results <- load_result_list(notrans_flist) ``` ```{r} ARI_in_repet_Z1(seq_results) ARI_in_repet_Z1(para_results) ARI_in_repet_Z1(notrans_results) ``` ```{r} ARI_in_repet_Z2(seq_results) ARI_in_repet_Z2(para_results) ARI_in_repet_Z2(notrans_results) ``` Tous les Z sont concordants entre les répétitions, je vais donc sélectionner une seule répétition de chaque pour l'analyse. # Analyse des résultats des LBM Ci-après on extrait les premiers modèles. ```{r} seq_model <- purrr::transpose(seq_results)[[2]] para_model <- purrr::transpose(para_results)[[1]] notrans_model <- purrr::transpose(notrans_results)[[1]] ``` ```{r} extract_memberships <- function(model) { memberships <- model$memberships[[which.max(model$ICL)]] rownames(memberships[["Z1"]]) <- rownames(model$adj) rownames(memberships[["Z2"]]) <- colnames(model$adj) memberships map_memberships <- list(Z1 = apply(memberships[["Z1"]], 1, which.max), Z2 = apply(memberships[["Z2"]], 1, which.max)) map_memberships } ``` ## Comparaison des ARI par rang taxonomique ```{r} seq_memberships <- lapply(seq_model, extract_memberships) para_memberships <- lapply(para_model, extract_memberships) notrans_memberships <- lapply(notrans_model, extract_memberships) ``` ```{r} tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(seq_memberships)[["Z1"]], .f = ARI)) %>% t() %>% knitr::kable(col.names = paste0("Rank", seq(2,5)), caption = "ARI for the methods for OTU memberships") ``` ```{r} tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(seq_memberships)[["Z2"]], .f = ARI)) %>% t() %>% knitr::kable(col.names = paste0("Rank", seq(2,5)), caption = "ARI for the methods for sample memberships") ``` ```{r} library(ggalluvial) membership_to_df <- function(membership_list, suffix = 1) { lapply(membership_list, function(membership_vec) { df <- data.frame(membership_vec) colnames(df) <- paste0("Z", suffix) df %>% rownames_to_column(var = "Rank") %>% separate_wider_delim(cols = "Rank", delim = ";_;", names_sep = "") }) } seq_df <- lapply(seq_along(seq_memberships), function(idx) { membership_to_df(seq_memberships[[idx]], suffix = idx + 1) }) seq_df_otu <- lapply(seq_df, function(df) { df$Z1 }) %>% # Below we join by the ranks reduce(left_join) %>% select(sort(names(.))) seq_df_sample <- lapply(seq_df, function(df) { df$Z2 }) %>% reduce(left_join, by = "Rank1") seq_df_sample_lodes <- to_lodes_form(seq_df_sample, key = "x", axes = 2:5) seq_df_otu_lodes <- to_lodes_form(seq_df_otu, key = "x", axes = 6:9) ``` ```{r} mach_metadata <- read.table(file = "data/mach/kinetic_sample_metadata.tsv") %>% rownames_to_column(var = "Rank1") seq_df_sample_lodes <- left_join(seq_df_sample_lodes, mach_metadata) ggplot(seq_df_sample_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) + geom_alluvium(aes(fill = Weaned)) + geom_stratum() + geom_text(stat = "stratum") ggplot(seq_df_otu_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) + geom_alluvium(aes(fill = Rank2)) + geom_stratum() + geom_text(stat = "stratum") ```