265 lines
No EOL
8.5 KiB
Text
265 lines
No EOL
8.5 KiB
Text
---
|
|
title: "Stats at Taxo"
|
|
format: html
|
|
embed-resources: true
|
|
---
|
|
|
|
```{r}
|
|
library(here)
|
|
library(stringr)
|
|
library(tidyverse)
|
|
library(phyloseq)
|
|
library(biomformat)
|
|
source("utils.R")
|
|
mach_data <- import_biom("data/mach/kinetic.biom")
|
|
the_data <- import_biom("data/chaillou/chaillou.biom")
|
|
per_taxa_networks <- collapse_otu_at_taxo(the_data)
|
|
otu_df <- sapply(per_taxa_networks, nrow) %>%
|
|
data.frame() %>%
|
|
rownames_to_column() %>%
|
|
rename(Nb_OTU = ".", Rank = "rowname")
|
|
|
|
result_folder <- here("results", "lbm-seq", "chaillou")
|
|
|
|
flist <- list.files(result_folder, full.names = TRUE, pattern = ".Rds")
|
|
|
|
para_flist <- grepv(pattern = "para.Rds", flist)
|
|
seq_flist <- grepv(pattern = "seq.Rds", flist)
|
|
notrans_flist <- grepv(pattern = "notrans.Rds", flist)
|
|
```
|
|
|
|
```{r}
|
|
bench_df <- do.call("rbind", lapply(flist, function(file) readRDS(file)$benchmark))
|
|
|
|
bench_df <- bench_df %>%
|
|
mutate(expr = as.character(expr)) %>%
|
|
separate_wider_regex(cols = "expr", patterns = c(Rank = "Rank[0-9]", type = "para|seq|notrans")) %>%
|
|
mutate(Rank = as.factor(Rank), type = as.factor(type)) %>%
|
|
left_join(otu_df, by = "Rank") %>%
|
|
mutate(Rank = as.factor(Rank), type = as.factor(type))
|
|
|
|
levels(bench_df$Rank) <- c("Phylum", "Class", "Order", "Family", "Genus")
|
|
```
|
|
|
|
```{r}
|
|
library(ggplot2)
|
|
|
|
coeff <- 220000000000
|
|
|
|
ggplot(bench_df, aes(x = Rank, col = type)) +
|
|
geom_boxplot(aes(y = time)) +
|
|
geom_point(aes(y = coeff * Nb_OTU, size = Nb_OTU), shape = 13) +
|
|
scale_color_manual(values = c("#363634", "#009E73", "#CC79A7"), labels = c("No transfer", "Parallelized", "Sequential")) +
|
|
scale_y_continuous(sec.axis = sec_axis(~ . / coeff, name = "Number of OTUs")) +
|
|
labs(size = "Number of OTUs", color = "Algorithm Type", y = "Time (seconds)") +
|
|
theme_minimal()
|
|
```
|
|
|
|
```{r}
|
|
load_result_list <- function(flist) {
|
|
results <- lapply(flist, function(file) readRDS(file)$models)
|
|
names(results) <- paste0("Rep", seq_along(results))
|
|
results <- purrr::transpose(results)
|
|
return(results)
|
|
}
|
|
|
|
library(aricode)
|
|
ARI_in_repet_Z1 <- function(result_list) {
|
|
lapply(result_list, function(rank) {
|
|
Z1_list <- lapply(rank, function(repet) {
|
|
Z11 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z1, 1, which.max)
|
|
})
|
|
outer(X = Z1_list, Y = Z1_list, Vectorize(ARI))
|
|
})
|
|
}
|
|
|
|
ARI_in_repet_Z2 <- function(result_list) {
|
|
lapply(result_list, function(rank) {
|
|
Z2_list <- lapply(rank, function(repet) {
|
|
Z2 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z2, 1, which.max)
|
|
})
|
|
outer(X = Z2_list, Y = Z2_list, Vectorize(ARI))
|
|
})
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
seq_results <- load_result_list(seq_flist)
|
|
para_results <- load_result_list(para_flist)
|
|
notrans_results <- load_result_list(notrans_flist)
|
|
```
|
|
|
|
|
|
```{r}
|
|
ARI_in_repet_Z1(seq_results)
|
|
ARI_in_repet_Z1(para_results)
|
|
ARI_in_repet_Z1(notrans_results)
|
|
```
|
|
|
|
```{r}
|
|
ARI_in_repet_Z2(seq_results)
|
|
ARI_in_repet_Z2(para_results)
|
|
ARI_in_repet_Z2(notrans_results)
|
|
```
|
|
|
|
Tous les Z sont concordants entre les répétitions, je vais donc sélectionner une seule répétition de chaque pour l'analyse.
|
|
|
|
# Analyse des résultats des LBM
|
|
|
|
Ci-après on extrait les premiers modèles.
|
|
```{r}
|
|
seq_model <- purrr::transpose(seq_results)[[1]]
|
|
para_model <- purrr::transpose(para_results)[[1]]
|
|
notrans_model <- purrr::transpose(notrans_results)[[1]]
|
|
```
|
|
|
|
```{r}
|
|
extract_memberships <- function(model) {
|
|
memberships <- model$memberships[[which.max(model$ICL)]]
|
|
rownames(memberships[["Z1"]]) <- rownames(model$adj)
|
|
rownames(memberships[["Z2"]]) <- colnames(model$adj)
|
|
memberships
|
|
map_memberships <- list(Z1 = apply(memberships[["Z1"]], 1, which.max), Z2 = apply(memberships[["Z2"]], 1, which.max))
|
|
map_memberships
|
|
}
|
|
```
|
|
|
|
## Comparaison des ARI par rang taxonomique
|
|
|
|
```{r}
|
|
seq_memberships <- lapply(seq_model, extract_memberships)
|
|
para_memberships <- lapply(para_model, extract_memberships)
|
|
notrans_memberships <- lapply(notrans_model, extract_memberships)
|
|
```
|
|
|
|
```{r}
|
|
tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(seq_memberships)[["Z1"]], .f = ARI)) %>%
|
|
t() %>%
|
|
knitr::kable(col.names = paste0("Rank", seq(2, 5)), caption = "ARI for the methods for OTU memberships")
|
|
```
|
|
|
|
```{r}
|
|
tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(seq_memberships)[["Z2"]], .f = ARI)) %>%
|
|
t() %>%
|
|
knitr::kable(col.names = paste0("Rank", seq(2, 5)), caption = "ARI for the methods for sample memberships")
|
|
```
|
|
|
|
```{r}
|
|
library(ggalluvial)
|
|
|
|
|
|
membership_to_df <- function(membership_list, suffix = 1) {
|
|
lapply(membership_list, function(membership_vec) {
|
|
df <- data.frame(membership_vec)
|
|
colnames(df) <- paste0("Z", suffix)
|
|
df %>%
|
|
rownames_to_column(var = "Rank") %>%
|
|
separate_wider_delim(cols = "Rank", delim = ";_;", names_sep = "")
|
|
})
|
|
}
|
|
|
|
dispatch_membership_to_df <- function(memberships) {
|
|
mdf <- lapply(seq_along(memberships), function(idx) {
|
|
membership_to_df(memberships[[idx]], suffix = idx + 1)
|
|
})
|
|
|
|
df_otu <- lapply(mdf, function(df) {
|
|
df$Z1
|
|
}) %>% # Below we join by the ranks
|
|
reduce(left_join) %>%
|
|
select(sort(names(.)))
|
|
|
|
df_sample <- lapply(mdf, function(df) {
|
|
df$Z2
|
|
}) %>% reduce(left_join, by = "Rank1")
|
|
|
|
|
|
return(list(df_otu = df_otu, df_sample = df_sample))
|
|
}
|
|
seq_df_otu_samples <- dispatch_membership_to_df(seq_memberships)
|
|
seq_df_otu <- seq_df_otu_samples$df_otu
|
|
seq_df_sample <- seq_df_otu_samples$df_sample
|
|
|
|
notrans_df_otu_samples <- dispatch_membership_to_df(notrans_memberships)
|
|
notrans_df_otu <- notrans_df_otu_samples$df_otu
|
|
notrans_df_sample <- notrans_df_otu_samples$df_sample
|
|
|
|
seq_df_sample_lodes <- to_lodes_form(seq_df_sample, key = "x", axes = 2:5)
|
|
seq_df_otu_lodes <- to_lodes_form(seq_df_otu, key = "x", axes = 6:9)
|
|
```
|
|
|
|
```{r}
|
|
mach_metadata <- read.table(file = "data/mach/kinetic_sample_metadata.tsv") %>% rownames_to_column(var = "Rank1")
|
|
chaillou_metadata <- read.table(file = "data/chaillou/sample_metadata.tsv") %>% rownames_to_column(var = "Rank1")
|
|
|
|
seq_df_sample_lodes <- left_join(seq_df_sample_lodes, chaillou_metadata)
|
|
|
|
ggplot(seq_df_sample_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) +
|
|
geom_alluvium(aes(fill = EnvType)) +
|
|
geom_stratum() +
|
|
geom_text(stat = "stratum")
|
|
|
|
ggplot(seq_df_otu_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) +
|
|
geom_alluvium(aes(fill = Rank2)) +
|
|
geom_stratum() +
|
|
geom_text(stat = "stratum")
|
|
```
|
|
|
|
```{r}
|
|
list_memberships <- function(df_otu) {
|
|
memb_mat <- df_otu |> select(starts_with("Z"))
|
|
memb_list <- lapply(seq_len(ncol(memb_mat)), function(i) {
|
|
memb_mat[, i] |>
|
|
unlist() |>
|
|
as.vector()
|
|
})
|
|
return(memb_list)
|
|
}
|
|
seq_memb_list <- list_memberships(seq_df_otu)
|
|
notrans_memb_list <- list_memberships(notrans_df_otu)
|
|
outer(seq_memb_list, seq_memb_list, Vectorize(ARI))
|
|
outer(notrans_memb_list, notrans_memb_list, Vectorize(ARI))
|
|
```
|
|
|
|
### ARI phylogénie et clustering
|
|
|
|
```{r}
|
|
phylo_df <- data.frame(merged = rownames(tail(per_taxa_networks, 1)[[1]])) %>% separate(col = "merged", into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";_;")
|
|
|
|
phylo_df <- phylo_df %>% rename_all(~ str_c("Rank", which(colnames(phylo_df) == .)))
|
|
|
|
phylo_df <- phylo_df %>% mutate_all(as.factor)
|
|
|
|
phylo_memb_list <- lapply(seq_len(ncol(phylo_df)), function(i) {
|
|
phylo_df[, i] |>
|
|
unlist() |>
|
|
as.factor()
|
|
})
|
|
|
|
outer(phylo_memb_list, , Vectorize(ARI))
|
|
```
|
|
|
|
|
|
## ICL
|
|
|
|
```{r}
|
|
# Extract ICL
|
|
ICL_per_ranks <- function(model_results) {
|
|
model_rank_list <- lapply(model_results, function(model_per_rank) model_per_rank[[1]]) # Extract first rep per rank
|
|
return(sapply(model_rank_list, function(model) max(model$ICL, na.rm = TRUE)))
|
|
}
|
|
ICL_df <- data.frame(
|
|
"No trans" = ICL_per_ranks(notrans_results), "Seq" =
|
|
ICL_per_ranks(seq_results)
|
|
) %>%
|
|
t() %>%
|
|
as.data.frame()
|
|
```
|
|
|
|
```{r}
|
|
library(kableExtra)
|
|
ICL_df %>% mutate_all(cell_spec(., bold = ifelse(is..)))
|
|
|
|
|
|
kable(ICL_df)
|
|
``` |