Stats at Taxo

library(here)

here() starts at /home/louis/Documents/Thèse/Axe 3 - Inférence et Microbiote/human-microbiome-compendium

library(stringr)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ purrr     1.0.4
✔ forcats   1.0.1     ✔ readr     2.2.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(phyloseq)
library(biomformat)
source("utils.R")


Attachement du package : 'kableExtra'

L'objet suivant est masqué depuis 'package:dplyr':

    group_rows


Attachement du package : 'rlang'

Les objets suivants sont masqués depuis 'package:purrr':

    %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice

mach_data <- import_biom("data/mach/mach.biom")
the_data <- import_biom("data/chaillou/chaillou.biom")
per_taxa_networks <- collapse_otu_at_taxo(the_data)
otu_df <- sapply(per_taxa_networks, nrow) %>%
    data.frame() %>%
    rownames_to_column() %>%
    rename(Nb_OTU = ".", Rank = "rowname")

result_folder <- here("results", "lbm-seq", "chaillou")

flist <- list.files(result_folder, full.names = TRUE, pattern = ".Rds")

para_flist <- grepv(pattern = "para_", flist) |> sort(decreasing = TRUE)
seq_flist <- grepv(pattern = "seq_", flist) |> sort(decreasing = TRUE)
notrans_flist <- grepv(pattern = "notrans_", flist) |> sort(decreasing = TRUE)

# bench_df <- do.call("rbind", lapply(flist, function(file) readRDS(file)$benchmark))

# bench_df <- bench_df %>%
#     mutate(expr = as.character(expr)) %>%
#     separate_wider_regex(cols = "expr", patterns = c(Rank = "Rank[0-9]", type = "para|seq|notrans")) %>%
#     mutate(Rank = as.factor(Rank), type = as.factor(type)) %>%
#     left_join(otu_df, by = "Rank") %>%
#     mutate(Rank = as.factor(Rank), type = as.factor(type))

# levels(bench_df$Rank) <- c("Phylum", "Class", "Order", "Family", "Genus")

# library(ggplot2)

# coeff <- 220000000000

# ggplot(bench_df, aes(x = Rank, col = type)) +
#     geom_boxplot(aes(y = time)) +
#     geom_point(aes(y = coeff * Nb_OTU, size = Nb_OTU), shape = 13) +
#     scale_color_manual(values = c("#363634", "#009E73", "#CC79A7"), labels = c("No transfer", "Parallelized", "Sequential")) +
#     scale_y_continuous(sec.axis = sec_axis(~ . / coeff, name = "Number of OTUs")) +
#     labs(size = "Number of OTUs", color = "Algorithm Type", y = "Time (seconds)") +
#     theme_minimal()

load_result_list <- function(flist) {
    results <- lapply(flist, readRDS)
    # names(results) <- paste0("Rep", seq_along(results))
    # results <- purrr::transpose(results)
    return(results)
}

select_max <- function(results) {
    ICL_list <- lapply(results, function(rep) rep |> sapply(function(model) max(model$ICL, na.rm = TRUE)))
    max_id_list <- lapply(ICL_list, which.max)

    best_results <- lapply(seq_along(max_id_list), function(idx) results[[idx]][[max_id_list[[idx]]]])
    names(best_results) <- names(results)
    return(best_results)
}

library(aricode)
ARI_in_repet_Z1 <- function(result_list) {
    lapply(result_list, function(rank) {
        Z1_list <- lapply(rank, function(repet) {
            Z11 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z1, 1, which.max)
        })
        outer(X = Z1_list, Y = Z1_list, Vectorize(ARI))
    })
}

ARI_in_repet_Z2 <- function(result_list) {
    lapply(result_list, function(rank) {
        Z2_list <- lapply(rank, function(repet) {
            Z2 <- apply(repet$memberships[[which.max(repet$ICL)]]$Z2, 1, which.max)
        })
        outer(X = Z2_list, Y = Z2_list, Vectorize(ARI))
    })
}

seq_results <- purrr::transpose(load_result_list(seq_flist))
para_results <- purrr::transpose(load_result_list(para_flist))
notrans_results <- purrr::transpose(load_result_list(notrans_flist))

seq_model <- select_max(seq_results)

Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé

para_model <- select_max(para_results)

Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé

notrans_model <- select_max(notrans_results)

Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé
Warning in max(model$ICL, na.rm = TRUE): aucun argument pour max ; -Inf est
renvoyé

# ARI_in_repet_Z1(seq_results)
# ARI_in_repet_Z1(para_results)
# ARI_in_repet_Z1(notrans_results)

# ARI_in_repet_Z2(seq_results)
# ARI_in_repet_Z2(para_results)
# ARI_in_repet_Z2(notrans_results)

Tous les Z sont concordants entre les répétitions, je vais donc sélectionner une seule répétition de chaque pour l’analyse.

Analyse des résultats des LBM

Ci-après on extrait les premiers modèles.

# seq_model <- purrr::transpose(seq_results)[[1]]
# para_model <- purrr::transpose(para_results)[[1]]
# notrans_model <- purrr::transpose(notrans_results)[[1]]

extract_memberships <- function(model) {
    memberships <- model$memberships[[which.max(model$ICL)]]
    rownames(memberships[["Z1"]]) <- rownames(model$adj)
    rownames(memberships[["Z2"]]) <- colnames(model$adj)
    memberships
    map_memberships <- list(Z1 = apply(memberships[["Z1"]], 1, which.max), Z2 = apply(memberships[["Z2"]], 1, which.max))
    map_memberships
}

Comparaison des ARI par rang taxonomique

seq_memberships <- lapply(seq_model, extract_memberships)
para_memberships <- lapply(para_model, extract_memberships)
notrans_memberships <- lapply(notrans_model, extract_memberships)

tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z1"]], purrr::transpose(notrans_memberships)[["Z1"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z1"]], purrr::transpose(seq_memberships)[["Z1"]], .f = ARI)) %>%
    t() %>%
    knitr::kable(col.names = paste0("Rank", seq(2, 7)), caption = "ARI for the methods for OTU memberships")

ARI for the methods for OTU memberships
	Rank2	Rank3	Rank4	Rank5	Rank6	Rank7
Para v No transfer	1	0.9795031	1	0.9559121	0.5863686	0.7160487
Sequential v No transfer	1	0.9795031	1	0.9559121	0.6714081	0.6848977
Para v Sequential	1	1.0000000	1	1.0000000	0.5442963	0.7191279

tibble("Para v No transfer" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Sequential v No transfer" = map2_dbl(purrr::transpose(seq_memberships)[["Z2"]], purrr::transpose(notrans_memberships)[["Z2"]], .f = ARI), "Para v Sequential" = map2_dbl(purrr::transpose(para_memberships)[["Z2"]], purrr::transpose(seq_memberships)[["Z2"]], .f = ARI)) %>%
    t() %>%
    knitr::kable(col.names = paste0("Rank", seq(2, 7)), caption = "ARI for the methods for sample memberships")

ARI for the methods for sample memberships
	Rank2	Rank3	Rank4	Rank5	Rank6	Rank7
Para v No transfer	1	0.9279241	0.9279268	1	0.9874013	0.9049673
Sequential v No transfer	1	0.9279241	0.9279268	1	0.9874013	0.8838616
Para v Sequential	1	1.0000000	1.0000000	1	1.0000000	0.9786588

library(ggalluvial)


membership_to_df <- function(membership_list, suffix = 1) {
    lapply(membership_list, function(membership_vec) {
        df <- data.frame(membership_vec)
        colnames(df) <- paste0("Z", suffix)
        df %>%
            rownames_to_column(var = "Rank") %>%
            separate_wider_delim(cols = "Rank", delim = ";_;", names_sep = "")
    })
}

dispatch_membership_to_df <- function(memberships) {
    mdf <- lapply(seq_along(memberships), function(idx) {
        membership_to_df(memberships[[idx]], suffix = idx + 1)
    })

    df_otu <- lapply(mdf, function(df) {
        df$Z1
    }) %>% # Below we join by the ranks
        reduce(left_join) %>%
        select(sort(names(.)))

    df_sample <- lapply(mdf, function(df) {
        df$Z2
    }) %>% reduce(left_join, by = "Rank1")

    return(list(df_otu = df_otu, df_sample = df_sample))
}
seq_df_otu_samples <- dispatch_membership_to_df(seq_memberships)

Joining with `by = join_by(Rank1, Rank2)`
Joining with `by = join_by(Rank1, Rank2, Rank3)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4, Rank5)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6)`

seq_df_otu <- seq_df_otu_samples$df_otu
seq_df_sample <- seq_df_otu_samples$df_sample

notrans_df_otu_samples <- dispatch_membership_to_df(notrans_memberships)

Joining with `by = join_by(Rank1, Rank2)`
Joining with `by = join_by(Rank1, Rank2, Rank3)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4, Rank5)`
Joining with `by = join_by(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6)`

notrans_df_otu <- notrans_df_otu_samples$df_otu
notrans_df_sample <- notrans_df_otu_samples$df_sample

seq_df_sample_lodes <- to_lodes_form(seq_df_sample, key = "x", axes = 2:7)
seq_df_otu_lodes <- to_lodes_form(seq_df_otu, key = "x", axes = 8:13)

mach_metadata <- read.table(file = "data/mach/kinetic_sample_metadata.tsv") %>% rownames_to_column(var = "Rank1")
chaillou_metadata <- read.table(file = "data/chaillou/sample_metadata.tsv") %>% rownames_to_column(var = "Rank1")

seq_df_sample_lodes <- left_join(seq_df_sample_lodes, chaillou_metadata)

Joining with `by = join_by(Rank1)`

ggplot(seq_df_sample_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) +
    geom_alluvium(aes(fill = EnvType)) +
    geom_stratum() +
    geom_text(stat = "stratum")

ggplot(seq_df_otu_lodes, aes(x = x, alluvium = alluvium, stratum = stratum, label = stratum)) +
    geom_alluvium(aes(fill = Rank2)) +
    geom_stratum() +
    geom_text(stat = "stratum")

list_memberships <- function(df_otu) {
    memb_mat <- df_otu |> select(starts_with("Z"))
    memb_list <- lapply(seq_len(ncol(memb_mat)), function(i) {
        memb_mat[, i] |>
            unlist() |>
            as.vector()
    })
    return(memb_list)
}
seq_memb_list <- list_memberships(seq_df_otu)
notrans_memb_list <- list_memberships(notrans_df_otu)
outer(seq_memb_list, seq_memb_list, Vectorize(ARI))

           [,1]      [,2]       [,3]       [,4]       [,5]       [,6]
[1,] 1.00000000 0.6051064 0.43632519 0.20999997 0.09186882 0.02762981
[2,] 0.60510641 1.0000000 0.66243617 0.32579079 0.13747406 0.03198990
[3,] 0.43632519 0.6624362 1.00000000 0.46895163 0.21157048 0.03628507
[4,] 0.20999997 0.3257908 0.46895163 1.00000000 0.35243896 0.05731858
[5,] 0.09186882 0.1374741 0.21157048 0.35243896 1.00000000 0.13103571
[6,] 0.02762981 0.0319899 0.03628507 0.05731858 0.13103571 1.00000000

outer(notrans_memb_list, notrans_memb_list, Vectorize(ARI))

           [,1]      [,2]       [,3]       [,4]       [,5]       [,6]
[1,] 1.00000000 0.6752629 0.43632519 0.21180060 0.08607997 0.02626304
[2,] 0.67526287 1.0000000 0.66114961 0.34665372 0.14483180 0.03557190
[3,] 0.43632519 0.6611496 1.00000000 0.47621526 0.21321859 0.03754886
[4,] 0.21180060 0.3466537 0.47621526 1.00000000 0.36450974 0.06849834
[5,] 0.08607997 0.1448318 0.21321859 0.36450974 1.00000000 0.12496635
[6,] 0.02626304 0.0355719 0.03754886 0.06849834 0.12496635 1.00000000

ARI phylogénie et clustering

phylo_df <- data.frame(merged = rownames(tail(per_taxa_networks, 1)[[1]])) %>% separate(col = "merged", into = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = ";_;")

phylo_df <- phylo_df %>% rename_all(~ str_c("Rank", which(colnames(phylo_df) == .)))

phylo_df <- phylo_df %>% mutate_all(as.factor)

phylo_memb_list <- lapply(seq_len(ncol(phylo_df)), function(i) {
    phylo_df[, i] |>
        unlist() |>
        as.factor()
})

outer(phylo_memb_list, seq_memb_list, Vectorize(ARI))

            [,1]         [,2]        [,3]         [,4]        [,5]
[1,] 0.000000000 0.000000e+00  0.00000000  0.000000000 0.000000000
[2,] 0.006843910 1.451703e-02  0.01617787  0.011256651 0.004928821
[3,] 0.007833661 2.300906e-02  0.02183041  0.021800120 0.012127447
[4,] 0.005860251 2.124732e-02  0.02071343  0.027231684 0.018577144
[5,] 0.010056545 2.440340e-02  0.02692696  0.044753926 0.032990443
[6,] 0.010034723 2.018662e-02  0.03331205  0.048235495 0.039256938
[7,] 0.004411921 4.525008e-05 -0.00275281 -0.004692503 0.002519763
              [,6]
[1,]  0.0000000000
[2,]  0.0008384133
[3,] -0.0008312780
[4,] -0.0008194597
[5,]  0.0014951018
[6,]  0.0059149398
[7,] -0.0006914548

ICL

# Extract ICL
ICL_per_ranks <- function(model_results) {
    model_rank_list <- lapply(model_results, function(model_per_rank) model_per_rank[[1]]) # Extract first rep per rank
    return(sapply(model_rank_list, function(model) max(model$ICL, na.rm = TRUE)))
}
ICL_df <- data.frame(
    "No trans" = ICL_per_ranks(notrans_results), "Seq" =
        ICL_per_ranks(seq_results)
) %>%
    t() %>%
    as.data.frame()

library(kableExtra)
kable(ICL_df)

	Rank2	Rank3	Rank4	Rank5	Rank6	Rank7
No.trans	-191896.8	-208538.4	-246763.0	-287639.8	-327344.1	-409702.2
Seq	-191896.8	-209577.4	-246726.3	-289685.9	-326594.8	-409348.6