anova-phylogenetique-projet.../Rnw/donnees-reelles.Rnw
2024-03-06 09:39:09 +01:00

255 lines
No EOL
9.8 KiB
Text
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

Ici nous appliquons les méthodes implémentées sur l'arbre de \cite{chenQuantitativeFrameworkCharacterizing2019}.
<< 'knitr_options', echo = FALSE>>=
knitr::opts_knit$set(cache = TRUE)
@
<< import_modules, echo = FALSE, include=FALSE>>=
necessary_packages <- c("phylotools", "phytools", "phylolm", "limma", "edgeR",
"here", "ggplot2", "patchwork", "dplyr", "tidyr", "UpSetR", "evemodel",
"compcodeR", "mvSLOUCH")
if (!all(necessary_packages %in% installed.packages())) {
install.packages(necessary_packages)
install.packages("remotes")
remotes::install_gitlab("sandve-lab/evemodel")
remotes::install_github("pbastide/compcodeR", ref = "phylolimma")
} else {
require(phylotools)
require(phytools)
require(phylolm)
require(limma)
require(edgeR)
require(here)
require(ggplot2)
require(dplyr)
require(tidyr)
require(UpSetR)
require(evemodel)
require(compcodeR)
require(mvSLOUCH)
require(patchwork)
}
source(here("R","utils.R"))
@
<< import_donnees, echo = FALSE>>=
### Data import
cdata <- readRDS(here("data", "data_TER", "data", "chen2019_rodents_cpd.rds"))
is.valid <- compcodeR:::check_phyloCompData(cdata)
if (!(is.valid == TRUE)) stop("Not a valid phyloCompData object.")
# Design
design_formula <- as.formula(~condition)
design_data <- compcodeR:::sample.annotations(cdata)[, "condition", drop = FALSE]
design_data$condition <- factor(design_data$condition)
design <- model.matrix(design_formula, design_data)
# Normalisation
nf <- edgeR::calcNormFactors(compcodeR:::count.matrix(cdata) / compcodeR:::length.matrix(cdata), method = "TMM")
lib.size <- colSums(compcodeR:::count.matrix(cdata) / compcodeR:::length.matrix(cdata)) * nf
data.norm <- sweep((compcodeR:::count.matrix(cdata) + 0.5) / compcodeR:::length.matrix(cdata), 2, lib.size + 1, "/")
data.norm <- data.norm * 1e6
# Transformation
data.trans <- log2(data.norm)
rownames(data.trans) <- rownames(compcodeR:::count.matrix(cdata))
@
\subsection*{Vanilla (ML et REML), Satterthwaite (ML et REML), LRT}
<< calcul_pvaleurs, echo = FALSE>>=
### Pvalues computation
pvalues_data = "chen2019pvalues.Rds"
if (!file.exists(here("data",pvalues_data))){
#  computing pvalues vec for all genes
pvalue_vec_vanilla <- sapply(seq(1, nrow(data.trans)), function(row_id) {
trait <- data.trans[row_id, ]
fit_phylo <- phylolm(trait ~ design_data$condition, phy = cdata@tree, measurement_error = TRUE)
compute_vanilla_pvalue(fit_phylo)
})
pvalue_vec_vanilla <- setNames(pvalue_vec_vanilla, rownames(data.trans))
pvalue_vec_vanilla_adj <- p.adjust(pvalue_vec_vanilla, method = "BH")
pvalue_vec_vanilla.REML <- sapply(seq(1, nrow(data.trans)), function(row_id) {
trait <- data.trans[row_id, ]
fit_phylo <- phylolm(trait ~ design_data$condition, phy = cdata@tree, REML = TRUE, measurement_error = TRUE)
compute_vanilla_pvalue(fit_phylo)
})
pvalue_vec_vanilla.REML <- setNames(pvalue_vec_vanilla.REML, rownames(data.trans))
pvalue_vec_vanilla_adj.REML <- p.adjust(pvalue_vec_vanilla.REML, method = "BH")
pvalue_vec_satterthwaite <- sapply(seq(1, nrow(data.trans)), function(row_id) {
trait <- data.trans[row_id, ]
fit_phylo <- phylolm(trait ~ design_data$condition, phy = cdata@tree, measurement_error = TRUE)
compute_satterthwaite_pvalue(fit_phylo, tree = cdata@tree)
})
pvalue_vec_satterthwaite <- setNames(pvalue_vec_satterthwaite, rownames(data.trans))
pvalue_vec_satterthwaite_adj <- p.adjust(pvalue_vec_satterthwaite, method = "BH")
pvalue_vec_lrt <- sapply(seq(1, nrow(data.trans)), function(row_id) {
trait <- data.trans[row_id, ]
fit_phylo <- phylolm(trait ~ design_data$condition, phy = cdata@tree, measurement_error = TRUE)
compute_lrt_pvalue(fit_phylo, tree = cdata@tree)
})
pvalue_vec_lrt <- setNames(pvalue_vec_lrt, rownames(data.trans))
pvalue_vec_lrt_adj <- p.adjust(pvalue_vec_lrt, method = "BH")
# REML
pvalue_vec_satterthwaite.REML <- sapply(seq(1, nrow(data.trans)), function(row_id) {
trait <- data.trans[row_id, ]
fit_phylo <- phylolm(trait ~ design_data$condition, phy = cdata@tree, REML = TRUE, measurement_error = TRUE)
compute_satterthwaite_pvalue(fit_phylo, tree = cdata@tree, REML = TRUE)
})
pvalue_vec_satterthwaite.REML <- setNames(pvalue_vec_satterthwaite.REML, rownames(data.trans))
pvalue_vec_satterthwaite_adj.REML <- p.adjust(pvalue_vec_satterthwaite.REML, method = "BH")
## Préparation du dataframe
pvalues_dataframe <- data.frame(
gene = rep(rownames(data.trans), 5),
pvalue = c(pvalue_vec_vanilla_adj,
pvalue_vec_vanilla_adj.REML,
pvalue_vec_satterthwaite_adj,
pvalue_vec_lrt_adj,
pvalue_vec_satterthwaite_adj.REML),
test_method = rep(c( "VanillaAdj", "VanillaAdjREML",
"SatterthwaiteAdj", "LRTAdj",
"SatterthwaiteAdjREML"), each = nrow(data.trans))
)
pvalues_dataframe$test_method <- as.factor(pvalues_dataframe$test_method)
pvalues_dataframe <- pvalues_dataframe %>% mutate(selected = ifelse(pvalue < 0.05, 1, 0))
save(pvalues_dataframe, file = here("data", pvalues_data))
} else {
load(here("data", pvalues_data))
}
@
\begin{figure}
<< graphique_all_pvalues, echo = FALSE>>=
all_plots <- plot_spacer()
for (test in unique(pvalues_dataframe$test_method)) {
all_plots <- all_plots + ggplot(pvalues_dataframe %>% filter(test_method == test)) +
aes(x = reorder(gene, -pvalue),
y = pvalue, color = as.factor(selected)) +
labs(color = "Selected", x = "Genes", y = "P-values")+
# geom_bar(stat = "identity", position = "dodge") +
geom_point()+
geom_hline(yintercept = 0.05) +
facet_grid(cols = vars(test_method)) +
theme(axis.text.x=element_blank(), axis.ticks.x = element_blank())
}
all_plots + patchwork::plot_layout(guides = "collect",
axis_titles = "collect", tag_level = "new") +
plot_annotation(title = "Selected genes by tested methods")
@
\end{figure}
Ici on réalise un \texttt{pivot wider} pour montrer les gènes sélectionnées par méthodes.
<< wide_data, echo = FALSE>>=
pvalues_dataframe_wide <- pvalues_dataframe[pvalues_dataframe$Adj,] %>%
pivot_wider(id_cols = gene,
names_from = test_method,
values_from = selected) %>%
data.frame()
@
\begin{figure}
<< upset_selection, echo = FALSE>>=
upset(pvalues_dataframe_wide,
nsets = 5,
mainbar.y.label = "Nombre de gènes en commun",
sets.x.label = "Nombre de gènes sélectionnés")
@
\end{figure}
\subsection*{EVEmodel}
<<'evemodel', echo = FALSE>>=
eve_data = "evechen2019pvalues.Rds"
if (!file.exists(here("data", eve_data))){
# TODO comparer avec le package evemodel, twothetatest
# Comparer avec OU lrt
# Arbre sans les replicats et les genes data
# remotes::install_gitlab("sandve-lab/evemodel")
# TODO Utiliser les infos de la ligne 83 du Rmd
cdataEVE <- readRDS(here("data", "data_TER", "data", "chen2019_rodents_cpd.rds"))
is.valid <- compcodeR:::check_phyloCompData(cdataEVE)
if (!(is.valid == TRUE)) stop('Not a valid phyloCompData object.')
tree_rep <- compcodeR:::getTree(cdataEVE)
tree_norep <- compcodeR:::getTreeEVE(cdataEVE)
theta_2_vec <- compcodeR:::getIsTheta2edge(cdataEVE, tree_norep)
#col_species <- tree_norep$tip.label[compcodeR:::sample.annotations(cdataEVE)$id.species]
col_species <- tree_norep$tip.label[cumsum(!duplicated(compcodeR:::sample.annotations(cdataEVE)$id.species))]
# Normalisation
nfEVE <- edgeR::calcNormFactors(compcodeR:::count.matrix(cdataEVE) / compcodeR:::length.matrix(cdataEVE), method = 'TMM')
lib.sizeEVE <- colSums(compcodeR:::count.matrix(cdataEVE) / compcodeR:::length.matrix(cdataEVE)) * nfEVE
data.normEVE <- sweep((compcodeR:::count.matrix(cdataEVE) + 0.5) / compcodeR:::length.matrix(cdataEVE), 2, lib.sizeEVE + 1, '/')
data.normEVE <- data.normEVE * 1e6
# Transformation
data.transEVE <- log2(data.normEVE)
rownames(data.transEVE) <- rownames(compcodeR:::count.matrix(cdataEVE))
# Analysis with EVE
evemodel.results_list <- evemodel::twoThetaTest(tree = tree_norep, gene.data = data.transEVE, isTheta2edge = theta_2_vec, colSpecies = col_species, upperBound = c(theta = Inf, sigma2 = Inf, alpha = log(2)/0.001/1))
result.table <- data.frame(pvalue = pchisq(evemodel.results_list$LRT, df = 1, lower.tail = FALSE), logFC = compcodeR:::getlogFCEVE(evemodel.results_list$twoThetaRes, theta_2_vec, tree_norep))
result.table$score <- 1 - result.table$pvalue
result.table$adjpvalue <- p.adjust(result.table$pvalue, 'BH')
rownames(result.table) <- rownames(compcodeR:::count.matrix(cdataEVE))
evemodel_dataframe <- data.frame(gene = rep(rownames(result.table)),
pvalue = c(result.table$adjpvalue),
test_method = rep("EVEAdj", each = nrow(result.table)))
save(evemodel_dataframe, file = here("data", eve_data))
} else {
load(file = here("data", eve_data))
}
evegenesNA <- (evemodel_dataframe%>% filter(is.na(pvalue)))$gene
evemodel_dataframe <- evemodel_dataframe %>%
filter(!is.na(pvalue)) %>%
mutate(selected = ifelse(pvalue < 0.05, 1, 0))
evemodel_dataframe$test_method <- as.factor(evemodel_dataframe$test_method)
@
<<'na-genes', result = 'asis'>>=
cat(paste("Il y a eu des NAs pour les gènes : ", paste0(evegenesNA, collapse = ";")))
@
\subsection{Toutes les méthodes}
<< pvalue_eve_upset, echo = FALSE>>=
pvalueseve_dataframe <- rbind(pvalues_dataframe, evemodel_dataframe)
pvalueseve_dataframe_wide <- pvalueseve_dataframe %>%
pivot_wider(id_cols = gene,
names_from = test_method,
values_from = selected, values_fill = 0) %>%
data.frame()
upset(pvalueseve_dataframe_wide,
nsets = 10,
mainbar.y.label = "Nombre de gènes en commun",
sets.x.label = "Nombre de gènes sélectionnés")
@