mirror of
https://github.com/Polarolouis/anova-phylogenetique-projet-msv.git
synced 2026-06-17 18:25:25 +02:00
253 lines
7.5 KiB
R
253 lines
7.5 KiB
R
# Phylocomparison tools
|
|
library(phylolm)
|
|
library(phylotools)
|
|
library(phytools)
|
|
library(phylolimma)
|
|
library(ape)
|
|
library(tidyverse)
|
|
|
|
# Plotting
|
|
library(ggplot2)
|
|
|
|
# Sourcing the utils
|
|
source("./R/utils.R")
|
|
|
|
# Fixing randomness for reproducibility
|
|
set.seed(1234)
|
|
|
|
# Parameters
|
|
nb_species <- 100
|
|
|
|
# Generating the phylo tree
|
|
tree <- rphylo(nb_species, birth = 0.1, death = 0)
|
|
|
|
# Group selections tries to have group of same size
|
|
plotTree(tree, node.numbers = TRUE)
|
|
|
|
# Here I chose two ancestors to split in two the tree
|
|
ancestors <- c(102, 104)
|
|
K <- length(ancestors) # The number of groups
|
|
|
|
# I assign the groups numbers
|
|
## Matching the phylogeny
|
|
phylo_matching_groups <- sapply(1:nb_species, function(tip) {
|
|
get_phylo_group(tip,
|
|
tree,
|
|
ancestors = ancestors
|
|
)
|
|
})
|
|
|
|
## Randomly
|
|
random_groups <- sample(1:K, nb_species, replace = TRUE)
|
|
|
|
## Randomly but with same size of groups
|
|
# sameSize_random_groups <- sample(1:K,
|
|
# nb_species,
|
|
# replace = TRUE,
|
|
# prob = table(phylo_matching_groups)
|
|
# )
|
|
# group_sizes <- table(phylo_matching_groups)
|
|
|
|
|
|
# Saving images of tree
|
|
plot_group_on_tree <- function(tree, groups) {
|
|
plot(tree, show.tip.label = FALSE, x.lim = 50)
|
|
tiplabels(bg = groups, pch = 21)
|
|
text(x = 10, y = 0, label = "This tree will be normalised.")
|
|
}
|
|
|
|
# Saving trees
|
|
png(file = "img/group_phylo_matching_tree.png")
|
|
plot_group_on_tree(tree, group = phylo_matching_groups)
|
|
dev.off()
|
|
|
|
png(file = "img/group_random_tree.png")
|
|
plot_group_on_tree(tree, group = random_groups)
|
|
dev.off()
|
|
|
|
# Normalising tree edge length
|
|
taille_tree <- diag(vcv(tree))[1]
|
|
tree$edge.length <- tree$edge.length / taille_tree
|
|
|
|
#' Returns pvalues for both F test for anova and phylogenetic anova
|
|
#'
|
|
#' @description
|
|
# TODO Describe
|
|
phyloanova_anova_pvalues <- function(
|
|
traits, groups, tree, stoch_process,
|
|
test_method, measurement_error = TRUE) {
|
|
# For phylo matching
|
|
anova_res <- lm(traits ~ groups)
|
|
|
|
# TODO Handle the stoch process and model for phylolm (OU etc)
|
|
model <- stoch_process
|
|
|
|
phyloanova_res <- phylolm(traits ~ groups,
|
|
phy = tree,
|
|
model = model,
|
|
measurement_error = measurement_error # To let phylolm know if there's measurement error
|
|
)
|
|
|
|
anova_res <- lm(traits ~ groups)
|
|
anova_F_stat <- summary(anova_res)$fstatistic[1]
|
|
anova_df1 <- summary(anova_res)$fstatistic[2]
|
|
anova_df2 <- summary(anova_res)$fstatistic[3]
|
|
anova_p_value <- pvalue_F_test(anova_F_stat,
|
|
df1 = anova_df1, df2 = anova_df2
|
|
)
|
|
|
|
if (test_method %in% c("vanilla", "satterthwaite")) {
|
|
phyloanova_F_stat <- compute_F_statistic(
|
|
r_squared = phyloanova_res$r.squared,
|
|
df1 = K - 1,
|
|
df2 = nb_species - K
|
|
)
|
|
|
|
df1 <- K - 1
|
|
df2 <- nb_species - K
|
|
|
|
if (test_method == "satterthwaite") {
|
|
# For satterthwaite ddf computation
|
|
phyloanova_res$REML <- FALSE
|
|
df2 <- phylolimma:::ddf_satterthwaite(phyloanova_res, tree)
|
|
}
|
|
|
|
phyloanova_p_value <- pvalue_F_test(phyloanova_F_stat, df1 = df1, df2 = df2)
|
|
}
|
|
|
|
if (test_method == "likelihood_ratio") {
|
|
# How to obtain the loglikehood under H0 ?
|
|
# TODO Find the correct way to do this
|
|
# I assume that under H0 this is like saying everyone is from the same group
|
|
h0_phyloanova <- phylolm(traits ~ rep(1, length(traits)),
|
|
phy = tree,
|
|
model = model,
|
|
measurement_error = measurement_error # To let phylolm know if there's measurement error
|
|
)
|
|
# But this gives a LAPACK error, the system is not inversible.
|
|
|
|
lambda_ratio_stat <- -2(h0_phyloanova$logLik - phyloanova_res$logLik)
|
|
|
|
# TODO Fix
|
|
phyloanova_p_value <- NA
|
|
}
|
|
|
|
list(
|
|
phyloanova_p_value = phyloanova_p_value,
|
|
anova_p_value = anova_p_value
|
|
)
|
|
}
|
|
|
|
simulate_matching_and_random <- function(
|
|
id, base_values,
|
|
sigma2_phylo, sigma2_measure,
|
|
stoch_process, test_method,
|
|
risk_threshold = 0.05) {
|
|
matching_phylo_traits <- compute_trait_values(
|
|
groups = phylo_matching_groups,
|
|
base_values = base_values, tree,
|
|
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
|
|
stoch_process = stoch_process
|
|
)
|
|
|
|
matching_pvalues <- phyloanova_anova_pvalues(
|
|
traits = matching_phylo_traits,
|
|
groups = phylo_matching_groups, tree, stoch_process = stoch_process,
|
|
test_method = test_method, measurement_error = (sigma2_measure != 0)
|
|
)
|
|
|
|
random_groups_traits <- compute_trait_values(
|
|
groups = random_groups,
|
|
base_values = base_values, tree,
|
|
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
|
|
stoch_process = stoch_process
|
|
)
|
|
|
|
random_groups_pvalues <- phyloanova_anova_pvalues(
|
|
traits = random_groups_traits,
|
|
groups = random_groups, tree, stoch_process = stoch_process,
|
|
test_method = test_method, measurement_error = (sigma2_measure != 0)
|
|
)
|
|
|
|
# Concatenate pvalues
|
|
pvalues <- c(unlist(matching_pvalues), unlist(random_groups_pvalues))
|
|
return(
|
|
data.frame(
|
|
sim_id = rep(id, 4),
|
|
test_method = rep(c("phylo-anova", "anova"), 2),
|
|
group_type = rep(c("matching", "random"), each = 2),
|
|
pvalues = pvalues,
|
|
reject_H0 = pvalues < risk_threshold
|
|
)
|
|
)
|
|
}
|
|
|
|
# Parameters for the simulations
|
|
N <- 500
|
|
base_values <- c(1, 3) # The base trait to add
|
|
risk_threshold <- 0.05
|
|
|
|
sigma2_phylo <- 1
|
|
sigma2_measure <- 0
|
|
stoch_process <- "BM"
|
|
test_method <- "satterthwaite" # "vanilla" # "satterthwaite", "likelihood_ratio"
|
|
simulate_data <- function(N, base_values, risk_threshold, sigma2_phylo, sigma2_measure, stoch_process, test_method) {
|
|
simulated_data <- do.call("rbind", lapply(1:N, function(id) {
|
|
simulate_matching_and_random(
|
|
id = id, base_values = base_values,
|
|
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
|
|
stoch_process = stoch_process,
|
|
test_method = test_method,
|
|
risk_threshold = risk_threshold
|
|
)
|
|
}))
|
|
|
|
parameters <- paste0(
|
|
" sigma2_measure = ", sigma2_measure,
|
|
"; sigma2_phylo = ", sigma2_phylo,
|
|
";\nbase values = (", paste(c(base_values), collapse = ";"), ")",
|
|
"; test method : ", test_method
|
|
)
|
|
|
|
return(list(data = simulated_data, parameters = parameters))
|
|
}
|
|
|
|
plot_data <- function(data, parameters) {
|
|
plot_data <- data %>%
|
|
group_by(test_method, group_type) %>%
|
|
summarize(power = mean(reject_H0))
|
|
|
|
p <- ggplot(plot_data, aes(x = test_method, y = power, fill = group_type)) +
|
|
geom_bar(stat = "identity", position = "dodge") +
|
|
scale_y_continuous(limits = c(0, 1)) +
|
|
labs(
|
|
title = paste0("Power vs Tested Method (", stoch_process, ") | N = ", N, ";", parameters),
|
|
x = "Tested Method",
|
|
y = "Power"
|
|
) +
|
|
geom_hline(yintercept = 0.95) +
|
|
theme_minimal()
|
|
p
|
|
|
|
return(p)
|
|
}
|
|
|
|
# Vanilla
|
|
vanilla_results <- simulate_data(N, base_values, risk_threshold, sigma2_phylo,
|
|
sigma2_measure, stoch_process,
|
|
test_method = "vanilla"
|
|
)
|
|
vanilla_data <- vanilla_results$data
|
|
vanilla_parameters <- vanilla_results$parameters
|
|
plot_data(vanilla_data, vanilla_parameters)
|
|
|
|
# Satterthwaite
|
|
|
|
satterthwaite_results <- simulate_data(N, base_values, risk_threshold, sigma2_phylo,
|
|
sigma2_measure, stoch_process,
|
|
test_method = "satterthwaite"
|
|
)
|
|
satterthwaite_data <- satterthwaite_results$data
|
|
satterthwaite_parameters <- satterthwaite_results$parameters
|
|
plot_data(satterthwaite_data, satterthwaite_parameters)
|
|
|