anova-phylogenetique-projet.../R/anovaComparison.R

446 lines
15 KiB
R
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Phylocomparison tools
library(phylolm)
library(phylotools)
library(phytools)
library(phylolimma)
library(ape)
library(tidyverse)
# Plotting
library(ggplot2)
# Sourcing the utils
source("./R/utils.R")
# Fixing randomness for reproducibility
set.seed(1234)
# Parameters
nb_species <- 100
# Generating the phylo tree
tree <- rphylo(nb_species, birth = 0.1, death = 0)
# Group selections tries to have group of same size
plotTree(tree, node.numbers = TRUE)
# Here I chose two ancestors to split in two the tree
ancestors <- c(102, 104)
K <- length(ancestors) # The number of groups
# I assign the groups numbers
## Matching the phylogeny
phylo_matching_groups <- sapply(1:nb_species, function(tip) {
get_phylo_group(tip,
tree,
ancestors = ancestors
)
})
## Randomly
random_groups <- sample(1:K, nb_species, replace = TRUE)
## Randomly but with same size of groups
# sameSize_random_groups <- sample(1:K,
# nb_species,
# replace = TRUE,
# prob = table(phylo_matching_groups)
# )
# group_sizes <- table(phylo_matching_groups)
# Saving images of tree
plot_group_on_tree <- function(tree, groups) {
plot(tree, show.tip.label = FALSE, x.lim = 50)
tiplabels(bg = groups, pch = 21)
text(x = 10, y = 0, label = "This tree will be normalised.")
}
# Saving trees
png(file = "img/group_phylo_matching_tree.png")
plot_group_on_tree(tree, group = phylo_matching_groups)
dev.off()
png(file = "img/group_random_tree.png")
plot_group_on_tree(tree, group = random_groups)
dev.off()
# Normalising tree edge length
taille_tree <- diag(vcv(tree))[1]
tree$edge.length <- tree$edge.length / taille_tree
#' Returns pvalues for both F test for anova and phylogenetic anova
#'
#' @description
# TODO Describe
phyloanova_anova_pvalues <- function(
traits, groups, tree, stoch_process,
test_method, measurement_error = TRUE) {
# For phylo matching
anova_res <- lm(traits ~ groups)
# TODO Handle the stoch process and model for phylolm (OU etc)
model <- stoch_process
phyloanova_res <- phylolm(traits ~ groups,
phy = tree,
model = model,
measurement_error = measurement_error # To let phylolm know if there's measurement error
)
anova_res <- lm(traits ~ groups)
anova_F_stat <- summary(anova_res)$fstatistic[1]
anova_df1 <- summary(anova_res)$fstatistic[2]
anova_df2 <- summary(anova_res)$fstatistic[3]
anova_p_value <- pvalue_F_test(anova_F_stat,
df1 = anova_df1, df2 = anova_df2
)
if (test_method %in% c("vanilla", "satterthwaite")) {
phyloanova_F_stat <- compute_F_statistic(
r_squared = phyloanova_res$r.squared,
df1 = K - 1,
df2 = nb_species - K
)
df1 <- K - 1
df2 <- nb_species - K
if (test_method == "satterthwaite") {
# For satterthwaite ddf computation
df2 <- ddf_satterthwaite_sum(phyloanova_res, tree, REML = TRUE)$ddf
print(paste0("Satterthwaite ddf :", df2))
}
phyloanova_p_value <- pvalue_F_test(phyloanova_F_stat, df1 = df1, df2 = df2)
}
if (test_method == "lrt") {
# TODO Change method name to be less deceptive
h0_phyloanova <- phylolm(traits ~ 1,
phy = tree,
model = model,
measurement_error = measurement_error # To let phylolm know if there's measurement error
)
lambda_ratio_stat <- -2 * (h0_phyloanova$logLik - phyloanova_res$logLik)
# Computes the pvalue from the statistic
# df1 = K - 1
phyloanova_p_value <- 1 - pchisq(lambda_ratio_stat, K - 1)
}
list(
phyloanova_p_value = phyloanova_p_value,
anova_p_value = anova_p_value,
anova_df2 = nb_species - K,
phylo_df2 = df2
)
}
simulate_matching_and_random <- function(
id, base_values,
sigma2_phylo, sigma2_measure,
stoch_process, test_method,
risk_threshold = 0.05,
correct_hypothesis = "H1") {
# To be in the right configuration for typeI error
if (correct_hypothesis == "H0") {
base_values <- rep(0, length(base_values))
}
matching_phylo_traits <- compute_trait_values(
groups = phylo_matching_groups,
base_values = base_values, tree,
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
stoch_process = stoch_process
)
matching_pval_df <- phyloanova_anova_pvalues(
traits = matching_phylo_traits,
groups = phylo_matching_groups, tree, stoch_process = stoch_process,
test_method = test_method, measurement_error = TRUE
)
matching_pvalues <- matching_pval_df[c(1, 2)]
matching_df2 <- matching_pval_df[c(3, 4)]
random_groups_traits <- compute_trait_values(
groups = random_groups,
base_values = base_values, tree,
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
stoch_process = stoch_process
)
random_groups_pval_df2 <- phyloanova_anova_pvalues(
traits = random_groups_traits,
groups = random_groups, tree, stoch_process = stoch_process,
test_method = test_method, measurement_error = TRUE
)
random_groups_pvalues <- random_groups_pval_df2[c(1,2)]
random_groups_df2 <- random_groups_pval_df2[c(3,4)]
# Concatenate pvalues
pvalues <- c(unlist(matching_pvalues), unlist(random_groups_pvalues))
if (correct_hypothesis == "H1") {
correctly_selected <- pvalues < risk_threshold
}
if (correct_hypothesis == "H0") {
correctly_selected <- pvalues >= risk_threshold
}
return(
data.frame(
sim_id = rep(id, 4),
anova_model = rep(c("phylo-anova", "anova"), 2),
group_type = rep(c("matching", "random"), each = 2),
pvalues = pvalues,
correctly_selected = correctly_selected,
df2 = unlist(c(matching_df2, random_groups_df2))
)
)
}
# Parameters for the simulations
N <- 500
base_values <- c(1, 3) # The base trait to add
risk_threshold <- 0.05
sigma2_phylo <- 1
sigma2_measure <- 0
stoch_process <- "BM"
test_method <- "satterthwaite" # "vanilla" # "satterthwaite", "likelihood_ratio"
simulate_data <- function(
N, base_values, risk_threshold, sigma2_phylo,
sigma2_measure, stoch_process, test_method, correct_hypothesis = "H1") {
simulated_data <- do.call("rbind", lapply(1:N, function(id) {
simulate_matching_and_random(
id = id, base_values = base_values,
sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
stoch_process = stoch_process,
test_method = test_method,
risk_threshold = risk_threshold,
correct_hypothesis = correct_hypothesis
)
}))
parameters_string <- paste0(
" sigma2_measure = ", sigma2_measure,
"; sigma2_phylo = ", sigma2_phylo,
";\nbase values = (", paste(c(base_values), collapse = ";"), ")",
"; test method : ", test_method,
"; correct hypothesis : ", correct_hypothesis
)
return(list(data = simulated_data, parameters_string = parameters_string))
}
compare_methods <- function(
N, base_values, risk_threshold, sigma2_phylo,
sigma2_measure, stoch_process, methods_to_test = c("vanilla", "satterthwaite"), correct_hypothesis = "H1") {
if (any(!(methods_to_test %in% c("vanilla", "satterthwaite", "lrt")))) {
stop("Unknown method to test.")
}
#  Generating data for each method
# TODO Séparer les deux fonctions de simulation et d'inférence
# TODO Utiliser les mêmes données pour les méthodes
##  To compute power
full_power_data <-
do.call("rbind", lapply(methods_to_test, function(method) {
data <- simulate_data(
N = N,
base_values = base_values,
risk_threshold = risk_threshold,
sigma2_phylo = sigma2_phylo,
sigma2_measure = sigma2_measure,
test_method = method,
stoch_process = stoch_process,
correct_hypothesis = "H1"
)$data
#  Adding a column to identify the approximation method
data$tested_method <- method
data$metric_type <- "power"
data
}))
##  To compute type I error
full_typeI_data <-
do.call("rbind", lapply(methods_to_test, function(method) {
data <- simulate_data(
N = N,
base_values = base_values,
risk_threshold = risk_threshold,
sigma2_phylo = sigma2_phylo,
sigma2_measure = sigma2_measure,
test_method = method,
stoch_process = stoch_process,
correct_hypothesis = "H0"
)$data
#  Adding a column to identify the approximation method
data$tested_method <- method
data$metric_type <- "typeI"
data
}))
data <- rbind(full_power_data, full_typeI_data)
return(
list(
data = data,
sim_parameters = list(
N = N,
base_values = base_values,
risk_threshold = risk_threshold,
sigma2_phylo = sigma2_phylo,
sigma2_measure = sigma2_measure,
stoch_process = stoch_process
)
)
)
}
plot_simulation_data <- function(data, parameters_string, threshold = 0.95) {
plot_data <- data %>%
group_by(anova_model, group_type) %>%
summarize(power = mean(correctly_selected))
p <- ggplot(plot_data, aes(x = anova_model, y = power, fill = group_type)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(limits = c(0, 1)) +
labs(
title = paste0("Power vs Tested Method (", stoch_process, ") | N = ", N, ";", parameters_string),
x = "Tested Method",
y = "Power"
) +
geom_hline(yintercept = threshold) +
theme_minimal()
p
return(p)
}
# # Vanilla
# vanilla_results <- simulate_data(N, base_values, risk_threshold, sigma2_phylo,
# sigma2_measure, stoch_process,
# test_method = "vanilla"
# )
# vanilla_data <- vanilla_results$data
# vanilla_parameters_string <- vanilla_results$parameters_string
# plot_simulation_data(vanilla_data, vanilla_parameters_string)
# vanilla_results_H0 <- simulate_data(N,
# base_values = c(1, 1), risk_threshold, sigma2_phylo,
# sigma2_measure, stoch_process,
# test_method = "vanilla",
# correct_hypothesis = "H0"
# )
# vanilla_data_H0 <- vanilla_results_H0$data
# vanilla_parameters_string_H0 <- vanilla_results_H0$parameters_string
# plot_simulation_data(vanilla_data_H0, vanilla_parameters_string_H0, threshold = 0.05)
# # Satterthwaite
# satterthwaite_results <- simulate_data(N, base_values, risk_threshold, sigma2_phylo,
# sigma2_measure = 1, stoch_process,
# test_method = "satterthwaite"
# )
# satterthwaite_data <- satterthwaite_results$data
# satterthwaite_parameters_string <- satterthwaite_results$parameters_string
# plot_simulation_data(satterthwaite_data, satterthwaite_parameters_string)
# satterthwaite_results_H0 <- simulate_data(N,
# base_values = c(1, 1), risk_threshold, sigma2_phylo,
# sigma2_measure = 1, stoch_process,
# test_method = "satterthwaite", correct_hypothesis = "H0"
# )
# satterthwaite_data_H0 <- satterthwaite_results_H0$data
# satterthwaite_parameters_string_H0 <- satterthwaite_results_H0$parameters_string
# plot_simulation_data(satterthwaite_data_H0, satterthwaite_parameters_string_H0, threshold = 0.05)
# # Likelihood ratio
# lrt_results <- simulate_data(N, base_values, risk_threshold, sigma2_phylo,
# sigma2_measure, stoch_process,
# test_method = "lrt"
# )
# lrt_data <- lrt_results$data
# lrt_parameters_string <- lrt_results$parameters_string
# plot_simulation_data(lrt_data, lrt_parameters_string)
plot_comparison <- function(data, sim_parameters) {
#  Retrieving simulation parameters
risk_threshold <- sim_parameters$risk_threshold
N <- sim_parameters$N
sigma2_measure <- sim_parameters$sigma2_measure
sigma2_phylo <- sim_parameters$sigma2_phylo
base_values <- sim_parameters$base_values
stoch_process <- sim_parameters$stoch_process
plot_title <- paste0(
"N = ", N, ";", " sigma2_measure = ", sigma2_measure,
"; sigma2_phylo = ", sigma2_phylo,
";\nbase values = (", paste(c(base_values), collapse = ","), ");",
"\nStoch process : ", stoch_process
)
#  Preparing plot data
plot_data <- data %>%
group_by(tested_method, anova_model, group_type, metric_type) %>%
summarize(metric = mean(correctly_selected))
#  Reversing the metric to really be typeI error (ie the prop of errors made)
plot_data[plot_data$metric_type == "typeI", ] <- plot_data[plot_data$metric_type == "typeI", ] %>% mutate(metric = 1 - metric)
# Adding a threshold
plot_data <- plot_data %>%
ungroup() %>%
mutate(
hline_risk_threshold = case_when(
plot_data$metric_type == "power" ~ -0.1,
plot_data$metric_type == "typeI" ~ risk_threshold
)
)
#  To be out of bounds
p <- ggplot(plot_data, aes(x = anova_model, y = metric, fill = group_type)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(metric, digits = 3)), vjust = -0.5, position = position_dodge(width = 0.9)) +
scale_y_continuous(limits = c(0, 1.2)) +
labs(
title = plot_title,
x = "Anova Method",
y = "Metric"
) +
theme_minimal()
p <- p + facet_grid(tested_method ~ metric_type)
p <- p + geom_hline(aes(yintercept = hline_risk_threshold))
return(p)
}
#  Comparing methods
# comparison <- compare_methods(N,
# base_values = c(1, 2), risk_threshold, sigma2_phylo = 0,
# sigma2_measure = 0.5, stoch_process, methods_to_test = c("vanilla", "satterthwaite", "lrt")
# )
# comparison_data <- comparison$data
# plot_comparison(comparison_data, sim_parameters = comparison$sim_parameters)
#  TODO Adapt to the current code
## Standardized parameters
total_variance <- 1.0 # sigma2_phylo + sigma2_error, fixed [as tree_height = 1]
heri <- c(0.0, 0.25, 0.5, 1.0) # heritability her = sigma2_phylo / total_variance. 0 means only noise. 1 means only phylo.
snr <- 1 # signal to noise ratio snr = size_effect / total_variance
## Try several parameter values
ggsave <- function(..., bg = "white") ggplot2::ggsave(..., bg = bg)
for (her in heri) {
sim <- compare_methods(N,
base_values = c(0, snr * total_variance), risk_threshold, sigma2_phylo = her * total_variance,
sigma2_measure = (1 - her) * total_variance, stoch_process, methods_to_test = c("vanilla", "satterthwaite", "lrt")
)
res_sim_plot <- plot_comparison(sim$data, sim$sim_parameters)
res_sim_plot
ggsave(paste0("img/simulation_BM_her_", her, ".png"), plot = res_sim_plot)
}