mirror of
https://github.com/Polarolouis/anova-phylogenetique-projet-msv.git
synced 2026-06-17 10:15:25 +02:00
294 lines
9.8 KiB
R
294 lines
9.8 KiB
R
# Phylocomparison tools
|
||
library(phylolm)
|
||
library(phylotools)
|
||
library(phytools)
|
||
library(phylolimma)
|
||
library(ape)
|
||
library(tidyverse)
|
||
|
||
# Plotting
|
||
library(ggplot2)
|
||
|
||
# Sourcing the utils
|
||
source("./R/utils.R")
|
||
|
||
# Fixing randomness for reproducibility
|
||
set.seed(1234)
|
||
|
||
# Parameters
|
||
nb_species <- 20
|
||
|
||
# Generating the phylo tree
|
||
tree <- rphylo(nb_species, birth = 0.1, death = 0)
|
||
|
||
# Group selections tries to have group of same size
|
||
plotTree(tree, node.numbers = TRUE)
|
||
|
||
# Here I chose two ancestors to split in two the tree
|
||
ancestors <- c(22, 23)
|
||
K <- length(ancestors) # The number of groups
|
||
|
||
# I assign the groups numbers
|
||
## Matching the phylogeny
|
||
phylo_matching_groups <- sapply(1:nb_species, function(tip) {
|
||
get_phylo_group(tip,
|
||
tree,
|
||
ancestors = ancestors
|
||
)
|
||
})
|
||
|
||
## Randomly
|
||
random_groups <- sample(1:K, nb_species, replace = TRUE)
|
||
|
||
## Randomly but with same size of groups
|
||
# sameSize_random_groups <- sample(1:K,
|
||
# nb_species,
|
||
# replace = TRUE,
|
||
# prob = table(phylo_matching_groups)
|
||
# )
|
||
# group_sizes <- table(phylo_matching_groups)
|
||
|
||
|
||
# Saving images of tree
|
||
plot_group_on_tree <- function(tree, groups) {
|
||
plot(tree, show.tip.label = FALSE, x.lim = 50)
|
||
tiplabels(bg = groups, pch = 21)
|
||
text(x = 10, y = 0, label = "This tree will be normalised.")
|
||
}
|
||
|
||
# Saving trees
|
||
png(file = "img/group_phylo_matching_tree.png")
|
||
plot_group_on_tree(tree, group = phylo_matching_groups)
|
||
dev.off()
|
||
|
||
png(file = "img/group_random_tree.png")
|
||
plot_group_on_tree(tree, group = random_groups)
|
||
dev.off()
|
||
|
||
# Normalising tree edge length
|
||
taille_tree <- diag(vcv(tree))[1]
|
||
tree$edge.length <- tree$edge.length / taille_tree
|
||
|
||
|
||
# TODO REMOVE
|
||
# simulate_matching_and_random <- function(
|
||
# id, base_values,
|
||
# sigma2_phylo, sigma2_measure,
|
||
# stoch_process, test_method,
|
||
# risk_threshold = 0.05,
|
||
# correct_hypothesis = "H1") {
|
||
# # To be in the right configuration for typeI error
|
||
# if (correct_hypothesis == "H0") {
|
||
# base_values <- rep(0, length(base_values))
|
||
# }
|
||
# matching_phylo_traits <- compute_trait_values(
|
||
# groups = phylo_matching_groups,
|
||
# base_values = base_values, tree,
|
||
# sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
|
||
# stoch_process = stoch_process
|
||
# )
|
||
# matching_pval_df <- phyloanova_anova_pvalues(
|
||
# traits = matching_phylo_traits,
|
||
# groups = phylo_matching_groups, tree, stoch_process = stoch_process,
|
||
# test_method = test_method, measurement_error = TRUE
|
||
# )
|
||
# matching_pvalues <- matching_pval_df[c(1, 2)]
|
||
|
||
# matching_df2 <- matching_pval_df[c(3, 4)]
|
||
|
||
# random_groups_traits <- compute_trait_values(
|
||
# groups = random_groups,
|
||
# base_values = base_values, tree,
|
||
# sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
|
||
# stoch_process = stoch_process
|
||
# )
|
||
|
||
# random_groups_pval_df2 <- phyloanova_anova_pvalues(
|
||
# traits = random_groups_traits,
|
||
# groups = random_groups, tree, stoch_process = stoch_process,
|
||
# test_method = test_method, measurement_error = TRUE
|
||
# )
|
||
# random_groups_pvalues <- random_groups_pval_df2[c(1, 2)]
|
||
|
||
# random_groups_df2 <- random_groups_pval_df2[c(3, 4)]
|
||
# # Concatenate pvalues
|
||
# pvalues <- c(unlist(matching_pvalues), unlist(random_groups_pvalues))
|
||
|
||
# if (correct_hypothesis == "H1") {
|
||
# correctly_selected <- pvalues < risk_threshold
|
||
# }
|
||
# if (correct_hypothesis == "H0") {
|
||
# correctly_selected <- pvalues >= risk_threshold
|
||
# }
|
||
# return(
|
||
# data.frame(
|
||
# sim_id = rep(id, 4),
|
||
# anova_model = rep(c("phylo-anova", "anova"), 2),
|
||
# group_type = rep(c("matching", "random"), each = 2),
|
||
# pvalues = pvalues,
|
||
# correctly_selected = correctly_selected,
|
||
# df2 = unlist(c(matching_df2, random_groups_df2))
|
||
# )
|
||
# )
|
||
# }
|
||
|
||
|
||
|
||
# compare_methods <- function(
|
||
# N, base_values, risk_threshold, sigma2_phylo,
|
||
# sigma2_measure, stoch_process, methods_to_test = c("vanilla", "satterthwaite"), correct_hypothesis = "H1") {
|
||
# if (any(!(methods_to_test %in% c("vanilla", "satterthwaite", "lrt")))) {
|
||
# stop("Unknown method to test.")
|
||
# }
|
||
# # Generating data for each method
|
||
# # TODO Utiliser les mêmes données pour les méthodes
|
||
# ## To compute power
|
||
# full_power_data <-
|
||
# do.call("rbind", lapply(methods_to_test, function(method) {
|
||
# data <- simulate_data(
|
||
# N = N,
|
||
# base_values = base_values,
|
||
# risk_threshold = risk_threshold,
|
||
# sigma2_phylo = sigma2_phylo,
|
||
# sigma2_measure = sigma2_measure,
|
||
# test_method = method,
|
||
# stoch_process = stoch_process,
|
||
# correct_hypothesis = "H1"
|
||
# )$data
|
||
# # Adding a column to identify the approximation method
|
||
# data$tested_method <- method
|
||
# data$metric_type <- "power"
|
||
# data
|
||
# }))
|
||
# ## To compute type I error
|
||
# full_typeI_data <-
|
||
# do.call("rbind", lapply(methods_to_test, function(method) {
|
||
# data <- simulate_data(
|
||
# N = N,
|
||
# base_values = base_values,
|
||
# risk_threshold = risk_threshold,
|
||
# sigma2_phylo = sigma2_phylo,
|
||
# sigma2_measure = sigma2_measure,
|
||
# test_method = method,
|
||
# stoch_process = stoch_process,
|
||
# correct_hypothesis = "H0"
|
||
# )$data
|
||
# # Adding a column to identify the approximation method
|
||
# data$tested_method <- method
|
||
# data$metric_type <- "typeI"
|
||
# data
|
||
# }))
|
||
|
||
# data <- rbind(full_power_data, full_typeI_data)
|
||
# return(
|
||
# list(
|
||
# data = data,
|
||
# sim_parameters = list(
|
||
# N = N,
|
||
# base_values = base_values,
|
||
# risk_threshold = risk_threshold,
|
||
# sigma2_phylo = sigma2_phylo,
|
||
# sigma2_measure = sigma2_measure,
|
||
# stoch_process = stoch_process
|
||
# )
|
||
# )
|
||
# )
|
||
# }
|
||
|
||
# plot_simulation_data <- function(data, parameters_string, threshold = 0.95) {
|
||
# plot_data <- data %>%
|
||
# group_by(anova_model, group_type) %>%
|
||
# summarize(power = mean(correctly_selected))
|
||
|
||
# p <- ggplot(plot_data, aes(x = anova_model, y = power, fill = group_type)) +
|
||
# geom_bar(stat = "identity", position = "dodge") +
|
||
# scale_y_continuous(limits = c(0, 1)) +
|
||
# labs(
|
||
# title = paste0("Power vs Tested Method (", stoch_process, ") | N = ", N, ";", parameters_string),
|
||
# x = "Tested Method",
|
||
# y = "Power"
|
||
# ) +
|
||
# geom_hline(yintercept = threshold) +
|
||
# theme_minimal()
|
||
# p
|
||
|
||
# return(p)
|
||
# }
|
||
|
||
plot_comparison <- function(data, sim_parameters) {
|
||
# Retrieving simulation parameters
|
||
risk_threshold <- sim_parameters$risk_threshold
|
||
N <- sim_parameters$N
|
||
sigma2_measure <- sim_parameters$sigma2_measure
|
||
sigma2_phylo <- sim_parameters$sigma2_phylo
|
||
base_values <- sim_parameters$base_values
|
||
stoch_process <- sim_parameters$stoch_process
|
||
|
||
plot_title <- paste0(
|
||
"N = ", N, ";", " sigma2_measure = ", sigma2_measure,
|
||
"; sigma2_phylo = ", sigma2_phylo,
|
||
";\nbase values = (", paste(c(base_values), collapse = ","), ");",
|
||
"\nStoch process : ", stoch_process
|
||
)
|
||
|
||
# Preparing plot data
|
||
plot_data <- data %>%
|
||
group_by(tested_method, anova_model, group_type, metric_type) %>%
|
||
summarize(metric = mean(correctly_selected))
|
||
# Reversing the metric to really be typeI error (ie the prop of errors made)
|
||
plot_data[plot_data$metric_type == "typeI", ] <- plot_data[plot_data$metric_type == "typeI", ] %>% mutate(metric = 1 - metric)
|
||
|
||
# Adding a threshold
|
||
plot_data <- plot_data %>%
|
||
ungroup() %>%
|
||
mutate(
|
||
hline_risk_threshold = case_when(
|
||
plot_data$metric_type == "power" ~ -0.1,
|
||
plot_data$metric_type == "typeI" ~ risk_threshold
|
||
)
|
||
)
|
||
# To be out of bounds
|
||
|
||
p <- ggplot(plot_data, aes(x = anova_model, y = metric, fill = group_type)) +
|
||
geom_bar(stat = "identity", position = "dodge") +
|
||
geom_text(aes(label = round(metric, digits = 3)), vjust = -0.5, position = position_dodge(width = 0.9)) +
|
||
scale_y_continuous(limits = c(0, 1.2)) +
|
||
labs(
|
||
title = plot_title,
|
||
x = "Anova Method",
|
||
y = "Metric"
|
||
) +
|
||
theme_minimal()
|
||
p <- p + facet_grid(tested_method ~ metric_type)
|
||
p <- p + geom_hline(aes(yintercept = hline_risk_threshold))
|
||
|
||
return(p)
|
||
}
|
||
|
||
# Comparing methods
|
||
# comparison <- compare_methods(N,
|
||
# base_values = c(1, 2), risk_threshold, sigma2_phylo = 0,
|
||
# sigma2_measure = 0.5, stoch_process, methods_to_test = c("vanilla", "satterthwaite", "lrt")
|
||
# )
|
||
# comparison_data <- comparison$data
|
||
|
||
# plot_comparison(comparison_data, sim_parameters = comparison$sim_parameters)
|
||
|
||
|
||
## Standardized parameters
|
||
total_variance <- 1.0 # sigma2_phylo + sigma2_error, fixed [as tree_height = 1]
|
||
heri <- c(0.0, 0.25, 0.5, 1.0) # heritability her = sigma2_phylo / total_variance. 0 means only noise. 1 means only phylo.
|
||
snr <- 1 # signal to noise ratio snr = size_effect / total_variance
|
||
|
||
## Try several parameter values
|
||
ggsave <- function(..., bg = "white") ggplot2::ggsave(..., bg = bg)
|
||
for (her in heri) {
|
||
sim <- compare_methods(N,
|
||
base_values = c(0, snr * total_variance), risk_threshold, sigma2_phylo = her * total_variance,
|
||
sigma2_measure = (1 - her) * total_variance, stoch_process, methods_to_test = c("vanilla", "satterthwaite", "lrt")
|
||
)
|
||
|
||
res_sim_plot <- plot_comparison(sim$data, sim$sim_parameters)
|
||
res_sim_plot
|
||
ggsave(paste0("img/simulation_BM_her_", her, ".png"), plot = res_sim_plot)
|
||
}
|