Began reimplementing properly

2026-06-17 10:15:25 +02:00 · 2024-01-11 18:13:26 +01:00 · 2024-01-11 18:13:26 +01:00 · 6e80eff1a4
commit 6e80eff1a4
parent b5f8b6ea26
9 changed files with 209 additions and 2 deletions
--- a/R/anovaComparison.R
+++ b/R/anovaComparison.R
@ -0,0 +1,139 @@
+# Phylocomparison tools
+library(phylolm)
+library(phylotools)
+library(phytools)
+library(phylolimma)
+library(ape)
+
+# Plotting
+library(ggplot2)
+
+# Sourcing the utils
+source("./R/utils.R")
+
+# Fixing randomness for reproducibility
+set.seed(1234)
+
+# Parameters
+nb_species <- 100
+
+# Generating the phylo tree
+tree <- rphylo(nb_species, birth = 0.1, death = 0)
+
+# Group selections tries to have group of same size
+plotTree(tree, node.numbers = TRUE)
+
+# Here I chose two ancestors to split in two the tree
+ancestors <- c(102, 104)
+K <- length(ancestors) # The number of groups
+
+# I assign the groups numbers
+## Matching the phylogeny
+phylo_matching_groups <- sapply(1:nb_species, function(tip) {
+    get_phylo_group(tip,
+        tree,
+        ancestors = ancestors
+    )
+})
+
+## Randomly
+random_groups <- sample(1:K, nb_species, replace = TRUE)
+
+## Randomly but with same size of groups
+# sameSize_random_groups <- sample(1:K,
+#     nb_species,
+#     replace = TRUE,
+#     prob = table(phylo_matching_groups)
+# )
+# group_sizes <- table(phylo_matching_groups)
+
+
+# Saving images of tree
+plot_group_on_tree <- function(tree, groups) {
+    plot(tree, show.tip.label = FALSE, x.lim = 50)
+    tiplabels(bg = groups, pch = 21)
+    text(x = 10, y = 0, label = "This tree will be normalised.")
+}
+
+# Saving trees
+png(file = "img/group_phylo_matching_tree.png")
+plot_group_on_tree(tree, group = phylo_matching_groups)
+dev.off()
+
+png(file = "img/group_random_tree.png")
+plot_group_on_tree(tree, group = random_groups)
+dev.off()
+
+# Normalising tree edge length
+taille_tree <- diag(vcv(tree))[1]
+tree$edge.length <- tree$edge.length / taille_tree
+
+# Compute traits
+# Parameters
+base_values <- c(1, 2.1) # The base trait to add
+sigma2_phylo <- 1
+sigma2_measure <- 0
+stoch_process <- "BM"
+hypo_test <- "satterthwaite" #"vanilla" # "satterthwaite", "likelihood_ratio"
+
+
+matching_phylo_traits <- compute_trait_values(
+    groups = phylo_matching_groups,
+    base_values = base_values, tree,
+    sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
+    stoch_process = stoch_process
+)
+
+random_traits <- compute_trait_values(
+    groups = random_groups,
+    base_values = base_values, tree,
+    sigma2_phylo = sigma2_phylo, sigma2_measure = sigma2_measure,
+    stoch_process = stoch_process
+)
+
+# For phylo matching
+matching_anova <- lm(matching_phylo_traits ~ phylo_matching_groups)
+
+# TODO Handling the stoch process and model for phylolm
+model <- stoch_process
+
+matching_phyloanova <- phylolm(matching_phylo_traits ~ phylo_matching_groups,
+    phy = tree,
+    model = model,
+    measurement_error = TRUE # To let phylolm know if there's measurement error
+)
+
+matching_phyloanova$REML = FALSE
+
+if (hypo_test %in% c("vanilla", "satterthwaite")) {
+    phyloanova_F_stat <- compute_F_statistic(
+        r_squared = matching_phyloanova$r.squared,
+        df1 = K - 1,
+        df2 = nb_species - K
+    )
+
+    df1 <- K - 1
+    df2 <- nb_species - K
+
+    if (hypo_test == "satterthwaite") {
+        df2 <- phylolimma:::ddf_satterthwaite(matching_phyloanova, tree)
+    }
+
+    p_value <- 1 - pf(phyloanova_F_stat, df1 = df1, df2 = df2)
+}
+
+if (hypo_test == "likelihood_ratio") {
+    # How to obtain the loglikehood under H0 ?
+    # TODO Find the correct way to do this
+    # I assume that under H0 this is like saying everyone is from the same group
+    h0_matching_phyloanova <- phylolm(matching_phylo_traits ~ rep(1, nb_species),
+        phy = tree,
+        model = model,
+        measurement_error = (sigma2_measure != 0) # To let phylolm know if there's measurement error
+    )
+    # But this gives a LAPACK error, the system is not inversible.
+
+    lambda_ratio_stat <- -2(h0_matching_phyloanova$logLik - matching_phyloanova$logLik)
+}
+
+p_value
--- a/R/utils.R
+++ b/R/utils.R
@ -0,0 +1,68 @@
+#' Get the group number of a tip of the given tree
+#'
+#' @description
+#' Returns the group number (based on the number of ancestors provided)
+#' for the given tip
+get_phylo_group <- function(tip, tree, ancestors = c(102, 104)) {
+    # Sanity checks
+    if (!all(ancestors %in% tree$edge[, 1])) {
+        stop("At least one ancestor is not in the given tree")
+    }
+    if (!paste0("t", tip) %in% tree$tip.label) {
+        stop("Provided tip is not in the tree.")
+    }
+
+    for (ancestor_id in seq_along(ancestors)) {
+        if (tip %in% getDescendants(tree, ancestors[ancestor_id])) {
+            # If the tip is a descendant of this ancestor return its index
+            return(ancestor_id)
+        }
+    }
+
+    # If we reach this line the tip is not part of the given ancestors
+    warning(paste0(
+        "The tip ", tip,
+        " is not a descendant of the provided ancestors"
+    ))
+}
+
+#' Compute trait values for the given groups
+#'
+#' @description
+#' For each value of group, the base value is matched in the order of
+#' the vector (1st value for 1st group and etc).
+#' Then the phylogenetic trait corresponding from the tree is computed.
+#' And finally the error (gaussian centered) is computed.
+#' The sum is returned
+compute_trait_values <- function(
+    groups,
+    base_values,
+    tree,
+    sigma2_phylo,
+    sigma2_measure,
+    stoch_process = "BM") {
+    # Here we compute the base values for each of the species
+    trait <- rowSums(sapply(seq_along(base_values), function(i) base_values[i] * (groups == i)))
+
+    # The phylogenetic induced value
+    trait_phylo <- rTrait(1, tree,
+        stoch_process,
+        parameters = list(sigma2 = sigma2_phylo)
+    )
+
+    # The measure error
+    trait_error <- rnorm(length(groups), mean = 0, sd = sqrt(sigma2_measure))
+
+    return(trait + trait_phylo + trait_error)
+}
+
+#' Computes the F_statistic from the r_squared
+#'
+#' @description
+#' Use the formula between r_squared and df1 (K-1), df2 (n - K) to return the
+#' F statistic to test against the Fisher distribution.
+compute_F_statistic <- function(r_squared, df1, df2) {
+    # df1 = k - 1, le nombre de prédicteur
+    # df2 = n - k, n le nombre d'observation
+    return(r_squared / (1 - r_squared) * df2 / df1)
+}
--- a/img/group_nonphylo_tree.png
+++ b/img/group_nonphylo_tree.png
--- a/img/group_phylo_matching_tree.png
+++ b/img/group_phylo_matching_tree.png
--- a/img/group_phylo_tree.png
+++ b/img/group_phylo_tree.png
--- a/img/group_random_tree.png
+++ b/img/group_random_tree.png
--- a/img/simulation_power_pure_BM.png
+++ b/img/simulation_power_pure_BM.png
--- a/img/simulation_power_pure_both_error.png
+++ b/img/simulation_power_pure_both_error.png
--- a/simulations/functions-anova.R
+++ b/simulations/functions-anova.R
@ -6,8 +6,8 @@ overall_p <- function(my_model) {
 }

 compute_F_statistic <- function(r_squared, df1, df2) {
-    # df1 = k, le nombre de prédicteur
-    # df2 = n - (k+1), n le nombre d'observation
+    # df1 = k - 1, le nombre de prédicteur
+    # df2 = n - k, n le nombre d'observation
    return(r_squared / (1 - r_squared) * df2 / df1)
 }