anova-phylogenetique-projet.../data/data_TER/01_data_format_Chen2019.R

# Copyright (C) {2023} {PB, MG, AC}
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

# Program to download and format the data from:
# Chen, J., Swofford, R., Johnson, J., Cummings, B. B., Rogel, N., Lindblad-Toh,
# K., Haerty, W., Palma, F. di, & Regev, A. (2019). A quantitative framework for
# characterizing the evolutionary history of mammalian gene expression.
# Genome Research, 29(1), 53–63. https://doi.org/10.1101/gr.237636.118

## Install correct version of compcodeR
# devtools::install_github("csoneson/compcodeR", ref = "phylocomp")

library(here)
library(compcodeR)

################################################################################
## Read and format count and length data
################################################################################
# Read the raw counts
raw_counts_full <- read.table(here("data/data_TER/data/rawCounts.txt"))
raw_counts <- raw_counts_full[,-c(1,2)]
raw_counts <- round(raw_counts) # expected RSEM values -> counts
## format sample names
vectrep <- paste0(".",sapply(1:dim(raw_counts)[2],function(i) length(strsplit(colnames(raw_counts),"[.]")[[i]]))-1)
colnames(raw_counts) <- paste0(colnames(raw_counts),replace(vectrep,vectrep==".1",""))


# Read length information
leng_full <- read.table(here("data/data_TER/data/rawLengths.txt"))
leng <- leng_full[,-c(1,2)]
## format sample names
vectrep <- paste0(".",sapply(1:dim(leng)[2],function(i) length(strsplit(colnames(leng),"[.]")[[i]]))-1)
colnames(leng) <- paste0(colnames(leng),replace(vectrep,vectrep==".1",""))

# remove NA
raw_counts_noNA <- raw_counts[complete.cases(raw_counts),]
length_noNA <- leng[complete.cases(raw_counts),colnames(raw_counts_noNA)]

# remove zero counts
zero_counts <- rowSums(raw_counts_noNA) == 0
raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
length_noNA <- length_noNA[!zero_counts, ]

# # remove counts when at least one zero
# zero_counts <- apply(raw_counts_noNA, 1, function(x) any(x == 0))
# raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
# length_noNA <- length_noNA[!zero_counts, ]

################################################################################
## Read and format condition data
################################################################################

mammalsconds <- read.csv2(here("data/data_TER/data_raw/speciesgroups.txt"), sep = "\t")
mammalsconds
## "primates" : 2= primates, 1= non primates
## "rodents" : 2 = rodents , 1 = non rodents

## for primates only...
condName <- "primates"
# Species condition (1 = non primates, 2 = primates)
condstp <- mammalsconds$primates
names(condstp) <- mammalsconds$species
# Sample Condition
idSpe <- sapply(1:dim(raw_counts)[2],function(i) strsplit(colnames(raw_counts)[i],"[.]")[[1]][1])
primates <- condstp[idSpe]
names(primates) <- colnames(raw_counts)
# Format
primates <- factor(primates)
colData <- data.frame(primates)
colData$primates <- factor(colData$primates)
colData$species <- sapply(1:length(rownames(colData)),function(i) strsplit(rownames(colData),"[.]")[[i]][1])


## for rodents only... (plus favorable pour philoebayes)
condName <- "rodents"
# Species condition (1 = non rodents, 2 = rodents)
condstp <- mammalsconds$rodents
names(condstp) <- mammalsconds$species
# Sample Condition
rodents <- condstp[idSpe]
names(rodents) <- colnames(raw_counts)
# Format
colData$rodents <- factor(rodents)

################################################################################
## Read and format Tree ## COPIE COLLE DE STERN CA MARCHE PO
################################################################################
library(ape)
## Get the tree
tree <- read.tree(file = here("data/data_TER/data_raw/mammals_tree_trimmed_hassler2020.tree"))
tree$node.label <- NULL
tree$tip.label
plot(tree)

## Species names
# Match
tree_data_cor <- match(tree$tip.label, colData$species)
data_tree_cor <- match(colData$species, tree$tip.label)
# Species in the tree NOT in data
tree$tip.label[is.na(tree_data_cor)]
# Species in data NOT in the tree
colData$species[is.na(data_tree_cor)]

## Format Tree
# Get rid of species not in data
tree <- drop.tip(tree, tip = tree$tip.label[is.na(tree_data_cor)])
plot(tree)
# function to add replicates
#createTreeWithReplicates <- function(tree){
  tree_rep <- tree
  for (tip_label in tree$tip.label) {
    print(tip_label)
    replis <- colnames(raw_counts_noNA)[grepl(tip_label, colnames(raw_counts_noNA))]
    print(replis)
    for (rep in replis) {
      print(rep)
      tree_rep <- phytools::bind.tip(tree_rep, tip.label = rep,
                                     where = which(tree_rep$tip.label == tip_label))
      plot(tree_rep)
    }
  }
  # Remove original tips
#  return(ape::drop.tip(tree_rep, tree$tip.label))
#}
tree_rep <- ape::drop.tip(tree_rep, tree$tip.label)
#tree_rep <- createTreeWithReplicates(tree)
# Plot
plot(tree_rep)

## Reorder data
corr <- match(tree_rep$tip.label, rownames(colData))
colData <- colData[corr, ]
raw_counts_noNA <- raw_counts_noNA[, corr]
length_noNA <- length_noNA[, corr]

## Remove if zero count for all replicates of a species
zero_counts <- apply(raw_counts_noNA, 1, function(x) any(tapply(t(x), colData$species, FUN = sum) == 0))
raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
length_noNA <- length_noNA[!zero_counts, ]


# # ################################################################################
# # ## Read and format Tree ## MODIF ANAELLE, MARCHE PO POUR CompcodeR mais j'ai pas compris pk
# # ################################################################################
# library(ape)
# ## Get the tree
# tree <- read.tree(file = here("data_raw/mammals_tree_trimmed_hassler2020.tree"))
# tree$node.label <- NULL
# tree$tip.label
# plot(tree)
#
# ## Arbre initial
# col <- colnames(raw_counts)
# col <- sub("[.][[:digit:]]+", "", col)
#
# tree_rep <- tree # Création de l'arbre avec les réplicats
#
# for (i in 1:length(tree$tip.label)){
#   print(i)
#   label <- tree$tip.label[i]
#   print(label)
#   nb_replicates <- length(which(col == label))
#   print(nb_replicates)
#   if(nb_replicates>1){               # Pour éviter de rentrer ds la boucle si pas besoin (et éviter une erreur)
#     for (j in (nb_replicates-1):1){  # J'ai inversé l'ordre d'ajout des réplicats pour qu'ils soient ds le bon ordre ds l'arbre
#       new_replicate = paste(label,j,sep = ".")
#       tree_rep <- phytools::bind.tip(tree_rep,new_replicate,edge.length = NULL, which(tree_rep$tip.label==label),position=0)
#     }
#   }
# }
# plot(tree_rep)
#
#
# ## Reorder data
# corr <- match(tree_rep$tip.label, rownames(colData))
# colData <- colData[corr, ]
# raw_counts_noNA <- raw_counts_noNA[, corr]
# length_noNA <- length_noNA[, corr]
#
#


################################################################################
# Format for compcodeR
################################################################################
## rename key column "condition" in colData
colnames(colData)[grep(condName, colnames(colData))] <- "condition"
## id species
colData$id.species <- colData$species
## create object
cpd <- phyloCompData(tree = tree_rep,
                     count.matrix = raw_counts_noNA,
                     sample.annotations = colData,
                     info.parameters = list(dataset = "chen2019_rodents_cpd", uID = "1"),
                     length.matrix = as.matrix(length_noNA))
## check obj conformity
check_phyloCompData(cpd)

## save data to rds file
saveRDS(cpd, file = here("data/data_TER/data", "chen2019_rodents_cpd.rds"))