mirror of
https://github.com/Polarolouis/anova-phylogenetique-projet-msv.git
synced 2026-06-17 18:25:25 +02:00
Ajout de vrais données
This commit is contained in:
parent
820716a4cd
commit
54b37e47f6
16 changed files with 55942 additions and 0 deletions
213
data/data_TER/01_data_format_Chen2019.R
Normal file
213
data/data_TER/01_data_format_Chen2019.R
Normal file
|
|
@ -0,0 +1,213 @@
|
||||||
|
# Copyright (C) {2023} {PB, MG, AC}
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License along
|
||||||
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
# Program to download and format the data from:
|
||||||
|
# Chen, J., Swofford, R., Johnson, J., Cummings, B. B., Rogel, N., Lindblad-Toh,
|
||||||
|
# K., Haerty, W., Palma, F. di, & Regev, A. (2019). A quantitative framework for
|
||||||
|
# characterizing the evolutionary history of mammalian gene expression.
|
||||||
|
# Genome Research, 29(1), 53–63. https://doi.org/10.1101/gr.237636.118
|
||||||
|
|
||||||
|
## Install correct version of compcodeR
|
||||||
|
# devtools::install_github("csoneson/compcodeR", ref = "phylocomp")
|
||||||
|
|
||||||
|
library(here)
|
||||||
|
library(compcodeR)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
## Read and format count and length data
|
||||||
|
################################################################################
|
||||||
|
# Read the raw counts
|
||||||
|
raw_counts_full <- read.table(here("data/data_TER/data/rawCounts.txt"))
|
||||||
|
raw_counts <- raw_counts_full[,-c(1,2)]
|
||||||
|
raw_counts <- round(raw_counts) # expected RSEM values -> counts
|
||||||
|
## format sample names
|
||||||
|
vectrep <- paste0(".",sapply(1:dim(raw_counts)[2],function(i) length(strsplit(colnames(raw_counts),"[.]")[[i]]))-1)
|
||||||
|
colnames(raw_counts) <- paste0(colnames(raw_counts),replace(vectrep,vectrep==".1",""))
|
||||||
|
|
||||||
|
|
||||||
|
# Read length information
|
||||||
|
leng_full <- read.table(here("data/data_TER/data/rawLengths.txt"))
|
||||||
|
leng <- leng_full[,-c(1,2)]
|
||||||
|
## format sample names
|
||||||
|
vectrep <- paste0(".",sapply(1:dim(leng)[2],function(i) length(strsplit(colnames(leng),"[.]")[[i]]))-1)
|
||||||
|
colnames(leng) <- paste0(colnames(leng),replace(vectrep,vectrep==".1",""))
|
||||||
|
|
||||||
|
# remove NA
|
||||||
|
raw_counts_noNA <- raw_counts[complete.cases(raw_counts),]
|
||||||
|
length_noNA <- leng[complete.cases(raw_counts),colnames(raw_counts_noNA)]
|
||||||
|
|
||||||
|
# remove zero counts
|
||||||
|
zero_counts <- rowSums(raw_counts_noNA) == 0
|
||||||
|
raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
|
||||||
|
length_noNA <- length_noNA[!zero_counts, ]
|
||||||
|
|
||||||
|
# # remove counts when at least one zero
|
||||||
|
# zero_counts <- apply(raw_counts_noNA, 1, function(x) any(x == 0))
|
||||||
|
# raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
|
||||||
|
# length_noNA <- length_noNA[!zero_counts, ]
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
## Read and format condition data
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
mammalsconds <- read.csv2(here("data/data_TER/data_raw/speciesgroups.txt"), sep = "\t")
|
||||||
|
mammalsconds
|
||||||
|
## "primates" : 2= primates, 1= non primates
|
||||||
|
## "rodents" : 2 = rodents , 1 = non rodents
|
||||||
|
|
||||||
|
## for primates only...
|
||||||
|
condName <- "primates"
|
||||||
|
# Species condition (1 = non primates, 2 = primates)
|
||||||
|
condstp <- mammalsconds$primates
|
||||||
|
names(condstp) <- mammalsconds$species
|
||||||
|
# Sample Condition
|
||||||
|
idSpe <- sapply(1:dim(raw_counts)[2],function(i) strsplit(colnames(raw_counts)[i],"[.]")[[1]][1])
|
||||||
|
primates <- condstp[idSpe]
|
||||||
|
names(primates) <- colnames(raw_counts)
|
||||||
|
# Format
|
||||||
|
primates <- factor(primates)
|
||||||
|
colData <- data.frame(primates)
|
||||||
|
colData$primates <- factor(colData$primates)
|
||||||
|
colData$species <- sapply(1:length(rownames(colData)),function(i) strsplit(rownames(colData),"[.]")[[i]][1])
|
||||||
|
|
||||||
|
|
||||||
|
## for rodents only... (plus favorable pour philoebayes)
|
||||||
|
condName <- "rodents"
|
||||||
|
# Species condition (1 = non rodents, 2 = rodents)
|
||||||
|
condstp <- mammalsconds$rodents
|
||||||
|
names(condstp) <- mammalsconds$species
|
||||||
|
# Sample Condition
|
||||||
|
rodents <- condstp[idSpe]
|
||||||
|
names(rodents) <- colnames(raw_counts)
|
||||||
|
# Format
|
||||||
|
colData$rodents <- factor(rodents)
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
## Read and format Tree ## COPIE COLLE DE STERN CA MARCHE PO
|
||||||
|
################################################################################
|
||||||
|
library(ape)
|
||||||
|
## Get the tree
|
||||||
|
tree <- read.tree(file = here("data/data_TER/data_raw/mammals_tree_trimmed_hassler2020.tree"))
|
||||||
|
tree$node.label <- NULL
|
||||||
|
tree$tip.label
|
||||||
|
plot(tree)
|
||||||
|
|
||||||
|
## Species names
|
||||||
|
# Match
|
||||||
|
tree_data_cor <- match(tree$tip.label, colData$species)
|
||||||
|
data_tree_cor <- match(colData$species, tree$tip.label)
|
||||||
|
# Species in the tree NOT in data
|
||||||
|
tree$tip.label[is.na(tree_data_cor)]
|
||||||
|
# Species in data NOT in the tree
|
||||||
|
colData$species[is.na(data_tree_cor)]
|
||||||
|
|
||||||
|
## Format Tree
|
||||||
|
# Get rid of species not in data
|
||||||
|
tree <- drop.tip(tree, tip = tree$tip.label[is.na(tree_data_cor)])
|
||||||
|
plot(tree)
|
||||||
|
# function to add replicates
|
||||||
|
#createTreeWithReplicates <- function(tree){
|
||||||
|
tree_rep <- tree
|
||||||
|
for (tip_label in tree$tip.label) {
|
||||||
|
print(tip_label)
|
||||||
|
replis <- colnames(raw_counts_noNA)[grepl(tip_label, colnames(raw_counts_noNA))]
|
||||||
|
print(replis)
|
||||||
|
for (rep in replis) {
|
||||||
|
print(rep)
|
||||||
|
tree_rep <- phytools::bind.tip(tree_rep, tip.label = rep,
|
||||||
|
where = which(tree_rep$tip.label == tip_label))
|
||||||
|
plot(tree_rep)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Remove original tips
|
||||||
|
# return(ape::drop.tip(tree_rep, tree$tip.label))
|
||||||
|
#}
|
||||||
|
tree_rep <- ape::drop.tip(tree_rep, tree$tip.label)
|
||||||
|
#tree_rep <- createTreeWithReplicates(tree)
|
||||||
|
# Plot
|
||||||
|
plot(tree_rep)
|
||||||
|
|
||||||
|
## Reorder data
|
||||||
|
corr <- match(tree_rep$tip.label, rownames(colData))
|
||||||
|
colData <- colData[corr, ]
|
||||||
|
raw_counts_noNA <- raw_counts_noNA[, corr]
|
||||||
|
length_noNA <- length_noNA[, corr]
|
||||||
|
|
||||||
|
## Remove if zero count for all replicates of a species
|
||||||
|
zero_counts <- apply(raw_counts_noNA, 1, function(x) any(tapply(t(x), colData$species, FUN = sum) == 0))
|
||||||
|
raw_counts_noNA <- raw_counts_noNA[!zero_counts, ]
|
||||||
|
length_noNA <- length_noNA[!zero_counts, ]
|
||||||
|
|
||||||
|
|
||||||
|
# # ################################################################################
|
||||||
|
# # ## Read and format Tree ## MODIF ANAELLE, MARCHE PO POUR CompcodeR mais j'ai pas compris pk
|
||||||
|
# # ################################################################################
|
||||||
|
# library(ape)
|
||||||
|
# ## Get the tree
|
||||||
|
# tree <- read.tree(file = here("data_raw/mammals_tree_trimmed_hassler2020.tree"))
|
||||||
|
# tree$node.label <- NULL
|
||||||
|
# tree$tip.label
|
||||||
|
# plot(tree)
|
||||||
|
#
|
||||||
|
# ## Arbre initial
|
||||||
|
# col <- colnames(raw_counts)
|
||||||
|
# col <- sub("[.][[:digit:]]+", "", col)
|
||||||
|
#
|
||||||
|
# tree_rep <- tree # Création de l'arbre avec les réplicats
|
||||||
|
#
|
||||||
|
# for (i in 1:length(tree$tip.label)){
|
||||||
|
# print(i)
|
||||||
|
# label <- tree$tip.label[i]
|
||||||
|
# print(label)
|
||||||
|
# nb_replicates <- length(which(col == label))
|
||||||
|
# print(nb_replicates)
|
||||||
|
# if(nb_replicates>1){ # Pour éviter de rentrer ds la boucle si pas besoin (et éviter une erreur)
|
||||||
|
# for (j in (nb_replicates-1):1){ # J'ai inversé l'ordre d'ajout des réplicats pour qu'ils soient ds le bon ordre ds l'arbre
|
||||||
|
# new_replicate = paste(label,j,sep = ".")
|
||||||
|
# tree_rep <- phytools::bind.tip(tree_rep,new_replicate,edge.length = NULL, which(tree_rep$tip.label==label),position=0)
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# plot(tree_rep)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# ## Reorder data
|
||||||
|
# corr <- match(tree_rep$tip.label, rownames(colData))
|
||||||
|
# colData <- colData[corr, ]
|
||||||
|
# raw_counts_noNA <- raw_counts_noNA[, corr]
|
||||||
|
# length_noNA <- length_noNA[, corr]
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# Format for compcodeR
|
||||||
|
################################################################################
|
||||||
|
## rename key column "condition" in colData
|
||||||
|
colnames(colData)[grep(condName, colnames(colData))] <- "condition"
|
||||||
|
## id species
|
||||||
|
colData$id.species <- colData$species
|
||||||
|
## create object
|
||||||
|
cpd <- phyloCompData(tree = tree_rep,
|
||||||
|
count.matrix = raw_counts_noNA,
|
||||||
|
sample.annotations = colData,
|
||||||
|
info.parameters = list(dataset = "chen2019_rodents_cpd", uID = "1"),
|
||||||
|
length.matrix = as.matrix(length_noNA))
|
||||||
|
## check obj conformity
|
||||||
|
check_phyloCompData(cpd)
|
||||||
|
|
||||||
|
## save data to rds file
|
||||||
|
saveRDS(cpd, file = here("data/data_TER/data", "chen2019_rodents_cpd.rds"))
|
||||||
1202
data/data_TER/Analyse_Méthodes_compcodeR.Rmd
Normal file
1202
data/data_TER/Analyse_Méthodes_compcodeR.Rmd
Normal file
File diff suppressed because it is too large
Load diff
BIN
data/data_TER/data/chen2019_primates_cpd.rds
Normal file
BIN
data/data_TER/data/chen2019_primates_cpd.rds
Normal file
Binary file not shown.
BIN
data/data_TER/data/chen2019_rodents_cpd.rds
Normal file
BIN
data/data_TER/data/chen2019_rodents_cpd.rds
Normal file
Binary file not shown.
BIN
data/data_TER/data/chen2019_rodentssub_cpd.rds
Normal file
BIN
data/data_TER/data/chen2019_rodentssub_cpd.rds
Normal file
Binary file not shown.
10899
data/data_TER/data/rawCounts.txt
Normal file
10899
data/data_TER/data/rawCounts.txt
Normal file
File diff suppressed because it is too large
Load diff
10899
data/data_TER/data/rawEffLengths.txt
Normal file
10899
data/data_TER/data/rawEffLengths.txt
Normal file
File diff suppressed because it is too large
Load diff
10899
data/data_TER/data/rawFPKM.txt
Normal file
10899
data/data_TER/data/rawFPKM.txt
Normal file
File diff suppressed because it is too large
Load diff
10899
data/data_TER/data/rawLengths.txt
Normal file
10899
data/data_TER/data/rawLengths.txt
Normal file
File diff suppressed because it is too large
Load diff
10899
data/data_TER/data/rawTPM.txt
Normal file
10899
data/data_TER/data/rawTPM.txt
Normal file
File diff suppressed because it is too large
Load diff
11
data/data_TER/data/updateroundmatrix.R
Normal file
11
data/data_TER/data/updateroundmatrix.R
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
tab <- read.table("rawCounts.txt")
|
||||||
|
head(tab)
|
||||||
|
tab2 <- tab[,-c(1,2)]
|
||||||
|
head(tab2)
|
||||||
|
dim(tab2)
|
||||||
|
tab3 <- tab2[complete.cases(tab2),]
|
||||||
|
head(tab3)
|
||||||
|
tab4 <- round(tab3)
|
||||||
|
head(tab4)
|
||||||
|
write.csv(tab4,file="rounded_matrix.csv")
|
||||||
|
getwd()
|
||||||
BIN
data/data_TER/data_raw/._speciesgroups.txt
Normal file
BIN
data/data_TER/data_raw/._speciesgroups.txt
Normal file
Binary file not shown.
1
data/data_TER/data_raw/mammals_tree_chen2019.tree
Normal file
1
data/data_TER/data_raw/mammals_tree_chen2019.tree
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
(opossum:0.3489,(armadillo:0.1678,((cow:0.236,(dog:0.0894,ferret:0.1124):0.0816):0.0354,((rabbit:0.215,(rat:0.0915,(musCaroli:0.029,(musSpretus:0.0125,musMusculus:0.0125):0.0165):0.0567):0.2744):0.013,(marmoset:0.0683,(rhesus:0.0379,(orangutan:0.0185,(gorilla:0.0088,(human:0.0064,(chimp:0.0027,bonobo:0.0027):0.0037):0.0021):0.0094):0.0146):0.0208):0.0879):0.0212):0.0225):0.2446);
|
||||||
1
data/data_TER/data_raw/mammals_tree_hassler2020.tree
Normal file
1
data/data_TER/data_raw/mammals_tree_hassler2020.tree
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1 @@
|
||||||
|
(opossum:147.1,(armadillo:98.7,((cow:85.4,(ferret:55.9,dog:55.9):29.5):10.8,((rabbit:89,(rat:29.4,(musCaroli:4.1,musMusculus:4.1,musSpretus:4.1):25.3):59.6):2.9,(marmoset:52.4,(rhesus:35.3,(orangutan:18.4,(gorilla:11.5,(human:8.8,(bonobo:4.5,chimp:4.5):4.3):2.7):6.9):16.9):17.1):39.5):4.3):2.5):48.4);
|
||||||
18
data/data_TER/data_raw/speciesgroups.txt
Normal file
18
data/data_TER/data_raw/speciesgroups.txt
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
nb.spcies species species.id primates rodents laurasians lagomorphs
|
||||||
|
1 human ENSG000 2 1 1 NA
|
||||||
|
2 cow ENSBTAG 1 1 2 NA
|
||||||
|
3 marmoset ENSCJAG 1 1 1 NA
|
||||||
|
4 dog ENSCAFG 1 1 2 NA
|
||||||
|
5 armadillo ENSDNOG 1 1 2 NA
|
||||||
|
6 gorilla ENSGGOG 2 1 1 NA
|
||||||
|
7 musMusculus ENSMUSG 1 2 1 NA
|
||||||
|
8 musCaroli ENSMUSG 1 2 1 NA
|
||||||
|
9 musSpretus ENSMUSG 1 2 1 NA
|
||||||
|
10 opossum ENSMODG 1 1 NA NA
|
||||||
|
11 ferret ENSMPUG 1 1 NA NA
|
||||||
|
12 rabbit ENSOCUG 1 1 1 NA
|
||||||
|
13 bonobo ENSPTRG 2 1 1 NA
|
||||||
|
14 chimp ENSPTRG 2 1 1 NA
|
||||||
|
15 orangutan ENSPPYG 2 1 1 NA
|
||||||
|
16 rhesus ENSMMUG 2 1 1 NA
|
||||||
|
17 rat ENSRNOG 1 2 1 NA
|
||||||
Loading…
Add table
Reference in a new issue