All analysis updates

This commit is contained in:
Louis Lacoste 2024-06-17 10:49:53 +02:00
parent af625836b6
commit 395a7beaed
22 changed files with 2304 additions and 548 deletions

View file

@ -18,6 +18,12 @@ library(latex2exp)
list_collection <- readRDS("{{clustering}}")
unlisted_best_partition <- unlist(extract_best_bipartite_partition(list_collection))
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
```
Le clustering donne le critère suivant : $BICL = `r BICL`$
```{r}
graph_size_df <- build_graph_size_dataframe(unlisted_best_partition)
summarized_graph_size_df <- graph_size_df %>%
group_by(collection_id) %>%

File diff suppressed because one or more lines are too long

View file

@ -1,6 +1,6 @@
---
format: html
title: Clustering avec `colSBM` des données
title: Clustering avec `colSBM` des données Doré et al.
execute:
echo: false
warning: false
@ -12,13 +12,20 @@ execute:
library(colSBM)
library(here)
library(stringr)
library(tidyr)
library(dplyr)
library(aricode)
library(reshape2)
library(ggplot2)
library(ggalluvial)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "dore")
files_vec <- get_recent_files(data_folder, n = 16)
files_vec <- get_recent_files(data_folder, n = 16L)
files_vec <- identify_models(files_vec)
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
@ -26,6 +33,22 @@ list_clustering <- files_vec
## Analyse par modèle
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_bipartite_partition(list_collection)
)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
BICL
})
knitr::kable(vec_bicl,
caption = "BICL par modèle", col.names = "$BICL$",
row.names = TRUE
)
```
:::{.panel-tabset}
```{r write_tabs}
@ -43,4 +66,97 @@ for (clustering_idx in seq_len(length(list_clustering))) {
}
```
:::
:::
## Comparaison des modèles
### Table des clustering
```{r}
clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
extract_clustering(readRDS(list_clustering[[idx]]))
})
clust_to_compare_order <- crossing(
f = seq_len(ncol(clustering_df)),
s = seq_len(ncol(clustering_df))
) %>%
filter(!(f == s) & f < s)
clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
clust_vec <- extract_clustering(readRDS(list_clustering[[idx]]))
net_id <- names(clust_vec)
clust_vec <- unname(clust_vec)
clust_idx <- idx
data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
}))
clustering_partitions_df <- clustering_partitions_df %>%
# mutate(net_id = str_trunc(net_id, width = 20L)) %>%
mutate_all(as.factor)
```
```{r ARI}
ari_dist_matrix <- outer(
X = seq_len(ncol(clustering_df)),
Y = seq_len(ncol(clustering_df)),
Vectorize(function(x, y) {
f_clust <- clustering_df[, x]
s_clust <- clustering_df[, y]
ARI(f_clust, s_clust)
})
)
ggplot(ari_dist_matrix %>% melt()) +
aes(x = Var1, y = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
scale_x_reverse() +
geom_text(aes(label = round(value, digits = 4))) +
labs(
x = "id clusterings",
y = "id clusterings", title = "ARI entre les clusterings"
) +
theme_bw()
```
```{r table cooccurrence}
list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
f <- clust_to_compare_order[[row_id, 1]]
s <- clust_to_compare_order[[row_id, 2]]
table(clustering_df[, f], clustering_df[, s])
})
```
```{r function plot tables}
plot_table <- function(data, f = NULL, s = NULL) {
melted_data <- data %>%
melt() %>%
mutate(
Var1 = as.factor(Var1),
Var2 = as.factor(Var2)
)
ggplot(melted_data) +
aes(y = Var1, x = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
scale_x_discrete() +
geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
labs(
x = paste0("numéros clusters pour clustering : ", s),
y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
) +
theme_bw()
}
```
```{r tables coocc plot}
#| output: asis
for (id in seq_len(length(list_tables_coocc))) {
print(plot_table(list_tables_coocc,
f = clust_to_compare_order[[id, 1]],
s = clust_to_compare_order[[id, 2]]
))
}
```

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,149 @@
---
format: html
title: Clustering avec `colSBM` des données Doré et al. seulement sur les larges réseaux
execute:
echo: false
warning: false
---
# Analyse
```{r}
library(colSBM)
library(here)
library(dplyr)
library(stringr)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "dore_no_small")
# Full data
list_full_files <- file.path(
data_folder,
list.files(data_folder,
pattern = "full"
)
)
names(list_full_files) <- stringr::str_extract(
string = list_full_files,
pattern = "(iid|pirho|pi|rho)"
)
lapply(seq_len(length(list_full_files)), function(idx) {
min_size <- readRDS(list_full_files[[idx]])[[1]]
clustering <- readRDS(list_full_files[[idx]])[[2]]
min_filepath <- gsub("full_", "min_size_", list_full_files[[idx]])
clustering_filepath <- gsub("full_", "clustering_", list_full_files[[idx]])
saveRDS(min_size, min_filepath)
saveRDS(clustering, clustering_filepath)
})
```
```{r}
#| output: false
files_vec <- get_recent_files(data_folder, pattern = "clustering")
min_vec <- get_recent_files(data_folder, pattern = "min")
files_vec <- identify_models(files_vec)
min_vec <- identify_models(min_vec)
# Importing data
base_data_folder <- file.path(here(), "code", "data", "dore")
interaction_data <- read.table(file = file.path(base_data_folder, "interaction-data.txt"), sep = "\t", header = TRUE)
seq_ids_network_aggreg <- unique(interaction_data$id_network_aggreg)
names_aggreg_networks <- sapply(
seq_ids_network_aggreg,
function(id) {
paste0(
unique(interaction_data[which(interaction_data$id_network_aggreg == id), ]$web),
collapse = "+"
)
}
)
# Computation of incidence matrices
incidence_matrices <- lapply(
seq_ids_network_aggreg,
function(m) {
current_interaction_data <- interaction_data[which(interaction_data$id_network_aggreg == m), ] %>%
mutate(
plantaggreg = paste(plantorder,
plantfamily, plantgenus, plantspecies,
sep = "-"
),
insectaggreg = paste(insectorder,
insectfamily, insectgenus, insectspecies,
sep = "-"
)
)
current_interaction_data <- table(current_interaction_data$plantaggreg, current_interaction_data$insectaggreg)
current_incidence_matrix <- matrix(current_interaction_data,
ncol = ncol(current_interaction_data), dimnames = dimnames(current_interaction_data)
)
current_incidence_matrix[which(current_incidence_matrix > 0)] <- 1
return(current_incidence_matrix)
}
)
names(incidence_matrices) <- names_aggreg_networks
min_sizes <- lapply(min_vec, function(file) readRDS(file))
nb_networks <- sapply(min_sizes, function(min_size) {
length(incidence_matrices[
(colSums(sapply(incidence_matrices, dim) > min_size) == 2L)
])
})
min_sizes_df <- do.call(rbind.data.frame, min_sizes)
min_sizes_df <- cbind(min_sizes_df, nb_networks)
colnames(min_sizes_df) <- c("nr >", "nc >", "nb_networks")
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
```
Les données ont été filtrées pour retenir les réseaux de taille supérieure à
celles du tableau ci-dessous, aboutissant au nombre de réseaux de la colonne de
droite :
`r knitr::kable(min_sizes_df, caption = "Table des tailles minimales retenues par modèle")`
## Analyse par modèle
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_bipartite_partition(list_collection)
)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
BICL
})
knitr::kable(vec_bicl, caption = "BICL par modèle", col.names = "$BICL$", row.names = TRUE)
```
:::{.panel-tabset}
```{r write_tabs}
#| warning: false
#| output: asis
# Generate content for each model using knit_expand
for (clustering_idx in seq_len(length(list_clustering))) {
clustering <- list_clustering[clustering_idx]
model <- names(clustering)
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
cat(res, sep = "\n")
cat("\n")
}
```
:::

View file

@ -0,0 +1,156 @@
---
format: html
title: Clustering avec `colSBM` des données Doré et al. seulement sur les larges réseaux
execute:
echo: false
warning: false
---
# Analyse
```{r}
library(colSBM)
library(here)
library(dplyr)
library(stringr)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "dore_no_small")
# Full data
list_full_files <- file.path(
data_folder,
list.files(data_folder,
pattern = "full"
)
)
names(list_full_files) <- stringr::str_extract(
string = list_full_files,
pattern = "(iid|pirho|pi|rho)"
)
lapply(seq_len(length(list_full_files)), function(idx) {
min_size <- readRDS(list_full_files[[idx]])[[1]]
clustering <- readRDS(list_full_files[[idx]])[[2]]
min_filepath <- gsub("full_", "min_size_", list_full_files[[idx]])
clustering_filepath <- gsub("full_", "clustering_", list_full_files[[idx]])
saveRDS(min_size, min_filepath)
saveRDS(clustering, clustering_filepath)
})
```
```{r}
#| output: false
files_vec <- get_recent_files(data_folder, pattern = "clustering")
min_vec <- get_recent_files(data_folder, pattern = "min")
files_vec <- identify_models(files_vec)
min_vec <- identify_models(min_vec)
# Importing data
base_data_folder <- file.path(here(), "code", "data", "dore")
interaction_data <- read.table(file = file.path(base_data_folder, "interaction-data.txt"), sep = "\t", header = TRUE)
seq_ids_network_aggreg <- unique(interaction_data$id_network_aggreg)
names_aggreg_networks <- sapply(
seq_ids_network_aggreg,
function(id) {
paste0(
unique(interaction_data[which(interaction_data$id_network_aggreg == id), ]$web),
collapse = "+"
)
}
)
# Computation of incidence matrices
incidence_matrices <- lapply(
seq_ids_network_aggreg,
function(m) {
current_interaction_data <- interaction_data[which(interaction_data$id_network_aggreg == m), ] %>%
mutate(
plantaggreg = paste(plantorder,
plantfamily, plantgenus, plantspecies,
sep = "-"
),
insectaggreg = paste(insectorder,
insectfamily, insectgenus, insectspecies,
sep = "-"
)
)
current_interaction_data <- table(current_interaction_data$plantaggreg, current_interaction_data$insectaggreg)
current_incidence_matrix <- matrix(current_interaction_data,
ncol = ncol(current_interaction_data), dimnames = dimnames(current_interaction_data)
)
current_incidence_matrix[which(current_incidence_matrix > 0)] <- 1
return(current_incidence_matrix)
}
)
names(incidence_matrices) <- names_aggreg_networks
min_sizes <- lapply(min_vec, function(file) readRDS(file))
nb_networks <- sapply(min_sizes, function(min_size) {
length(incidence_matrices[
(colSums(sapply(incidence_matrices, dim) > min_size) == 2L)
])
})
min_sizes_df <- do.call(rbind.data.frame, min_sizes)
min_sizes_df <- cbind(min_sizes_df, nb_networks)
colnames(min_sizes_df) <- c("nr >", "nc >", "nb_networks")
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
```
Les données ont été filtrées pour retenir les réseaux de taille supérieure à
celles du tableau ci-dessous, aboutissant au nombre de réseaux de la colonne de
droite :
`r knitr::kable(min_sizes_df, caption = "Table des tailles minimales retenues par modèle")`
## Analyse par modèle
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_bipartite_partition(list_collection)
)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
BICL
})
knitr::kable(vec_bicl, caption = "BICL par modèle", col.names = "$BICL$", row.names = TRUE)
```
:::{.panel-tabset}
```{r write_tabs}
#| warning: false
#| output: asis
# Generate content for each model using knit_expand
for (clustering_idx in seq_len(length(list_clustering))) {
clustering <- list_clustering[clustering_idx]
model <- names(clustering)
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
cat(res, sep = "\n")
cat("\n")
}
```
:::

File diff suppressed because one or more lines are too long

View file

@ -1,115 +1,59 @@
---
title: Analysis of herbivores data
format: html
title: Clustering avec `colSBM` des données herbivores
execute:
echo: false
freeze: auto
warning: false
---
```{r import_data}
# Analyse
```{r}
library(colSBM)
library(here)
library(stringr)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "herbivores")
files_vec <- get_recent_files(data_folder)
filepath <- files_vec[1]
list_collection <- readRDS(filepath)
files_vec <- identify_models(files_vec)
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
```
```{r libs_and_data}
library(here)
## Analyse par modèle
library(colSBM)
library(dplyr)
library(tidyr)
library(plotly)
library(ggplot2)
library(ggforce)
library(ggrepel)
library(latex2exp)
```
```{r dataformatting}
unlisted_best_partition <- unlist(extract_best_bipartite_partition(list_collection))
graph_size_df <- build_graph_size_dataframe(unlisted_best_partition)
summarized_graph_size_df <- graph_size_df %>%
group_by(collection_id) %>%
select(-net_id) %>%
summarise(
M = n(),
nr_mean = mean(nr),
nr_sd = ifelse(length(nr) > 1, sd(nr), 0),
nc_mean = mean(nc),
nc_sd = ifelse(length(nc) > 1, sd(nc), 0),
Qr = first(Qr),
Qc = first(Qc)
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_bipartite_partition(list_collection)
)
```
### Analyse des tailles de cluster
```{r collection_blocks_plot_size}
#| layout-ncol: 2
#| plotly: true
filtered_summarized_graph_size_df <- summarized_graph_size_df
ggplot(filtered_summarized_graph_size_df) +
aes(x = Qc, y = Qr, color = collection_id, label = M) +
geom_point() + # , position = position_jitter(h = 0.15, w = 0.05)
geom_text_repel() +
xlab(TeX("$Q_c$")) +
ylab(TeX("$Q_r$")) +
coord_fixed()
plot_ly(x = filtered_summarized_graph_size_df$Qc,
y = filtered_summarized_graph_size_df$Qr,
z = filtered_summarized_graph_size_df$M,
type = "scatter3d", mode = "markers",
color = filtered_summarized_graph_size_df$collection_id) %>%
layout(scene = list(xaxis = list(title = "Q_c"),
yaxis = list(title = "Q_r")))
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
BICL
})
knitr::kable(vec_bicl, caption = "BICL par modèle", col.names = "$BICL$", row.names = TRUE)
```
### Analyse des tailles de réseaux
:::{.panel-tabset}
```{r size_plot}
#| layout-ncol: 2
ggplot(filtered_summarized_graph_size_df) +
aes(x = nc_mean, y = nr_mean, color = collection_id) +
geom_point(size = 4) + # , position = position_jitter(h = 0.15, w = 0.05)
geom_ellipse(aes(
x0 = nc_mean, y0 = nr_mean,
a = 0.5 * nc_sd,
b = 0.5 * nr_sd,
fill = collection_id, angle = 0
), alpha = 0.1) +
geom_text_repel(aes(label = collection_id)) +
xlab(TeX("$n_c$")) +
ylab(TeX("$n_r$")) +
coord_fixed() +
ggtitle("Distribution des tailles moyennes et écart-types")
```{r write_tabs}
#| warning: false
#| output: asis
max_nr <- 200
max_nc <- 200
ggplot(filtered_summarized_graph_size_df %>% filter(nr_mean <= max_nr, nc_mean <= max_nc)) +
aes(x = nc_mean, y = nr_mean, color = collection_id) +
geom_point(size = 4) + # , position = position_jitter(h = 0.15, w = 0.05)
geom_ellipse(aes(
x0 = nc_mean, y0 = nr_mean,
a = 0.5 * nc_sd,
b = 0.5 * nr_sd,
fill = collection_id, angle = 0
), alpha = 0.1) +
geom_text_repel(aes(label = collection_id)) +
xlab(TeX("$n_c$")) +
ylab(TeX("$n_r$")) +
coord_fixed() +
ggtitle(TeX(paste0("Distribution des tailles moyennes et écart-types,", "$n_r <=", max_nr, "$", " et ", "$n_c <=", max_nc, "$")))
filtered_graph_size <- graph_size_df %>% filter(M > 1)
ggplot(filtered_graph_size) +
aes(x = collection_id, y = nr, fill = collection_id) +
geom_boxplot()
ggplot(filtered_graph_size) +
aes(x = collection_id, y = nc, fill = collection_id) +
geom_boxplot()
# Generate content for each model using knit_expand
for (clustering_idx in seq_len(length(list_clustering))) {
clustering <- list_clustering[clustering_idx]
model <- names(clustering)
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
cat(res, sep = "\n")
cat("\n")
}
```
:::

View file

@ -7,12 +7,12 @@
#' warning if the number wanted `n` is larger than the number of files.
#'
#' @return A vector of size `n` with the file path.
get_recent_files <- function(data_folder, n = 4) {
get_recent_files <- function(data_folder, n = 4, pattern = NULL) {
files_info <- file.info(file.path(data_folder, list.files(data_folder,
include.dirs = FALSE, pattern = "*dore_collection_[a-z]*_seed"
include.dirs = FALSE, pattern = pattern
)))
files_info[["filepath"]] <- file.path(data_folder, list.files(data_folder,
include.dirs = FALSE, pattern = "*dore_collection_[a-z]*_seed"
include.dirs = FALSE, pattern = pattern
))
files_info <- sort_by(files_info, files_info[["ctime"]], decreasing = TRUE)
if (n > nrow(files_info)) {
@ -48,3 +48,15 @@ build_graph_size_dataframe <- function(collection_list) {
)
}))
}
extract_clustering <- function(clustering) {
partition <- colSBM::extract_best_bipartite_partition(
l = clustering,
unnest = TRUE
)
unlist(sapply(seq_len(length(partition)), function(idx) {
clust_vec <- rep(idx, partition[[idx]][["M"]])
names(clust_vec) <- partition[[idx]][["net_id"]]
clust_vec
}))
}