analysis : a whole lot of things

This commit is contained in:
Louis Lacoste 2024-07-08 15:47:04 +02:00
parent 129a5295df
commit 44bda5e683
11 changed files with 24253 additions and 5547 deletions

File diff suppressed because it is too large Load diff

View file

@ -12,13 +12,25 @@ library(plotly)
library(ggplot2)
library(ggforce)
library(ggrepel)
# library(ggtree) # devtools::install_github("YuLab-SMU/ggtree")
library(ggdendro)
library(phylogram)
library(plotly)
library(latex2exp)
```
```{r}
list_collection <- readRDS("{{clustering}}")
unlisted_best_partition <- unlist( extract_best_partition(list_collection))
unlisted_best_partition <- extract_best_partition(list_collection)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
if (is.list(unlisted_best_partition)) {
BICL <- sum(sapply(
unlisted_best_partition,
function(col) col[["BICL"]]
))
} else {
BICL <- unlisted_best_partition[["BICL"]]
unlisted_best_partition <- list(unlisted_best_partition)
}
```
Le clustering donne le critère suivant : $BICL = `r BICL`$
@ -40,6 +52,14 @@ summarized_graph_size_df <- graph_size_df %>%
```
:::{.panel-tabset}
#### Parcours du clustering
```{r}
parcours_tree <- extract_clustering_tree(list_collection)
dendro_parcours_tree <- phylogram::as.dendrogram(parcours_tree)
p <- ggdendrogram(dendro_parcours_tree)
plotly::ggplotly(p)
```
#### Réseaux par cluster
```{r}
#| output: asis
@ -142,7 +162,7 @@ for (i in seq_len(length(unlisted_best_partition))) {
p <- plot(
current_col,
type = "meso",
values = FALSE, mixture = TRUE
values = F, mixture = F
) +
ggtitle(paste0("Structure\ncollection ", i))
print(p)

File diff suppressed because one or more lines are too long

View file

@ -40,13 +40,26 @@ Les clustering donne le critère suivant :
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_partition(list_collection)
extract_best_partition(list_collection)
)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
if (is.list(unlisted_best_partition)) {
BICL <- sum(sapply(
unlisted_best_partition,
function(col) col[["BICL"]]
))
} else {
BICL <- unlisted_best_partition[["BICL"]]
}
BICL
})
names(vec_bicl) <- names(files_vec)
col_order <- order(vec_bicl, decreasing = TRUE)
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
list_clustering <- list_clustering[col_order]
knitr::kable(vec_bicl,
caption = "BICL par modèle", col.names = "$BICL$",
row.names = TRUE

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,186 @@
---
format:
html:
embed-resources: true
title: Clustering avec `colSBM` des données Baldock et al.
execute:
echo: false
warning: false
---
# Analyse
```{r}
library(colSBM)
library(here)
library(stringr)
library(tidyr)
library(dplyr)
library(aricode)
library(reshape2)
library(ggplot2)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "baldock")
files_vec <- get_recent_files(data_folder, n = 16L)
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
```
## Analyse par modèle
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- extract_best_partition(list_collection)
if (is.list(unlisted_best_partition)) {
BICL <- sum(sapply(
unlisted_best_partition,
function(col) col[["BICL"]]
))
} else {
BICL <- unlisted_best_partition[["BICL"]]
}
BICL
})
names(vec_bicl) <- names(files_vec)
col_order <- order(vec_bicl, decreasing = TRUE)
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
list_clustering <- list_clustering[col_order]
knitr::kable(vec_bicl,
caption = "BICL par modèle", col.names = "$BICL$",
row.names = TRUE
)
```
:::{.panel-tabset}
```{r write_tabs}
#| warning: false
#| output: asis
# Generate content for each model using knit_expand
for (clustering_idx in seq_len(length(list_clustering))) {
clustering <- list_clustering[clustering_idx]
model <- names(clustering)
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
cat(res, sep = "\n")
cat("\n")
}
```
:::
## Comparaison des modèles
### Table des clustering
```{r}
clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
clust <- readRDS(list_clustering[[idx]])
extract_clustering(clust)
})
clust_to_compare_order <- crossing(
f = seq_len(ncol(clustering_df)),
s = seq_len(ncol(clustering_df))
) %>%
filter(!(f == s) & f < s)
clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
clust <- readRDS(list_clustering[[idx]])
clust_vec <- extract_clustering(clust)
net_id <- names(clust_vec)
clust_vec <- unname(clust_vec)
clust_idx <- idx
data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
}))
clustering_partitions_df <- clustering_partitions_df %>%
# mutate(net_id = str_trunc(net_id, width = 20L)) %>%
mutate_all(as.factor)
```
```{r ARI}
ari_dist_matrix <- outer(
X = seq_len(ncol(clustering_df)),
Y = seq_len(ncol(clustering_df)),
Vectorize(function(x, y) {
f_clust <- clustering_df[, x]
s_clust <- clustering_df[, y]
ARI(f_clust, s_clust)
})
)
ari_plot_data <- ari_dist_matrix %>%
melt() %>%
mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
ggplot(ari_plot_data) +
aes(x = Var1, y = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
geom_text(aes(label = round(value, digits = 2))) +
labs(
x = "id clusterings",
y = "id clusterings", title = "ARI entre les clusterings"
) +
theme_bw()
```
<details>
<summary> Tables de cooccurrence </summary>
```{r table cooccurrence}
list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
f <- clust_to_compare_order[[row_id, 1]]
s <- clust_to_compare_order[[row_id, 2]]
table(clustering_df[, f], clustering_df[, s])
})
```
```{r function plot tables}
plot_table <- function(data, f = NULL, s = NULL) {
melted_data <- data %>%
melt() %>%
mutate(
Var1 = as.factor(Var1),
Var2 = as.factor(Var2)
)
ggplot(melted_data) +
aes(y = Var1, x = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
scale_x_discrete() +
geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
labs(
x = paste0("numéros clusters pour clustering : ", s),
y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
) +
theme_bw()
}
```
```{r tables coocc plot}
#| output: asis
for (id in seq_len(length(list_tables_coocc))) {
print(plot_table(list_tables_coocc[[id]],
f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
))
}
```
</details>

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,185 @@
---
format:
html:
embed-resources: true
title: Clustering avec `colSBM` des données de sous-collections.
execute:
echo: false
warning: false
---
# Analyse
```{r}
library(colSBM)
library(here)
library(stringr)
library(tidyr)
library(dplyr)
library(aricode)
library(reshape2)
library(ggplot2)
```
```{r}
root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "souscollection")
files_vec <- get_recent_files(data_folder, n = 16L)
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
```
## Analyse par modèle
Les clustering donne le critère suivant :
```{r}
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- extract_best_partition(list_collection)
if (is.list(unlisted_best_partition)) {
BICL <- sum(sapply(
unlisted_best_partition,
function(col) col[["BICL"]]
))
} else {
BICL <- unlisted_best_partition[["BICL"]]
}
BICL
})
names(vec_bicl) <- names(files_vec)
col_order <- order(vec_bicl, decreasing = TRUE)
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
list_clustering <- list_clustering[col_order]
knitr::kable(vec_bicl,
caption = "BICL par modèle", col.names = "$BICL$",
row.names = TRUE
)
```
:::{.panel-tabset}
```{r write_tabs}
#| warning: false
#| output: asis
# Generate content for each model using knit_expand
for (clustering_idx in seq_len(length(list_clustering))) {
clustering <- list_clustering[clustering_idx]
model <- names(clustering)
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
cat(res, sep = "\n")
cat("\n")
}
```
:::
## Comparaison des pénalités
## Comparaison des modèles
### Table des clustering
```{r}
clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
clust <- readRDS(list_clustering[[idx]])
extract_clustering(clust)
})
clust_to_compare_order <- crossing(
f = seq_len(ncol(clustering_df)),
s = seq_len(ncol(clustering_df))
) %>%
filter(!(f == s) & f < s)
clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
clust <- readRDS(list_clustering[[idx]])
clust_vec <- extract_clustering(clust)
net_id <- names(clust_vec)
clust_vec <- unname(clust_vec)
clust_idx <- idx
data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
}))
clustering_partitions_df <- clustering_partitions_df %>%
# mutate(net_id = str_trunc(net_id, width = 20L)) %>%
mutate_all(as.factor)
```
```{r ARI}
ari_dist_matrix <- outer(
X = seq_len(ncol(clustering_df)),
Y = seq_len(ncol(clustering_df)),
Vectorize(function(x, y) {
f_clust <- clustering_df[, x]
s_clust <- clustering_df[, y]
ARI(f_clust, s_clust)
})
)
ari_plot_data <- ari_dist_matrix %>%
melt() %>%
mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
ggplot(ari_plot_data) +
aes(x = Var1, y = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
geom_text(aes(label = round(value, digits = 2))) +
labs(
x = "id clusterings",
y = "id clusterings", title = "ARI entre les clusterings"
) +
theme_bw()
```
```{r table cooccurrence}
list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
f <- clust_to_compare_order[[row_id, 1]]
s <- clust_to_compare_order[[row_id, 2]]
table(clustering_df[, f], clustering_df[, s])
})
```
```{r function plot tables}
plot_table <- function(data, f = NULL, s = NULL) {
melted_data <- data %>%
melt() %>%
mutate(
Var1 = as.factor(Var1),
Var2 = as.factor(Var2)
)
ggplot(melted_data) +
aes(y = Var1, x = Var2) +
geom_raster(aes(fill = value)) +
scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
scale_x_discrete() +
geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
labs(
x = paste0("numéros clusters pour clustering : ", s),
y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
) +
theme_bw()
}
```
```{r tables coocc plot}
#| output: asis
for (id in seq_len(length(list_tables_coocc))) {
print(plot_table(list_tables_coocc[[id]],
f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
))
}
```

View file

@ -35,6 +35,17 @@ identify_models <- function(files_vec, pattern = "(iid|pirho|pi|rho)") {
}
build_graph_size_dataframe <- function(collection_list) {
if (!is.list(collection_list)) {
return(data.frame(
collection_id = factor(1L),
M = collection_list[["M"]],
net_id = factor(collection_list[["net_id"]]),
nr = collection_list[["n"]][[1]],
nc = collection_list[["n"]][[2]],
Qr = collection_list[["Q"]][[1]],
Qc = collection_list[["Q"]][[2]]
))
}
do.call("rbind", lapply(seq_len(length(collection_list)), function(idx) {
collection <- collection_list[[idx]]
data.frame(
@ -54,9 +65,17 @@ extract_clustering <- function(clustering) {
l = clustering,
unnest = TRUE
)
unlist(sapply(seq_len(length(partition)), function(idx) {
if (!is.list(partition)) {
partition <- list(partition)
}
out <- unlist(sapply(seq_len(length(partition)), function(idx) {
clust_vec <- rep(idx, partition[[idx]][["M"]])
names(clust_vec) <- partition[[idx]][["net_id"]]
clust_vec
}))
if (is.matrix(out)) {
nm <- rownames(out)
out <- setNames(c(out), nm)
}
return(out)
}

View file

@ -28,7 +28,7 @@ root_app_folder <- file.path(here(), "code", "applications")
source(file.path(root_app_folder, "utils.R"))
data_folder <- file.path(here(), "code", "results", "simulations", "clustering", "synthetic")
files_vec <- get_recent_files(data_folder, n = 16L, pattern = ".Rds")
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)")
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,6}")
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
list_clustering <- files_vec
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
@ -41,7 +41,7 @@ Les clustering donne le critère suivant :
vec_bicl <- sapply(list_clustering, function(clustering) {
list_collection <- readRDS(clustering)
unlisted_best_partition <- unlist(
extract_best_partition(list_collection)
extract_best_partition(list_collection)
)
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
BICL