analysis : a whole lot of things
This commit is contained in:
parent
129a5295df
commit
44bda5e683
11 changed files with 24253 additions and 5547 deletions
1363
code/analysis/simulations/model_selection.html
Normal file
1363
code/analysis/simulations/model_selection.html
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -12,13 +12,25 @@ library(plotly)
|
|||
library(ggplot2)
|
||||
library(ggforce)
|
||||
library(ggrepel)
|
||||
# library(ggtree) # devtools::install_github("YuLab-SMU/ggtree")
|
||||
library(ggdendro)
|
||||
library(phylogram)
|
||||
library(plotly)
|
||||
library(latex2exp)
|
||||
```
|
||||
```{r}
|
||||
list_collection <- readRDS("{{clustering}}")
|
||||
unlisted_best_partition <- unlist( extract_best_partition(list_collection))
|
||||
unlisted_best_partition <- extract_best_partition(list_collection)
|
||||
|
||||
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
|
||||
if (is.list(unlisted_best_partition)) {
|
||||
BICL <- sum(sapply(
|
||||
unlisted_best_partition,
|
||||
function(col) col[["BICL"]]
|
||||
))
|
||||
} else {
|
||||
BICL <- unlisted_best_partition[["BICL"]]
|
||||
unlisted_best_partition <- list(unlisted_best_partition)
|
||||
}
|
||||
```
|
||||
|
||||
Le clustering donne le critère suivant : $BICL = `r BICL`$
|
||||
|
|
@ -40,6 +52,14 @@ summarized_graph_size_df <- graph_size_df %>%
|
|||
```
|
||||
|
||||
:::{.panel-tabset}
|
||||
#### Parcours du clustering
|
||||
```{r}
|
||||
parcours_tree <- extract_clustering_tree(list_collection)
|
||||
dendro_parcours_tree <- phylogram::as.dendrogram(parcours_tree)
|
||||
p <- ggdendrogram(dendro_parcours_tree)
|
||||
plotly::ggplotly(p)
|
||||
```
|
||||
|
||||
#### Réseaux par cluster
|
||||
```{r}
|
||||
#| output: asis
|
||||
|
|
@ -142,7 +162,7 @@ for (i in seq_len(length(unlisted_best_partition))) {
|
|||
p <- plot(
|
||||
current_col,
|
||||
type = "meso",
|
||||
values = FALSE, mixture = TRUE
|
||||
values = F, mixture = F
|
||||
) +
|
||||
ggtitle(paste0("Structure\ncollection ", i))
|
||||
print(p)
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -40,13 +40,26 @@ Les clustering donne le critère suivant :
|
|||
vec_bicl <- sapply(list_clustering, function(clustering) {
|
||||
list_collection <- readRDS(clustering)
|
||||
unlisted_best_partition <- unlist(
|
||||
extract_best_partition(list_collection)
|
||||
extract_best_partition(list_collection)
|
||||
)
|
||||
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
|
||||
if (is.list(unlisted_best_partition)) {
|
||||
BICL <- sum(sapply(
|
||||
unlisted_best_partition,
|
||||
function(col) col[["BICL"]]
|
||||
))
|
||||
} else {
|
||||
BICL <- unlisted_best_partition[["BICL"]]
|
||||
}
|
||||
BICL
|
||||
})
|
||||
|
||||
names(vec_bicl) <- names(files_vec)
|
||||
|
||||
col_order <- order(vec_bicl, decreasing = TRUE)
|
||||
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
|
||||
|
||||
list_clustering <- list_clustering[col_order]
|
||||
|
||||
knitr::kable(vec_bicl,
|
||||
caption = "BICL par modèle", col.names = "$BICL$",
|
||||
row.names = TRUE
|
||||
|
|
|
|||
7406
code/applications/sub-dore/baldock/02_baldock_analysis.html
Normal file
7406
code/applications/sub-dore/baldock/02_baldock_analysis.html
Normal file
File diff suppressed because one or more lines are too long
186
code/applications/sub-dore/baldock/02_baldock_analysis.qmd
Normal file
186
code/applications/sub-dore/baldock/02_baldock_analysis.qmd
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
---
|
||||
format:
|
||||
html:
|
||||
embed-resources: true
|
||||
title: Clustering avec `colSBM` des données Baldock et al.
|
||||
execute:
|
||||
echo: false
|
||||
warning: false
|
||||
---
|
||||
|
||||
# Analyse
|
||||
|
||||
```{r}
|
||||
library(colSBM)
|
||||
library(here)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(aricode)
|
||||
library(reshape2)
|
||||
|
||||
library(ggplot2)
|
||||
```
|
||||
|
||||
```{r}
|
||||
root_app_folder <- file.path(here(), "code", "applications")
|
||||
source(file.path(root_app_folder, "utils.R"))
|
||||
data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "baldock")
|
||||
files_vec <- get_recent_files(data_folder, n = 16L)
|
||||
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
|
||||
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
|
||||
list_clustering <- files_vec
|
||||
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
|
||||
```
|
||||
|
||||
## Analyse par modèle
|
||||
|
||||
Les clustering donne le critère suivant :
|
||||
```{r}
|
||||
vec_bicl <- sapply(list_clustering, function(clustering) {
|
||||
list_collection <- readRDS(clustering)
|
||||
unlisted_best_partition <- extract_best_partition(list_collection)
|
||||
if (is.list(unlisted_best_partition)) {
|
||||
BICL <- sum(sapply(
|
||||
unlisted_best_partition,
|
||||
function(col) col[["BICL"]]
|
||||
))
|
||||
} else {
|
||||
BICL <- unlisted_best_partition[["BICL"]]
|
||||
}
|
||||
BICL
|
||||
})
|
||||
|
||||
names(vec_bicl) <- names(files_vec)
|
||||
|
||||
col_order <- order(vec_bicl, decreasing = TRUE)
|
||||
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
|
||||
|
||||
list_clustering <- list_clustering[col_order]
|
||||
|
||||
knitr::kable(vec_bicl,
|
||||
caption = "BICL par modèle", col.names = "$BICL$",
|
||||
row.names = TRUE
|
||||
)
|
||||
```
|
||||
|
||||
:::{.panel-tabset}
|
||||
|
||||
```{r write_tabs}
|
||||
#| warning: false
|
||||
#| output: asis
|
||||
|
||||
# Generate content for each model using knit_expand
|
||||
for (clustering_idx in seq_len(length(list_clustering))) {
|
||||
clustering <- list_clustering[clustering_idx]
|
||||
model <- names(clustering)
|
||||
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
|
||||
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
|
||||
cat(res, sep = "\n")
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
## Comparaison des modèles
|
||||
|
||||
### Table des clustering
|
||||
|
||||
```{r}
|
||||
clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
|
||||
clust <- readRDS(list_clustering[[idx]])
|
||||
extract_clustering(clust)
|
||||
})
|
||||
|
||||
clust_to_compare_order <- crossing(
|
||||
f = seq_len(ncol(clustering_df)),
|
||||
s = seq_len(ncol(clustering_df))
|
||||
) %>%
|
||||
filter(!(f == s) & f < s)
|
||||
|
||||
clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
|
||||
clust <- readRDS(list_clustering[[idx]])
|
||||
clust_vec <- extract_clustering(clust)
|
||||
net_id <- names(clust_vec)
|
||||
clust_vec <- unname(clust_vec)
|
||||
clust_idx <- idx
|
||||
data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
|
||||
}))
|
||||
|
||||
clustering_partitions_df <- clustering_partitions_df %>%
|
||||
# mutate(net_id = str_trunc(net_id, width = 20L)) %>%
|
||||
mutate_all(as.factor)
|
||||
|
||||
```
|
||||
|
||||
```{r ARI}
|
||||
ari_dist_matrix <- outer(
|
||||
X = seq_len(ncol(clustering_df)),
|
||||
Y = seq_len(ncol(clustering_df)),
|
||||
Vectorize(function(x, y) {
|
||||
f_clust <- clustering_df[, x]
|
||||
s_clust <- clustering_df[, y]
|
||||
ARI(f_clust, s_clust)
|
||||
})
|
||||
)
|
||||
ari_plot_data <- ari_dist_matrix %>%
|
||||
melt() %>%
|
||||
mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
|
||||
ggplot(ari_plot_data) +
|
||||
aes(x = Var1, y = Var2) +
|
||||
geom_raster(aes(fill = value)) +
|
||||
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
|
||||
geom_text(aes(label = round(value, digits = 2))) +
|
||||
labs(
|
||||
x = "id clusterings",
|
||||
y = "id clusterings", title = "ARI entre les clusterings"
|
||||
) +
|
||||
theme_bw()
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary> Tables de cooccurrence </summary>
|
||||
|
||||
```{r table cooccurrence}
|
||||
list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
|
||||
f <- clust_to_compare_order[[row_id, 1]]
|
||||
s <- clust_to_compare_order[[row_id, 2]]
|
||||
table(clustering_df[, f], clustering_df[, s])
|
||||
})
|
||||
```
|
||||
|
||||
```{r function plot tables}
|
||||
plot_table <- function(data, f = NULL, s = NULL) {
|
||||
melted_data <- data %>%
|
||||
melt() %>%
|
||||
mutate(
|
||||
Var1 = as.factor(Var1),
|
||||
Var2 = as.factor(Var2)
|
||||
)
|
||||
|
||||
ggplot(melted_data) +
|
||||
aes(y = Var1, x = Var2) +
|
||||
geom_raster(aes(fill = value)) +
|
||||
scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
|
||||
scale_x_discrete() +
|
||||
geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
|
||||
labs(
|
||||
x = paste0("numéros clusters pour clustering : ", s),
|
||||
y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
|
||||
) +
|
||||
theme_bw()
|
||||
}
|
||||
```
|
||||
|
||||
```{r tables coocc plot}
|
||||
#| output: asis
|
||||
|
||||
for (id in seq_len(length(list_tables_coocc))) {
|
||||
print(plot_table(list_tables_coocc[[id]],
|
||||
f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
|
||||
s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
|
||||
))
|
||||
}
|
||||
```
|
||||
</details>
|
||||
8050
code/applications/sub-dore/sous-collection/02_souscol_analysis.html
Normal file
8050
code/applications/sub-dore/sous-collection/02_souscol_analysis.html
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,185 @@
|
|||
---
|
||||
format:
|
||||
html:
|
||||
embed-resources: true
|
||||
title: Clustering avec `colSBM` des données de sous-collections.
|
||||
execute:
|
||||
echo: false
|
||||
warning: false
|
||||
---
|
||||
|
||||
# Analyse
|
||||
|
||||
```{r}
|
||||
library(colSBM)
|
||||
library(here)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(aricode)
|
||||
library(reshape2)
|
||||
|
||||
library(ggplot2)
|
||||
```
|
||||
|
||||
```{r}
|
||||
root_app_folder <- file.path(here(), "code", "applications")
|
||||
source(file.path(root_app_folder, "utils.R"))
|
||||
data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "souscollection")
|
||||
files_vec <- get_recent_files(data_folder, n = 16L)
|
||||
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
|
||||
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
|
||||
list_clustering <- files_vec
|
||||
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
|
||||
```
|
||||
|
||||
## Analyse par modèle
|
||||
|
||||
Les clustering donne le critère suivant :
|
||||
```{r}
|
||||
vec_bicl <- sapply(list_clustering, function(clustering) {
|
||||
list_collection <- readRDS(clustering)
|
||||
unlisted_best_partition <- extract_best_partition(list_collection)
|
||||
if (is.list(unlisted_best_partition)) {
|
||||
BICL <- sum(sapply(
|
||||
unlisted_best_partition,
|
||||
function(col) col[["BICL"]]
|
||||
))
|
||||
} else {
|
||||
BICL <- unlisted_best_partition[["BICL"]]
|
||||
}
|
||||
BICL
|
||||
})
|
||||
|
||||
names(vec_bicl) <- names(files_vec)
|
||||
|
||||
col_order <- order(vec_bicl, decreasing = TRUE)
|
||||
vec_bicl <- sort(vec_bicl, decreasing = TRUE)
|
||||
|
||||
list_clustering <- list_clustering[col_order]
|
||||
|
||||
knitr::kable(vec_bicl,
|
||||
caption = "BICL par modèle", col.names = "$BICL$",
|
||||
row.names = TRUE
|
||||
)
|
||||
```
|
||||
|
||||
:::{.panel-tabset}
|
||||
|
||||
```{r write_tabs}
|
||||
#| warning: false
|
||||
#| output: asis
|
||||
|
||||
# Generate content for each model using knit_expand
|
||||
for (clustering_idx in seq_len(length(list_clustering))) {
|
||||
clustering <- list_clustering[clustering_idx]
|
||||
model <- names(clustering)
|
||||
expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
|
||||
res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
|
||||
cat(res, sep = "\n")
|
||||
cat("\n")
|
||||
}
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
## Comparaison des pénalités
|
||||
|
||||
|
||||
## Comparaison des modèles
|
||||
|
||||
### Table des clustering
|
||||
|
||||
```{r}
|
||||
clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
|
||||
clust <- readRDS(list_clustering[[idx]])
|
||||
extract_clustering(clust)
|
||||
})
|
||||
|
||||
clust_to_compare_order <- crossing(
|
||||
f = seq_len(ncol(clustering_df)),
|
||||
s = seq_len(ncol(clustering_df))
|
||||
) %>%
|
||||
filter(!(f == s) & f < s)
|
||||
|
||||
clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
|
||||
clust <- readRDS(list_clustering[[idx]])
|
||||
clust_vec <- extract_clustering(clust)
|
||||
net_id <- names(clust_vec)
|
||||
clust_vec <- unname(clust_vec)
|
||||
clust_idx <- idx
|
||||
data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
|
||||
}))
|
||||
|
||||
clustering_partitions_df <- clustering_partitions_df %>%
|
||||
# mutate(net_id = str_trunc(net_id, width = 20L)) %>%
|
||||
mutate_all(as.factor)
|
||||
|
||||
```
|
||||
|
||||
```{r ARI}
|
||||
ari_dist_matrix <- outer(
|
||||
X = seq_len(ncol(clustering_df)),
|
||||
Y = seq_len(ncol(clustering_df)),
|
||||
Vectorize(function(x, y) {
|
||||
f_clust <- clustering_df[, x]
|
||||
s_clust <- clustering_df[, y]
|
||||
ARI(f_clust, s_clust)
|
||||
})
|
||||
)
|
||||
ari_plot_data <- ari_dist_matrix %>%
|
||||
melt() %>%
|
||||
mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
|
||||
ggplot(ari_plot_data) +
|
||||
aes(x = Var1, y = Var2) +
|
||||
geom_raster(aes(fill = value)) +
|
||||
scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
|
||||
geom_text(aes(label = round(value, digits = 2))) +
|
||||
labs(
|
||||
x = "id clusterings",
|
||||
y = "id clusterings", title = "ARI entre les clusterings"
|
||||
) +
|
||||
theme_bw()
|
||||
```
|
||||
|
||||
```{r table cooccurrence}
|
||||
list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
|
||||
f <- clust_to_compare_order[[row_id, 1]]
|
||||
s <- clust_to_compare_order[[row_id, 2]]
|
||||
table(clustering_df[, f], clustering_df[, s])
|
||||
})
|
||||
```
|
||||
|
||||
```{r function plot tables}
|
||||
plot_table <- function(data, f = NULL, s = NULL) {
|
||||
melted_data <- data %>%
|
||||
melt() %>%
|
||||
mutate(
|
||||
Var1 = as.factor(Var1),
|
||||
Var2 = as.factor(Var2)
|
||||
)
|
||||
|
||||
ggplot(melted_data) +
|
||||
aes(y = Var1, x = Var2) +
|
||||
geom_raster(aes(fill = value)) +
|
||||
scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
|
||||
scale_x_discrete() +
|
||||
geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
|
||||
labs(
|
||||
x = paste0("numéros clusters pour clustering : ", s),
|
||||
y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
|
||||
) +
|
||||
theme_bw()
|
||||
}
|
||||
```
|
||||
|
||||
```{r tables coocc plot}
|
||||
#| output: asis
|
||||
|
||||
for (id in seq_len(length(list_tables_coocc))) {
|
||||
print(plot_table(list_tables_coocc[[id]],
|
||||
f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
|
||||
s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
|
||||
))
|
||||
}
|
||||
```
|
||||
|
|
@ -35,6 +35,17 @@ identify_models <- function(files_vec, pattern = "(iid|pirho|pi|rho)") {
|
|||
}
|
||||
|
||||
build_graph_size_dataframe <- function(collection_list) {
|
||||
if (!is.list(collection_list)) {
|
||||
return(data.frame(
|
||||
collection_id = factor(1L),
|
||||
M = collection_list[["M"]],
|
||||
net_id = factor(collection_list[["net_id"]]),
|
||||
nr = collection_list[["n"]][[1]],
|
||||
nc = collection_list[["n"]][[2]],
|
||||
Qr = collection_list[["Q"]][[1]],
|
||||
Qc = collection_list[["Q"]][[2]]
|
||||
))
|
||||
}
|
||||
do.call("rbind", lapply(seq_len(length(collection_list)), function(idx) {
|
||||
collection <- collection_list[[idx]]
|
||||
data.frame(
|
||||
|
|
@ -54,9 +65,17 @@ extract_clustering <- function(clustering) {
|
|||
l = clustering,
|
||||
unnest = TRUE
|
||||
)
|
||||
unlist(sapply(seq_len(length(partition)), function(idx) {
|
||||
if (!is.list(partition)) {
|
||||
partition <- list(partition)
|
||||
}
|
||||
out <- unlist(sapply(seq_len(length(partition)), function(idx) {
|
||||
clust_vec <- rep(idx, partition[[idx]][["M"]])
|
||||
names(clust_vec) <- partition[[idx]][["net_id"]]
|
||||
clust_vec
|
||||
}))
|
||||
if (is.matrix(out)) {
|
||||
nm <- rownames(out)
|
||||
out <- setNames(c(out), nm)
|
||||
}
|
||||
return(out)
|
||||
}
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -28,7 +28,7 @@ root_app_folder <- file.path(here(), "code", "applications")
|
|||
source(file.path(root_app_folder, "utils.R"))
|
||||
data_folder <- file.path(here(), "code", "results", "simulations", "clustering", "synthetic")
|
||||
files_vec <- get_recent_files(data_folder, n = 16L, pattern = ".Rds")
|
||||
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)")
|
||||
files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,6}")
|
||||
names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
|
||||
list_clustering <- files_vec
|
||||
# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
|
||||
|
|
@ -41,7 +41,7 @@ Les clustering donne le critère suivant :
|
|||
vec_bicl <- sapply(list_clustering, function(clustering) {
|
||||
list_collection <- readRDS(clustering)
|
||||
unlisted_best_partition <- unlist(
|
||||
extract_best_partition(list_collection)
|
||||
extract_best_partition(list_collection)
|
||||
)
|
||||
BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
|
||||
BICL
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue