analysis : a whole lot of things

2024-07-08 15:47:04 +02:00 · 2024-07-08 15:47:04 +02:00 · 44bda5e683
commit 44bda5e683
parent 129a5295df
11 changed files with 24253 additions and 5547 deletions
--- a/code/analysis/simulations/model_selection.html
+++ b/code/analysis/simulations/model_selection.html
--- a/code/applications/base_analysis.qmd
+++ b/code/applications/base_analysis.qmd
@ -12,13 +12,25 @@ library(plotly)
 library(ggplot2)
 library(ggforce)
 library(ggrepel)
+# library(ggtree) # devtools::install_github("YuLab-SMU/ggtree")
+library(ggdendro)
+library(phylogram)
+library(plotly)
 library(latex2exp)
 ```
 ```{r}
 list_collection <- readRDS("{{clustering}}")
-unlisted_best_partition <- unlist( extract_best_partition(list_collection))
+unlisted_best_partition <- extract_best_partition(list_collection)

-BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
+if (is.list(unlisted_best_partition)) {
+    BICL <- sum(sapply(
+        unlisted_best_partition,
+        function(col) col[["BICL"]]
+    ))
+} else {
+    BICL <- unlisted_best_partition[["BICL"]]
+    unlisted_best_partition <- list(unlisted_best_partition)
+}
 ```

 Le clustering donne le critère suivant : $BICL = `r BICL`$
@ -40,6 +52,14 @@ summarized_graph_size_df <- graph_size_df %>%
 ```

 :::{.panel-tabset}
+#### Parcours du clustering
+```{r}
+parcours_tree <- extract_clustering_tree(list_collection)
+dendro_parcours_tree <- phylogram::as.dendrogram(parcours_tree)
+p <- ggdendrogram(dendro_parcours_tree)
+plotly::ggplotly(p)
+```
+
 #### Réseaux par cluster
 ```{r}
 #| output: asis
@ -142,7 +162,7 @@ for (i in seq_len(length(unlisted_best_partition))) {
        p <- plot(
            current_col,
            type = "meso",
-            values = FALSE, mixture = TRUE
+            values = F, mixture = F
        ) +
            ggtitle(paste0("Structure\ncollection ", i))
        print(p)
--- a/code/applications/dore/02_dore_analysis.html
+++ b/code/applications/dore/02_dore_analysis.html
--- a/code/applications/dore/02_dore_analysis.qmd
+++ b/code/applications/dore/02_dore_analysis.qmd
@ -40,13 +40,26 @@ Les clustering donne le critère suivant :
 vec_bicl <- sapply(list_clustering, function(clustering) {
    list_collection <- readRDS(clustering)
    unlisted_best_partition <- unlist(
-         extract_best_partition(list_collection)
+        extract_best_partition(list_collection)
    )
-    BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
+    if (is.list(unlisted_best_partition)) {
+        BICL <- sum(sapply(
+            unlisted_best_partition,
+            function(col) col[["BICL"]]
+        ))
+    } else {
+        BICL <- unlisted_best_partition[["BICL"]]
+    }
    BICL
 })

 names(vec_bicl) <- names(files_vec)
+
+col_order <- order(vec_bicl, decreasing = TRUE)
+vec_bicl <- sort(vec_bicl, decreasing = TRUE)
+
+list_clustering <- list_clustering[col_order]
+
 knitr::kable(vec_bicl,
    caption = "BICL par modèle", col.names = "$BICL$",
    row.names = TRUE
--- a/code/applications/sub-dore/baldock/02_baldock_analysis.html
+++ b/code/applications/sub-dore/baldock/02_baldock_analysis.html
--- a/code/applications/sub-dore/baldock/02_baldock_analysis.qmd
+++ b/code/applications/sub-dore/baldock/02_baldock_analysis.qmd
@ -0,0 +1,186 @@
+---
+format:
+  html:
+    embed-resources: true
+title: Clustering avec `colSBM` des données Baldock et al.
+execute: 
+  echo: false
+  warning: false
+---
+
+# Analyse
+
+```{r}
+library(colSBM)
+library(here)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(aricode)
+library(reshape2)
+
+library(ggplot2)
+```
+
+```{r}
+root_app_folder <- file.path(here(), "code", "applications")
+source(file.path(root_app_folder, "utils.R"))
+data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "baldock")
+files_vec <- get_recent_files(data_folder, n = 16L)
+files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
+names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
+list_clustering <- files_vec
+# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
+```
+
+## Analyse par modèle
+
+Les clustering donne le critère suivant :
+```{r}
+vec_bicl <- sapply(list_clustering, function(clustering) {
+    list_collection <- readRDS(clustering)
+    unlisted_best_partition <- extract_best_partition(list_collection)
+    if (is.list(unlisted_best_partition)) {
+        BICL <- sum(sapply(
+            unlisted_best_partition,
+            function(col) col[["BICL"]]
+        ))
+    } else {
+        BICL <- unlisted_best_partition[["BICL"]]
+    }
+    BICL
+})
+
+names(vec_bicl) <- names(files_vec)
+
+col_order <- order(vec_bicl, decreasing = TRUE)
+vec_bicl <- sort(vec_bicl, decreasing = TRUE)
+
+list_clustering <- list_clustering[col_order]
+
+knitr::kable(vec_bicl,
+    caption = "BICL par modèle", col.names = "$BICL$",
+    row.names = TRUE
+)
+```
+
+:::{.panel-tabset}
+
+```{r write_tabs}
+#| warning: false
+#| output: asis
+
+# Generate content for each model using knit_expand
+for (clustering_idx in seq_len(length(list_clustering))) {
+    clustering <- list_clustering[clustering_idx]
+    model <- names(clustering)
+    expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
+    res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
+    cat(res, sep = "\n")
+    cat("\n")
+}
+```
+
+:::
+
+## Comparaison des modèles
+
+### Table des clustering
+
+```{r}
+clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
+    clust <- readRDS(list_clustering[[idx]])
+    extract_clustering(clust)
+})
+
+clust_to_compare_order <- crossing(
+    f = seq_len(ncol(clustering_df)),
+    s = seq_len(ncol(clustering_df))
+) %>%
+    filter(!(f == s) & f < s)
+
+clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
+    clust <- readRDS(list_clustering[[idx]])
+    clust_vec <- extract_clustering(clust)
+    net_id <- names(clust_vec)
+    clust_vec <- unname(clust_vec)
+    clust_idx <- idx
+    data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
+}))
+
+clustering_partitions_df <- clustering_partitions_df %>%
+    # mutate(net_id = str_trunc(net_id, width = 20L)) %>%
+    mutate_all(as.factor)
+
+```
+
+```{r ARI}
+ari_dist_matrix <- outer(
+    X = seq_len(ncol(clustering_df)),
+    Y = seq_len(ncol(clustering_df)),
+    Vectorize(function(x, y) {
+        f_clust <- clustering_df[, x]
+        s_clust <- clustering_df[, y]
+        ARI(f_clust, s_clust)
+    })
+)
+ari_plot_data <- ari_dist_matrix %>%
+    melt() %>%
+    mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
+ggplot(ari_plot_data) +
+    aes(x = Var1, y = Var2) +
+    geom_raster(aes(fill = value)) +
+    scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
+    geom_text(aes(label = round(value, digits = 2))) +
+    labs(
+        x = "id clusterings",
+        y = "id clusterings", title = "ARI entre les clusterings"
+    ) +
+    theme_bw()
+```
+
+<details>
+<summary> Tables de cooccurrence </summary>
+
+```{r table cooccurrence}
+list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
+    f <- clust_to_compare_order[[row_id, 1]]
+    s <- clust_to_compare_order[[row_id, 2]]
+    table(clustering_df[, f], clustering_df[, s])
+})
+```
+
+```{r function plot tables}
+plot_table <- function(data, f = NULL, s = NULL) {
+    melted_data <- data %>%
+        melt() %>%
+        mutate(
+            Var1 = as.factor(Var1),
+            Var2 = as.factor(Var2)
+        )
+
+    ggplot(melted_data) +
+        aes(y = Var1, x = Var2) +
+        geom_raster(aes(fill = value)) +
+        scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
+        scale_x_discrete() +
+        geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
+        labs(
+            x = paste0("numéros clusters pour clustering : ", s),
+            y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
+        ) +
+        theme_bw()
+}
+```
+
+```{r tables coocc plot}
+#| output: asis
+
+for (id in seq_len(length(list_tables_coocc))) {
+    print(plot_table(list_tables_coocc[[id]],
+        f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
+        s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
+    ))
+}
+```
+</details>
--- a/code/applications/sub-dore/sous-collection/02_souscol_analysis.html
+++ b/code/applications/sub-dore/sous-collection/02_souscol_analysis.html
--- a/code/applications/sub-dore/sous-collection/02_souscol_analysis.qmd
+++ b/code/applications/sub-dore/sous-collection/02_souscol_analysis.qmd
@ -0,0 +1,185 @@
+---
+format:
+  html:
+    embed-resources: true
+title: Clustering avec `colSBM` des données de sous-collections.
+execute: 
+  echo: false
+  warning: false
+---
+
+# Analyse
+
+```{r}
+library(colSBM)
+library(here)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(aricode)
+library(reshape2)
+
+library(ggplot2)
+```
+
+```{r}
+root_app_folder <- file.path(here(), "code", "applications")
+source(file.path(root_app_folder, "utils.R"))
+data_folder <- file.path(here(), "code", "results", "applications", "sub-dore", "souscollection")
+files_vec <- get_recent_files(data_folder, n = 16L)
+files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,4}")
+names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
+list_clustering <- files_vec
+# list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
+```
+
+## Analyse par modèle
+
+Les clustering donne le critère suivant :
+```{r}
+vec_bicl <- sapply(list_clustering, function(clustering) {
+    list_collection <- readRDS(clustering)
+    unlisted_best_partition <- extract_best_partition(list_collection)
+    if (is.list(unlisted_best_partition)) {
+        BICL <- sum(sapply(
+            unlisted_best_partition,
+            function(col) col[["BICL"]]
+        ))
+    } else {
+        BICL <- unlisted_best_partition[["BICL"]]
+    }
+    BICL
+})
+
+names(vec_bicl) <- names(files_vec)
+
+col_order <- order(vec_bicl, decreasing = TRUE)
+vec_bicl <- sort(vec_bicl, decreasing = TRUE)
+
+list_clustering <- list_clustering[col_order]
+
+knitr::kable(vec_bicl,
+    caption = "BICL par modèle", col.names = "$BICL$",
+    row.names = TRUE
+)
+```
+
+:::{.panel-tabset}
+
+```{r write_tabs}
+#| warning: false
+#| output: asis
+
+# Generate content for each model using knit_expand
+for (clustering_idx in seq_len(length(list_clustering))) {
+    clustering <- list_clustering[clustering_idx]
+    model <- names(clustering)
+    expanded_content <- knitr::knit_expand(file = file.path(root_app_folder, "base_analysis.qmd"), clustering = clustering, model = model)
+    res <- knitr::knit_child(text = expanded_content, quiet = TRUE)
+    cat(res, sep = "\n")
+    cat("\n")
+}
+```
+
+:::
+
+## Comparaison des pénalités
+
+
+## Comparaison des modèles
+
+### Table des clustering
+
+```{r}
+clustering_df <- sapply(seq_len(length(list_clustering)), function(idx) {
+    clust <- readRDS(list_clustering[[idx]])
+    extract_clustering(clust)
+})
+
+clust_to_compare_order <- crossing(
+    f = seq_len(ncol(clustering_df)),
+    s = seq_len(ncol(clustering_df))
+) %>%
+    filter(!(f == s) & f < s)
+
+clustering_partitions_df <- do.call("rbind", lapply(seq_len(length(list_clustering)), function(idx) {
+    clust <- readRDS(list_clustering[[idx]])
+    clust_vec <- extract_clustering(clust)
+    net_id <- names(clust_vec)
+    clust_vec <- unname(clust_vec)
+    clust_idx <- idx
+    data.frame(net_id = net_id, cluster = clust_vec, clustering_id = clust_idx)
+}))
+
+clustering_partitions_df <- clustering_partitions_df %>%
+    # mutate(net_id = str_trunc(net_id, width = 20L)) %>%
+    mutate_all(as.factor)
+
+```
+
+```{r ARI}
+ari_dist_matrix <- outer(
+    X = seq_len(ncol(clustering_df)),
+    Y = seq_len(ncol(clustering_df)),
+    Vectorize(function(x, y) {
+        f_clust <- clustering_df[, x]
+        s_clust <- clustering_df[, y]
+        ARI(f_clust, s_clust)
+    })
+)
+ari_plot_data <- ari_dist_matrix %>%
+    melt() %>%
+    mutate(Var1 = names(list_clustering)[Var1], Var2 = names(list_clustering)[Var2])
+ggplot(ari_plot_data) +
+    aes(x = Var1, y = Var2) +
+    geom_raster(aes(fill = value)) +
+    scale_fill_gradient(low = "white", high = "red", limits = c(0, 1)) +
+    geom_text(aes(label = round(value, digits = 2))) +
+    labs(
+        x = "id clusterings",
+        y = "id clusterings", title = "ARI entre les clusterings"
+    ) +
+    theme_bw()
+```
+
+```{r table cooccurrence}
+list_tables_coocc <- lapply(seq_len(nrow(clust_to_compare_order)), function(row_id) {
+    f <- clust_to_compare_order[[row_id, 1]]
+    s <- clust_to_compare_order[[row_id, 2]]
+    table(clustering_df[, f], clustering_df[, s])
+})
+```
+
+```{r function plot tables}
+plot_table <- function(data, f = NULL, s = NULL) {
+    melted_data <- data %>%
+        melt() %>%
+        mutate(
+            Var1 = as.factor(Var1),
+            Var2 = as.factor(Var2)
+        )
+
+    ggplot(melted_data) +
+        aes(y = Var1, x = Var2) +
+        geom_raster(aes(fill = value)) +
+        scale_fill_gradient(low = "white", high = "red", limits = c(0, NA)) +
+        scale_x_discrete() +
+        geom_text(data = subset(melted_data, value != 0), aes(label = value)) +
+        labs(
+            x = paste0("numéros clusters pour clustering : ", s),
+            y = paste0("numéros clusters pour clustering : ", f), title = paste0("Co-occurence entre les clusterings ", f, " et ", s)
+        ) +
+        theme_bw()
+}
+```
+
+```{r tables coocc plot}
+#| output: asis
+
+for (id in seq_len(length(list_tables_coocc))) {
+    print(plot_table(list_tables_coocc[[id]],
+        f = names(list_clustering)[clust_to_compare_order[[id, 1]]],
+        s = names(list_clustering)[clust_to_compare_order[[id, 2]]]
+    ))
+}
+```
--- a/code/applications/utils.R
+++ b/code/applications/utils.R
@ -35,6 +35,17 @@ identify_models <- function(files_vec, pattern = "(iid|pirho|pi|rho)") {
 }

 build_graph_size_dataframe <- function(collection_list) {
+    if (!is.list(collection_list)) {
+        return(data.frame(
+            collection_id = factor(1L),
+            M = collection_list[["M"]],
+            net_id = factor(collection_list[["net_id"]]),
+            nr = collection_list[["n"]][[1]],
+            nc = collection_list[["n"]][[2]],
+            Qr = collection_list[["Q"]][[1]],
+            Qc = collection_list[["Q"]][[2]]
+        ))
+    }
    do.call("rbind", lapply(seq_len(length(collection_list)), function(idx) {
        collection <- collection_list[[idx]]
        data.frame(
@ -54,9 +65,17 @@ extract_clustering <- function(clustering) {
        l = clustering,
        unnest = TRUE
    )
-    unlist(sapply(seq_len(length(partition)), function(idx) {
+    if (!is.list(partition)) {
+        partition <- list(partition)
+    }
+    out <- unlist(sapply(seq_len(length(partition)), function(idx) {
        clust_vec <- rep(idx, partition[[idx]][["M"]])
        names(clust_vec) <- partition[[idx]][["net_id"]]
        clust_vec
    }))
+    if (is.matrix(out)) {
+        nm <- rownames(out)
+        out <- setNames(c(out), nm)
+    }
+    return(out)
 }
--- a/code/results/applications/sub-dore/souscollection/souscollection_collection_iid_seed_1234_01-07-24_13-35-33.Rds
+++ b/code/results/applications/sub-dore/souscollection/souscollection_collection_iid_seed_1234_01-07-24_13-35-33.Rds
--- a/code/simulations/clustering/02_synthetic_clustering_analysis.qmd
+++ b/code/simulations/clustering/02_synthetic_clustering_analysis.qmd
@ -28,7 +28,7 @@ root_app_folder <- file.path(here(), "code", "applications")
 source(file.path(root_app_folder, "utils.R"))
 data_folder <- file.path(here(), "code", "results", "simulations", "clustering", "synthetic")
 files_vec <- get_recent_files(data_folder, n = 16L, pattern = ".Rds")
-files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)")
+files_vec <- identify_models(files_vec, pattern = "(iid|pirho|pi|rho)_seed_[0-9]{1,6}")
 names(files_vec) <- names(files_vec) %>% str_replace_all(pattern = "_", "~")
 list_clustering <- files_vec
 # list_collection <- readRDS(file.path(data_folder, "dore_collection_iid_24-05-24_18-07-50.Rds"))
@ -41,7 +41,7 @@ Les clustering donne le critère suivant :
 vec_bicl <- sapply(list_clustering, function(clustering) {
    list_collection <- readRDS(clustering)
    unlisted_best_partition <- unlist(
-         extract_best_partition(list_collection)
+        extract_best_partition(list_collection)
    )
    BICL <- sum(sapply(unlisted_best_partition, function(col) col$BICL))
    BICL