reforme-enseignement/04_CAH_clust_par_dominantes.R

library(data.table)
library(mltools)
library(dplyr)
library(tidyr)
library(here)
library(ggplot2)
library(ggdendro)
library(factoextra)

set.seed(1234)
data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"),
    header = TRUE
), stringsAsFactors = TRUE)

data_folder_path <- file.path(here(), "data", "04_CAH_par_dominante")

if (!dir.exists(data_folder_path)) {
    dir.create(data_folder_path)
}

colnames(data)[5:ncol(data)] <- c(
    "1AC-OUVERTURE", "1AC-MI",
    "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC3",
    "2A-UEchoix-S2-UC4",
    "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
    "2A-Projet-S2"
)

data <- data %>%
    mutate_if(sapply(data, is.character), as.factor)

selected_cols <- c(
    "dominante3A", "parcours", "domaine2A", "1AC-MI",
    "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4",
    "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
    "2A-Projet-S2", "1AC-OUVERTURE"
)

seuil_effectif <- 20L

#  On récupère les dominantes supérieures au seuil
vec_dominantes <- na.omit(sapply(unique(data[["dominante3A"]]), function(dominante) {
    ifelse(sum(data[["dominante3A"]] == dominante) > seuil_effectif,
        as.character(dominante), NA
    )
}))

for (dominante in vec_dominantes) {
    message("Dominante ", dominante)
    data_dominante <- data %>% filter(dominante3A == dominante)
    onehot_data <- one_hot(as.data.table(data_dominante), cols = selected_cols, sparsifyNAs = TRUE)

    #  Fonctionne bien avec binary
    dist <- dist(x = onehot_data[, -1], method = "binary")

    hclust_avg <- hclust(dist, method = "average")

    dhc <- as.dendrogram(hclust_avg)

    plotdata <- dendro_data(dhc, type = "rectangle")
    p <- ggplot(segment(plotdata)) +
        geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
        coord_flip() +
        scale_y_reverse(expand = c(0.2, 0)) +
        ggtitle(paste0("Dominante : ", dominante))
    p + theme_dendro()

    print(fviz_nbclust(onehot_data,
        FUNcluster = hcut
    )+
        ggtitle(paste0("Dominante : ", dominante)))

    nb_clust_dominante <- as.integer(readline(prompt = paste0(
        "Nb clusters ",
        dominante,
        " : "
    )))

    cut_avg <- cutree(hclust_avg, k = nb_clust_dominante)
    names(cut_avg) <- data_dominante[["ine"]]
    table(cut_avg)

    data_dominante[["cluster"]] <- cut_avg

    write.csv(data_dominante, file.path(
        data_folder_path,
        paste0("04_", dominante, "_cah_results.csv")
    ), row.names = FALSE)
}