library(data.table) library(mltools) library(dplyr) library(tidyr) library(here) library(ggplot2) library(ggdendro) library(factoextra) set.seed(1234) data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"), header = TRUE ), stringsAsFactors = TRUE) data_folder_path <- file.path(here(), "data", "04_CAH_par_dominante") if (!dir.exists(data_folder_path)) { dir.create(data_folder_path) } colnames(data)[5:ncol(data)] <- c( "1AC-OUVERTURE", "1AC-MI", "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC4", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", "2A-Projet-S2" ) data <- data %>% mutate_if(sapply(data, is.character), as.factor) selected_cols <- c( "dominante3A", "parcours", "domaine2A", "1AC-MI", "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4", "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", "2A-Projet-S2", "1AC-OUVERTURE" ) seuil_effectif <- 20L #  On récupère les dominantes supérieures au seuil vec_dominantes <- na.omit(sapply(unique(data[["dominante3A"]]), function(dominante) { ifelse(sum(data[["dominante3A"]] == dominante) > seuil_effectif, as.character(dominante), NA ) })) for (dominante in vec_dominantes) { message("Dominante ", dominante) data_dominante <- data %>% filter(dominante3A == dominante) onehot_data <- one_hot(as.data.table(data_dominante), cols = selected_cols, sparsifyNAs = TRUE) #  Fonctionne bien avec binary dist <- dist(x = onehot_data[, -1], method = "binary") hclust_avg <- hclust(dist, method = "average") dhc <- as.dendrogram(hclust_avg) plotdata <- dendro_data(dhc, type = "rectangle") p <- ggplot(segment(plotdata)) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + coord_flip() + scale_y_reverse(expand = c(0.2, 0)) + ggtitle(paste0("Dominante : ", dominante)) p + theme_dendro() print(fviz_nbclust(onehot_data, FUNcluster = hcut )+ ggtitle(paste0("Dominante : ", dominante))) nb_clust_dominante <- as.integer(readline(prompt = paste0( "Nb clusters ", dominante, " : " ))) cut_avg <- cutree(hclust_avg, k = nb_clust_dominante) names(cut_avg) <- data_dominante[["ine"]] table(cut_avg) data_dominante[["cluster"]] <- cut_avg write.csv(data_dominante, file.path( data_folder_path, paste0("04_", dominante, "_cah_results.csv") ), row.names = FALSE) }