From df54cf9961513692463df9af85286675e1f2f87d Mon Sep 17 00:00:00 2001 From: Louis Date: Sat, 4 May 2024 18:44:22 +0200 Subject: [PATCH] CAH file --- 03_CAH_clust.R | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 03_CAH_clust.R diff --git a/03_CAH_clust.R b/03_CAH_clust.R new file mode 100644 index 0000000..08ddfaa --- /dev/null +++ b/03_CAH_clust.R @@ -0,0 +1,54 @@ +library(data.table) +library(mltools) +library(dplyr) +library(tidyr) +library(here) +library(ggplot2) +library(ggdendro) +library(factoextra) + +data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"), + header = TRUE +), stringsAsFactors = TRUE) + +colnames(data)[5:ncol(data)] <- c( + "1AC-OUVERTURE", "1AC-MI", + "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC3", + "2A-UEchoix-S2-UC4", + "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", + "2A-Projet-S2" +) + +data <- data %>% + mutate_if(sapply(data, is.character), as.factor) + +selected_cols <- c( + "parcours", "domaine2A", "1AC-MI", + "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4", + "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", + "2A-Projet-S2", "1AC-OUVERTURE" +) + +onehot_data <- one_hot(as.data.table(data), cols = selected_cols, sparsifyNAs = TRUE) + +#  Fonctionne bien avec binary +dist_eucl <- dist(x = onehot_data[, -c(1, 2, 3)], method = "binary") + +hclust_avg <- hclust(dist_eucl, method = "average") + +dhc <- as.dendrogram(hclust_avg) + +plotdata <- dendro_data(dhc, type = "rectangle") +p <- ggplot(segment(plotdata)) + + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + + coord_flip() + + scale_y_reverse(expand = c(0.2, 0)) +p + theme_dendro() + +fviz_nbclust(onehot_data[, -c(1,2,3)], FUNcluster = hcut, k.max = 30) + +cut_avg <- cutree(hclust_avg, k = 6) +names(cut_avg) <- data[["ine"]] +table(cut_avg) + +onehot_data[["cluster"]] <- cut_avg \ No newline at end of file