diff --git a/03_CAH_clust.R b/03_CAH_clust_toutes_dominantes.R similarity index 79% rename from 03_CAH_clust.R rename to 03_CAH_clust_toutes_dominantes.R index 08ddfaa..ca0780e 100644 --- a/03_CAH_clust.R +++ b/03_CAH_clust_toutes_dominantes.R @@ -6,7 +6,7 @@ library(here) library(ggplot2) library(ggdendro) library(factoextra) - +set.seed(1234) data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"), header = TRUE ), stringsAsFactors = TRUE) @@ -23,7 +23,7 @@ data <- data %>% mutate_if(sapply(data, is.character), as.factor) selected_cols <- c( - "parcours", "domaine2A", "1AC-MI", + "dominante3A", "parcours", "domaine2A", "1AC-MI", "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4", "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", "2A-Projet-S2", "1AC-OUVERTURE" @@ -32,7 +32,7 @@ selected_cols <- c( onehot_data <- one_hot(as.data.table(data), cols = selected_cols, sparsifyNAs = TRUE) #  Fonctionne bien avec binary -dist_eucl <- dist(x = onehot_data[, -c(1, 2, 3)], method = "binary") +dist_eucl <- dist(x = onehot_data[, - 1], method = "binary") hclust_avg <- hclust(dist_eucl, method = "average") @@ -45,10 +45,12 @@ p <- ggplot(segment(plotdata)) + scale_y_reverse(expand = c(0.2, 0)) p + theme_dendro() -fviz_nbclust(onehot_data[, -c(1,2,3)], FUNcluster = hcut, k.max = 30) +fviz_nbclust(onehot_data[, -c(1, 2, 3)], FUNcluster = hcut, k.max = 30, method = "wss") cut_avg <- cutree(hclust_avg, k = 6) names(cut_avg) <- data[["ine"]] table(cut_avg) -onehot_data[["cluster"]] <- cut_avg \ No newline at end of file +data[["cluster"]] <- cut_avg + +write.csv(data, file.path(here(), "data", "03_cah_results.csv"), row.names = FALSE) diff --git a/04_CAH_clust_par_dominantes.R b/04_CAH_clust_par_dominantes.R new file mode 100644 index 0000000..ec939a8 --- /dev/null +++ b/04_CAH_clust_par_dominantes.R @@ -0,0 +1,89 @@ +library(data.table) +library(mltools) +library(dplyr) +library(tidyr) +library(here) +library(ggplot2) +library(ggdendro) +library(factoextra) + +set.seed(1234) +data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"), + header = TRUE +), stringsAsFactors = TRUE) + +data_folder_path <- file.path(here(), "data", "04_CAH_par_dominante") + +if (!dir.exists(data_folder_path)) { + dir.create(data_folder_path) +} + +colnames(data)[5:ncol(data)] <- c( + "1AC-OUVERTURE", "1AC-MI", + "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC3", + "2A-UEchoix-S2-UC4", + "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", + "2A-Projet-S2" +) + +data <- data %>% + mutate_if(sapply(data, is.character), as.factor) + +selected_cols <- c( + "dominante3A", "parcours", "domaine2A", "1AC-MI", + "2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4", + "2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6", + "2A-Projet-S2", "1AC-OUVERTURE" +) + +seuil_effectif <- 20L + +#  On récupère les dominantes supérieures au seuil +vec_dominantes <- na.omit(sapply(unique(data[["dominante3A"]]), function(dominante) { + ifelse(sum(data[["dominante3A"]] == dominante) > seuil_effectif, + as.character(dominante), NA + ) +})) + +for (dominante in vec_dominantes) { + message("Dominante ", dominante) + data_dominante <- data %>% filter(dominante3A == dominante) + onehot_data <- one_hot(as.data.table(data_dominante), cols = selected_cols, sparsifyNAs = TRUE) + + #  Fonctionne bien avec binary + dist <- dist(x = onehot_data[, -1], method = "binary") + + hclust_avg <- hclust(dist, method = "average") + + dhc <- as.dendrogram(hclust_avg) + + plotdata <- dendro_data(dhc, type = "rectangle") + p <- ggplot(segment(plotdata)) + + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + + coord_flip() + + scale_y_reverse(expand = c(0.2, 0)) + + ggtitle(paste0("Dominante : ", dominante)) + p + theme_dendro() + + print(fviz_nbclust(onehot_data, + FUNcluster = hcut + )+ + ggtitle(paste0("Dominante : ", dominante))) + + nb_clust_dominante <- as.integer(readline(prompt = paste0( + "Nb clusters ", + dominante, + " : " + ))) + + cut_avg <- cutree(hclust_avg, k = nb_clust_dominante) + names(cut_avg) <- data_dominante[["ine"]] + table(cut_avg) + + data_dominante[["cluster"]] <- cut_avg + + write.csv(data_dominante, file.path( + data_folder_path, + paste0("04_", dominante, "_cah_results.csv") + ), row.names = FALSE) +}