Ajout scripts CAH
This commit is contained in:
parent
be61e5f6f8
commit
58a4599213
2 changed files with 96 additions and 5 deletions
|
|
@ -6,7 +6,7 @@ library(here)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
library(ggdendro)
|
library(ggdendro)
|
||||||
library(factoextra)
|
library(factoextra)
|
||||||
|
set.seed(1234)
|
||||||
data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"),
|
data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"),
|
||||||
header = TRUE
|
header = TRUE
|
||||||
), stringsAsFactors = TRUE)
|
), stringsAsFactors = TRUE)
|
||||||
|
|
@ -23,7 +23,7 @@ data <- data %>%
|
||||||
mutate_if(sapply(data, is.character), as.factor)
|
mutate_if(sapply(data, is.character), as.factor)
|
||||||
|
|
||||||
selected_cols <- c(
|
selected_cols <- c(
|
||||||
"parcours", "domaine2A", "1AC-MI",
|
"dominante3A", "parcours", "domaine2A", "1AC-MI",
|
||||||
"2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4",
|
"2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4",
|
||||||
"2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
|
"2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
|
||||||
"2A-Projet-S2", "1AC-OUVERTURE"
|
"2A-Projet-S2", "1AC-OUVERTURE"
|
||||||
|
|
@ -32,7 +32,7 @@ selected_cols <- c(
|
||||||
onehot_data <- one_hot(as.data.table(data), cols = selected_cols, sparsifyNAs = TRUE)
|
onehot_data <- one_hot(as.data.table(data), cols = selected_cols, sparsifyNAs = TRUE)
|
||||||
|
|
||||||
# Fonctionne bien avec binary
|
# Fonctionne bien avec binary
|
||||||
dist_eucl <- dist(x = onehot_data[, -c(1, 2, 3)], method = "binary")
|
dist_eucl <- dist(x = onehot_data[, - 1], method = "binary")
|
||||||
|
|
||||||
hclust_avg <- hclust(dist_eucl, method = "average")
|
hclust_avg <- hclust(dist_eucl, method = "average")
|
||||||
|
|
||||||
|
|
@ -45,10 +45,12 @@ p <- ggplot(segment(plotdata)) +
|
||||||
scale_y_reverse(expand = c(0.2, 0))
|
scale_y_reverse(expand = c(0.2, 0))
|
||||||
p + theme_dendro()
|
p + theme_dendro()
|
||||||
|
|
||||||
fviz_nbclust(onehot_data[, -c(1,2,3)], FUNcluster = hcut, k.max = 30)
|
fviz_nbclust(onehot_data[, -c(1, 2, 3)], FUNcluster = hcut, k.max = 30, method = "wss")
|
||||||
|
|
||||||
cut_avg <- cutree(hclust_avg, k = 6)
|
cut_avg <- cutree(hclust_avg, k = 6)
|
||||||
names(cut_avg) <- data[["ine"]]
|
names(cut_avg) <- data[["ine"]]
|
||||||
table(cut_avg)
|
table(cut_avg)
|
||||||
|
|
||||||
onehot_data[["cluster"]] <- cut_avg
|
data[["cluster"]] <- cut_avg
|
||||||
|
|
||||||
|
write.csv(data, file.path(here(), "data", "03_cah_results.csv"), row.names = FALSE)
|
||||||
89
04_CAH_clust_par_dominantes.R
Normal file
89
04_CAH_clust_par_dominantes.R
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
library(data.table)
|
||||||
|
library(mltools)
|
||||||
|
library(dplyr)
|
||||||
|
library(tidyr)
|
||||||
|
library(here)
|
||||||
|
library(ggplot2)
|
||||||
|
library(ggdendro)
|
||||||
|
library(factoextra)
|
||||||
|
|
||||||
|
set.seed(1234)
|
||||||
|
data <- data.frame(read.csv(file.path(here(), "data", "02_preprocessed_data.csv"),
|
||||||
|
header = TRUE
|
||||||
|
), stringsAsFactors = TRUE)
|
||||||
|
|
||||||
|
data_folder_path <- file.path(here(), "data", "04_CAH_par_dominante")
|
||||||
|
|
||||||
|
if (!dir.exists(data_folder_path)) {
|
||||||
|
dir.create(data_folder_path)
|
||||||
|
}
|
||||||
|
|
||||||
|
colnames(data)[5:ncol(data)] <- c(
|
||||||
|
"1AC-OUVERTURE", "1AC-MI",
|
||||||
|
"2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC3",
|
||||||
|
"2A-UEchoix-S2-UC4",
|
||||||
|
"2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
|
||||||
|
"2A-Projet-S2"
|
||||||
|
)
|
||||||
|
|
||||||
|
data <- data %>%
|
||||||
|
mutate_if(sapply(data, is.character), as.factor)
|
||||||
|
|
||||||
|
selected_cols <- c(
|
||||||
|
"dominante3A", "parcours", "domaine2A", "1AC-MI",
|
||||||
|
"2A-UEchoix-S1-UC1", "2A-UEchoix-S1-UC2", "2A-UEchoix-S2-UC4",
|
||||||
|
"2A-UEchoix-S2-UC3", "2A-UEchoix-S2-UC5", "2A-UEchoix-S2-UC6",
|
||||||
|
"2A-Projet-S2", "1AC-OUVERTURE"
|
||||||
|
)
|
||||||
|
|
||||||
|
seuil_effectif <- 20L
|
||||||
|
|
||||||
|
# On récupère les dominantes supérieures au seuil
|
||||||
|
vec_dominantes <- na.omit(sapply(unique(data[["dominante3A"]]), function(dominante) {
|
||||||
|
ifelse(sum(data[["dominante3A"]] == dominante) > seuil_effectif,
|
||||||
|
as.character(dominante), NA
|
||||||
|
)
|
||||||
|
}))
|
||||||
|
|
||||||
|
for (dominante in vec_dominantes) {
|
||||||
|
message("Dominante ", dominante)
|
||||||
|
data_dominante <- data %>% filter(dominante3A == dominante)
|
||||||
|
onehot_data <- one_hot(as.data.table(data_dominante), cols = selected_cols, sparsifyNAs = TRUE)
|
||||||
|
|
||||||
|
# Fonctionne bien avec binary
|
||||||
|
dist <- dist(x = onehot_data[, -1], method = "binary")
|
||||||
|
|
||||||
|
hclust_avg <- hclust(dist, method = "average")
|
||||||
|
|
||||||
|
dhc <- as.dendrogram(hclust_avg)
|
||||||
|
|
||||||
|
plotdata <- dendro_data(dhc, type = "rectangle")
|
||||||
|
p <- ggplot(segment(plotdata)) +
|
||||||
|
geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) +
|
||||||
|
coord_flip() +
|
||||||
|
scale_y_reverse(expand = c(0.2, 0)) +
|
||||||
|
ggtitle(paste0("Dominante : ", dominante))
|
||||||
|
p + theme_dendro()
|
||||||
|
|
||||||
|
print(fviz_nbclust(onehot_data,
|
||||||
|
FUNcluster = hcut
|
||||||
|
)+
|
||||||
|
ggtitle(paste0("Dominante : ", dominante)))
|
||||||
|
|
||||||
|
nb_clust_dominante <- as.integer(readline(prompt = paste0(
|
||||||
|
"Nb clusters ",
|
||||||
|
dominante,
|
||||||
|
" : "
|
||||||
|
)))
|
||||||
|
|
||||||
|
cut_avg <- cutree(hclust_avg, k = nb_clust_dominante)
|
||||||
|
names(cut_avg) <- data_dominante[["ine"]]
|
||||||
|
table(cut_avg)
|
||||||
|
|
||||||
|
data_dominante[["cluster"]] <- cut_avg
|
||||||
|
|
||||||
|
write.csv(data_dominante, file.path(
|
||||||
|
data_folder_path,
|
||||||
|
paste0("04_", dominante, "_cah_results.csv")
|
||||||
|
), row.names = FALSE)
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue