171 lines
No EOL
5.6 KiB
R
171 lines
No EOL
5.6 KiB
R
library(readxl)
|
||
library(dplyr)
|
||
library(here)
|
||
|
||
# Supplement_diplome-1.xls ligne 4
|
||
ligne_prenom <- 5
|
||
colonne_prenom <- 50
|
||
|
||
folder_path <- file.path("data", "Bulletins promotion 2023")
|
||
|
||
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||
current_data <- as.data.frame(read_excel(
|
||
file.path(
|
||
folder_path,
|
||
filename
|
||
),
|
||
sheet = 1L,
|
||
col_names = FALSE
|
||
))
|
||
if (ncol(current_data) == 63L) {
|
||
return(current_data)
|
||
}
|
||
}))
|
||
|
||
# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||
# current_data <- as.data.frame(read_excel(file.path(folder_path,
|
||
# filename),
|
||
# sheet = 1L,
|
||
# col_names = FALSE))
|
||
# if (ncol(current_data) != 63L) {
|
||
# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
|
||
# current_data <- current_data %>% relocate(starts_with("missing"))
|
||
# colnames(current_data) <- paste0("...", seq(1,63))
|
||
# current_data
|
||
# }
|
||
# }))
|
||
|
||
|
||
not_selected_files <- sapply(list.files(folder_path), function(filename) {
|
||
current_data <- as.data.frame(read_excel(
|
||
file.path(
|
||
folder_path,
|
||
filename
|
||
),
|
||
sheet = 1L,
|
||
col_names = FALSE
|
||
))
|
||
if (ncol(current_data) != 63L) {
|
||
return(filename)
|
||
}
|
||
})
|
||
|
||
## A Ajuster pour d'autres tableaux
|
||
# Les repérages se base sur la localisation de l'INE
|
||
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
|
||
|
||
# En triant les dataframes au préalables on a les mêmes positions de colonnes
|
||
# (A quelques NAs près)
|
||
col_parcours <- 23L
|
||
col_domaine2A <- 15L
|
||
col_dominante3A <- 36L
|
||
col_ECTS <- 63
|
||
|
||
# Décalages par rapport au texte de l'INE
|
||
dec_ine_reel <- c(1, 6)
|
||
dec_col_cours_ine <- -23
|
||
dec_row_cours_ine <- 35
|
||
dec_row_parcours_ine <- 32L
|
||
dec_row_domaine2A_ine <- 29L
|
||
dec_row_dominante3A_ine <- 31L
|
||
|
||
full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
|
||
# Trouver l'INE
|
||
current_row <- indices_texte_ine[idx, 1]
|
||
current_col <- indices_texte_ine[idx, 2]
|
||
|
||
# Ici si on est au dernier indice on va au bout du tableau et on nettoiera
|
||
# après
|
||
next_row <- ifelse(idx != nrow(indices_texte_ine),
|
||
indices_texte_ine[idx + 1, 1],
|
||
nrow(df)
|
||
)
|
||
# La colonne ne bouge pas
|
||
next_col <- current_col
|
||
ine <- df[[
|
||
current_row + dec_ine_reel[[1]],
|
||
current_col + dec_ine_reel[[2]]
|
||
]]
|
||
# Parcours
|
||
parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
|
||
domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
|
||
dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
|
||
|
||
# Cours
|
||
cours <- df[seq(
|
||
current_row + dec_row_cours_ine,
|
||
next_row + dec_row_cours_ine - 1
|
||
), current_col + dec_col_cours_ine]
|
||
ects <- df[seq(
|
||
current_row + dec_row_cours_ine,
|
||
next_row + dec_row_cours_ine - 1
|
||
), col_ECTS]
|
||
|
||
longdata <- data.frame(ine = ine, cours = cours, ects = ects)
|
||
longdata <- longdata[which(!(is.na(longdata[["cours"]]) & is.na(longdata[["ects"]]))), ]
|
||
row_remove_intit_cours <- which((longdata[["cours"]] == "Intitulé du cours" &
|
||
longdata[["ects"]] == "Ects"))
|
||
if (!identical(row_remove_intit_cours, integer(0))) {
|
||
longdata <- longdata[-row_remove_intit_cours, ]
|
||
}
|
||
row_remove_ing <- which((longdata[["cours"]] == "Ing"))
|
||
if (!identical(row_remove_ing, integer(0))) {
|
||
longdata <- longdata[-row_remove_ing, ]
|
||
}
|
||
|
||
id_cycle_ing <- which(grepl("Ing - [1-3]A", longdata[["cours"]]))
|
||
id_cycle_ing <- c(id_cycle_ing, nrow(longdata))
|
||
annee_cycle_ing <- grep("20[0-9][0-9]-[0-9][0-9]", df[seq(
|
||
current_row + dec_row_cours_ine,
|
||
next_row + dec_row_cours_ine - 1
|
||
), 8], value = TRUE)
|
||
type_annee <- rep("Unknown", nrow(longdata))
|
||
annee <- rep("", nrow(longdata))
|
||
for (idx in seq_len(length(id_cycle_ing) - 1L)) {
|
||
type_annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- longdata[["cours"]][id_cycle_ing[idx]]
|
||
annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- annee_cycle_ing[idx]
|
||
}
|
||
longdata[["annee"]] <- annee
|
||
longdata[["type_annee"]] <- type_annee
|
||
|
||
row_remove_annee_cycle_ing <- which(grepl("Ing - [0-9]{1}A", longdata[["cours"]]))
|
||
if (!identical(row_remove_annee_cycle_ing, integer(0))) {
|
||
longdata <- longdata[-row_remove_annee_cycle_ing, ]
|
||
}
|
||
|
||
|
||
# Trouver les blocs et les mettre en bloc
|
||
|
||
# Identifier les indices des lignes où les valeurs ECTS ne sont pas NA
|
||
indices_blocs <- which(!is.na(longdata[["ects"]]))
|
||
noms_blocs <- longdata[indices_blocs, "cours"]
|
||
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
|
||
longdata[["bloc"]] <- NA
|
||
longdata[["bloc"]][indices_blocs] <- noms_blocs
|
||
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
|
||
for (i in 2:nrow(longdata)) {
|
||
if (is.na(longdata[["bloc"]][i])) {
|
||
longdata[["bloc"]][i] <- longdata[["bloc"]][i - 1]
|
||
}
|
||
}
|
||
|
||
# Ajout parcours, domaine2A, dominante3A
|
||
longdata[["parcours"]] <- parcours
|
||
longdata[["domaine2A"]] <- domaine2A
|
||
longdata[["dominante3A"]] <- dominante3A
|
||
|
||
longdata[["cours"]] <- as.factor(longdata[["cours"]])
|
||
longdata[["ine"]] <- as.factor(longdata[["ine"]])
|
||
longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
|
||
longdata[["annee"]] <- as.factor(longdata[["annee"]])
|
||
longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
|
||
longdata
|
||
}))
|
||
|
||
write.csv(full,
|
||
file.path(
|
||
here(), "data",
|
||
"01_aggregated_dataframe_suppdiplome.csv"
|
||
),
|
||
row.names = FALSE
|
||
) |