reforme-enseignement/extract_data.R

171 lines
No EOL
5.6 KiB
R
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

library(readxl)
library(dplyr)
library(here)
# Supplement_diplome-1.xls ligne 4
ligne_prenom <- 5
colonne_prenom <- 50
folder_path <- file.path("data", "Bulletins promotion 2023")
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
current_data <- as.data.frame(read_excel(
file.path(
folder_path,
filename
),
sheet = 1L,
col_names = FALSE
))
if (ncol(current_data) == 63L) {
return(current_data)
}
}))
# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
# current_data <- as.data.frame(read_excel(file.path(folder_path,
# filename),
# sheet = 1L,
# col_names = FALSE))
# if (ncol(current_data) != 63L) {
# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
# current_data <- current_data %>% relocate(starts_with("missing"))
# colnames(current_data) <- paste0("...", seq(1,63))
# current_data
# }
# }))
not_selected_files <- sapply(list.files(folder_path), function(filename) {
current_data <- as.data.frame(read_excel(
file.path(
folder_path,
filename
),
sheet = 1L,
col_names = FALSE
))
if (ncol(current_data) != 63L) {
return(filename)
}
})
##  A Ajuster pour d'autres tableaux
#  Les repérages se base sur la localisation de l'INE
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
#  En triant les dataframes au préalables on a les mêmes positions de colonnes
#  (A quelques NAs près)
col_parcours <- 23L
col_domaine2A <- 15L
col_dominante3A <- 36L
col_ECTS <- 63
# Décalages par rapport au texte de l'INE
dec_ine_reel <- c(1, 6)
dec_col_cours_ine <- -23
dec_row_cours_ine <- 35
dec_row_parcours_ine <- 32L
dec_row_domaine2A_ine <- 29L
dec_row_dominante3A_ine <- 31L
full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
# Trouver l'INE
current_row <- indices_texte_ine[idx, 1]
current_col <- indices_texte_ine[idx, 2]
#  Ici si on est au dernier indice on va au bout du tableau et on nettoiera
# après
next_row <- ifelse(idx != nrow(indices_texte_ine),
indices_texte_ine[idx + 1, 1],
nrow(df)
)
# La colonne ne bouge pas
next_col <- current_col
ine <- df[[
current_row + dec_ine_reel[[1]],
current_col + dec_ine_reel[[2]]
]]
# Parcours
parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
#  Cours
cours <- df[seq(
current_row + dec_row_cours_ine,
next_row + dec_row_cours_ine - 1
), current_col + dec_col_cours_ine]
ects <- df[seq(
current_row + dec_row_cours_ine,
next_row + dec_row_cours_ine - 1
), col_ECTS]
longdata <- data.frame(ine = ine, cours = cours, ects = ects)
longdata <- longdata[which(!(is.na(longdata[["cours"]]) & is.na(longdata[["ects"]]))), ]
row_remove_intit_cours <- which((longdata[["cours"]] == "Intitulé du cours" &
longdata[["ects"]] == "Ects"))
if (!identical(row_remove_intit_cours, integer(0))) {
longdata <- longdata[-row_remove_intit_cours, ]
}
row_remove_ing <- which((longdata[["cours"]] == "Ing"))
if (!identical(row_remove_ing, integer(0))) {
longdata <- longdata[-row_remove_ing, ]
}
id_cycle_ing <- which(grepl("Ing - [1-3]A", longdata[["cours"]]))
id_cycle_ing <- c(id_cycle_ing, nrow(longdata))
annee_cycle_ing <- grep("20[0-9][0-9]-[0-9][0-9]", df[seq(
current_row + dec_row_cours_ine,
next_row + dec_row_cours_ine - 1
), 8], value = TRUE)
type_annee <- rep("Unknown", nrow(longdata))
annee <- rep("", nrow(longdata))
for (idx in seq_len(length(id_cycle_ing) - 1L)) {
type_annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- longdata[["cours"]][id_cycle_ing[idx]]
annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- annee_cycle_ing[idx]
}
longdata[["annee"]] <- annee
longdata[["type_annee"]] <- type_annee
row_remove_annee_cycle_ing <- which(grepl("Ing - [0-9]{1}A", longdata[["cours"]]))
if (!identical(row_remove_annee_cycle_ing, integer(0))) {
longdata <- longdata[-row_remove_annee_cycle_ing, ]
}
#  Trouver les blocs et les mettre en bloc
# Identifier les indices des lignes où les valeurs ECTS ne sont pas NA
indices_blocs <- which(!is.na(longdata[["ects"]]))
noms_blocs <- longdata[indices_blocs, "cours"]
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
longdata[["bloc"]] <- NA
longdata[["bloc"]][indices_blocs] <- noms_blocs
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
for (i in 2:nrow(longdata)) {
if (is.na(longdata[["bloc"]][i])) {
longdata[["bloc"]][i] <- longdata[["bloc"]][i - 1]
}
}
#  Ajout parcours, domaine2A, dominante3A
longdata[["parcours"]] <- parcours
longdata[["domaine2A"]] <- domaine2A
longdata[["dominante3A"]] <- dominante3A
longdata[["cours"]] <- as.factor(longdata[["cours"]])
longdata[["ine"]] <- as.factor(longdata[["ine"]])
longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
longdata[["annee"]] <- as.factor(longdata[["annee"]])
longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
longdata
}))
write.csv(full,
file.path(
here(), "data",
"aggregated_dataframe_suppdiplome.csv"
),
row.names = FALSE
)