reforme-enseignement/extract_data.R

library(readxl)
library(dplyr)
library(here)

# Supplement_diplome-1.xls ligne 4
ligne_prenom <- 5
colonne_prenom <- 50

folder_path <- file.path("data", "Bulletins promotion 2023")

df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
    current_data <- as.data.frame(read_excel(
        file.path(
            folder_path,
            filename
        ),
        sheet = 1L,
        col_names = FALSE
    ))
    if (ncol(current_data) == 63L) {
        return(current_data)
    }
}))

# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
#     current_data <- as.data.frame(read_excel(file.path(folder_path,
#     filename),
#     sheet = 1L,
#     col_names = FALSE))
#     if (ncol(current_data) != 63L) {
#         current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
#         current_data <- current_data %>% relocate(starts_with("missing"))
#         colnames(current_data) <- paste0("...", seq(1,63))
#         current_data
#     }
# }))


not_selected_files <- sapply(list.files(folder_path), function(filename) {
    current_data <- as.data.frame(read_excel(
        file.path(
            folder_path,
            filename
        ),
        sheet = 1L,
        col_names = FALSE
    ))
    if (ncol(current_data) != 63L) {
        return(filename)
    }
})

##  A Ajuster pour d'autres tableaux
#  Les repérages se base sur la localisation de l'INE
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)

#  En triant les dataframes au préalables on a les mêmes positions de colonnes
#  (A quelques NAs près)
col_parcours <- 23L
col_domaine2A <- 15L
col_dominante3A <- 36L
col_ECTS <- 63

# Décalages par rapport au texte de l'INE
dec_ine_reel <- c(1, 6)
dec_col_cours_ine <- -23
dec_row_cours_ine <- 35
dec_row_parcours_ine <- 32L
dec_row_domaine2A_ine <- 29L
dec_row_dominante3A_ine <- 31L

full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
    # Trouver l'INE
    current_row <- indices_texte_ine[idx, 1]
    current_col <- indices_texte_ine[idx, 2]

    #  Ici si on est au dernier indice on va au bout du tableau et on nettoiera
    # après
    next_row <- ifelse(idx != nrow(indices_texte_ine),
        indices_texte_ine[idx + 1, 1],
        nrow(df)
    )
    # La colonne ne bouge pas
    next_col <- current_col
    ine <- df[[
        current_row + dec_ine_reel[[1]],
        current_col + dec_ine_reel[[2]]
    ]]
    # Parcours
    parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
    domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
    dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]

    #  Cours
    cours <- df[seq(
        current_row + dec_row_cours_ine,
        next_row + dec_row_cours_ine - 1
    ), current_col + dec_col_cours_ine]
    ects <- df[seq(
        current_row + dec_row_cours_ine,
        next_row + dec_row_cours_ine - 1
    ), col_ECTS]

    longdata <- data.frame(ine = ine, cours = cours, ects = ects)
    longdata <- longdata[which(!(is.na(longdata[["cours"]]) & is.na(longdata[["ects"]]))), ]
    row_remove_intit_cours <- which((longdata[["cours"]] == "Intitulé du cours" &
        longdata[["ects"]] == "Ects"))
    if (!identical(row_remove_intit_cours, integer(0))) {
        longdata <- longdata[-row_remove_intit_cours, ]
    }
    row_remove_ing <- which((longdata[["cours"]] == "Ing"))
    if (!identical(row_remove_ing, integer(0))) {
        longdata <- longdata[-row_remove_ing, ]
    }

    id_cycle_ing <- which(grepl("Ing - [1-3]A", longdata[["cours"]]))
    id_cycle_ing <- c(id_cycle_ing, nrow(longdata))
    annee_cycle_ing <- grep("20[0-9][0-9]-[0-9][0-9]", df[seq(
        current_row + dec_row_cours_ine,
        next_row + dec_row_cours_ine - 1
    ), 8], value = TRUE)
    type_annee <- rep("Unknown", nrow(longdata))
    annee <- rep("", nrow(longdata))
    for (idx in seq_len(length(id_cycle_ing) - 1L)) {
        type_annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- longdata[["cours"]][id_cycle_ing[idx]]
        annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- annee_cycle_ing[idx]
    }
    longdata[["annee"]] <- annee
    longdata[["type_annee"]] <- type_annee

    row_remove_annee_cycle_ing <- which(grepl("Ing - [0-9]{1}A", longdata[["cours"]]))
    if (!identical(row_remove_annee_cycle_ing, integer(0))) {
        longdata <- longdata[-row_remove_annee_cycle_ing, ]
    }


    #  Trouver les blocs et les mettre en bloc

    # Identifier les indices des lignes où les valeurs ECTS ne sont pas NA
    indices_blocs <- which(!is.na(longdata[["ects"]]))
    noms_blocs <- longdata[indices_blocs, "cours"]
    # Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
    longdata[["bloc"]] <- NA
    longdata[["bloc"]][indices_blocs] <- noms_blocs
    # Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
    for (i in 2:nrow(longdata)) {
        if (is.na(longdata[["bloc"]][i])) {
            longdata[["bloc"]][i] <- longdata[["bloc"]][i - 1]
        }
    }

    #  Ajout parcours, domaine2A, dominante3A
    longdata[["parcours"]] <- parcours
    longdata[["domaine2A"]] <- domaine2A
    longdata[["dominante3A"]] <- dominante3A

    longdata[["cours"]] <- as.factor(longdata[["cours"]])
    longdata[["ine"]] <- as.factor(longdata[["ine"]])
    longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
    longdata[["annee"]] <- as.factor(longdata[["annee"]])
    longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
    longdata
}))

write.csv(full,
    file.path(
        here(), "data",
        "aggregated_dataframe_suppdiplome.csv"
    ),
    row.names = FALSE
)