library(readxl) library(dplyr) library(here) # Supplement_diplome-1.xls ligne 4 ligne_prenom <- 5 colonne_prenom <- 50 folder_path <- file.path("data", "Bulletins promotion 2023") df <- do.call("rbind", lapply(list.files(folder_path), function(filename) { current_data <- as.data.frame(read_excel( file.path( folder_path, filename ), sheet = 1L, col_names = FALSE )) if (ncol(current_data) == 63L) { return(current_data) } })) # df <- do.call("rbind", lapply(list.files(folder_path), function(filename) { # current_data <- as.data.frame(read_excel(file.path(folder_path, # filename), # sheet = 1L, # col_names = FALSE)) # if (ncol(current_data) != 63L) { # current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data)) # current_data <- current_data %>% relocate(starts_with("missing")) # colnames(current_data) <- paste0("...", seq(1,63)) # current_data # } # })) not_selected_files <- sapply(list.files(folder_path), function(filename) { current_data <- as.data.frame(read_excel( file.path( folder_path, filename ), sheet = 1L, col_names = FALSE )) if (ncol(current_data) != 63L) { return(filename) } }) ##  A Ajuster pour d'autres tableaux #  Les repérages se base sur la localisation de l'INE indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE) #  En triant les dataframes au préalables on a les mêmes positions de colonnes #  (A quelques NAs près) col_parcours <- 23L col_domaine2A <- 15L col_dominante3A <- 36L col_ECTS <- 63 # Décalages par rapport au texte de l'INE dec_ine_reel <- c(1, 6) dec_col_cours_ine <- -23 dec_row_cours_ine <- 35 dec_row_parcours_ine <- 32L dec_row_domaine2A_ine <- 29L dec_row_dominante3A_ine <- 31L full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) { # Trouver l'INE current_row <- indices_texte_ine[idx, 1] current_col <- indices_texte_ine[idx, 2] #  Ici si on est au dernier indice on va au bout du tableau et on nettoiera # après next_row <- ifelse(idx != nrow(indices_texte_ine), indices_texte_ine[idx + 1, 1], nrow(df) ) # La colonne ne bouge pas next_col <- current_col ine <- df[[ current_row + dec_ine_reel[[1]], current_col + dec_ine_reel[[2]] ]] # Parcours parcours <- df[current_row + dec_row_parcours_ine, col_parcours] domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A] dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A] #  Cours cours <- df[seq( current_row + dec_row_cours_ine, next_row + dec_row_cours_ine - 1 ), current_col + dec_col_cours_ine] ects <- df[seq( current_row + dec_row_cours_ine, next_row + dec_row_cours_ine - 1 ), col_ECTS] longdata <- data.frame(ine = ine, cours = cours, ects = ects) longdata <- longdata[which(!(is.na(longdata[["cours"]]) & is.na(longdata[["ects"]]))), ] row_remove_intit_cours <- which((longdata[["cours"]] == "Intitulé du cours" & longdata[["ects"]] == "Ects")) if (!identical(row_remove_intit_cours, integer(0))) { longdata <- longdata[-row_remove_intit_cours, ] } row_remove_ing <- which((longdata[["cours"]] == "Ing")) if (!identical(row_remove_ing, integer(0))) { longdata <- longdata[-row_remove_ing, ] } id_cycle_ing <- which(grepl("Ing - [1-3]A", longdata[["cours"]])) id_cycle_ing <- c(id_cycle_ing, nrow(longdata)) annee_cycle_ing <- grep("20[0-9][0-9]-[0-9][0-9]", df[seq( current_row + dec_row_cours_ine, next_row + dec_row_cours_ine - 1 ), 8], value = TRUE) type_annee <- rep("Unknown", nrow(longdata)) annee <- rep("", nrow(longdata)) for (idx in seq_len(length(id_cycle_ing) - 1L)) { type_annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- longdata[["cours"]][id_cycle_ing[idx]] annee[seq(id_cycle_ing[idx], id_cycle_ing[idx + 1])] <- annee_cycle_ing[idx] } longdata[["annee"]] <- annee longdata[["type_annee"]] <- type_annee row_remove_annee_cycle_ing <- which(grepl("Ing - [0-9]{1}A", longdata[["cours"]])) if (!identical(row_remove_annee_cycle_ing, integer(0))) { longdata <- longdata[-row_remove_annee_cycle_ing, ] } #  Trouver les blocs et les mettre en bloc # Identifier les indices des lignes où les valeurs ECTS ne sont pas NA indices_blocs <- which(!is.na(longdata[["ects"]])) noms_blocs <- longdata[indices_blocs, "cours"] # Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes longdata[["bloc"]] <- NA longdata[["bloc"]][indices_blocs] <- noms_blocs # Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle for (i in 2:nrow(longdata)) { if (is.na(longdata[["bloc"]][i])) { longdata[["bloc"]][i] <- longdata[["bloc"]][i - 1] } } #  Ajout parcours, domaine2A, dominante3A longdata[["parcours"]] <- parcours longdata[["domaine2A"]] <- domaine2A longdata[["dominante3A"]] <- dominante3A longdata[["cours"]] <- as.factor(longdata[["cours"]]) longdata[["ine"]] <- as.factor(longdata[["ine"]]) longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]]) longdata[["annee"]] <- as.factor(longdata[["annee"]]) longdata[["bloc"]] <- as.factor(longdata[["bloc"]]) longdata })) write.csv(full, file.path( here(), "data", "aggregated_dataframe_suppdiplome.csv" ), row.names = FALSE )