Having almost a final form for extraction

2024-04-30 17:42:08 +02:00 · 2024-04-30 17:42:08 +02:00 · 6952fd0228
commit 6952fd0228
parent 94b4e8f836
3 changed files with 587 additions and 41 deletions
--- a/extract_data.R
+++ b/extract_data.R
@ -1,6 +1,6 @@
 library(readxl)
 library(dplyr)
-library(tidyr)
+library(here)
 # Supplement_diplome-1.xls ligne 4
 ligne_prenom <- 5
@ -9,66 +9,88 @@ colonne_prenom <- 50
 folder_path <- file.path("data", "Bulletins promotion 2023")
 df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
-    current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE))
+    current_data <- as.data.frame(read_excel(
        file.path(
            folder_path,
            filename
        ),
        sheet = 1L,
        col_names = FALSE
    ))
    if (ncol(current_data) == 63L) {
        return(current_data)
    }
 }))
-
+# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
-indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE)
+#     current_data <- as.data.frame(read_excel(file.path(folder_path,
-decalage_ine <- c(1, 6)
+#     filename),
-
+#     sheet = 1L,
-#  Indices Total ECTS validés
+#     col_names = FALSE))
-which(df == "Total Ects validés", arr.ind = TRUE)
+#     if (ncol(current_data) != 63L) {
 #         current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
 #         current_data <- current_data %>% relocate(starts_with("missing"))
 #         colnames(current_data) <- paste0("...", seq(1,63))
 #         current_data
 #     }
 # }))
-#  Indices Ing - 1A
+not_selected_files <- sapply(list.files(folder_path), function(filename) {
    current_data <- as.data.frame(read_excel(
        file.path(
            folder_path,
            filename
        ),
        sheet = 1L,
        col_names = FALSE
    ))
    if (ncol(current_data) != 63L) {
        return(filename)
    }
 })
-indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE)
+##  A Ajuster pour d'autres tableaux
-indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE)
+#  Les repérages se base sur la localisation de l'INE
-indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE)
+indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
 indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE)
 #  En triant les dataframes au préalables on a les mêmes positions de colonnes
 #  (A quelques NAs près)
 col_parcours <- 23L
 col_domaine2A <- 15L
 col_dominante3A <- 36L
 col_ECTS <- 63
-# Décalages
+# Décalages par rapport au texte de l'INE
-
+dec_ine_reel <- c(1, 6)
 dec_col_cours_ine <- -23
 dec_row_cours_ine <- 35
 dec_row_parcours_ine <- 32L
 dec_row_domaine2A_ine <- 29L
 dec_row_dominante3A_ine <- 31L
-dec_col_annee <- -13
+full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
 dec_row_parcours_ing1A <- -4
 dec_col_parcours_ing1A <- 2
 dec_ECTS <- 42
 get_row_to_remove_cours <- function(cours) {
    vec_cours <- cours
    which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") |
        (vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") |
        (vec_cours == "Ing - 3A")))
 }
 full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) {
    # Trouver l'INE
-    current_row <- indices_numero_ine[idx, 1]
+    current_row <- indices_texte_ine[idx, 1]
-    current_col <- indices_numero_ine[idx, 2]
+    current_col <- indices_texte_ine[idx, 2]
    #  Ici si on est au dernier indice on va au bout du tableau et on nettoiera
    # après
-    next_row <- ifelse(idx != nrow(indices_numero_ine),
+    next_row <- ifelse(idx != nrow(indices_texte_ine),
-        indices_numero_ine[idx + 1, 1],
+        indices_texte_ine[idx + 1, 1],
        nrow(df)
    )
    # La colonne ne bouge pas
    next_col <- current_col
    ine <- df[[
-        current_row + decalage_ine[[1]],
+        current_row + dec_ine_reel[[1]],
-        current_col + decalage_ine[[2]]
+        current_col + dec_ine_reel[[2]]
    ]]
    # Parcours
    parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
    domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
    dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
    #  Cours
    cours <- df[seq(
        current_row + dec_row_cours_ine,
@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
    indices_blocs <- which(!is.na(longdata[["ects"]]))
    noms_blocs <- longdata[indices_blocs, "cours"]
    # Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
-    longdata$bloc <- NA
+    longdata[["bloc"]] <- NA
-    longdata$bloc[indices_blocs] <- noms_blocs
+    longdata[["bloc"]][indices_blocs] <- noms_blocs
    # Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
    for (i in 2:nrow(longdata)) {
        if (is.na(longdata[["bloc"]][i])) {
@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
        }
    }
    #  Ajout parcours, domaine2A, dominante3A
    longdata[["parcours"]] <- parcours
    longdata[["domaine2A"]] <- domaine2A
    longdata[["dominante3A"]] <- dominante3A
    longdata[["cours"]] <- as.factor(longdata[["cours"]])
    longdata[["ine"]] <- as.factor(longdata[["ine"]])
    longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
    longdata[["annee"]] <- as.factor(longdata[["annee"]])
    longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
    longdata
 }))
-
+write.csv(full,
-df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),]
+    file.path(
        here(), "data",
        "aggregated_dataframe_suppdiplome.csv"
    ),
    row.names = FALSE
 )
--- a/point.Rmd
+++ b/point.Rmd
@ -0,0 +1,95 @@
 ```{r, echo = FALSE}
 knitr::opts_chunk$set(fig.width=12)
 ```
 ```{r packages, echo = FALSE, include = FALSE}
 library(dplyr)
 library(tidyr)
 library(ggplot2)
 library(here)
 ```
 ```{r import donnees, echo = FALSE}
 full <- read.csv(file.path(
        here(), "data",
        "aggregated_dataframe_suppdiplome.csv"
    ))
 full[["ine"]] <- as.factor(full[["ine"]])
 full[["cours"]] <- as.factor(full[["cours"]])
 full[["ects"]] <- as.numeric(full[["ects"]])
 full[["annee"]] <- as.factor(full[["annee"]])
 full[["type_annee"]] <- as.factor(full[["type_annee"]])
 full[["bloc"]] <- as.factor(full[["bloc"]])
 full[["parcours"]] <- as.factor(full[["parcours"]])
 full[["domaine2A"]] <- as.factor(full[["domaine2A"]])
 full[["dominante3A"]] <- as.factor(full[["dominante3A"]])
 ```
 ```{r differents_dataframe, echo = FALSE}
 df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ]
 df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ]
 df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ]
 df_count <- df_ue_choix %>%
    group_by(cours, bloc) %>%
    summarise(n = n()) %>%
    ungroup() %>%
    mutate(freq = n / sum(n))
 par_dominante_effectif <- full %>% group_by(dominante3A) %>% 
    select(-c(annee,type_annee,bloc,cours,ects)) %>%
    distinct() %>% count(sort = TRUE)
 par_domaine_effectif <- full %>% group_by(domaine2A) %>% 
    select(-c(annee,type_annee,bloc,cours,ects)) %>%
    distinct() %>% count(sort = TRUE)
 par_parcours_effectif <- full %>% group_by(parcours) %>% 
    select(-c(annee,type_annee,bloc,cours,ects)) %>%
    distinct() %>% count(sort = TRUE)
 ```
 ```{r, echo = FALSE}
 ggplot(df_count %>% filter(n > 20)) +
    aes(x = reorder(cours, n), y = reorder(n, n)) +
    geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) +
    scale_x_discrete() +
    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
    coord_flip()
 ```
 ```{r, echo = FALSE}
 ggplot(par_domaine_effectif) +
    aes(x = reorder(domaine2A, n), y = reorder(n, n)) +
    geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) +
    scale_x_discrete() +
    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
    coord_flip() + 
    theme(legend.position = "none")
 ```
 ```{r, echo = FALSE}
 ggplot(par_dominante_effectif) +
    aes(x = reorder(dominante3A, n), y = reorder(n, n)) +
    geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) +
    scale_x_discrete() +
    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
    coord_flip() + 
    theme(legend.position = "none")
 ```
 ```{r , echo = FALSE}
 ggplot(par_parcours_effectif) +
    aes(x = reorder(parcours, n), y = reorder(n, n)) +
    geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) +
    scale_x_discrete() +
    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
    coord_flip() + 
    theme(legend.position = "none")
 ```
--- a/point.html
+++ b/point.html