Having almost a final form for extraction

2024-04-30 17:42:08 +02:00 · 2024-04-30 17:42:08 +02:00 · 6952fd0228
commit 6952fd0228
parent 94b4e8f836
3 changed files with 587 additions and 41 deletions
--- a/extract_data.R
+++ b/extract_data.R
@ -1,6 +1,6 @@
 library(readxl)
 library(dplyr)
-library(tidyr)
+library(here)

 # Supplement_diplome-1.xls ligne 4
 ligne_prenom <- 5
@ -9,66 +9,88 @@ colonne_prenom <- 50
 folder_path <- file.path("data", "Bulletins promotion 2023")

 df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
-    current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE))
+    current_data <- as.data.frame(read_excel(
+        file.path(
+            folder_path,
+            filename
+        ),
+        sheet = 1L,
+        col_names = FALSE
+    ))
    if (ncol(current_data) == 63L) {
        return(current_data)
    }
 }))

-
-indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE)
-decalage_ine <- c(1, 6)
-
-#  Indices Total ECTS validés
-which(df == "Total Ects validés", arr.ind = TRUE)
+# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
+#     current_data <- as.data.frame(read_excel(file.path(folder_path,
+#     filename),
+#     sheet = 1L,
+#     col_names = FALSE))
+#     if (ncol(current_data) != 63L) {
+#         current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
+#         current_data <- current_data %>% relocate(starts_with("missing"))
+#         colnames(current_data) <- paste0("...", seq(1,63))
+#         current_data
+#     }
+# }))


-#  Indices Ing - 1A
+not_selected_files <- sapply(list.files(folder_path), function(filename) {
+    current_data <- as.data.frame(read_excel(
+        file.path(
+            folder_path,
+            filename
+        ),
+        sheet = 1L,
+        col_names = FALSE
+    ))
+    if (ncol(current_data) != 63L) {
+        return(filename)
+    }
+})

-indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE)
-indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE)
-indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE)
-indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE)
+##  A Ajuster pour d'autres tableaux
+#  Les repérages se base sur la localisation de l'INE
+indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)

+#  En triant les dataframes au préalables on a les mêmes positions de colonnes
+#  (A quelques NAs près)
+col_parcours <- 23L
+col_domaine2A <- 15L
+col_dominante3A <- 36L
 col_ECTS <- 63

-# Décalages
-
+# Décalages par rapport au texte de l'INE
+dec_ine_reel <- c(1, 6)
 dec_col_cours_ine <- -23
 dec_row_cours_ine <- 35
+dec_row_parcours_ine <- 32L
+dec_row_domaine2A_ine <- 29L
+dec_row_dominante3A_ine <- 31L

-dec_col_annee <- -13
-
-dec_row_parcours_ing1A <- -4
-dec_col_parcours_ing1A <- 2
-
-dec_ECTS <- 42
-
-get_row_to_remove_cours <- function(cours) {
-    vec_cours <- cours
-    which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") |
-        (vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") |
-        (vec_cours == "Ing - 3A")))
-}
-
-
-full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) {
+full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
    # Trouver l'INE
-    current_row <- indices_numero_ine[idx, 1]
-    current_col <- indices_numero_ine[idx, 2]
+    current_row <- indices_texte_ine[idx, 1]
+    current_col <- indices_texte_ine[idx, 2]

    #  Ici si on est au dernier indice on va au bout du tableau et on nettoiera
    # après
-    next_row <- ifelse(idx != nrow(indices_numero_ine),
-        indices_numero_ine[idx + 1, 1],
+    next_row <- ifelse(idx != nrow(indices_texte_ine),
+        indices_texte_ine[idx + 1, 1],
        nrow(df)
    )
    # La colonne ne bouge pas
    next_col <- current_col
    ine <- df[[
-        current_row + decalage_ine[[1]],
-        current_col + decalage_ine[[2]]
+        current_row + dec_ine_reel[[1]],
+        current_col + dec_ine_reel[[2]]
    ]]
+    # Parcours
+    parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
+    domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
+    dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
+
    #  Cours
    cours <- df[seq(
        current_row + dec_row_cours_ine,
@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
    indices_blocs <- which(!is.na(longdata[["ects"]]))
    noms_blocs <- longdata[indices_blocs, "cours"]
    # Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
-    longdata$bloc <- NA
-    longdata$bloc[indices_blocs] <- noms_blocs
+    longdata[["bloc"]] <- NA
+    longdata[["bloc"]][indices_blocs] <- noms_blocs
    # Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
    for (i in 2:nrow(longdata)) {
        if (is.na(longdata[["bloc"]][i])) {
@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
        }
    }

+    #  Ajout parcours, domaine2A, dominante3A
+    longdata[["parcours"]] <- parcours
+    longdata[["domaine2A"]] <- domaine2A
+    longdata[["dominante3A"]] <- dominante3A
+
+    longdata[["cours"]] <- as.factor(longdata[["cours"]])
+    longdata[["ine"]] <- as.factor(longdata[["ine"]])
+    longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
+    longdata[["annee"]] <- as.factor(longdata[["annee"]])
+    longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
    longdata
 }))

-
-df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),]
+write.csv(full,
+    file.path(
+        here(), "data",
+        "aggregated_dataframe_suppdiplome.csv"
+    ),
+    row.names = FALSE
+)
--- a/point.Rmd
+++ b/point.Rmd
@ -0,0 +1,95 @@
+```{r, echo = FALSE}
+knitr::opts_chunk$set(fig.width=12)
+```
+```{r packages, echo = FALSE, include = FALSE}
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+library(here)
+```
+
+```{r import donnees, echo = FALSE}
+full <- read.csv(file.path(
+        here(), "data",
+        "aggregated_dataframe_suppdiplome.csv"
+    ))
+
+full[["ine"]] <- as.factor(full[["ine"]])
+full[["cours"]] <- as.factor(full[["cours"]])
+full[["ects"]] <- as.numeric(full[["ects"]])
+full[["annee"]] <- as.factor(full[["annee"]])
+full[["type_annee"]] <- as.factor(full[["type_annee"]])
+full[["bloc"]] <- as.factor(full[["bloc"]])
+full[["parcours"]] <- as.factor(full[["parcours"]])
+full[["domaine2A"]] <- as.factor(full[["domaine2A"]])
+full[["dominante3A"]] <- as.factor(full[["dominante3A"]])
+
+
+
+
+```
+
+```{r differents_dataframe, echo = FALSE}
+df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ]
+df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ]
+df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ]
+
+df_count <- df_ue_choix %>%
+    group_by(cours, bloc) %>%
+    summarise(n = n()) %>%
+    ungroup() %>%
+    mutate(freq = n / sum(n))
+
+par_dominante_effectif <- full %>% group_by(dominante3A) %>% 
+    select(-c(annee,type_annee,bloc,cours,ects)) %>%
+    distinct() %>% count(sort = TRUE)
+
+par_domaine_effectif <- full %>% group_by(domaine2A) %>% 
+    select(-c(annee,type_annee,bloc,cours,ects)) %>%
+    distinct() %>% count(sort = TRUE)
+
+par_parcours_effectif <- full %>% group_by(parcours) %>% 
+    select(-c(annee,type_annee,bloc,cours,ects)) %>%
+    distinct() %>% count(sort = TRUE)
+
+```
+```{r, echo = FALSE}
+
+ggplot(df_count %>% filter(n > 20)) +
+    aes(x = reorder(cours, n), y = reorder(n, n)) +
+    geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) +
+    scale_x_discrete() +
+    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
+    coord_flip()
+```
+
+```{r, echo = FALSE}
+
+ggplot(par_domaine_effectif) +
+    aes(x = reorder(domaine2A, n), y = reorder(n, n)) +
+    geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) +
+    scale_x_discrete() +
+    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
+    coord_flip() + 
+    theme(legend.position = "none")
+```
+```{r, echo = FALSE}
+
+ggplot(par_dominante_effectif) +
+    aes(x = reorder(dominante3A, n), y = reorder(n, n)) +
+    geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) +
+    scale_x_discrete() +
+    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
+    coord_flip() + 
+    theme(legend.position = "none")
+```
+
+```{r , echo = FALSE}
+ggplot(par_parcours_effectif) +
+    aes(x = reorder(parcours, n), y = reorder(n, n)) +
+    geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) +
+    scale_x_discrete() +
+    theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
+    coord_flip() + 
+    theme(legend.position = "none")
+```
--- a/point.html
+++ b/point.html