From 6952fd0228fb5fef1423f880faca67e4654aad2b Mon Sep 17 00:00:00 2001 From: Louis Lacoste Date: Tue, 30 Apr 2024 17:42:08 +0200 Subject: [PATCH] Having almost a final form for extraction --- extract_data.R | 119 +++++++++----- point.Rmd | 95 ++++++++++++ point.html | 414 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 587 insertions(+), 41 deletions(-) create mode 100644 point.Rmd create mode 100644 point.html diff --git a/extract_data.R b/extract_data.R index 526e089..708e31a 100644 --- a/extract_data.R +++ b/extract_data.R @@ -1,6 +1,6 @@ library(readxl) library(dplyr) -library(tidyr) +library(here) # Supplement_diplome-1.xls ligne 4 ligne_prenom <- 5 @@ -9,66 +9,88 @@ colonne_prenom <- 50 folder_path <- file.path("data", "Bulletins promotion 2023") df <- do.call("rbind", lapply(list.files(folder_path), function(filename) { - current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE)) + current_data <- as.data.frame(read_excel( + file.path( + folder_path, + filename + ), + sheet = 1L, + col_names = FALSE + )) if (ncol(current_data) == 63L) { return(current_data) } })) - -indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE) -decalage_ine <- c(1, 6) - -#  Indices Total ECTS validés -which(df == "Total Ects validés", arr.ind = TRUE) +# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) { +# current_data <- as.data.frame(read_excel(file.path(folder_path, +# filename), +# sheet = 1L, +# col_names = FALSE)) +# if (ncol(current_data) != 63L) { +# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data)) +# current_data <- current_data %>% relocate(starts_with("missing")) +# colnames(current_data) <- paste0("...", seq(1,63)) +# current_data +# } +# })) -#  Indices Ing - 1A +not_selected_files <- sapply(list.files(folder_path), function(filename) { + current_data <- as.data.frame(read_excel( + file.path( + folder_path, + filename + ), + sheet = 1L, + col_names = FALSE + )) + if (ncol(current_data) != 63L) { + return(filename) + } +}) -indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE) -indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE) -indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE) -indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE) +##  A Ajuster pour d'autres tableaux +#  Les repérages se base sur la localisation de l'INE +indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE) +#  En triant les dataframes au préalables on a les mêmes positions de colonnes +#  (A quelques NAs près) +col_parcours <- 23L +col_domaine2A <- 15L +col_dominante3A <- 36L col_ECTS <- 63 -# Décalages - +# Décalages par rapport au texte de l'INE +dec_ine_reel <- c(1, 6) dec_col_cours_ine <- -23 dec_row_cours_ine <- 35 +dec_row_parcours_ine <- 32L +dec_row_domaine2A_ine <- 29L +dec_row_dominante3A_ine <- 31L -dec_col_annee <- -13 - -dec_row_parcours_ing1A <- -4 -dec_col_parcours_ing1A <- 2 - -dec_ECTS <- 42 - -get_row_to_remove_cours <- function(cours) { - vec_cours <- cours - which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") | - (vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") | - (vec_cours == "Ing - 3A"))) -} - - -full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) { +full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) { # Trouver l'INE - current_row <- indices_numero_ine[idx, 1] - current_col <- indices_numero_ine[idx, 2] + current_row <- indices_texte_ine[idx, 1] + current_col <- indices_texte_ine[idx, 2] #  Ici si on est au dernier indice on va au bout du tableau et on nettoiera # après - next_row <- ifelse(idx != nrow(indices_numero_ine), - indices_numero_ine[idx + 1, 1], + next_row <- ifelse(idx != nrow(indices_texte_ine), + indices_texte_ine[idx + 1, 1], nrow(df) ) # La colonne ne bouge pas next_col <- current_col ine <- df[[ - current_row + decalage_ine[[1]], - current_col + decalage_ine[[2]] + current_row + dec_ine_reel[[1]], + current_col + dec_ine_reel[[2]] ]] + # Parcours + parcours <- df[current_row + dec_row_parcours_ine, col_parcours] + domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A] + dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A] + #  Cours cours <- df[seq( current_row + dec_row_cours_ine, @@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) indices_blocs <- which(!is.na(longdata[["ects"]])) noms_blocs <- longdata[indices_blocs, "cours"] # Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes - longdata$bloc <- NA - longdata$bloc[indices_blocs] <- noms_blocs + longdata[["bloc"]] <- NA + longdata[["bloc"]][indices_blocs] <- noms_blocs # Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle for (i in 2:nrow(longdata)) { if (is.na(longdata[["bloc"]][i])) { @@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) } } + #  Ajout parcours, domaine2A, dominante3A + longdata[["parcours"]] <- parcours + longdata[["domaine2A"]] <- domaine2A + longdata[["dominante3A"]] <- dominante3A + + longdata[["cours"]] <- as.factor(longdata[["cours"]]) + longdata[["ine"]] <- as.factor(longdata[["ine"]]) + longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]]) + longdata[["annee"]] <- as.factor(longdata[["annee"]]) + longdata[["bloc"]] <- as.factor(longdata[["bloc"]]) longdata })) - -df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),] +write.csv(full, + file.path( + here(), "data", + "aggregated_dataframe_suppdiplome.csv" + ), + row.names = FALSE +) \ No newline at end of file diff --git a/point.Rmd b/point.Rmd new file mode 100644 index 0000000..6d62489 --- /dev/null +++ b/point.Rmd @@ -0,0 +1,95 @@ +```{r, echo = FALSE} +knitr::opts_chunk$set(fig.width=12) +``` +```{r packages, echo = FALSE, include = FALSE} +library(dplyr) +library(tidyr) +library(ggplot2) +library(here) +``` + +```{r import donnees, echo = FALSE} +full <- read.csv(file.path( + here(), "data", + "aggregated_dataframe_suppdiplome.csv" + )) + +full[["ine"]] <- as.factor(full[["ine"]]) +full[["cours"]] <- as.factor(full[["cours"]]) +full[["ects"]] <- as.numeric(full[["ects"]]) +full[["annee"]] <- as.factor(full[["annee"]]) +full[["type_annee"]] <- as.factor(full[["type_annee"]]) +full[["bloc"]] <- as.factor(full[["bloc"]]) +full[["parcours"]] <- as.factor(full[["parcours"]]) +full[["domaine2A"]] <- as.factor(full[["domaine2A"]]) +full[["dominante3A"]] <- as.factor(full[["dominante3A"]]) + + + + +``` + +```{r differents_dataframe, echo = FALSE} +df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ] +df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ] +df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ] + +df_count <- df_ue_choix %>% + group_by(cours, bloc) %>% + summarise(n = n()) %>% + ungroup() %>% + mutate(freq = n / sum(n)) + +par_dominante_effectif <- full %>% group_by(dominante3A) %>% + select(-c(annee,type_annee,bloc,cours,ects)) %>% + distinct() %>% count(sort = TRUE) + +par_domaine_effectif <- full %>% group_by(domaine2A) %>% + select(-c(annee,type_annee,bloc,cours,ects)) %>% + distinct() %>% count(sort = TRUE) + +par_parcours_effectif <- full %>% group_by(parcours) %>% + select(-c(annee,type_annee,bloc,cours,ects)) %>% + distinct() %>% count(sort = TRUE) + +``` +```{r, echo = FALSE} + +ggplot(df_count %>% filter(n > 20)) + + aes(x = reorder(cours, n), y = reorder(n, n)) + + geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) + + scale_x_discrete() + + theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) + + coord_flip() +``` + +```{r, echo = FALSE} + +ggplot(par_domaine_effectif) + + aes(x = reorder(domaine2A, n), y = reorder(n, n)) + + geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) + + scale_x_discrete() + + theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) + + coord_flip() + + theme(legend.position = "none") +``` +```{r, echo = FALSE} + +ggplot(par_dominante_effectif) + + aes(x = reorder(dominante3A, n), y = reorder(n, n)) + + geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) + + scale_x_discrete() + + theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) + + coord_flip() + + theme(legend.position = "none") +``` + +```{r , echo = FALSE} +ggplot(par_parcours_effectif) + + aes(x = reorder(parcours, n), y = reorder(n, n)) + + geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) + + scale_x_discrete() + + theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) + + coord_flip() + + theme(legend.position = "none") +``` \ No newline at end of file diff --git a/point.html b/point.html new file mode 100644 index 0000000..4d89560 --- /dev/null +++ b/point.html @@ -0,0 +1,414 @@ + + + + + + + + + + + + + +point.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
## Warning: NAs introduits lors de la conversion automatique
+
## `summarise()` has grouped output by 'cours'. You can override using the
+## `.groups` argument.
+

+

+

+ + + + +
+ + + + + + + + + + + + + + +