Having almost a final form for extraction

This commit is contained in:
Louis Lacoste 2024-04-30 17:42:08 +02:00
parent 94b4e8f836
commit 6952fd0228
3 changed files with 587 additions and 41 deletions

View file

@ -1,6 +1,6 @@
library(readxl)
library(dplyr)
library(tidyr)
library(here)
# Supplement_diplome-1.xls ligne 4
ligne_prenom <- 5
@ -9,66 +9,88 @@ colonne_prenom <- 50
folder_path <- file.path("data", "Bulletins promotion 2023")
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE))
current_data <- as.data.frame(read_excel(
file.path(
folder_path,
filename
),
sheet = 1L,
col_names = FALSE
))
if (ncol(current_data) == 63L) {
return(current_data)
}
}))
indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE)
decalage_ine <- c(1, 6)
#  Indices Total ECTS validés
which(df == "Total Ects validés", arr.ind = TRUE)
# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
# current_data <- as.data.frame(read_excel(file.path(folder_path,
# filename),
# sheet = 1L,
# col_names = FALSE))
# if (ncol(current_data) != 63L) {
# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
# current_data <- current_data %>% relocate(starts_with("missing"))
# colnames(current_data) <- paste0("...", seq(1,63))
# current_data
# }
# }))
#  Indices Ing - 1A
not_selected_files <- sapply(list.files(folder_path), function(filename) {
current_data <- as.data.frame(read_excel(
file.path(
folder_path,
filename
),
sheet = 1L,
col_names = FALSE
))
if (ncol(current_data) != 63L) {
return(filename)
}
})
indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE)
indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE)
indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE)
indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE)
##  A Ajuster pour d'autres tableaux
#  Les repérages se base sur la localisation de l'INE
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
#  En triant les dataframes au préalables on a les mêmes positions de colonnes
#  (A quelques NAs près)
col_parcours <- 23L
col_domaine2A <- 15L
col_dominante3A <- 36L
col_ECTS <- 63
# Décalages
# Décalages par rapport au texte de l'INE
dec_ine_reel <- c(1, 6)
dec_col_cours_ine <- -23
dec_row_cours_ine <- 35
dec_row_parcours_ine <- 32L
dec_row_domaine2A_ine <- 29L
dec_row_dominante3A_ine <- 31L
dec_col_annee <- -13
dec_row_parcours_ing1A <- -4
dec_col_parcours_ing1A <- 2
dec_ECTS <- 42
get_row_to_remove_cours <- function(cours) {
vec_cours <- cours
which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") |
(vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") |
(vec_cours == "Ing - 3A")))
}
full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) {
full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
# Trouver l'INE
current_row <- indices_numero_ine[idx, 1]
current_col <- indices_numero_ine[idx, 2]
current_row <- indices_texte_ine[idx, 1]
current_col <- indices_texte_ine[idx, 2]
#  Ici si on est au dernier indice on va au bout du tableau et on nettoiera
# après
next_row <- ifelse(idx != nrow(indices_numero_ine),
indices_numero_ine[idx + 1, 1],
next_row <- ifelse(idx != nrow(indices_texte_ine),
indices_texte_ine[idx + 1, 1],
nrow(df)
)
# La colonne ne bouge pas
next_col <- current_col
ine <- df[[
current_row + decalage_ine[[1]],
current_col + decalage_ine[[2]]
current_row + dec_ine_reel[[1]],
current_col + dec_ine_reel[[2]]
]]
# Parcours
parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
#  Cours
cours <- df[seq(
current_row + dec_row_cours_ine,
@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
indices_blocs <- which(!is.na(longdata[["ects"]]))
noms_blocs <- longdata[indices_blocs, "cours"]
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
longdata$bloc <- NA
longdata$bloc[indices_blocs] <- noms_blocs
longdata[["bloc"]] <- NA
longdata[["bloc"]][indices_blocs] <- noms_blocs
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
for (i in 2:nrow(longdata)) {
if (is.na(longdata[["bloc"]][i])) {
@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
}
}
#  Ajout parcours, domaine2A, dominante3A
longdata[["parcours"]] <- parcours
longdata[["domaine2A"]] <- domaine2A
longdata[["dominante3A"]] <- dominante3A
longdata[["cours"]] <- as.factor(longdata[["cours"]])
longdata[["ine"]] <- as.factor(longdata[["ine"]])
longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
longdata[["annee"]] <- as.factor(longdata[["annee"]])
longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
longdata
}))
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),]
write.csv(full,
file.path(
here(), "data",
"aggregated_dataframe_suppdiplome.csv"
),
row.names = FALSE
)

95
point.Rmd Normal file
View file

@ -0,0 +1,95 @@
```{r, echo = FALSE}
knitr::opts_chunk$set(fig.width=12)
```
```{r packages, echo = FALSE, include = FALSE}
library(dplyr)
library(tidyr)
library(ggplot2)
library(here)
```
```{r import donnees, echo = FALSE}
full <- read.csv(file.path(
here(), "data",
"aggregated_dataframe_suppdiplome.csv"
))
full[["ine"]] <- as.factor(full[["ine"]])
full[["cours"]] <- as.factor(full[["cours"]])
full[["ects"]] <- as.numeric(full[["ects"]])
full[["annee"]] <- as.factor(full[["annee"]])
full[["type_annee"]] <- as.factor(full[["type_annee"]])
full[["bloc"]] <- as.factor(full[["bloc"]])
full[["parcours"]] <- as.factor(full[["parcours"]])
full[["domaine2A"]] <- as.factor(full[["domaine2A"]])
full[["dominante3A"]] <- as.factor(full[["dominante3A"]])
```
```{r differents_dataframe, echo = FALSE}
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ]
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ]
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ]
df_count <- df_ue_choix %>%
group_by(cours, bloc) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(freq = n / sum(n))
par_dominante_effectif <- full %>% group_by(dominante3A) %>%
select(-c(annee,type_annee,bloc,cours,ects)) %>%
distinct() %>% count(sort = TRUE)
par_domaine_effectif <- full %>% group_by(domaine2A) %>%
select(-c(annee,type_annee,bloc,cours,ects)) %>%
distinct() %>% count(sort = TRUE)
par_parcours_effectif <- full %>% group_by(parcours) %>%
select(-c(annee,type_annee,bloc,cours,ects)) %>%
distinct() %>% count(sort = TRUE)
```
```{r, echo = FALSE}
ggplot(df_count %>% filter(n > 20)) +
aes(x = reorder(cours, n), y = reorder(n, n)) +
geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) +
scale_x_discrete() +
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
coord_flip()
```
```{r, echo = FALSE}
ggplot(par_domaine_effectif) +
aes(x = reorder(domaine2A, n), y = reorder(n, n)) +
geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) +
scale_x_discrete() +
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
coord_flip() +
theme(legend.position = "none")
```
```{r, echo = FALSE}
ggplot(par_dominante_effectif) +
aes(x = reorder(dominante3A, n), y = reorder(n, n)) +
geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) +
scale_x_discrete() +
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
coord_flip() +
theme(legend.position = "none")
```
```{r , echo = FALSE}
ggplot(par_parcours_effectif) +
aes(x = reorder(parcours, n), y = reorder(n, n)) +
geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) +
scale_x_discrete() +
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
coord_flip() +
theme(legend.position = "none")
```

414
point.html Normal file

File diff suppressed because one or more lines are too long