Having almost a final form for extraction
This commit is contained in:
parent
94b4e8f836
commit
6952fd0228
3 changed files with 587 additions and 41 deletions
119
extract_data.R
119
extract_data.R
|
|
@ -1,6 +1,6 @@
|
||||||
library(readxl)
|
library(readxl)
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
library(tidyr)
|
library(here)
|
||||||
|
|
||||||
# Supplement_diplome-1.xls ligne 4
|
# Supplement_diplome-1.xls ligne 4
|
||||||
ligne_prenom <- 5
|
ligne_prenom <- 5
|
||||||
|
|
@ -9,66 +9,88 @@ colonne_prenom <- 50
|
||||||
folder_path <- file.path("data", "Bulletins promotion 2023")
|
folder_path <- file.path("data", "Bulletins promotion 2023")
|
||||||
|
|
||||||
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||||||
current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE))
|
current_data <- as.data.frame(read_excel(
|
||||||
|
file.path(
|
||||||
|
folder_path,
|
||||||
|
filename
|
||||||
|
),
|
||||||
|
sheet = 1L,
|
||||||
|
col_names = FALSE
|
||||||
|
))
|
||||||
if (ncol(current_data) == 63L) {
|
if (ncol(current_data) == 63L) {
|
||||||
return(current_data)
|
return(current_data)
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
|
|
||||||
|
# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||||||
indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE)
|
# current_data <- as.data.frame(read_excel(file.path(folder_path,
|
||||||
decalage_ine <- c(1, 6)
|
# filename),
|
||||||
|
# sheet = 1L,
|
||||||
# Indices Total ECTS validés
|
# col_names = FALSE))
|
||||||
which(df == "Total Ects validés", arr.ind = TRUE)
|
# if (ncol(current_data) != 63L) {
|
||||||
|
# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
|
||||||
|
# current_data <- current_data %>% relocate(starts_with("missing"))
|
||||||
|
# colnames(current_data) <- paste0("...", seq(1,63))
|
||||||
|
# current_data
|
||||||
|
# }
|
||||||
|
# }))
|
||||||
|
|
||||||
|
|
||||||
# Indices Ing - 1A
|
not_selected_files <- sapply(list.files(folder_path), function(filename) {
|
||||||
|
current_data <- as.data.frame(read_excel(
|
||||||
|
file.path(
|
||||||
|
folder_path,
|
||||||
|
filename
|
||||||
|
),
|
||||||
|
sheet = 1L,
|
||||||
|
col_names = FALSE
|
||||||
|
))
|
||||||
|
if (ncol(current_data) != 63L) {
|
||||||
|
return(filename)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE)
|
## A Ajuster pour d'autres tableaux
|
||||||
indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE)
|
# Les repérages se base sur la localisation de l'INE
|
||||||
indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE)
|
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
|
||||||
indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE)
|
|
||||||
|
|
||||||
|
# En triant les dataframes au préalables on a les mêmes positions de colonnes
|
||||||
|
# (A quelques NAs près)
|
||||||
|
col_parcours <- 23L
|
||||||
|
col_domaine2A <- 15L
|
||||||
|
col_dominante3A <- 36L
|
||||||
col_ECTS <- 63
|
col_ECTS <- 63
|
||||||
|
|
||||||
# Décalages
|
# Décalages par rapport au texte de l'INE
|
||||||
|
dec_ine_reel <- c(1, 6)
|
||||||
dec_col_cours_ine <- -23
|
dec_col_cours_ine <- -23
|
||||||
dec_row_cours_ine <- 35
|
dec_row_cours_ine <- 35
|
||||||
|
dec_row_parcours_ine <- 32L
|
||||||
|
dec_row_domaine2A_ine <- 29L
|
||||||
|
dec_row_dominante3A_ine <- 31L
|
||||||
|
|
||||||
dec_col_annee <- -13
|
full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
|
||||||
|
|
||||||
dec_row_parcours_ing1A <- -4
|
|
||||||
dec_col_parcours_ing1A <- 2
|
|
||||||
|
|
||||||
dec_ECTS <- 42
|
|
||||||
|
|
||||||
get_row_to_remove_cours <- function(cours) {
|
|
||||||
vec_cours <- cours
|
|
||||||
which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") |
|
|
||||||
(vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") |
|
|
||||||
(vec_cours == "Ing - 3A")))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) {
|
|
||||||
# Trouver l'INE
|
# Trouver l'INE
|
||||||
current_row <- indices_numero_ine[idx, 1]
|
current_row <- indices_texte_ine[idx, 1]
|
||||||
current_col <- indices_numero_ine[idx, 2]
|
current_col <- indices_texte_ine[idx, 2]
|
||||||
|
|
||||||
# Ici si on est au dernier indice on va au bout du tableau et on nettoiera
|
# Ici si on est au dernier indice on va au bout du tableau et on nettoiera
|
||||||
# après
|
# après
|
||||||
next_row <- ifelse(idx != nrow(indices_numero_ine),
|
next_row <- ifelse(idx != nrow(indices_texte_ine),
|
||||||
indices_numero_ine[idx + 1, 1],
|
indices_texte_ine[idx + 1, 1],
|
||||||
nrow(df)
|
nrow(df)
|
||||||
)
|
)
|
||||||
# La colonne ne bouge pas
|
# La colonne ne bouge pas
|
||||||
next_col <- current_col
|
next_col <- current_col
|
||||||
ine <- df[[
|
ine <- df[[
|
||||||
current_row + decalage_ine[[1]],
|
current_row + dec_ine_reel[[1]],
|
||||||
current_col + decalage_ine[[2]]
|
current_col + dec_ine_reel[[2]]
|
||||||
]]
|
]]
|
||||||
|
# Parcours
|
||||||
|
parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
|
||||||
|
domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
|
||||||
|
dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
|
||||||
|
|
||||||
# Cours
|
# Cours
|
||||||
cours <- df[seq(
|
cours <- df[seq(
|
||||||
current_row + dec_row_cours_ine,
|
current_row + dec_row_cours_ine,
|
||||||
|
|
@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
|
||||||
indices_blocs <- which(!is.na(longdata[["ects"]]))
|
indices_blocs <- which(!is.na(longdata[["ects"]]))
|
||||||
noms_blocs <- longdata[indices_blocs, "cours"]
|
noms_blocs <- longdata[indices_blocs, "cours"]
|
||||||
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
|
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
|
||||||
longdata$bloc <- NA
|
longdata[["bloc"]] <- NA
|
||||||
longdata$bloc[indices_blocs] <- noms_blocs
|
longdata[["bloc"]][indices_blocs] <- noms_blocs
|
||||||
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
|
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
|
||||||
for (i in 2:nrow(longdata)) {
|
for (i in 2:nrow(longdata)) {
|
||||||
if (is.na(longdata[["bloc"]][i])) {
|
if (is.na(longdata[["bloc"]][i])) {
|
||||||
|
|
@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Ajout parcours, domaine2A, dominante3A
|
||||||
|
longdata[["parcours"]] <- parcours
|
||||||
|
longdata[["domaine2A"]] <- domaine2A
|
||||||
|
longdata[["dominante3A"]] <- dominante3A
|
||||||
|
|
||||||
|
longdata[["cours"]] <- as.factor(longdata[["cours"]])
|
||||||
|
longdata[["ine"]] <- as.factor(longdata[["ine"]])
|
||||||
|
longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
|
||||||
|
longdata[["annee"]] <- as.factor(longdata[["annee"]])
|
||||||
|
longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
|
||||||
longdata
|
longdata
|
||||||
}))
|
}))
|
||||||
|
|
||||||
|
write.csv(full,
|
||||||
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),]
|
file.path(
|
||||||
|
here(), "data",
|
||||||
|
"aggregated_dataframe_suppdiplome.csv"
|
||||||
|
),
|
||||||
|
row.names = FALSE
|
||||||
|
)
|
||||||
95
point.Rmd
Normal file
95
point.Rmd
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
```{r, echo = FALSE}
|
||||||
|
knitr::opts_chunk$set(fig.width=12)
|
||||||
|
```
|
||||||
|
```{r packages, echo = FALSE, include = FALSE}
|
||||||
|
library(dplyr)
|
||||||
|
library(tidyr)
|
||||||
|
library(ggplot2)
|
||||||
|
library(here)
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r import donnees, echo = FALSE}
|
||||||
|
full <- read.csv(file.path(
|
||||||
|
here(), "data",
|
||||||
|
"aggregated_dataframe_suppdiplome.csv"
|
||||||
|
))
|
||||||
|
|
||||||
|
full[["ine"]] <- as.factor(full[["ine"]])
|
||||||
|
full[["cours"]] <- as.factor(full[["cours"]])
|
||||||
|
full[["ects"]] <- as.numeric(full[["ects"]])
|
||||||
|
full[["annee"]] <- as.factor(full[["annee"]])
|
||||||
|
full[["type_annee"]] <- as.factor(full[["type_annee"]])
|
||||||
|
full[["bloc"]] <- as.factor(full[["bloc"]])
|
||||||
|
full[["parcours"]] <- as.factor(full[["parcours"]])
|
||||||
|
full[["domaine2A"]] <- as.factor(full[["domaine2A"]])
|
||||||
|
full[["dominante3A"]] <- as.factor(full[["dominante3A"]])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r differents_dataframe, echo = FALSE}
|
||||||
|
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ]
|
||||||
|
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ]
|
||||||
|
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ]
|
||||||
|
|
||||||
|
df_count <- df_ue_choix %>%
|
||||||
|
group_by(cours, bloc) %>%
|
||||||
|
summarise(n = n()) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
mutate(freq = n / sum(n))
|
||||||
|
|
||||||
|
par_dominante_effectif <- full %>% group_by(dominante3A) %>%
|
||||||
|
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||||
|
distinct() %>% count(sort = TRUE)
|
||||||
|
|
||||||
|
par_domaine_effectif <- full %>% group_by(domaine2A) %>%
|
||||||
|
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||||
|
distinct() %>% count(sort = TRUE)
|
||||||
|
|
||||||
|
par_parcours_effectif <- full %>% group_by(parcours) %>%
|
||||||
|
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||||
|
distinct() %>% count(sort = TRUE)
|
||||||
|
|
||||||
|
```
|
||||||
|
```{r, echo = FALSE}
|
||||||
|
|
||||||
|
ggplot(df_count %>% filter(n > 20)) +
|
||||||
|
aes(x = reorder(cours, n), y = reorder(n, n)) +
|
||||||
|
geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) +
|
||||||
|
scale_x_discrete() +
|
||||||
|
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||||
|
coord_flip()
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r, echo = FALSE}
|
||||||
|
|
||||||
|
ggplot(par_domaine_effectif) +
|
||||||
|
aes(x = reorder(domaine2A, n), y = reorder(n, n)) +
|
||||||
|
geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) +
|
||||||
|
scale_x_discrete() +
|
||||||
|
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||||
|
coord_flip() +
|
||||||
|
theme(legend.position = "none")
|
||||||
|
```
|
||||||
|
```{r, echo = FALSE}
|
||||||
|
|
||||||
|
ggplot(par_dominante_effectif) +
|
||||||
|
aes(x = reorder(dominante3A, n), y = reorder(n, n)) +
|
||||||
|
geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) +
|
||||||
|
scale_x_discrete() +
|
||||||
|
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||||
|
coord_flip() +
|
||||||
|
theme(legend.position = "none")
|
||||||
|
```
|
||||||
|
|
||||||
|
```{r , echo = FALSE}
|
||||||
|
ggplot(par_parcours_effectif) +
|
||||||
|
aes(x = reorder(parcours, n), y = reorder(n, n)) +
|
||||||
|
geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) +
|
||||||
|
scale_x_discrete() +
|
||||||
|
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||||
|
coord_flip() +
|
||||||
|
theme(legend.position = "none")
|
||||||
|
```
|
||||||
414
point.html
Normal file
414
point.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue