Having almost a final form for extraction
This commit is contained in:
parent
94b4e8f836
commit
6952fd0228
3 changed files with 587 additions and 41 deletions
119
extract_data.R
119
extract_data.R
|
|
@ -1,6 +1,6 @@
|
|||
library(readxl)
|
||||
library(dplyr)
|
||||
library(tidyr)
|
||||
library(here)
|
||||
|
||||
# Supplement_diplome-1.xls ligne 4
|
||||
ligne_prenom <- 5
|
||||
|
|
@ -9,66 +9,88 @@ colonne_prenom <- 50
|
|||
folder_path <- file.path("data", "Bulletins promotion 2023")
|
||||
|
||||
df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||||
current_data <- as.data.frame(read_excel(file.path(folder_path, filename), sheet = 1L, col_names = FALSE))
|
||||
current_data <- as.data.frame(read_excel(
|
||||
file.path(
|
||||
folder_path,
|
||||
filename
|
||||
),
|
||||
sheet = 1L,
|
||||
col_names = FALSE
|
||||
))
|
||||
if (ncol(current_data) == 63L) {
|
||||
return(current_data)
|
||||
}
|
||||
}))
|
||||
|
||||
|
||||
indices_numero_ine <- which(df == "Numéro INE", arr.ind = TRUE)
|
||||
decalage_ine <- c(1, 6)
|
||||
|
||||
# Indices Total ECTS validés
|
||||
which(df == "Total Ects validés", arr.ind = TRUE)
|
||||
# df <- do.call("rbind", lapply(list.files(folder_path), function(filename) {
|
||||
# current_data <- as.data.frame(read_excel(file.path(folder_path,
|
||||
# filename),
|
||||
# sheet = 1L,
|
||||
# col_names = FALSE))
|
||||
# if (ncol(current_data) != 63L) {
|
||||
# current_data[,paste0("missing",seq(1,63-ncol(current_data)))] <- rep(NA, 63-ncol(current_data))
|
||||
# current_data <- current_data %>% relocate(starts_with("missing"))
|
||||
# colnames(current_data) <- paste0("...", seq(1,63))
|
||||
# current_data
|
||||
# }
|
||||
# }))
|
||||
|
||||
|
||||
# Indices Ing - 1A
|
||||
not_selected_files <- sapply(list.files(folder_path), function(filename) {
|
||||
current_data <- as.data.frame(read_excel(
|
||||
file.path(
|
||||
folder_path,
|
||||
filename
|
||||
),
|
||||
sheet = 1L,
|
||||
col_names = FALSE
|
||||
))
|
||||
if (ncol(current_data) != 63L) {
|
||||
return(filename)
|
||||
}
|
||||
})
|
||||
|
||||
indices_cours_Ing_1A <- which(df == "Ing - 1A", arr.ind = TRUE)
|
||||
indices_cours_Ing_2A <- which(df == "Ing - 2A", arr.ind = TRUE)
|
||||
indices_cours_Ing_3A <- which(df == "Ing - 3A", arr.ind = TRUE)
|
||||
indices_stage_fin_etude <- which((df == "Stage de fin d'études") | (df == "Stage de fin d'études de 3ème année"), arr.ind = TRUE)
|
||||
## A Ajuster pour d'autres tableaux
|
||||
# Les repérages se base sur la localisation de l'INE
|
||||
indices_texte_ine <- which(df == "Numéro INE", arr.ind = TRUE)
|
||||
|
||||
# En triant les dataframes au préalables on a les mêmes positions de colonnes
|
||||
# (A quelques NAs près)
|
||||
col_parcours <- 23L
|
||||
col_domaine2A <- 15L
|
||||
col_dominante3A <- 36L
|
||||
col_ECTS <- 63
|
||||
|
||||
# Décalages
|
||||
|
||||
# Décalages par rapport au texte de l'INE
|
||||
dec_ine_reel <- c(1, 6)
|
||||
dec_col_cours_ine <- -23
|
||||
dec_row_cours_ine <- 35
|
||||
dec_row_parcours_ine <- 32L
|
||||
dec_row_domaine2A_ine <- 29L
|
||||
dec_row_dominante3A_ine <- 31L
|
||||
|
||||
dec_col_annee <- -13
|
||||
|
||||
dec_row_parcours_ing1A <- -4
|
||||
dec_col_parcours_ing1A <- 2
|
||||
|
||||
dec_ECTS <- 42
|
||||
|
||||
get_row_to_remove_cours <- function(cours) {
|
||||
vec_cours <- cours
|
||||
which((is.na(vec_cours) | (vec_cours == "Intitulé du cours") |
|
||||
(vec_cours == "Ing - 1A") | (vec_cours == "Ing - 2A") |
|
||||
(vec_cours == "Ing - 3A")))
|
||||
}
|
||||
|
||||
|
||||
full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx) {
|
||||
full <- do.call("rbind", lapply(seq_len(nrow(indices_texte_ine)), function(idx) {
|
||||
# Trouver l'INE
|
||||
current_row <- indices_numero_ine[idx, 1]
|
||||
current_col <- indices_numero_ine[idx, 2]
|
||||
current_row <- indices_texte_ine[idx, 1]
|
||||
current_col <- indices_texte_ine[idx, 2]
|
||||
|
||||
# Ici si on est au dernier indice on va au bout du tableau et on nettoiera
|
||||
# après
|
||||
next_row <- ifelse(idx != nrow(indices_numero_ine),
|
||||
indices_numero_ine[idx + 1, 1],
|
||||
next_row <- ifelse(idx != nrow(indices_texte_ine),
|
||||
indices_texte_ine[idx + 1, 1],
|
||||
nrow(df)
|
||||
)
|
||||
# La colonne ne bouge pas
|
||||
next_col <- current_col
|
||||
ine <- df[[
|
||||
current_row + decalage_ine[[1]],
|
||||
current_col + decalage_ine[[2]]
|
||||
current_row + dec_ine_reel[[1]],
|
||||
current_col + dec_ine_reel[[2]]
|
||||
]]
|
||||
# Parcours
|
||||
parcours <- df[current_row + dec_row_parcours_ine, col_parcours]
|
||||
domaine2A <- df[current_row + dec_row_domaine2A_ine, col_domaine2A]
|
||||
dominante3A <- df[current_row + dec_row_dominante3A_ine, col_dominante3A]
|
||||
|
||||
# Cours
|
||||
cours <- df[seq(
|
||||
current_row + dec_row_cours_ine,
|
||||
|
|
@ -118,8 +140,8 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
|
|||
indices_blocs <- which(!is.na(longdata[["ects"]]))
|
||||
noms_blocs <- longdata[indices_blocs, "cours"]
|
||||
# Créer une nouvelle colonne "bloc" en remplissant les valeurs manquantes
|
||||
longdata$bloc <- NA
|
||||
longdata$bloc[indices_blocs] <- noms_blocs
|
||||
longdata[["bloc"]] <- NA
|
||||
longdata[["bloc"]][indices_blocs] <- noms_blocs
|
||||
# Remplir les valeurs manquantes dans la colonne "bloc" en utilisant une boucle
|
||||
for (i in 2:nrow(longdata)) {
|
||||
if (is.na(longdata[["bloc"]][i])) {
|
||||
|
|
@ -127,8 +149,23 @@ full <- do.call("rbind", lapply(seq_len(nrow(indices_numero_ine)), function(idx)
|
|||
}
|
||||
}
|
||||
|
||||
# Ajout parcours, domaine2A, dominante3A
|
||||
longdata[["parcours"]] <- parcours
|
||||
longdata[["domaine2A"]] <- domaine2A
|
||||
longdata[["dominante3A"]] <- dominante3A
|
||||
|
||||
longdata[["cours"]] <- as.factor(longdata[["cours"]])
|
||||
longdata[["ine"]] <- as.factor(longdata[["ine"]])
|
||||
longdata[["type_annee"]] <- as.factor(longdata[["type_annee"]])
|
||||
longdata[["annee"]] <- as.factor(longdata[["annee"]])
|
||||
longdata[["bloc"]] <- as.factor(longdata[["bloc"]])
|
||||
longdata
|
||||
}))
|
||||
|
||||
|
||||
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]),]
|
||||
write.csv(full,
|
||||
file.path(
|
||||
here(), "data",
|
||||
"aggregated_dataframe_suppdiplome.csv"
|
||||
),
|
||||
row.names = FALSE
|
||||
)
|
||||
95
point.Rmd
Normal file
95
point.Rmd
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
```{r, echo = FALSE}
|
||||
knitr::opts_chunk$set(fig.width=12)
|
||||
```
|
||||
```{r packages, echo = FALSE, include = FALSE}
|
||||
library(dplyr)
|
||||
library(tidyr)
|
||||
library(ggplot2)
|
||||
library(here)
|
||||
```
|
||||
|
||||
```{r import donnees, echo = FALSE}
|
||||
full <- read.csv(file.path(
|
||||
here(), "data",
|
||||
"aggregated_dataframe_suppdiplome.csv"
|
||||
))
|
||||
|
||||
full[["ine"]] <- as.factor(full[["ine"]])
|
||||
full[["cours"]] <- as.factor(full[["cours"]])
|
||||
full[["ects"]] <- as.numeric(full[["ects"]])
|
||||
full[["annee"]] <- as.factor(full[["annee"]])
|
||||
full[["type_annee"]] <- as.factor(full[["type_annee"]])
|
||||
full[["bloc"]] <- as.factor(full[["bloc"]])
|
||||
full[["parcours"]] <- as.factor(full[["parcours"]])
|
||||
full[["domaine2A"]] <- as.factor(full[["domaine2A"]])
|
||||
full[["dominante3A"]] <- as.factor(full[["dominante3A"]])
|
||||
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r differents_dataframe, echo = FALSE}
|
||||
df_ue_choix <- full[grepl("UE à choix *", full[["bloc"]]), ]
|
||||
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 1"), ]
|
||||
df_ue_choix <- df_ue_choix[-which(df_ue_choix[["cours"]] == "UE à choix Semestre 2"), ]
|
||||
|
||||
df_count <- df_ue_choix %>%
|
||||
group_by(cours, bloc) %>%
|
||||
summarise(n = n()) %>%
|
||||
ungroup() %>%
|
||||
mutate(freq = n / sum(n))
|
||||
|
||||
par_dominante_effectif <- full %>% group_by(dominante3A) %>%
|
||||
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||
distinct() %>% count(sort = TRUE)
|
||||
|
||||
par_domaine_effectif <- full %>% group_by(domaine2A) %>%
|
||||
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||
distinct() %>% count(sort = TRUE)
|
||||
|
||||
par_parcours_effectif <- full %>% group_by(parcours) %>%
|
||||
select(-c(annee,type_annee,bloc,cours,ects)) %>%
|
||||
distinct() %>% count(sort = TRUE)
|
||||
|
||||
```
|
||||
```{r, echo = FALSE}
|
||||
|
||||
ggplot(df_count %>% filter(n > 20)) +
|
||||
aes(x = reorder(cours, n), y = reorder(n, n)) +
|
||||
geom_bar(stat = "identity", width = 1, aes(fill = .data$bloc)) +
|
||||
scale_x_discrete() +
|
||||
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||
coord_flip()
|
||||
```
|
||||
|
||||
```{r, echo = FALSE}
|
||||
|
||||
ggplot(par_domaine_effectif) +
|
||||
aes(x = reorder(domaine2A, n), y = reorder(n, n)) +
|
||||
geom_bar(stat = "identity", width = 1, aes(fill = .data$domaine2A)) +
|
||||
scale_x_discrete() +
|
||||
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||
coord_flip() +
|
||||
theme(legend.position = "none")
|
||||
```
|
||||
```{r, echo = FALSE}
|
||||
|
||||
ggplot(par_dominante_effectif) +
|
||||
aes(x = reorder(dominante3A, n), y = reorder(n, n)) +
|
||||
geom_bar(stat = "identity", width = 1, aes(fill = .data$dominante3A)) +
|
||||
scale_x_discrete() +
|
||||
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||
coord_flip() +
|
||||
theme(legend.position = "none")
|
||||
```
|
||||
|
||||
```{r , echo = FALSE}
|
||||
ggplot(par_parcours_effectif) +
|
||||
aes(x = reorder(parcours, n), y = reorder(n, n)) +
|
||||
geom_bar(stat = "identity", width = 1, aes(fill = .data$parcours)) +
|
||||
scale_x_discrete() +
|
||||
theme(axis.text.y = element_text(angle = 0, vjust = .5, hjust = 1)) +
|
||||
coord_flip() +
|
||||
theme(legend.position = "none")
|
||||
```
|
||||
414
point.html
Normal file
414
point.html
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue