From 0809cb6d65d9ff05c21020ba3c60b367753ae405 Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 6 Nov 2025 10:50:37 +0100 Subject: [PATCH] Ajout semaine 45 --- suivi/2025-45/2025-45.qmd | 110 ++++++++++++++++++++++++++ suivi/2025-45/references.bib | 144 +++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 suivi/2025-45/2025-45.qmd create mode 100644 suivi/2025-45/references.bib diff --git a/suivi/2025-45/2025-45.qmd b/suivi/2025-45/2025-45.qmd new file mode 100644 index 0000000..d8fe795 --- /dev/null +++ b/suivi/2025-45/2025-45.qmd @@ -0,0 +1,110 @@ +--- +title: "Bilan semaine 45 2025 : 03 novembre - 06 novembre" +categories: [colBiSBM, inférence, GNN] +date: 2025 11 03 +date-modified: last-modified +bibliography: references.bib +--- + +## TODO List + +- Finir le papier : + - Re-structurer le plan, mon plan, Donnet et Barbillon, échelle méso et comparaison inter réseau et noeuds non partagés. + - ✅ Partie Baldock: Ajouter l'ordre des modèles préférés et vérifier mais BICLsep < BICL pirho < BICL iid + - ✅ Toutes les simus en annexe. Envoyer Info transfer en annexe et remplacer par Network partitioning + +- Maitriser graphtools de Peixoto pour essayer d'utiliser l'arbre taxonomique sur graphe de cooccurence inférer par SparCC + +- Maitriser SparCC + +- Faire LBM sur niveau taxonomique grossier, initialiser avec le résultat pour un niveau plus fin et ainsi de suite. + +- Clustering unipartite j'ai cassé une fonction de distance à vérifier et réparer + +- Pour clustering de collections sur données ~~réelles~~ : + → L'intuition de Pierre semble être confirmé, les dissimilarités semblent arrêter de varier sensiblement pour de grandes valeurs $(Q_1,Q_2)$. + +- 👶 (délégué à stagiaire) Clustering sur Doré : + - Regarder pour les couples date+nom les études et le nombre de réseaux analysables (Possible demander à Élisa) + - ⌛ Chamberlain et al semble intéressant à regarder ! Voir le Rmarkdown + - Clusteriser sur la base des noms et voir parmi les réseaux Européens (désagrégés ?) + - Si M > 10, alors voir si je retrouve les mêmes résultats que dans les études. + + - Regarder *Largest gap* sur réseaux Doré + + - Essayer *clustering* sur `supinfo` + +### Inférence et microbes + +#### Bibliographie: à lire, à faire + +- Lire article multi-niveaux Saint-Clair +- 🆕⌛ Papier Julie Negative Binomiale +- 🆕 🔎 Trouver des papiers: + - LBM Negative Binomial + - Network inference through sample comparison + +#### Réflexion + +- easy16s : se renseigner sur + - $\alpha$, $\beta$ diversité + - Heatmap +- Regarder **SPARTA** Rennes +- Ecrire et étudier les modèles pour différents niveaux taxonomiques. +- 🆕 Regarder NetComi +- 🆕 Regarder OneNet car aggrégation plus robuste +- 🆕 Réfléchir sens d'aggréger les données ou de les diviser + +#### Écrire et faire tourner + +- Lancer *colBiSBM* sur $OTU\times Sample$ → problème du chargement en mémoire des données à voir +- Lancer *colSBM* sur $OTU\times OTU$ +- TabNet pratiquer les [exercices](https://github.com/cregouby/Tutoriel_torch) +- 🆕 SparCC à différent niveaux +- 🆕 SBM à différent niveaux +- 🆕⌛ Tree-PLN à différents niveaux + + +#### Causalité + +Plus sur le temps long, à regarder + +- GT causalité +- Daria Bystrova lire présentation @bystrovaCausalDiscovery (Meek rules, V-structure) + +## A discuter + +- 🆕 Voir pour des Réseaux / GDR ou aller +- 🆕 Chercher des cours à suivre + +## Biblio à faire + +- Regarder Transport optimal graphes bipartite. + + +## Lectures en cours 📚 + +### HDR Vincent Brault + +- ⌛ Chap 2 : Creuser l'idée de maximiser l'énergie libre, très intéressant regarder le critère CARI et lire Robert et al 2021. Actuellement p32 du manuscrit +- Chap 3 + +### OT +- ⌛ @mazeletUnsupervisedLearningOptimal Intéressant pour le transport optimal entre graphes de tailles différentes | Regarder si regularization entropique ne marche pas bien pour le graphe. +- ⌛ @nennaLecture2Entropic Pour comprendre le problème d'OT régularisé pour l'entropie. +- ⌛ @nennaLecture1Monge + +### Inférence de graphes + +- ⌛ @aitchisonStatisticalAnalysisCompositional1982a, en cours + +- ❗📖 @payneFiniteMixturesMultivariate2023 sur MixMPLN + +### Causalité + +- ❗📖 @bystrovaCausalDiscovery + +### Largest Gaps + +- ❗📖 @braultFastConsistentAlgorithm2023 +- ❗📖 @channarondClassificationEstimationStochastic2012 le papier qui introduit le *Largest Gaps* \ No newline at end of file diff --git a/suivi/2025-45/references.bib b/suivi/2025-45/references.bib new file mode 100644 index 0000000..1de96d9 --- /dev/null +++ b/suivi/2025-45/references.bib @@ -0,0 +1,144 @@ +@article{mazeletUnsupervisedLearningOptimal, + title = {Unsupervised {{Learning}} for {{Optimal Transport}} Plan Prediction between Unbalanced Graphs}, + author = {Mazelet, Sonia and Flamary, Rémi and Thirion, Bertrand}, + abstract = {Optimal transport between graphs, based on Gromov-Wasserstein and other extensions, is a powerful tool for comparing and aligning graph structures. However, solving the associated non-convex optimization problems is computationally expensive, which limits the scalability of these methods to large graphs. In this work, we present Unbalanced Learning of Optimal Transport (ULOT), a deep learning method that predicts optimal transport plans between two graphs. Our method is trained by minimizing the fused unbalanced Gromov-Wasserstein (FUGW) loss. We propose a novel neural architecture with cross-attention that is conditioned on the FUGW tradeoff hyperparameters. We evaluate ULOT on synthetic stochastic block model (SBM) graphs and on real cortical surface data obtained from fMRI. ULOT predicts transport plans with competitive loss up to two orders of magnitude faster than classical solvers. Furthermore, the predicted plan can be used as a warm start for classical solvers to accelerate their convergence. Finally, the predicted transport plan is fully differentiable with respect to the graph inputs and FUGW hyperparameters, enabling the optimization of functionals of the ULOT plan.}, + langid = {english}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-06-11T09:08:09.864Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/HPZEYMM9/Mazelet et al. - Unsupervised Learning for Optimal Transport plan prediction between unbalanced graphs.pdf} +} + +@article{nennaLecture2Entropic, + title = {Lecture 2: {{Entropic Optimal Transport}}}, + author = {Nenna, Luca}, + langid = {english}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-06-11T16:06:28.547Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/WGFIISDB/Nenna - Lecture 2 Entropic Optimal Transport.pdf} +} + +@article{nennaLecture1Monge, + title = {Lecture 1 {{Monge}} and {{Kantorovich}} Problems: From Primal to Dual}, + author = {Nenna, Luca}, + langid = {english}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-06-13T09:24:13.832Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/7LVQPD6D/Nenna - Lecture 1 Monge and Kantorovich problems from primal to dual.pdf} +} + +@article{Morton2021.11.09.467939, + title = {Scalable Estimation of Microbial Co-Occurrence Networks with {{Variational Autoencoders}}}, + author = {Morton, James T. and Silverman, Justin and Tikhonov, Gleb and Lähdesmäki, Harri and Bonneau, Rich}, + date = {2021}, + journaltitle = {bioRxiv : the preprint server for biology}, + shortjournal = {bioRxiv}, + eprint = {https://www.biorxiv.org/content/early/2021/11/11/2021.11.09.467939.full.pdf}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2021.11.09.467939}, + url = {https://www.biorxiv.org/content/early/2021/11/11/2021.11.09.467939}, + abstract = {Estimating microbe-microbe interactions is critical for understanding the ecological laws governing microbial communities. Rapidly decreasing sequencing costs have promised new opportunities to estimate microbe-microbe interactions across thousands of uncultured, unknown microbes. However, typical microbiome datasets are very high dimensional and accurate estimation of microbial correlations requires tens of thousands of samples, exceeding the computational capabilities of existing methodologies. Furthermore, the vast majority of microbiome studies collect compositional metagenomics data which enforces a negative bias when computing microbe-microbe correlations. The Multinomial Logistic Normal (MLN) distribution has been shown to be effective at inferring microbe-microbe correlations, however scalable Bayesian inference of these distributions has remained elusive. Here, we show that carefully constructed Variational Autoencoders (VAEs) augmented with the Isometric Log-ratio (ILR) transform can estimate low-rank MLN distributions thousands of times faster than existing methods. These VAEs can be trained on tens of thousands of samples, enabling co-occurrence inference across tens of thousands of microbes without regularization. The latent embedding distances computed from these VAEs are competitive with existing beta-diversity methods across a variety of mouse and human microbiome classification and regression tasks, with notable improvements on longitudinal studies.Competing Interest StatementThe authors have declared no competing interest.}, + elocation-id = {2021.11.09.467939}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-06-30T14:17:29.518Z} +} +@article{aitchisonStatisticalAnalysisCompositional1982a, + title = {The {{Statistical Analysis}} of {{Compositional Data}}}, + author = {Aitchison, J.}, + date = {1982}, + journaltitle = {Journal of the Royal Statistical Society. Series B (Methodological)}, + volume = {44}, + number = {2}, + eprint = {2345821}, + eprinttype = {jstor}, + pages = {139--177}, + publisher = {[Royal Statistical Society, Oxford University Press]}, + issn = {0035-9246}, + url = {https://www.jstor.org/stable/2345821}, + urldate = {2025-05-07}, + abstract = {The simplex plays an important role as sample space in many practical situations where compositional data, in the form of proportions of some whole, require interpretation. It is argued that the statistical analysis of such data has proved difficult because of a lack both of concepts of independence and of rich enough parametric classes of distributions in the simplex. A variety of independence hypotheses are introduced and interrelated, and new classes of transformed-normal distributions in the simplex are provided as models within which the independence hypotheses can be tested through standard theory of parametric hypothesis testing. The new concepts and statistical methodology are illustrated by a number of applications.}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-05-07T07:43:38.485Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/S97URH4Y/Aitchison - 1982 - The Statistical Analysis of Compositional Data.pdf} +} +@online{payneFiniteMixturesMultivariate2023, + title = {Finite {{Mixtures}} of {{Multivariate Poisson-Log Normal Factor Analyzers}} for {{Clustering Count Data}}}, + author = {Payne, Andrea and Silva, Anjali and Rothstein, Steven J. and McNicholas, Paul D. and Subedi, Sanjeena}, + date = {2023-11-13}, + eprint = {2311.07762}, + eprinttype = {arXiv}, + eprintclass = {stat}, + doi = {10.48550/arXiv.2311.07762}, + url = {http://arxiv.org/abs/2311.07762}, + urldate = {2025-07-02}, + abstract = {A mixture of multivariate Poisson-log normal factor analyzers is introduced by imposing constraints on the covariance matrix, which resulted in flexible models for clustering purposes. In particular, a class of eight parsimonious mixture models based on the mixtures of factor analyzers model are introduced. Variational Gaussian approximation is used for parameter estimation, and information criteria are used for model selection. The proposed models are explored in the context of clustering discrete data arising from RNA sequencing studies. Using real and simulated data, the models are shown to give favourable clustering performance. The GitHub R package for this work is available at https://github.com/anjalisilva/mixMPLNFA and is released under the open-source MIT license.}, + pubstate = {prepublished}, + keywords = {/unread,Statistics - Computation,Statistics - Machine Learning,Statistics - Methodology}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-07-02T09:31:47.579Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/BXVPEIDD/Payne et al. - 2023 - Finite Mixtures of Multivariate Poisson-Log Normal Factor Analyzers for Clustering Count Data.pdf;/home/louis/snap/zotero-snap/common/Zotero/storage/L5DAS5C2/2311.html} +} +@unpublished{bystrovaCausalDiscovery, + title = {Causal Discovery}, + author = {Bystrova, Daria}, + langid = {english}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-07-02T09:34:39.476Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/NQE5DY92/Bystrova - Causal discovery.pdf} +} + +@online{braultFastConsistentAlgorithm2023, + title = {Fast and {{Consistent Algorithm}} for the {{Latent Block Model}}}, + author = {Brault, Vincent and Channarond, Antoine}, + date = {2023-03-09}, + eprint = {1610.09005}, + eprinttype = {arXiv}, + eprintclass = {math}, + doi = {10.48550/arXiv.1610.09005}, + url = {http://arxiv.org/abs/1610.09005}, + urldate = {2025-07-09}, + abstract = {The latent block model is used to simultaneously rank the rows and columns of a matrix to reveal a block structure. The algorithms used for estimation are often time consuming. However, recent work shows that the log-likelihood ratios are equivalent under the complete and observed (with unknown labels) models and the groups posterior distribution to converge as the size of the data increases to a Dirac mass located at the actual groups configuration. Based on these observations, the algorithm Largest Gaps is proposed in this paper to perform clustering using only the marginals of the matrix, when the number of blocks is very small with respect to the size of the whole matrix in the case of binary data. In addition, a model selection method is incorporated with a proof of its consistency. Thus, this paper shows that studying simplistic configurations (few blocks compared to the size of the matrix or very contrasting blocks) with complex algorithms is useless since the marginals already give very good parameter and classification estimates.}, + langid = {english}, + pubstate = {prepublished}, + keywords = {/unread,Mathematics - Statistics Theory,Statistics - Computation,Statistics - Statistics Theory}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-07-09T13:58:53.533Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/5LXC6Y68/Brault et Channarond - 2023 - Fast and Consistent Algorithm for the Latent Block Model.pdf} +} + +@article{braultGeneralisationLalgorithmeLargest, + title = {Généralisation de l'algorithme Largest Gaps pour le modèle des blocs latents non-paramétrique}, + author = {Brault, Vincent and Channarond, Antoine and Robert, Valérie}, + abstract = {The latent block model assumes there exists a distribution for each crossing between an object cluster and a variable cluster of a data table ; the cells are supposed to be independent conditionally to the choice of these clusters. To estimate the model parameters, most of algorithms are time consuming. Brault and Channarond (2016) proposed to adapt the Largest Gaps algorithm which consists in using the margins. They thus obtained a procedure which estimates all the model parameters consistently but requires a large number of observations. In this talk, we will extend the procedure to the case of any distribution having a second order moment by using an EM algorithm estimation.}, + langid = {french}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-07-09T12:29:43.098Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/GIUNC4L3/Brault et al. - Généralisation de l'algorithme Largest Gaps pour le modèle des blocs latents non-paramétrique.pdf} +} + +@article{channarondClassificationEstimationStochastic2012, + title = {Classification and Estimation in the {{Stochastic Blockmodel}} Based on the Empirical Degrees}, + author = {Channarond, Antoine and Daudin, Jean-Jacques and Robin, Stéphane}, + date = {2012-01-01}, + journaltitle = {Electronic Journal of Statistics}, + shortjournal = {Electron. J. Statist.}, + volume = {6}, + publisher = {Institute of Mathematical Statistics}, + issn = {1935-7524}, + doi = {10.1214/12-ejs753}, + url = {https://projecteuclid.org/journals/electronic-journal-of-statistics/volume-6/issue-none/Classification-and-estimation-in-the-Stochastic-Blockmodel-based-on-the/10.1214/12-EJS753.full}, + urldate = {2025-07-09}, + abstract = {The Stochastic Blockmodel [16] is a mixture model for heterogeneous network data. Unlike the usual statistical framework, new nodes give additional information about the previous ones in this model. Thereby the distribution of the degrees concentrates in points conditionally on the node class. We show under a mild assumption that classification, estimation and model selection can actually be achieved with no more than the empirical degree data. We provide an algorithm able to process very large networks and consistent estimators based on it. In particular, we prove a bound of the probability of misclassification of at least one node, including when the number of classes grows.}, + issue = {none}, + langid = {english}, + keywords = {/unread}, + annotation = {Read\_Status: New\\ + Read\_Status\_Date: 2025-07-09T13:59:33.921Z}, + file = {/home/louis/snap/zotero-snap/common/Zotero/storage/8TL8AJ2G/Channarond et al. - 2012 - Classification and estimation in the Stochastic Blockmodel based on the empirical degrees.pdf} +}