Merge pull request #53 from bigbio/feat/diann-2.5.0

ypriverol · web-flow · commit a679dc322a10 · 2026-04-14T07:25:06.000+01:00
feat: add DIA-NN 2.5.0 support with model fine-tuning documentation
diff --git a/AGENTS.md b/AGENTS.md
@@ -24,7 +24,7 @@ This is **non-negotiable**. All code must pass formatting and style checks befor
 
 - Built with Nextflow DSL2
 - DIA-NN for peptide/protein identification and quantification
-- Supports DIA-NN v1.8.1, v2.1.0, and v2.2.0 (latest)
+- Supports DIA-NN v1.8.1, v2.1.0, v2.2.0, v2.3.2, and v2.5.0 (latest)
 - QuantUMS quantification method (DIA-NN >= 1.9.2)
 - Parquet-native output with decoy reporting (DIA-NN >= 2.0)
 - MSstats-compatible output generation (via quantms-utils conversion, no MSstats analysis)
@@ -44,7 +44,7 @@ This is **non-negotiable**. All code must pass formatting and style checks befor
 - **nf-test**: Testing framework (config: `nf-test.config`)
 - **nf-core tools**: Pipeline standards and linting
 - **Containers**: Docker/Singularity/Apptainer/Podman (Conda deprecated)
-- **DIA-NN**: Primary search engine (versions 1.8.1 through 2.2.0)
+- **DIA-NN**: Primary search engine (versions 1.8.1 through 2.5.0)
 
 ### Key Configuration Files
 
@@ -116,6 +116,9 @@ The pipeline executes the following steps:
 | Parquet output format                       | 2.0         | (automatic in 2.0+)    |
 | Decoy reporting                             | 2.0         | `--report_decoys true` |
 | Native .raw on Linux                        | 2.1.0       | (automatic)            |
+| DDA support                                 | 2.3.2       | `--dda true`           |
+| InfinDIA                                    | 2.3.0       | `--enable_infin_dia`   |
+| DL model fine-tuning & selection            | 2.5.0       | `--extra_args`         |
 
 ---
 
@@ -194,6 +197,8 @@ These apply on top of test profiles to override the DIA-NN container version:
 | `diann_v1_8_1` | `biocontainers/diann:v1.8.1_cv1` | none |
 | `diann_v2_1_0` | `ghcr.io/bigbio/diann:2.1.0`     | GHCR |
 | `diann_v2_2_0` | `ghcr.io/bigbio/diann:2.2.0`     | GHCR |
+| `diann_v2_3_2` | `ghcr.io/bigbio/diann:2.3.2`     | GHCR |
+| `diann_v2_5_0` | `ghcr.io/bigbio/diann:2.5.0`     | GHCR |
 
 ### CI Workflows
 
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,44 @@
+cff-version: 1.2.0
+title: "quantmsdiann"
+message: "If you use quantmsdiann, please cite the quantms paper."
+type: software
+license: MIT
+repository-code: https://github.com/bigbio/quantmsdiann
+url: https://quantmsdiann.quantms.org
+keywords:
+  - proteomics
+  - mass-spectrometry
+  - dia
+  - data-independent-acquisition
+  - dia-nn
+  - nextflow
+preferred-citation:
+  type: article
+  title: "quantms: a cloud-based pipeline for quantitative proteomics enables the reanalysis of public proteomics data"
+  journal: "Nature Methods"
+  year: 2024
+  volume: "21"
+  start: 1603
+  end: 1607
+  doi: "10.1038/s41592-024-02343-1"
+  authors:
+    - family-names: Dai
+      given-names: Chengxin
+    - family-names: Pfeuffer
+      given-names: Julianus
+    - family-names: Wang
+      given-names: Hong
+    - family-names: Zheng
+      given-names: Ping
+    - family-names: Kall
+      given-names: Lukas
+    - family-names: Sachsenberg
+      given-names: Timo
+    - family-names: Demichev
+      given-names: Vadim
+    - family-names: Bai
+      given-names: Mingze
+    - family-names: Kohlbacher
+      given-names: Oliver
+    - family-names: Perez-Riverol
+      given-names: Yasset
diff --git a/conf/diann_versions/v2_5_0.config b/conf/diann_versions/v2_5_0.config
@@ -0,0 +1,14 @@
+/*
+ * DIA-NN 2.5.0 container override (private ghcr.io)
+ * Major protein ID improvements (+70%), DL model selection flags,
+ * new --aa-eq flag for amino acid equivalence.
+ */
+params.diann_version = '2.5.0'
+
+process {
+    withLabel: diann {
+        container = 'ghcr.io/bigbio/diann:2.5.0'
+    }
+}
+
+// Container engine is selected via -profile (docker/singularity), not here
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -62,6 +62,27 @@ This document lists every pipeline parameter organised by category. Default valu
 | `--light_models`   | boolean | `false` | Enable `--light-models` for 10x faster in-silico library generation. Requires DIA-NN >= 2.0.                                                                                                                    |
 | `--export_quant`   | boolean | `false` | Enable `--export-quant` for fragment-level parquet data export. Requires DIA-NN >= 2.0.                                                                                                                         |
 | `--site_ms1_quant` | boolean | `false` | Enable `--site-ms1-quant` to use MS1 apex intensities for PTM site quantification. Requires DIA-NN >= 2.0.                                                                                                      |
+| `--aa_eq`          | boolean | `false` | Treat I&L, Q&E, N&D as equivalent amino acids during reannotation. Essential for entrapment FDR benchmarks. Maps to `--aa-eq`.                                                                                  |
+
+### DIA-NN 2.5.0 flags (via `--extra_args`)
+
+The following DIA-NN 2.5.0 flags are not exposed as pipeline parameters but can be passed via `--extra_args`. See [Fine-Tuning Deep Learning Models](usage.md#fine-tuning-deep-learning-models-dia-nn-20) for the complete workflow.
+
+| DIA-NN flag              | Description                                                                                      |
+| ------------------------ | ------------------------------------------------------------------------------------------------ |
+| `--tokens <file>`        | Tokenizer dictionary mapping modified residues to neural network token IDs (0-255).              |
+| `--rt-model <file>`      | Fine-tuned retention time prediction model (`.pt` PyTorch file).                                 |
+| `--fr-model <file>`      | Fine-tuned fragment ion prediction model (`.pt` file). Quality-sensitive — verify vs base model. |
+| `--im-model <file>`      | Fine-tuned ion mobility prediction model (`.pt` file).                                           |
+| `--tune-lib <file>`      | Spectral library for fine-tuning (`.parquet`). Requires `--tune-rt` and/or `--tune-im`.          |
+| `--tune-rt`              | Fine-tune the RT deep learning predictor. Requires `--tune-lib`.                                 |
+| `--tune-im`              | Fine-tune the IM deep learning predictor. Requires `--tune-lib`.                                 |
+| `--tune-fr`              | Fine-tune the fragmentation predictor. Requires `--tune-lib`. Use with caution.                  |
+| `--tune-lr <X>`          | Fine-tuning learning rate (default: 0.0005).                                                     |
+| `--tune-restrict-layers` | Keep RNN layer weights fixed during fine-tuning (except cysteine embeddings).                    |
+| `--tune-level <N>`       | Limit fine-tuning to a specific model distillation level (0, 1, or 2).                           |
+
+> **Note:** `--parent` and `--aa-eq` are blocked from `--extra_args` — they are managed as pipeline parameters (`--aa_eq`). `--parent` is container-managed and overriding it would break model discovery.
 
 ## 6. Mass Accuracy & Calibration
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -120,6 +120,7 @@ The default DIA-NN version is 1.8.1. To use a different version:
 | 2.1.0   | `-profile diann_v2_1_0` | Native .raw support, reduced memory |
 | 2.2.0   | `-profile diann_v2_2_0` | Speed optimizations                 |
 | 2.3.2   | `-profile diann_v2_3_2` | DDA support, InfinDIA               |
+| 2.5.0   | `-profile diann_v2_5_0` | +70% protein IDs, model fine-tuning |
 
 Example: `nextflow run bigbio/quantmsdiann -profile test_dia,docker,diann_v2_2_0`
 
@@ -318,12 +319,13 @@ process {
 
 The pipeline supports multiple DIA-NN versions via built-in Nextflow profiles. Each profile sets `params.diann_version` and overrides the container image for all `diann`-labelled processes.
 
-| Profile        | DIA-NN Version | Container                                  | Key features                                                   |
-| -------------- | -------------- | ------------------------------------------ | -------------------------------------------------------------- |
-| `diann_v1_8_1` | 1.8.1          | `docker.io/biocontainers/diann:v1.8.1_cv1` | Default. Public BioContainers image. TSV output.               |
-| `diann_v2_1_0` | 2.1.0          | `ghcr.io/bigbio/diann:2.1.0`               | Parquet output. Native .raw on Linux. QuantUMS (`--quantums`). |
-| `diann_v2_2_0` | 2.2.0          | `ghcr.io/bigbio/diann:2.2.0`               | Speed optimizations (up to 1.6x on HPC). Parquet output.       |
-| `diann_v2_3_2` | 2.3.2          | `ghcr.io/bigbio/diann:2.3.2`               | DDA support (`--dda`), InfinDIA, up to 9 variable mods.        |
+| Profile        | DIA-NN Version | Container                                  | Key features                                                    |
+| -------------- | -------------- | ------------------------------------------ | --------------------------------------------------------------- |
+| `diann_v1_8_1` | 1.8.1          | `docker.io/biocontainers/diann:v1.8.1_cv1` | Default. Public BioContainers image. TSV output.                |
+| `diann_v2_1_0` | 2.1.0          | `ghcr.io/bigbio/diann:2.1.0`               | Parquet output. Native .raw on Linux. QuantUMS (`--quantums`).  |
+| `diann_v2_2_0` | 2.2.0          | `ghcr.io/bigbio/diann:2.2.0`               | Speed optimizations (up to 1.6x on HPC). Parquet output.        |
+| `diann_v2_3_2` | 2.3.2          | `ghcr.io/bigbio/diann:2.3.2`               | DDA support (`--dda`), InfinDIA, up to 9 variable mods.         |
+| `diann_v2_5_0` | 2.5.0          | `ghcr.io/bigbio/diann:2.5.0`               | Up to 70% more protein IDs. DL model fine-tuning and selection. |
 
 **Version-dependent features:** Some parameters are only available with newer DIA-NN versions. The pipeline handles version compatibility automatically:
 
@@ -348,6 +350,86 @@ nextflow run bigbio/quantmsdiann \
 > [!NOTE]
 > DIA-NN 2.x images are hosted on `ghcr.io/bigbio` and may require authentication for private registries. The `diann_v2_1_0` and `diann_v2_2_0` profiles force Docker mode by default; for Singularity, override with your own config.
 
+## Fine-Tuning Deep Learning Models (DIA-NN 2.0+)
+
+DIA-NN uses deep learning models to predict retention time (RT), ion mobility (IM), and fragment ion intensities. For non-standard modifications, fine-tuning these models on real data can substantially improve detection.
+
+**When to fine-tune:** Fine-tuning is beneficial for custom chemical labels (e.g., mTRAQ, dimethyl), exotic PTMs, or unmodified cysteines. Standard modifications (Phospho, Oxidation, Acetylation, Deamidation, diGlycine) do not require fine-tuning — DIA-NN's built-in models already handle them well.
+
+### How fine-tuning works
+
+DIA-NN's neural networks encode each amino acid and modification as a "token" — an integer ID (0-255) mapped in a dictionary file (`dict.txt`). The default dictionary ships with DIA-NN and covers common modifications. When you fine-tune, DIA-NN:
+
+1. Reads a spectral library containing empirically observed peptides with the modifications of interest
+2. Learns how those modifications affect RT, IM, and fragmentation patterns
+3. Outputs new model files (`.pt` PyTorch format) and an expanded dictionary (`dict.txt`) that includes tokens for the new modifications
+
+The fine-tuned models are then used in place of the defaults when generating predicted spectral libraries.
+
+> [!NOTE]
+> **`--tune-lib` cannot be combined with `--gen-spec-lib` in a single DIA-NN invocation** ([confirmed in DIA-NN #1499](https://github.com/vdemichev/DiaNN/issues/1499)). Fine-tuning and library generation are separate DIA-NN commands. This means the workflow currently requires two pipeline runs.
+
+### Current workflow (manual fine-tuning)
+
+**Run 1 — Generate the tuning library:**
+
+Run quantmsdiann normally. The empirical library produced by the ASSEMBLE_EMPIRICAL_LIBRARY step (after preliminary analysis) serves as the tuning library. This library contains empirically observed RT, IM, and fragment intensities for peptides bearing the modifications of interest.
+
+```bash
+# First run: standard pipeline to produce empirical library
+nextflow run bigbio/quantmsdiann \
+    -profile diann_v2_5_0,docker \
+    --input sdrf.tsv --database db.fasta --outdir results_run1
+# Output: results_run1/library_generation/assemble_empirical_library/empirical_library.parquet
+```
+
+**Fine-tune models (outside the pipeline):**
+
+```bash
+# Fine-tune RT and IM models using the empirical library
+diann --tune-lib /abs/path/to/empirical_library.parquet --tune-rt --tune-im
+
+# Optionally also fine-tune the fragmentation model (quality-sensitive — verify vs base model)
+diann --tune-lib /abs/path/to/empirical_library.parquet --tune-rt --tune-im --tune-fr
+```
+
+DIA-NN will output (named after the input library):
+
+- `empirical_library.dict.txt` — expanded tokenizer dictionary with new modification tokens
+- `empirical_library.rt.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned RT models (3 distillation levels)
+- `empirical_library.im.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned IM models
+- `empirical_library.fr.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned fragment models (if `--tune-fr`)
+
+Additional tuning parameters: `--tune-lr` (learning rate, default 0.0005), `--tune-restrict-layers` (fix RNN weights), `--tune-level` (limit to a specific distillation level 0/1/2).
+
+**Run 2 — Re-run the pipeline with fine-tuned models:**
+
+```bash
+# Second run: use tuned models for in-silico library generation and all downstream steps
+nextflow run bigbio/quantmsdiann \
+    -profile diann_v2_5_0,docker \
+    --input sdrf.tsv --database db.fasta \
+    --extra_args "--tokens /abs/path/to/empirical_library.dict.txt --rt-model /abs/path/to/empirical_library.rt.d0.pt --im-model /abs/path/to/empirical_library.im.d0.pt" \
+    --outdir results_run2
+```
+
+The `--tokens`, `--rt-model`, and `--im-model` flags are passed to all DIA-NN steps via `--extra_args`, so the in-silico library generation uses the fine-tuned models to produce better-predicted spectra for the non-standard modifications.
+
+> [!IMPORTANT]
+> Use **absolute paths** for model files. The `--parent` flag is blocked by the pipeline (it controls the container's DIA-NN installation path).
+
+### Future: integrated fine-tuning step
+
+We are exploring adding an optional `FINE_TUNE_MODELS` step directly in the pipeline, which would eliminate the need for two separate runs. The integrated workflow would be:
+
+```
+INSILICO_LIBRARY → PRELIMINARY_ANALYSIS → ASSEMBLE_EMPIRICAL_LIBRARY
+    → [FINE_TUNE_MODELS] → INSILICO_LIBRARY (with tuned models)
+    → INDIVIDUAL_ANALYSIS → FINAL_QUANTIFICATION
+```
+
+This would be gated by a `--enable_fine_tuning` parameter. [@vdemichev](https://github.com/vdemichev): would this approach work correctly — using the empirical library from assembly as `--tune-lib`, then regenerating the in-silico library with the tuned models before proceeding to individual analysis? Or would you recommend a different integration point?
+
 ## Verbose Module Output
 
 By default, only final result files are published. For debugging or detailed inspection, the `verbose_modules` profile publishes all intermediate files from every DIA-NN step:
diff --git a/lib/BlockedFlags.groovy b/lib/BlockedFlags.groovy
@@ -32,9 +32,11 @@ class BlockedFlags {
     //   --var-mod/--fixed-mod/--monitor-mod/--channels/--lib-fixed-mod/--original-mods: injected from diann_config.cfg
     //   --dda: auto-detected from SDRF or set via --dda param
     //   --proteoforms/--peptidoforms/--no-peptidoforms: controlled by scoring_mode param
+    //   --parent: container-managed (DIA-NN model path), overriding breaks model discovery
+    //   --aa-eq: controlled by aa_eq param
     private static final List<String> COMMON = [
         '--temp', '--threads', '--verbose', '--lib', '--f', '--fasta',
-        '--monitor-mod', '--var-mod', '--fixed-mod', '--dda',
+        '--monitor-mod', '--var-mod', '--fixed-mod', '--dda', '--parent', '--aa-eq',
         '--channels', '--lib-fixed-mod', '--original-mods',
         '--proteoforms', '--peptidoforms', '--no-peptidoforms',
     ]
diff --git a/modules/local/diann/assemble_empirical_library/main.nf b/modules/local/diann/assemble_empirical_library/main.nf
@@ -40,6 +40,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
     scan_window = params.scan_window_automatic ? '--individual-windows' : "--window $params.scan_window"
     scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
                          params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
+    aa_eq = params.aa_eq ? '--aa-eq' : ''
     diann_tims_sum = params.tims_sum ? "--quant-tims-sum" : ""
     diann_im_window = params.im_window ? "--im-window $params.im_window" : ""
     diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
@@ -67,6 +68,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
             ${scan_window} \\
             --gen-spec-lib \\
             ${scoring_mode} \\
+            ${aa_eq} \\
             ${diann_tims_sum} \\
             ${diann_im_window} \\
             ${diann_dda_flag} \\
diff --git a/modules/local/diann/final_quantification/main.nf b/modules/local/diann/final_quantification/main.nf
@@ -61,6 +61,7 @@ process FINAL_QUANTIFICATION {
     quantums_params = params.quantums_params ? "--quant-params $params.quantums_params": ""
     scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
                          params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
+    aa_eq = params.aa_eq ? '--aa-eq' : ''
     diann_use_quant = params.use_quant ? "--use-quant" : ""
     diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
     diann_export_quant = params.export_quant ? "--export-quant" : ""
@@ -97,6 +98,7 @@ process FINAL_QUANTIFICATION {
             ${quantums_sel_runs} \\
             ${quantums_params} \\
             ${scoring_mode} \\
+            ${aa_eq} \\
             ${diann_use_quant} \\
             ${diann_dda_flag} \\
             ${diann_export_quant} \\
diff --git a/modules/local/diann/individual_analysis/main.nf b/modules/local/diann/individual_analysis/main.nf
@@ -69,6 +69,7 @@ process INDIVIDUAL_ANALYSIS {
 
     scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
                          params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
+    aa_eq = params.aa_eq ? '--aa-eq' : ''
     diann_tims_sum = params.tims_sum ? "--quant-tims-sum" : ""
     diann_im_window = params.im_window ? "--im-window $params.im_window" : ""
     diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
@@ -105,6 +106,7 @@ process INDIVIDUAL_ANALYSIS {
             ${min_fr_mz} \\
             ${max_fr_mz} \\
             ${scoring_mode} \\
+            ${aa_eq} \\
             ${diann_tims_sum} \\
             ${diann_im_window} \\
             ${diann_dda_flag} \\
diff --git a/modules/local/diann/insilico_library_generation/main.nf b/modules/local/diann/insilico_library_generation/main.nf
@@ -35,6 +35,7 @@ process INSILICO_LIBRARY_GENERATION {
     met_excision = params.met_excision ? "--met-excision" : ""
     scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
                          params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
+    aa_eq = params.aa_eq ? '--aa-eq' : ''
     diann_dda_flag = is_dda ? "--dda" : ""
     diann_light_models = params.light_models ? "--light-models" : ""
     infin_dia_flag = params.enable_infin_dia ? "--infin-dia" : ""
@@ -60,6 +61,7 @@ process INSILICO_LIBRARY_GENERATION {
             --verbose $params.debug_level \\
             --gen-spec-lib \\
             ${scoring_mode} \\
+            ${aa_eq} \\
             ${diann_light_models} \\
             ${infin_dia_flag} \\
             ${pre_select_flag} \\
diff --git a/modules/local/diann/preliminary_analysis/main.nf b/modules/local/diann/preliminary_analysis/main.nf
@@ -31,6 +31,7 @@ process PRELIMINARY_ANALYSIS {
     performance_flags = params.performance_mode ? "--min-corr 2 --corr-diff 1 --time-corr-only" : ""
     scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
                          params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
+    aa_eq = params.aa_eq ? '--aa-eq' : ''
 
     // I am using here the ["key"] syntax, since the preprocessed meta makes
     // was evaluating to null when using the dot notation.
@@ -89,6 +90,7 @@ process PRELIMINARY_ANALYSIS {
             ${min_fr_mz} \\
             ${max_fr_mz} \\
             ${scoring_mode} \\
+            ${aa_eq} \\
             ${diann_tims_sum} \\
             ${diann_im_window} \\
             --no-prot-inf \\
diff --git a/nextflow.config b/nextflow.config
diff --git a/nextflow_schema.json b/nextflow_schema.json