Skip to content

Commit a679dc3

Browse files
authored
Merge pull request #53 from bigbio/feat/diann-2.5.0
feat: add DIA-NN 2.5.0 support with model fine-tuning documentation
2 parents f9298e8 + 51a3000 commit a679dc3

File tree

13 files changed

+196
-9
lines changed

13 files changed

+196
-9
lines changed

AGENTS.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ This is **non-negotiable**. All code must pass formatting and style checks befor
2424

2525
- Built with Nextflow DSL2
2626
- DIA-NN for peptide/protein identification and quantification
27-
- Supports DIA-NN v1.8.1, v2.1.0, and v2.2.0 (latest)
27+
- Supports DIA-NN v1.8.1, v2.1.0, v2.2.0, v2.3.2, and v2.5.0 (latest)
2828
- QuantUMS quantification method (DIA-NN >= 1.9.2)
2929
- Parquet-native output with decoy reporting (DIA-NN >= 2.0)
3030
- MSstats-compatible output generation (via quantms-utils conversion, no MSstats analysis)
@@ -44,7 +44,7 @@ This is **non-negotiable**. All code must pass formatting and style checks befor
4444
- **nf-test**: Testing framework (config: `nf-test.config`)
4545
- **nf-core tools**: Pipeline standards and linting
4646
- **Containers**: Docker/Singularity/Apptainer/Podman (Conda deprecated)
47-
- **DIA-NN**: Primary search engine (versions 1.8.1 through 2.2.0)
47+
- **DIA-NN**: Primary search engine (versions 1.8.1 through 2.5.0)
4848

4949
### Key Configuration Files
5050

@@ -116,6 +116,9 @@ The pipeline executes the following steps:
116116
| Parquet output format | 2.0 | (automatic in 2.0+) |
117117
| Decoy reporting | 2.0 | `--report_decoys true` |
118118
| Native .raw on Linux | 2.1.0 | (automatic) |
119+
| DDA support | 2.3.2 | `--dda true` |
120+
| InfinDIA | 2.3.0 | `--enable_infin_dia` |
121+
| DL model fine-tuning & selection | 2.5.0 | `--extra_args` |
119122

120123
---
121124

@@ -194,6 +197,8 @@ These apply on top of test profiles to override the DIA-NN container version:
194197
| `diann_v1_8_1` | `biocontainers/diann:v1.8.1_cv1` | none |
195198
| `diann_v2_1_0` | `ghcr.io/bigbio/diann:2.1.0` | GHCR |
196199
| `diann_v2_2_0` | `ghcr.io/bigbio/diann:2.2.0` | GHCR |
200+
| `diann_v2_3_2` | `ghcr.io/bigbio/diann:2.3.2` | GHCR |
201+
| `diann_v2_5_0` | `ghcr.io/bigbio/diann:2.5.0` | GHCR |
197202

198203
### CI Workflows
199204

CITATION.cff

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
cff-version: 1.2.0
2+
title: "quantmsdiann"
3+
message: "If you use quantmsdiann, please cite the quantms paper."
4+
type: software
5+
license: MIT
6+
repository-code: https://github.com/bigbio/quantmsdiann
7+
url: https://quantmsdiann.quantms.org
8+
keywords:
9+
- proteomics
10+
- mass-spectrometry
11+
- dia
12+
- data-independent-acquisition
13+
- dia-nn
14+
- nextflow
15+
preferred-citation:
16+
type: article
17+
title: "quantms: a cloud-based pipeline for quantitative proteomics enables the reanalysis of public proteomics data"
18+
journal: "Nature Methods"
19+
year: 2024
20+
volume: "21"
21+
start: 1603
22+
end: 1607
23+
doi: "10.1038/s41592-024-02343-1"
24+
authors:
25+
- family-names: Dai
26+
given-names: Chengxin
27+
- family-names: Pfeuffer
28+
given-names: Julianus
29+
- family-names: Wang
30+
given-names: Hong
31+
- family-names: Zheng
32+
given-names: Ping
33+
- family-names: Kall
34+
given-names: Lukas
35+
- family-names: Sachsenberg
36+
given-names: Timo
37+
- family-names: Demichev
38+
given-names: Vadim
39+
- family-names: Bai
40+
given-names: Mingze
41+
- family-names: Kohlbacher
42+
given-names: Oliver
43+
- family-names: Perez-Riverol
44+
given-names: Yasset

conf/diann_versions/v2_5_0.config

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
/*
2+
* DIA-NN 2.5.0 container override (private ghcr.io)
3+
* Major protein ID improvements (+70%), DL model selection flags,
4+
* new --aa-eq flag for amino acid equivalence.
5+
*/
6+
params.diann_version = '2.5.0'
7+
8+
process {
9+
withLabel: diann {
10+
container = 'ghcr.io/bigbio/diann:2.5.0'
11+
}
12+
}
13+
14+
// Container engine is selected via -profile (docker/singularity), not here

docs/parameters.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,27 @@ This document lists every pipeline parameter organised by category. Default valu
6262
| `--light_models` | boolean | `false` | Enable `--light-models` for 10x faster in-silico library generation. Requires DIA-NN >= 2.0. |
6363
| `--export_quant` | boolean | `false` | Enable `--export-quant` for fragment-level parquet data export. Requires DIA-NN >= 2.0. |
6464
| `--site_ms1_quant` | boolean | `false` | Enable `--site-ms1-quant` to use MS1 apex intensities for PTM site quantification. Requires DIA-NN >= 2.0. |
65+
| `--aa_eq` | boolean | `false` | Treat I&L, Q&E, N&D as equivalent amino acids during reannotation. Essential for entrapment FDR benchmarks. Maps to `--aa-eq`. |
66+
67+
### DIA-NN 2.5.0 flags (via `--extra_args`)
68+
69+
The following DIA-NN 2.5.0 flags are not exposed as pipeline parameters but can be passed via `--extra_args`. See [Fine-Tuning Deep Learning Models](usage.md#fine-tuning-deep-learning-models-dia-nn-20) for the complete workflow.
70+
71+
| DIA-NN flag | Description |
72+
| ------------------------ | ------------------------------------------------------------------------------------------------ |
73+
| `--tokens <file>` | Tokenizer dictionary mapping modified residues to neural network token IDs (0-255). |
74+
| `--rt-model <file>` | Fine-tuned retention time prediction model (`.pt` PyTorch file). |
75+
| `--fr-model <file>` | Fine-tuned fragment ion prediction model (`.pt` file). Quality-sensitive — verify vs base model. |
76+
| `--im-model <file>` | Fine-tuned ion mobility prediction model (`.pt` file). |
77+
| `--tune-lib <file>` | Spectral library for fine-tuning (`.parquet`). Requires `--tune-rt` and/or `--tune-im`. |
78+
| `--tune-rt` | Fine-tune the RT deep learning predictor. Requires `--tune-lib`. |
79+
| `--tune-im` | Fine-tune the IM deep learning predictor. Requires `--tune-lib`. |
80+
| `--tune-fr` | Fine-tune the fragmentation predictor. Requires `--tune-lib`. Use with caution. |
81+
| `--tune-lr <X>` | Fine-tuning learning rate (default: 0.0005). |
82+
| `--tune-restrict-layers` | Keep RNN layer weights fixed during fine-tuning (except cysteine embeddings). |
83+
| `--tune-level <N>` | Limit fine-tuning to a specific model distillation level (0, 1, or 2). |
84+
85+
> **Note:** `--parent` and `--aa-eq` are blocked from `--extra_args` — they are managed as pipeline parameters (`--aa_eq`). `--parent` is container-managed and overriding it would break model discovery.
6586
6687
## 6. Mass Accuracy & Calibration
6788

docs/usage.md

Lines changed: 88 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ The default DIA-NN version is 1.8.1. To use a different version:
120120
| 2.1.0 | `-profile diann_v2_1_0` | Native .raw support, reduced memory |
121121
| 2.2.0 | `-profile diann_v2_2_0` | Speed optimizations |
122122
| 2.3.2 | `-profile diann_v2_3_2` | DDA support, InfinDIA |
123+
| 2.5.0 | `-profile diann_v2_5_0` | +70% protein IDs, model fine-tuning |
123124

124125
Example: `nextflow run bigbio/quantmsdiann -profile test_dia,docker,diann_v2_2_0`
125126

@@ -318,12 +319,13 @@ process {
318319

319320
The pipeline supports multiple DIA-NN versions via built-in Nextflow profiles. Each profile sets `params.diann_version` and overrides the container image for all `diann`-labelled processes.
320321

321-
| Profile | DIA-NN Version | Container | Key features |
322-
| -------------- | -------------- | ------------------------------------------ | -------------------------------------------------------------- |
323-
| `diann_v1_8_1` | 1.8.1 | `docker.io/biocontainers/diann:v1.8.1_cv1` | Default. Public BioContainers image. TSV output. |
324-
| `diann_v2_1_0` | 2.1.0 | `ghcr.io/bigbio/diann:2.1.0` | Parquet output. Native .raw on Linux. QuantUMS (`--quantums`). |
325-
| `diann_v2_2_0` | 2.2.0 | `ghcr.io/bigbio/diann:2.2.0` | Speed optimizations (up to 1.6x on HPC). Parquet output. |
326-
| `diann_v2_3_2` | 2.3.2 | `ghcr.io/bigbio/diann:2.3.2` | DDA support (`--dda`), InfinDIA, up to 9 variable mods. |
322+
| Profile | DIA-NN Version | Container | Key features |
323+
| -------------- | -------------- | ------------------------------------------ | --------------------------------------------------------------- |
324+
| `diann_v1_8_1` | 1.8.1 | `docker.io/biocontainers/diann:v1.8.1_cv1` | Default. Public BioContainers image. TSV output. |
325+
| `diann_v2_1_0` | 2.1.0 | `ghcr.io/bigbio/diann:2.1.0` | Parquet output. Native .raw on Linux. QuantUMS (`--quantums`). |
326+
| `diann_v2_2_0` | 2.2.0 | `ghcr.io/bigbio/diann:2.2.0` | Speed optimizations (up to 1.6x on HPC). Parquet output. |
327+
| `diann_v2_3_2` | 2.3.2 | `ghcr.io/bigbio/diann:2.3.2` | DDA support (`--dda`), InfinDIA, up to 9 variable mods. |
328+
| `diann_v2_5_0` | 2.5.0 | `ghcr.io/bigbio/diann:2.5.0` | Up to 70% more protein IDs. DL model fine-tuning and selection. |
327329

328330
**Version-dependent features:** Some parameters are only available with newer DIA-NN versions. The pipeline handles version compatibility automatically:
329331

@@ -348,6 +350,86 @@ nextflow run bigbio/quantmsdiann \
348350
> [!NOTE]
349351
> DIA-NN 2.x images are hosted on `ghcr.io/bigbio` and may require authentication for private registries. The `diann_v2_1_0` and `diann_v2_2_0` profiles force Docker mode by default; for Singularity, override with your own config.
350352

353+
## Fine-Tuning Deep Learning Models (DIA-NN 2.0+)
354+
355+
DIA-NN uses deep learning models to predict retention time (RT), ion mobility (IM), and fragment ion intensities. For non-standard modifications, fine-tuning these models on real data can substantially improve detection.
356+
357+
**When to fine-tune:** Fine-tuning is beneficial for custom chemical labels (e.g., mTRAQ, dimethyl), exotic PTMs, or unmodified cysteines. Standard modifications (Phospho, Oxidation, Acetylation, Deamidation, diGlycine) do not require fine-tuning — DIA-NN's built-in models already handle them well.
358+
359+
### How fine-tuning works
360+
361+
DIA-NN's neural networks encode each amino acid and modification as a "token" — an integer ID (0-255) mapped in a dictionary file (`dict.txt`). The default dictionary ships with DIA-NN and covers common modifications. When you fine-tune, DIA-NN:
362+
363+
1. Reads a spectral library containing empirically observed peptides with the modifications of interest
364+
2. Learns how those modifications affect RT, IM, and fragmentation patterns
365+
3. Outputs new model files (`.pt` PyTorch format) and an expanded dictionary (`dict.txt`) that includes tokens for the new modifications
366+
367+
The fine-tuned models are then used in place of the defaults when generating predicted spectral libraries.
368+
369+
> [!NOTE]
370+
> **`--tune-lib` cannot be combined with `--gen-spec-lib` in a single DIA-NN invocation** ([confirmed in DIA-NN #1499](https://github.com/vdemichev/DiaNN/issues/1499)). Fine-tuning and library generation are separate DIA-NN commands. This means the workflow currently requires two pipeline runs.
371+
372+
### Current workflow (manual fine-tuning)
373+
374+
**Run 1 — Generate the tuning library:**
375+
376+
Run quantmsdiann normally. The empirical library produced by the ASSEMBLE_EMPIRICAL_LIBRARY step (after preliminary analysis) serves as the tuning library. This library contains empirically observed RT, IM, and fragment intensities for peptides bearing the modifications of interest.
377+
378+
```bash
379+
# First run: standard pipeline to produce empirical library
380+
nextflow run bigbio/quantmsdiann \
381+
-profile diann_v2_5_0,docker \
382+
--input sdrf.tsv --database db.fasta --outdir results_run1
383+
# Output: results_run1/library_generation/assemble_empirical_library/empirical_library.parquet
384+
```
385+
386+
**Fine-tune models (outside the pipeline):**
387+
388+
```bash
389+
# Fine-tune RT and IM models using the empirical library
390+
diann --tune-lib /abs/path/to/empirical_library.parquet --tune-rt --tune-im
391+
392+
# Optionally also fine-tune the fragmentation model (quality-sensitive — verify vs base model)
393+
diann --tune-lib /abs/path/to/empirical_library.parquet --tune-rt --tune-im --tune-fr
394+
```
395+
396+
DIA-NN will output (named after the input library):
397+
398+
- `empirical_library.dict.txt` — expanded tokenizer dictionary with new modification tokens
399+
- `empirical_library.rt.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned RT models (3 distillation levels)
400+
- `empirical_library.im.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned IM models
401+
- `empirical_library.fr.d0.pt` (+ `.d1.pt`, `.d2.pt`) — fine-tuned fragment models (if `--tune-fr`)
402+
403+
Additional tuning parameters: `--tune-lr` (learning rate, default 0.0005), `--tune-restrict-layers` (fix RNN weights), `--tune-level` (limit to a specific distillation level 0/1/2).
404+
405+
**Run 2 — Re-run the pipeline with fine-tuned models:**
406+
407+
```bash
408+
# Second run: use tuned models for in-silico library generation and all downstream steps
409+
nextflow run bigbio/quantmsdiann \
410+
-profile diann_v2_5_0,docker \
411+
--input sdrf.tsv --database db.fasta \
412+
--extra_args "--tokens /abs/path/to/empirical_library.dict.txt --rt-model /abs/path/to/empirical_library.rt.d0.pt --im-model /abs/path/to/empirical_library.im.d0.pt" \
413+
--outdir results_run2
414+
```
415+
416+
The `--tokens`, `--rt-model`, and `--im-model` flags are passed to all DIA-NN steps via `--extra_args`, so the in-silico library generation uses the fine-tuned models to produce better-predicted spectra for the non-standard modifications.
417+
418+
> [!IMPORTANT]
419+
> Use **absolute paths** for model files. The `--parent` flag is blocked by the pipeline (it controls the container's DIA-NN installation path).
420+
421+
### Future: integrated fine-tuning step
422+
423+
We are exploring adding an optional `FINE_TUNE_MODELS` step directly in the pipeline, which would eliminate the need for two separate runs. The integrated workflow would be:
424+
425+
```
426+
INSILICO_LIBRARY → PRELIMINARY_ANALYSIS → ASSEMBLE_EMPIRICAL_LIBRARY
427+
→ [FINE_TUNE_MODELS] → INSILICO_LIBRARY (with tuned models)
428+
→ INDIVIDUAL_ANALYSIS → FINAL_QUANTIFICATION
429+
```
430+
431+
This would be gated by a `--enable_fine_tuning` parameter. [@vdemichev](https://github.com/vdemichev): would this approach work correctly — using the empirical library from assembly as `--tune-lib`, then regenerating the in-silico library with the tuned models before proceeding to individual analysis? Or would you recommend a different integration point?
432+
351433
## Verbose Module Output
352434

353435
By default, only final result files are published. For debugging or detailed inspection, the `verbose_modules` profile publishes all intermediate files from every DIA-NN step:

lib/BlockedFlags.groovy

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ class BlockedFlags {
3232
// --var-mod/--fixed-mod/--monitor-mod/--channels/--lib-fixed-mod/--original-mods: injected from diann_config.cfg
3333
// --dda: auto-detected from SDRF or set via --dda param
3434
// --proteoforms/--peptidoforms/--no-peptidoforms: controlled by scoring_mode param
35+
// --parent: container-managed (DIA-NN model path), overriding breaks model discovery
36+
// --aa-eq: controlled by aa_eq param
3537
private static final List<String> COMMON = [
3638
'--temp', '--threads', '--verbose', '--lib', '--f', '--fasta',
37-
'--monitor-mod', '--var-mod', '--fixed-mod', '--dda',
39+
'--monitor-mod', '--var-mod', '--fixed-mod', '--dda', '--parent', '--aa-eq',
3840
'--channels', '--lib-fixed-mod', '--original-mods',
3941
'--proteoforms', '--peptidoforms', '--no-peptidoforms',
4042
]

modules/local/diann/assemble_empirical_library/main.nf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
4040
scan_window = params.scan_window_automatic ? '--individual-windows' : "--window $params.scan_window"
4141
scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
4242
params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
43+
aa_eq = params.aa_eq ? '--aa-eq' : ''
4344
diann_tims_sum = params.tims_sum ? "--quant-tims-sum" : ""
4445
diann_im_window = params.im_window ? "--im-window $params.im_window" : ""
4546
diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
@@ -67,6 +68,7 @@ process ASSEMBLE_EMPIRICAL_LIBRARY {
6768
${scan_window} \\
6869
--gen-spec-lib \\
6970
${scoring_mode} \\
71+
${aa_eq} \\
7072
${diann_tims_sum} \\
7173
${diann_im_window} \\
7274
${diann_dda_flag} \\

modules/local/diann/final_quantification/main.nf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ process FINAL_QUANTIFICATION {
6161
quantums_params = params.quantums_params ? "--quant-params $params.quantums_params": ""
6262
scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
6363
params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
64+
aa_eq = params.aa_eq ? '--aa-eq' : ''
6465
diann_use_quant = params.use_quant ? "--use-quant" : ""
6566
diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
6667
diann_export_quant = params.export_quant ? "--export-quant" : ""
@@ -97,6 +98,7 @@ process FINAL_QUANTIFICATION {
9798
${quantums_sel_runs} \\
9899
${quantums_params} \\
99100
${scoring_mode} \\
101+
${aa_eq} \\
100102
${diann_use_quant} \\
101103
${diann_dda_flag} \\
102104
${diann_export_quant} \\

modules/local/diann/individual_analysis/main.nf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ process INDIVIDUAL_ANALYSIS {
6969

7070
scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
7171
params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
72+
aa_eq = params.aa_eq ? '--aa-eq' : ''
7273
diann_tims_sum = params.tims_sum ? "--quant-tims-sum" : ""
7374
diann_im_window = params.im_window ? "--im-window $params.im_window" : ""
7475
diann_dda_flag = meta.acquisition_method == 'dda' ? "--dda" : ""
@@ -105,6 +106,7 @@ process INDIVIDUAL_ANALYSIS {
105106
${min_fr_mz} \\
106107
${max_fr_mz} \\
107108
${scoring_mode} \\
109+
${aa_eq} \\
108110
${diann_tims_sum} \\
109111
${diann_im_window} \\
110112
${diann_dda_flag} \\

modules/local/diann/insilico_library_generation/main.nf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ process INSILICO_LIBRARY_GENERATION {
3535
met_excision = params.met_excision ? "--met-excision" : ""
3636
scoring_mode = params.scoring_mode == 'proteoforms' ? '--proteoforms' :
3737
params.scoring_mode == 'peptidoforms' ? '--peptidoforms' : ''
38+
aa_eq = params.aa_eq ? '--aa-eq' : ''
3839
diann_dda_flag = is_dda ? "--dda" : ""
3940
diann_light_models = params.light_models ? "--light-models" : ""
4041
infin_dia_flag = params.enable_infin_dia ? "--infin-dia" : ""
@@ -60,6 +61,7 @@ process INSILICO_LIBRARY_GENERATION {
6061
--verbose $params.debug_level \\
6162
--gen-spec-lib \\
6263
${scoring_mode} \\
64+
${aa_eq} \\
6365
${diann_light_models} \\
6466
${infin_dia_flag} \\
6567
${pre_select_flag} \\

0 commit comments

Comments
 (0)