Skip to content

Commit 0331f61

Browse files
committed
WIP: SH tree
1 parent 2beb729 commit 0331f61

7 files changed

Lines changed: 84 additions & 19 deletions

File tree

phylogenetic/Snakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ workdir: workflow.current_basedir
88
# Use default configuration values. Override with Snakemake's --configfile/--config options.
99
configfile: "defaults/config.yaml"
1010

11-
builds = ["north-america","global"]
11+
builds = ["north-america","global","sh"]
1212

1313
wildcard_constraints:
1414
build = "|".join(builds)

phylogenetic/defaults/color_orderings.tsv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,17 @@ MuV_genotype A
298298
MuV_genotype B
299299
MuV_genotype C
300300
MuV_genotype D
301+
MuV_genotype D1
301302
MuV_genotype F
302303
MuV_genotype G
304+
MuV_genotype G1
305+
MuV_genotype G2
303306
MuV_genotype H
307+
MuV_genotype H1
308+
MuV_genotype H2
304309
MuV_genotype I
305310
MuV_genotype J
306311
MuV_genotype K
312+
MuV_genotype K/M
307313
MuV_genotype L
308314
MuV_genotype N

phylogenetic/defaults/config.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,30 @@ sequences_url: "https://data.nextstrain.org/files/workflows/mumps/sequences.fast
44
metadata_url: "https://data.nextstrain.org/files/workflows/mumps/metadata.tsv.zst"
55

66
strain_id_field: "accession"
7-
reference: "defaults/reference.gb"
7+
reference:
8+
north-america: "defaults/reference.gb"
9+
global: "defaults/reference.gb"
10+
sh: "defaults/sh/reference.gb"
811

912
filter:
10-
min_length: 8000
1113
group_by: country year month MuV_genotype division
1214
specific:
13-
north-america: --subsample-max-sequences 4000 --min-date 2006 --query "region=='North America' & (MuV_genotype=='G')"
14-
global: --subsample-max-sequences 4000 --min-date 1950
15+
north-america: --min-length 8000 --subsample-max-sequences 4000 --min-date 2006 --query "region=='North America' & (MuV_genotype=='G')"
16+
global: --min-length 8000 --subsample-max-sequences 4000 --min-date 1950
17+
sh: --exclude-all
1518

1619
refine:
17-
north-america: "--clock-filter-iqd 4"
18-
global: ""
20+
north-america: "--timetree --clock-filter-iqd 4 --coalescent opt --date-confidence --date-inference marginal"
21+
global: "--timetree --coalescent opt --date-confidence --date-inference marginal"
22+
sh: "--timetree --clock-rate 0.0001"
1923

2024
ancestral:
2125
inference: "joint"
2226

2327
traits:
2428
north-america: country division MuV_genotype
2529
global: region MuV_genotype
30+
sh: region
2631
sampling_bias_correction: 3
2732

2833
colors:

phylogenetic/rules/annotate_phylogeny.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ rule translate:
5959
input:
6060
tree = "results/{build}/tree.nwk",
6161
node_data = "results/{build}/nt_muts.json",
62-
reference = config['reference'],
62+
reference = lambda wildcard: config['reference'][wildcard.build],
6363
output:
6464
node_data = "results/{build}/aa_muts.json",
6565
log:

phylogenetic/rules/construct_phylogeny.smk

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@ rule refine:
5656
benchmark:
5757
"benchmarks/{build}/refine.txt",
5858
params:
59-
coalescent = "opt",
60-
date_inference = "marginal",
6159
clock_filter_iqd = lambda wildcard: config['refine'][wildcard.build],
6260
strain_id = config.get("strain_id_field", "strain"),
6361
shell:
@@ -69,9 +67,5 @@ rule refine:
6967
--metadata-id-columns {params.strain_id:q} \
7068
--output-tree {output.tree:q} \
7169
--output-node-data {output.node_data:q} \
72-
--timetree \
73-
--coalescent {params.coalescent:q} \
74-
--date-confidence \
75-
--date-inference {params.date_inference:q} \
7670
{params.clock_filter_iqd} 2>&1 | tee {log:q}
7771
"""

phylogenetic/rules/export.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ rule export:
5151
"""Exporting data files for for auspice"""
5252
input:
5353
tree = "results/{build}/tree.nwk",
54-
metadata = "data/metadata.tsv",
54+
metadata = "results/sh/metadata_merged.tsv",
5555
branch_lengths = "results/{build}/branch_lengths.json",
5656
traits = "results/{build}/traits.json",
5757
nt_muts = "results/{build}/nt_muts.json",

phylogenetic/rules/prepare_sequences.smk

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,31 @@ rule decompress:
5050
zstd -d -c {input.metadata} > {output.metadata}
5151
"""
5252

53+
rule merge_annotations:
54+
"""Merge identical sequence annotations"""
55+
input:
56+
metadata = "data/metadata.tsv",
57+
identical = "defaults/sh/metadata_identical.tsv",
58+
output:
59+
merged_metadata = "data/metadata_merged.tsv",
60+
params:
61+
id_column = "accession",
62+
shell:
63+
r"""
64+
augur merge --metadata \
65+
a={input.metadata:q} \
66+
b={input.identical:q} \
67+
--metadata-id-columns {params.id_column} \
68+
--output-metadata {output.merged_metadata}
69+
"""
70+
5371
rule filter:
5472
"""
5573
Filtering to
5674
- various criteria based on the auspice JSON target
5775
- from {params.min_date} onwards
5876
- excluding strains in {input.exclude}
5977
- including strains in {input.include}
60-
- minimum genome length of {params.min_length} (50% of Zika virus genome)
6178
"""
6279
input:
6380
sequences = "data/sequences.fasta",
@@ -72,7 +89,6 @@ rule filter:
7289
benchmark:
7390
"benchmarks/{build}/filtered.txt",
7491
params:
75-
min_length = config['filter']['min_length'],
7692
group_by = config['filter']['group_by'],
7793
filter_params = lambda wildcard: config['filter']['specific'][wildcard.build],
7894
strain_id = config.get("strain_id_field", "strain"),
@@ -86,19 +102,63 @@ rule filter:
86102
--include {input.include:q} \
87103
--output {output.sequences:q} \
88104
--output-metadata {output.metadata:q} \
89-
--min-length {params.min_length:q} \
90105
--group-by {params.group_by} \
91106
{params.filter_params} 2>&1 | tee {log:q}
92107
"""
93108

109+
ruleorder: filter_sh > filter
110+
111+
rule filter_sh:
112+
"""
113+
Filtering to
114+
- various criteria based on the auspice JSON target
115+
- from {params.min_date} onwards
116+
- excluding strains in {input.exclude}
117+
- including strains in {input.include}
118+
"""
119+
input:
120+
sequences = "data/sequences.fasta",
121+
metadata = "data/metadata.tsv",
122+
clade_membership = "defaults/sh/metadata_duplicate.txt",
123+
exclude = "defaults/sh/exclude.txt",
124+
include = "defaults/sh/include.txt"
125+
output:
126+
merged_metadata = "results/sh/metadata_merged.tsv",
127+
sequences = "results/sh/filtered.fasta",
128+
metadata = "results/sh/metadata.tsv",
129+
log:
130+
"logs/sh/filtered.txt",
131+
benchmark:
132+
"benchmarks/sh/filtered.txt",
133+
params:
134+
group_by = config['filter']['group_by'],
135+
filter_params = config['filter']['specific']['sh'],
136+
strain_id = config.get("strain_id_field", "strain"),
137+
shell:
138+
r"""
139+
augur merge \
140+
--metadata a={input.metadata:q} b={input.clade_membership:q} \
141+
--metadata-id-columns a={params.strain_id:q} b={params.strain_id:q} \
142+
--output-metadata {output.merged_metadata:q}
143+
144+
augur filter \
145+
--sequences {input.sequences:q} \
146+
--metadata {output.merged_metadata:q} \
147+
--metadata-id-columns {params.strain_id:q} \
148+
--include {input.include:q} \
149+
--output-sequences {output.sequences:q} \
150+
--output-metadata {output.metadata:q} \
151+
{params.filter_params} 2>&1 | tee {log:q}
152+
"""
153+
94154
rule align:
95155
"""
96156
Aligning sequences to {input.reference}
97157
- filling gaps with N
98158
"""
99159
input:
100160
sequences = "results/{build}/filtered.fasta",
101-
reference = config['reference'],
161+
reference = lambda wildcard: config['reference'][wildcard.build],
102162
output:
103163
alignment = "results/{build}/aligned.fasta",
104164
log:

0 commit comments

Comments
 (0)