@@ -50,14 +50,31 @@ rule decompress:
5050 zstd -d -c {input.metadata} > {output.metadata}
5151 """
5252
53+ rule merge_annotations :
54+ """Merge identical sequence annotations"""
55+ input :
56+ metadata = "data/metadata.tsv" ,
57+ identical = "defaults/sh/metadata_identical.tsv" ,
58+ output :
59+ merged_metadata = "data/metadata_merged.tsv" ,
60+ params :
61+ id_column = "accession" ,
62+ shell :
63+ r"""
64+ augur merge --metadata \
65+ a={input.metadata:q} \
66+ b={input.identical:q} \
67+ --metadata-id-columns {params.id_column} \
68+ --output-metadata {output.merged_metadata}
69+ """
70+
5371rule filter :
5472 """
5573 Filtering to
5674 - various criteria based on the auspice JSON target
5775 - from {params.min_date} onwards
5876 - excluding strains in {input.exclude}
5977 - including strains in {input.include}
60- - minimum genome length of {params.min_length} (50% of Zika virus genome)
6178 """
6279 input :
6380 sequences = "data/sequences.fasta" ,
@@ -72,7 +89,6 @@ rule filter:
7289 benchmark :
7390 "benchmarks/{build}/filtered.txt" ,
7491 params :
75- min_length = config ['filter' ]['min_length' ],
7692 group_by = config ['filter' ]['group_by' ],
7793 filter_params = lambda wildcard : config ['filter' ]['specific' ][wildcard .build ],
7894 strain_id = config .get ("strain_id_field" , "strain" ),
@@ -86,19 +102,63 @@ rule filter:
86102 --include {input.include:q} \
87103 --output {output.sequences:q} \
88104 --output-metadata {output.metadata:q} \
89- --min-length {params.min_length:q} \
90105 --group-by {params.group_by} \
91106 {params.filter_params} 2>&1 | tee {log:q}
92107 """
93108
109+ ruleorder : filter_sh > filter
110+
111+ rule filter_sh :
112+ """
113+ Filtering to
114+ - various criteria based on the auspice JSON target
115+ - from {params.min_date} onwards
116+ - excluding strains in {input.exclude}
117+ - including strains in {input.include}
118+ """
119+ input :
120+ sequences = "data/sequences.fasta" ,
121+ metadata = "data/metadata.tsv" ,
122+ clade_membership = "defaults/sh/metadata_duplicate.txt" ,
123+ exclude = "defaults/sh/exclude.txt" ,
124+ include = "defaults/sh/include.txt"
125+ output :
126+ merged_metadata = "results/sh/metadata_merged.tsv" ,
127+ sequences = "results/sh/filtered.fasta" ,
128+ metadata = "results/sh/metadata.tsv" ,
129+ log :
130+ "logs/sh/filtered.txt" ,
131+ benchmark :
132+ "benchmarks/sh/filtered.txt" ,
133+ params :
134+ group_by = config ['filter' ]['group_by' ],
135+ filter_params = config ['filter' ]['specific' ]['sh' ],
136+ strain_id = config .get ("strain_id_field" , "strain" ),
137+ shell :
138+ r"""
139+ augur merge \
140+ --metadata a={input.metadata:q} b={input.clade_membership:q} \
141+ --metadata-id-columns a={params.strain_id:q} b={params.strain_id:q} \
142+ --output-metadata {output.merged_metadata:q}
143+
144+ augur filter \
145+ --sequences {input.sequences:q} \
146+ --metadata {output.merged_metadata:q} \
147+ --metadata-id-columns {params.strain_id:q} \
148+ --include {input.include:q} \
149+ --output-sequences {output.sequences:q} \
150+ --output-metadata {output.metadata:q} \
151+ {params.filter_params} 2>&1 | tee {log:q}
152+ """
153+
94154rule align :
95155 """
96156 Aligning sequences to {input.reference}
97157 - filling gaps with N
98158 """
99159 input :
100160 sequences = "results/{build}/filtered.fasta" ,
101- reference = config ['reference' ],
161+ reference = lambda wildcard : config ['reference' ][ wildcard . build ],
102162 output :
103163 alignment = "results/{build}/aligned.fasta" ,
104164 log :
0 commit comments