diff --git a/q2_annotate/kraken2/select.py b/q2_annotate/kraken2/select.py index 639064b2..29da4177 100644 --- a/q2_annotate/kraken2/select.py +++ b/q2_annotate/kraken2/select.py @@ -262,7 +262,8 @@ def _kraken_to_ncbi_tree(df): while stack and parent_node.length == 0: _, parent_node = stack.pop() - if parent_node.children: + # Make sure we are not labeling infra-clades as actual tips + if parent_node.length == 1: parent_node.children[0].is_actual_tip = True return tree diff --git a/q2_annotate/kraken2/tests/data/root-infraclade-report/0a2f080c-2774-4c64-a645-07660afc7eb4.report.txt b/q2_annotate/kraken2/tests/data/root-infraclade-report/0a2f080c-2774-4c64-a645-07660afc7eb4.report.txt new file mode 100644 index 00000000..680c96a1 --- /dev/null +++ b/q2_annotate/kraken2/tests/data/root-infraclade-report/0a2f080c-2774-4c64-a645-07660afc7eb4.report.txt @@ -0,0 +1,18 @@ +100.00 180 0 R 1 root +100.00 180 0 R1 131567 cellular organisms +100.00 180 0 R2 2 Bacteria +100.00 180 0 K 1783272 Bacillati +100.00 180 0 P 201174 Actinomycetota +100.00 180 0 C 84998 Coriobacteriia + 99.44 179 0 O 84999 Coriobacteriales + 99.44 179 0 F 84107 Coriobacteriaceae + 99.44 179 0 G 102106 Collinsella + 97.78 176 10 S 74426 Collinsella aerofaciens + 92.22 166 166 S1 411903 Collinsella aerofaciens ATCC 25986 + 1.67 3 0 G1 2637548 unclassified Collinsella + 1.67 3 3 S 3132705 Collinsella sp. i05-0019-G5 + 0.56 1 0 O 1643822 Eggerthellales + 0.56 1 0 F 1643826 Eggerthellaceae + 0.56 1 0 G 644652 Gordonibacter + 0.56 1 0 S 471189 Gordonibacter pamelaeae + 0.56 1 1 S1 657308 Gordonibacter pamelaeae 7-10-1-b diff --git a/q2_annotate/kraken2/tests/data/root-infraclade-report/1acb175d-ddb0-410f-a513-7055cb90e706.report.txt b/q2_annotate/kraken2/tests/data/root-infraclade-report/1acb175d-ddb0-410f-a513-7055cb90e706.report.txt new file mode 100644 index 00000000..863b7dd5 --- /dev/null +++ b/q2_annotate/kraken2/tests/data/root-infraclade-report/1acb175d-ddb0-410f-a513-7055cb90e706.report.txt @@ -0,0 +1,4 @@ + 50.00 1 1 U 0 unclassified + 50.00 1 0 R 1 root + 50.00 1 0 R1 131567 cellular organisms + 50.00 1 1 R2 2 Bacteria diff --git a/q2_annotate/kraken2/tests/data/root-infraclade-report/f608b31a-c05c-4498-9e7d-67c89e33cabf.report.txt b/q2_annotate/kraken2/tests/data/root-infraclade-report/f608b31a-c05c-4498-9e7d-67c89e33cabf.report.txt new file mode 100644 index 00000000..aeeadbff --- /dev/null +++ b/q2_annotate/kraken2/tests/data/root-infraclade-report/f608b31a-c05c-4498-9e7d-67c89e33cabf.report.txt @@ -0,0 +1,2 @@ +100.00 1 0 R 1 root +100.00 1 1 R1 131567 cellular organisms diff --git a/q2_annotate/kraken2/tests/test_selection.py b/q2_annotate/kraken2/tests/test_selection.py index 769ac067..34376027 100644 --- a/q2_annotate/kraken2/tests/test_selection.py +++ b/q2_annotate/kraken2/tests/test_selection.py @@ -7,24 +7,23 @@ # ---------------------------------------------------------------------------- import shutil import tempfile -import unittest import pandas as pd import pandas.testing -from pandas._testing import assert_frame_equal import skbio +from pandas._testing import assert_frame_equal +from q2_types.kraken2 import ( + Kraken2OutputDirectoryFormat, + Kraken2ReportDirectoryFormat, +) +from qiime2.plugin.testing import TestPluginBase + from q2_annotate.kraken2 import kraken2_to_features from q2_annotate.kraken2.select import ( - _kraken_to_ncbi_tree, _find_lcas, + _kraken_to_ncbi_tree, kraken2_to_mag_features, ) -from qiime2.plugin.testing import TestPluginBase - -from q2_types.kraken2 import ( - Kraken2ReportDirectoryFormat, - Kraken2OutputDirectoryFormat, -) class MockTempDir(tempfile.TemporaryDirectory): @@ -159,8 +158,7 @@ def test_kraken2_to_mag_features_incorrect_fraction_unclassified(self): hits = Kraken2OutputDirectoryFormat(self.get_data_path("outputs-mags"), "r") with self.assertRaisesRegex( ValueError, - "fraction for MAG '8894435a-c836-4c18-b475-8b38a9ab6c6b' " - "is not .* 99.01%", + "fraction for MAG '8894435a-c836-4c18-b475-8b38a9ab6c6b' is not .* 99.01%", ): kraken2_to_mag_features(reports, hits, 0.0) @@ -245,7 +243,7 @@ def test_kraken2_to_mag_features_unclassified_no_add_up(self): with self.assertRaisesRegex( ValueError, - "fraction for MAG '8894435a-c836-4c18-b475-8b38a9ab6c6b' " "is not 100.0", + "fraction for MAG '8894435a-c836-4c18-b475-8b38a9ab6c6b' is not 100.0", ): kraken2_to_mag_features(reports, hits, 0.1) @@ -380,7 +378,8 @@ def test_find_lcas_mode_lca_all_unclassified(self): # pandas.testing.assert_frame_equal(obs, exp) -class TestKrakenSelectEdgeCases(unittest.TestCase): +class TestKrakenSelectEdgeCases(TestPluginBase): + package = "q2_annotate.kraken2.tests" def make_dirfmt(self, string, coverage=False): """ @@ -464,7 +463,6 @@ def test_kraken_to_ncbi_tree_no_tricks(self): ) table, taxonomy = kraken2_to_features(dirfmt) - pandas.testing.assert_frame_equal(exp_table, table) pandas.testing.assert_frame_equal(exp_tax, taxonomy) @@ -608,3 +606,13 @@ def test_kraken_to_ncbi_tree_rankless_domain_inference(self): pandas.testing.assert_frame_equal(exp_table, table) pandas.testing.assert_frame_equal(exp_tax, taxonomy) + + def test_kraken2_to_features_root_infraclades(self): + """ + Tests that root infra-clades are not treated as actual tips leading + to mismatch between feature table and taxonomy. + """ + reports = Kraken2ReportDirectoryFormat( + self.get_data_path("root-infraclade-report"), "r" + ) + kraken2_to_features(reports)