diff --git a/q2_annotate/eggnog/__init__.py b/q2_annotate/eggnog/__init__.py index 97c9dec7..0a33af52 100644 --- a/q2_annotate/eggnog/__init__.py +++ b/q2_annotate/eggnog/__init__.py @@ -12,7 +12,12 @@ _eggnog_hmmer_search, _eggnog_feature_table, ) -from .annotation import map_eggnog, _eggnog_annotate, extract_annotations +from .annotation import ( + map_eggnog, + _eggnog_annotate, + extract_annotations, + transfer_eggnog_annotations, +) from .dbs import ( fetch_eggnog_db, fetch_diamond_db, @@ -39,4 +44,5 @@ "search_orthologs_hmmer", "_eggnog_hmmer_search", "extract_annotations", + "transfer_eggnog_annotations", ] diff --git a/q2_annotate/eggnog/annotation.py b/q2_annotate/eggnog/annotation.py index 04c9cd31..9cfc916d 100644 --- a/q2_annotate/eggnog/annotation.py +++ b/q2_annotate/eggnog/annotation.py @@ -5,10 +5,16 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import shutil import subprocess +import warnings +from pathlib import Path +from typing import Union import pandas as pd +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.feature_map import MAGtoContigsDirFmt from q2_types.genome_data import ( OrthologAnnotationDirFmt, Orthologs, @@ -168,7 +174,9 @@ def extract_annotations( annot_df = pd.read_csv( fp, sep="\t", skiprows=4, index_col=0 ) # skip the first 4 rows as they contain comments - annot_df = annot_df.iloc[:-3, :] # remove the last 3 comment rows + # strip trailing comment rows (footer) only if present + if annot_df.index[-3:].astype(str).str.startswith("##").all(): + annot_df = annot_df.iloc[:-3, :] annot_df = _filter(annot_df, max_evalue, min_score) annot_df = _extract_generic(annot_df, col, func) annot_df.name = _id @@ -177,3 +185,126 @@ def extract_annotations( result = pd.concat(annotations, axis=1).fillna(0).T result.index.name = "id" return result + + +def _get_mag_ids_from_feature_data(mags: MAGSequencesDirFmt) -> set: + """Extract MAG UUIDs from a FeatureData[MAG] artifact.""" + return set(mags.feature_dict().keys()) + + +def _copy_annotation_files( + source_annotations: OrthologAnnotationDirFmt, + mag_ids: set, + result: OrthologAnnotationDirFmt, +): + """Copy annotation files from source to result for the given MAG IDs.""" + annotation_dict = source_annotations.annotation_dict() + + matched_ids = mag_ids & set(annotation_dict.keys()) + if not matched_ids: + raise ValueError("No annotation files matched the destination MAG IDs.") + + missing = mag_ids - set(annotation_dict.keys()) + if missing: + warnings.warn( + f"{len(missing)} MAG(s) in the destination had no matching " + f"annotation file in the source and will be skipped: " + f"{', '.join(sorted(missing))}", + UserWarning, + ) + + for mag_id in matched_ids: + src_path = annotation_dict[mag_id] + shutil.copy2(src_path, str(result.path / Path(src_path).name)) + + +def _annotate_mags_from_contigs( + ortholog_annotations: OrthologAnnotationDirFmt, + contig_map: MAGtoContigsDirFmt, +) -> OrthologAnnotationDirFmt: + """Aggregate contig-level eggNOG annotations -> MAG-level annotations.""" + # contig_map: {mag_uuid: [contig_id, ...]} + contig_map_dict = contig_map.file.view(dict) + + # reverse map: contig_id -> mag_uuid + contig_to_mag = { + contig_id: mag_uuid + for mag_uuid, contig_ids in contig_map_dict.items() + for contig_id in contig_ids + } + + # Read all annotation files into a DataFrame + + frames = [] + for _id, fp in ortholog_annotations.annotation_dict().items(): + df = pd.read_csv(fp, sep="\t", skiprows=4) + # drop trailing comment only if present + first_col = df.columns[0] + df = df[~df[first_col].astype(str).str.startswith("##")] + frames.append(df) + + all_annotations = pd.concat(frames, ignore_index=True) + + # Rebuild the eggNOG column header line. + col_header = "\t".join(all_annotations.columns) + "\n" + + # Strip ORF suffix (contig_id_1 -> contig_id) + query_col = all_annotations.columns[0] + all_annotations["mag_uuid"] = ( + all_annotations[query_col] + .str.replace(r"_\d+$", "", regex=True) + .map(contig_to_mag) + ) + + matched = all_annotations.dropna(subset=["mag_uuid"]) + if matched.empty: + raise ValueError("No annotation rows could be matched to any MAG.") + + unmatched = all_annotations["mag_uuid"].isna().sum() + if unmatched > 0: + total = len(all_annotations) + pct = unmatched / total * 100 + warnings.warn( + f"{unmatched} of {total} annotation row(s) ({pct:.1f}%) were on " + "contigs not present in the contig map (e.g. unbinned contigs) " + "and were skipped.", + UserWarning, + ) + + result = OrthologAnnotationDirFmt() + for mag_uuid, group in matched.groupby("mag_uuid"): + out_fp = result.path / f"{mag_uuid}.emapper.annotations" + n_contigs = len(contig_map_dict.get(mag_uuid, [])) + n_rows = len(group) + with open(out_fp, "w") as fh: + fh.write("## Transferred using transfer_eggnog_annotations (q2-annotate)\n") + fh.write("## Source: contig-level annotations\n") + fh.write(f"## MAG: {mag_uuid} | contigs: {n_contigs} | rows: {n_rows}\n") + fh.write("##\n") + fh.write(col_header) + group.drop(columns=["mag_uuid"]).to_csv( + fh, sep="\t", index=False, header=False + ) + + # Verbose-only summary + print( + f"Aggregated {len(matched)} of {len(all_annotations)} annotation " + f"row(s) into {matched['mag_uuid'].nunique()} MAG(s); " + f"{unmatched} row(s) skipped." + ) + + return result + + +def transfer_eggnog_annotations( + ortholog_annotations: OrthologAnnotationDirFmt, + destination: Union[MAGSequencesDirFmt, MAGtoContigsDirFmt], +) -> OrthologAnnotationDirFmt: + """Transfer or aggregate eggNOG annotations based on the destination type.""" + if isinstance(destination, MAGSequencesDirFmt): + result = OrthologAnnotationDirFmt() + mag_ids = _get_mag_ids_from_feature_data(destination) + _copy_annotation_files(ortholog_annotations, mag_ids, result) + return result + else: + return _annotate_mags_from_contigs(ortholog_annotations, destination) diff --git a/q2_annotate/eggnog/tests/data/contig-annotations/sample1.emapper.annotations b/q2_annotate/eggnog/tests/data/contig-annotations/sample1.emapper.annotations new file mode 100644 index 00000000..fb34aa54 --- /dev/null +++ b/q2_annotate/eggnog/tests/data/contig-annotations/sample1.emapper.annotations @@ -0,0 +1,12 @@ +## Fri May 24 23:19:02 2024 +## emapper-2.1.12 +## emapper.py -m no_search --annotate_hits_table input --cpu 1 +## +#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs +k141_100_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 1.1.1.1 ko:K00001 - - - - ko00000 - - - PF00001 +k141_100_1 ortholog1 0.0 100.0 COG0001@1|root 1|root F some description geneA - 2.2.2.2 ko:K00002 - - - - ko00000 - - - PF00001 +k141_200_0 ortholog1 0.0 100.0 COG0001@1|root 1|root A some description geneA - 3.3.3.3 ko:K00003 - - - - ko00000 - - - PF00001 +k141_300_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 4.4.4.4 ko:K00004 - - - - ko00000 - - - PF00001 +## 4 queries scanned +## Total time (seconds): 1.0 +## Rate: 4.00 q/s diff --git a/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/1e9ffc02-0847-4f2c-b1e2-3965a4a78b15.fasta b/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/1e9ffc02-0847-4f2c-b1e2-3965a4a78b15.fasta new file mode 100644 index 00000000..6b27daed --- /dev/null +++ b/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/1e9ffc02-0847-4f2c-b1e2-3965a4a78b15.fasta @@ -0,0 +1,2 @@ +>seq1 +ACGT diff --git a/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/62e07985-2556-435c-9e02-e7f94b8df07d.fasta b/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/62e07985-2556-435c-9e02-e7f94b8df07d.fasta new file mode 100644 index 00000000..bc751db1 --- /dev/null +++ b/q2_annotate/eggnog/tests/data/mag-sequences-for-transfer/62e07985-2556-435c-9e02-e7f94b8df07d.fasta @@ -0,0 +1,2 @@ +>seq2 +ACGT diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs-nomatch/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs-nomatch/mag-to-contigs.json new file mode 100644 index 00000000..a1c749a0 --- /dev/null +++ b/q2_annotate/eggnog/tests/data/mag-to-contigs-nomatch/mag-to-contigs.json @@ -0,0 +1,5 @@ +{ + "11111111-1111-4111-8111-111111111111": [ + "nonexistent_contig" + ] +} \ No newline at end of file diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs-partial/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs-partial/mag-to-contigs.json new file mode 100644 index 00000000..22cd6505 --- /dev/null +++ b/q2_annotate/eggnog/tests/data/mag-to-contigs-partial/mag-to-contigs.json @@ -0,0 +1,6 @@ +{ + "11111111-1111-4111-8111-111111111111": [ + "k141_100", + "k141_200" + ] +} \ No newline at end of file diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs/mag-to-contigs.json new file mode 100644 index 00000000..d9743c87 --- /dev/null +++ b/q2_annotate/eggnog/tests/data/mag-to-contigs/mag-to-contigs.json @@ -0,0 +1,9 @@ +{ + "11111111-1111-4111-8111-111111111111": [ + "k141_100", + "k141_200" + ], + "22222222-2222-4222-8222-222222222222": [ + "k141_300" + ] +} \ No newline at end of file diff --git a/q2_annotate/eggnog/tests/test_annotation.py b/q2_annotate/eggnog/tests/test_annotation.py index 0089ff93..116afde0 100644 --- a/q2_annotate/eggnog/tests/test_annotation.py +++ b/q2_annotate/eggnog/tests/test_annotation.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import filecmp +from pathlib import Path import pandas as pd import pandas.testing as pdt @@ -13,7 +14,14 @@ from qiime2.plugin.testing import TestPluginBase from q2_annotate.eggnog import _eggnog_annotate, extract_annotations -from q2_annotate.eggnog.annotation import _extract_generic, _filter, extraction_methods +from q2_annotate.eggnog.annotation import ( + _extract_generic, + _filter, + extraction_methods, + transfer_eggnog_annotations, +) +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.feature_map import MAGtoContigsDirFmt from q2_types.genome_data import ( OrthologAnnotationDirFmt, SeedOrthologDirFmt, @@ -235,3 +243,92 @@ def test_filter(self): def test_filter_empty(self): with self.assertRaisesRegex(ValueError, " resulted in an empty table"): _filter(self.df, 0.1, 500.0) + + +class TestTransferAnnotations(TestPluginBase): + package = "q2_annotate.eggnog.tests" + + def setUp(self): + super().setUp() + self.annotations = OrthologAnnotationDirFmt( + self.get_data_path("annotations/"), mode="r" + ) + self.feature_data_mags = MAGSequencesDirFmt( + self.get_data_path("mag-sequences-for-transfer/"), mode="r" + ) + + def test_transfer_to_feature_data(self): + result = transfer_eggnog_annotations( + self.annotations, self.feature_data_mags + ) + src = self.annotations.annotation_dict() + self.assertEqual( + set(result.annotation_dict().keys()), + { + "1e9ffc02-0847-4f2c-b1e2-3965a4a78b15", + "62e07985-2556-435c-9e02-e7f94b8df07d", + }, + ) + for uuid, path in result.annotation_dict().items(): + self.assertTrue(filecmp.cmp(src[uuid], path, shallow=False)) + + def test_transfer_raises_on_no_match(self): + uuid = "00000000-0000-4000-8000-000000000000" + Path(self.temp_dir.name, f"{uuid}.fasta").touch() + empty_mags = MAGSequencesDirFmt(self.temp_dir.name, mode="r") + with self.assertRaisesRegex(ValueError, "No annotation files matched"): + transfer_eggnog_annotations(self.annotations, empty_mags) + + +class TestAnnotateMagsFromContigs(TestPluginBase): + package = "q2_annotate.eggnog.tests" + + MAG1 = "11111111-1111-4111-8111-111111111111" + MAG2 = "22222222-2222-4222-8222-222222222222" + + def setUp(self): + super().setUp() + self.contig_annotations = OrthologAnnotationDirFmt( + self.get_data_path("contig-annotations/"), mode="r" + ) + self.contig_map = MAGtoContigsDirFmt( + self.get_data_path("mag-to-contigs/"), mode="r" + ) + self.contig_map_partial = MAGtoContigsDirFmt( + self.get_data_path("mag-to-contigs-partial/"), mode="r" + ) + self.contig_map_nomatch = MAGtoContigsDirFmt( + self.get_data_path("mag-to-contigs-nomatch/"), mode="r" + ) + + def _query_ids(self, result, mag_uuid): + df = pd.read_csv(result.annotation_dict()[mag_uuid], sep="\t", skiprows=4) + return df[df.columns[0]].tolist() + + def test_aggregate_groups_contigs_into_mags(self): + result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map) + self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1, self.MAG2}) + self.assertEqual( + sorted(self._query_ids(result, self.MAG1)), + ["k141_100_0", "k141_100_1", "k141_200_0"], + ) + self.assertEqual(self._query_ids(result, self.MAG2), ["k141_300_0"]) + + def test_aggregate_preserves_header_and_drops_footer(self): + result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map) + lines = Path(result.annotation_dict()[self.MAG1]).read_text().splitlines() + self.assertIn("#query\tseed_ortholog\tevalue", lines[4]) + self.assertFalse(lines[-1].startswith("##")) + + def test_aggregate_warns_on_unmatched_rows(self): + with self.assertWarns(UserWarning): + result = transfer_eggnog_annotations( + self.contig_annotations, self.contig_map_partial + ) + self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1}) + + def test_aggregate_raises_when_nothing_matches(self): + with self.assertRaisesRegex(ValueError, "No annotation rows could be"): + transfer_eggnog_annotations( + self.contig_annotations, self.contig_map_nomatch + ) diff --git a/q2_annotate/plugin_setup.py b/q2_annotate/plugin_setup.py index a7209745..f169ef00 100644 --- a/q2_annotate/plugin_setup.py +++ b/q2_annotate/plugin_setup.py @@ -1817,6 +1817,35 @@ citations=[], ) +plugin.methods.register_function( + function=q2_annotate.eggnog.transfer_eggnog_annotations, + inputs={ + "ortholog_annotations": GenomeData[NOG], + "destination": FeatureData[MAG] | FeatureMap[MAGtoContigs], + }, + parameters={}, + outputs=[("transferred_annotations", GenomeData[NOG])], + input_descriptions={ + "ortholog_annotations": "Ortholog annotations to transfer or aggregate.", + "destination": ( + "FeatureData[MAG] to subset annotations, or " + "FeatureMap[MAGtoContigs] to aggregate contig-level annotations." + ), + }, + parameter_descriptions={}, + output_descriptions={ + "transferred_annotations": "Transferred or aggregated annotations.", + }, + name="Transfer or aggregate eggNOG annotations.", + description=( + "Transfers eggNOG ortholog annotations based on the destination " + "type. A FeatureData[MAG] copies annotations for matching MAGs " + "(e.g., after dereplication); a FeatureMap[MAGtoContigs] " + "aggregates contig-level annotations into per-MAG files." + ), + citations=[], +) + multiply_input_descriptions = { "table1": "First feature table.", "table2": "Second feature table with matching dimension.",