bokulich-lab · ZuzanaSebb · May 28, 2026 · Jun 4, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/q2_annotate/eggnog/__init__.py b/q2_annotate/eggnog/__init__.py
@@ -12,7 +12,12 @@
     _eggnog_hmmer_search,
     _eggnog_feature_table,
 )
-from .annotation import map_eggnog, _eggnog_annotate, extract_annotations
+from .annotation import (
+    map_eggnog,
+    _eggnog_annotate,
+    extract_annotations,
+    transfer_eggnog_annotations,
+)
 from .dbs import (
     fetch_eggnog_db,
     fetch_diamond_db,
@@ -39,4 +44,5 @@
     "search_orthologs_hmmer",
     "_eggnog_hmmer_search",
     "extract_annotations",
+    "transfer_eggnog_annotations",
 ]
diff --git a/q2_annotate/eggnog/annotation.py b/q2_annotate/eggnog/annotation.py
@@ -5,10 +5,16 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import shutil
 import subprocess
+import warnings
+from pathlib import Path
+from typing import Union
 
 import pandas as pd
 
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+from q2_types.feature_map import MAGtoContigsDirFmt
 from q2_types.genome_data import (
     OrthologAnnotationDirFmt,
     Orthologs,
@@ -168,7 +174,9 @@ def extract_annotations(
         annot_df = pd.read_csv(
             fp, sep="\t", skiprows=4, index_col=0
         )  # skip the first 4 rows as they contain comments
-        annot_df = annot_df.iloc[:-3, :]  # remove the last 3 comment rows
+        # strip trailing comment rows (footer) only if present
+        if annot_df.index[-3:].astype(str).str.startswith("##").all():
+            annot_df = annot_df.iloc[:-3, :]
         annot_df = _filter(annot_df, max_evalue, min_score)
         annot_df = _extract_generic(annot_df, col, func)
         annot_df.name = _id
@@ -177,3 +185,126 @@ def extract_annotations(
     result = pd.concat(annotations, axis=1).fillna(0).T
     result.index.name = "id"
     return result
+
+
+def _get_mag_ids_from_feature_data(mags: MAGSequencesDirFmt) -> set:
+    """Extract MAG UUIDs from a FeatureData[MAG] artifact."""
+    return set(mags.feature_dict().keys())
+
+
+def _copy_annotation_files(
+    source_annotations: OrthologAnnotationDirFmt,
+    mag_ids: set,
+    result: OrthologAnnotationDirFmt,
+):
+    """Copy annotation files from source to result for the given MAG IDs."""
+    annotation_dict = source_annotations.annotation_dict()
+
+    matched_ids = mag_ids & set(annotation_dict.keys())
+    if not matched_ids:
+        raise ValueError("No annotation files matched the destination MAG IDs.")
+
+    missing = mag_ids - set(annotation_dict.keys())
+    if missing:
+        warnings.warn(
+            f"{len(missing)} MAG(s) in the destination had no matching "
+            f"annotation file in the source and will be skipped: "
+            f"{', '.join(sorted(missing))}",
+            UserWarning,
+        )
+
+    for mag_id in matched_ids:
+        src_path = annotation_dict[mag_id]
+        shutil.copy2(src_path, str(result.path / Path(src_path).name))
+
+
+def _annotate_mags_from_contigs(
+    ortholog_annotations: OrthologAnnotationDirFmt,
+    contig_map: MAGtoContigsDirFmt,
+) -> OrthologAnnotationDirFmt:
+    """Aggregate contig-level eggNOG annotations -> MAG-level annotations."""
+    # contig_map: {mag_uuid: [contig_id, ...]}
+    contig_map_dict = contig_map.file.view(dict)
+
+    # reverse map: contig_id -> mag_uuid
+    contig_to_mag = {
+        contig_id: mag_uuid
+        for mag_uuid, contig_ids in contig_map_dict.items()
+        for contig_id in contig_ids
+    }
+
+    # Read all annotation files into a DataFrame
+
+    frames = []
+    for _id, fp in ortholog_annotations.annotation_dict().items():
+        df = pd.read_csv(fp, sep="\t", skiprows=4)
+        # drop trailing comment only if present
+        first_col = df.columns[0]
+        df = df[~df[first_col].astype(str).str.startswith("##")]
+        frames.append(df)
+
+    all_annotations = pd.concat(frames, ignore_index=True)
+
+    # Rebuild the eggNOG column header line.
+    col_header = "\t".join(all_annotations.columns) + "\n"
+
+    # Strip ORF suffix (contig_id_1 -> contig_id)
+    query_col = all_annotations.columns[0]
+    all_annotations["mag_uuid"] = (
+        all_annotations[query_col]
+        .str.replace(r"_\d+$", "", regex=True)
+        .map(contig_to_mag)
+    )
+
+    matched = all_annotations.dropna(subset=["mag_uuid"])
+    if matched.empty:
+        raise ValueError("No annotation rows could be matched to any MAG.")
+
+    unmatched = all_annotations["mag_uuid"].isna().sum()
+    if unmatched > 0:
+        total = len(all_annotations)
+        pct = unmatched / total * 100
+        warnings.warn(
+            f"{unmatched} of {total} annotation row(s) ({pct:.1f}%) were on "
+            "contigs not present in the contig map (e.g. unbinned contigs) "
+            "and were skipped.",
+            UserWarning,
+        )
+
+    result = OrthologAnnotationDirFmt()
+    for mag_uuid, group in matched.groupby("mag_uuid"):
+        out_fp = result.path / f"{mag_uuid}.emapper.annotations"
+        n_contigs = len(contig_map_dict.get(mag_uuid, []))
+        n_rows = len(group)
+        with open(out_fp, "w") as fh:
+            fh.write("## Transferred using transfer_eggnog_annotations (q2-annotate)\n")
+            fh.write("## Source: contig-level annotations\n")
+            fh.write(f"## MAG: {mag_uuid} | contigs: {n_contigs} | rows: {n_rows}\n")
+            fh.write("##\n")
+            fh.write(col_header)
+            group.drop(columns=["mag_uuid"]).to_csv(
+                fh, sep="\t", index=False, header=False
+            )
+
+    # Verbose-only summary
+    print(
+        f"Aggregated {len(matched)} of {len(all_annotations)} annotation "
+        f"row(s) into {matched['mag_uuid'].nunique()} MAG(s); "
+        f"{unmatched} row(s) skipped."
+    )
+
+    return result
+
+
+def transfer_eggnog_annotations(
+    ortholog_annotations: OrthologAnnotationDirFmt,
+    destination: Union[MAGSequencesDirFmt, MAGtoContigsDirFmt],
+) -> OrthologAnnotationDirFmt:
+    """Transfer or aggregate eggNOG annotations based on the destination type."""
+    if isinstance(destination, MAGSequencesDirFmt):
+        result = OrthologAnnotationDirFmt()
+        mag_ids = _get_mag_ids_from_feature_data(destination)
+        _copy_annotation_files(ortholog_annotations, mag_ids, result)
+        return result
+    else:
+        return _annotate_mags_from_contigs(ortholog_annotations, destination)
diff --git a/q2_annotate/eggnog/tests/data/contig-annotations/sample1.emapper.annotations b/q2_annotate/eggnog/tests/data/contig-annotations/sample1.emapper.annotations
@@ -0,0 +1,12 @@
+## Fri May 24 23:19:02 2024
+## emapper-2.1.12
+## emapper.py -m no_search --annotate_hits_table input --cpu 1
+##
+#query	seed_ortholog	evalue	score	eggNOG_OGs	max_annot_lvl	COG_category	Description	Preferred_name	GOs	EC	KEGG_ko	KEGG_Pathway	KEGG_Module	KEGG_Reaction	KEGG_rclass	BRITE	KEGG_TC	CAZy	BiGG_Reaction	PFAMs
+k141_100_0	ortholog1	0.0	100.0	COG0001@1|root	1|root	L	some description	geneA	-	1.1.1.1	ko:K00001	-	-	-	-	ko00000	-	-	-	PF00001
+k141_100_1	ortholog1	0.0	100.0	COG0001@1|root	1|root	F	some description	geneA	-	2.2.2.2	ko:K00002	-	-	-	-	ko00000	-	-	-	PF00001
+k141_200_0	ortholog1	0.0	100.0	COG0001@1|root	1|root	A	some description	geneA	-	3.3.3.3	ko:K00003	-	-	-	-	ko00000	-	-	-	PF00001
+k141_300_0	ortholog1	0.0	100.0	COG0001@1|root	1|root	L	some description	geneA	-	4.4.4.4	ko:K00004	-	-	-	-	ko00000	-	-	-	PF00001
+## 4 queries scanned
+## Total time (seconds): 1.0
+## Rate: 4.00 q/s
diff --git a/...e/eggnog/tests/data/mag-sequences-for-transfer/1e9ffc02-0847-4f2c-b1e2-3965a4a78b15.fasta b/...e/eggnog/tests/data/mag-sequences-for-transfer/1e9ffc02-0847-4f2c-b1e2-3965a4a78b15.fasta
@@ -0,0 +1,2 @@
+>seq1
+ACGT
diff --git a/...e/eggnog/tests/data/mag-sequences-for-transfer/62e07985-2556-435c-9e02-e7f94b8df07d.fasta b/...e/eggnog/tests/data/mag-sequences-for-transfer/62e07985-2556-435c-9e02-e7f94b8df07d.fasta
@@ -0,0 +1,2 @@
+>seq2
+ACGT
diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs-nomatch/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs-nomatch/mag-to-contigs.json
@@ -0,0 +1,5 @@
+{
+  "11111111-1111-4111-8111-111111111111": [
+    "nonexistent_contig"
+  ]
+}
diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs-partial/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs-partial/mag-to-contigs.json
@@ -0,0 +1,6 @@
+{
+  "11111111-1111-4111-8111-111111111111": [
+    "k141_100",
+    "k141_200"
+  ]
+}
diff --git a/q2_annotate/eggnog/tests/data/mag-to-contigs/mag-to-contigs.json b/q2_annotate/eggnog/tests/data/mag-to-contigs/mag-to-contigs.json
@@ -0,0 +1,9 @@
+{
+  "11111111-1111-4111-8111-111111111111": [
+    "k141_100",
+    "k141_200"
+  ],
+  "22222222-2222-4222-8222-222222222222": [
+    "k141_300"
+  ]
+}
diff --git a/q2_annotate/eggnog/tests/test_annotation.py b/q2_annotate/eggnog/tests/test_annotation.py
@@ -6,14 +6,22 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import filecmp
+from pathlib import Path
 
 import pandas as pd
 import pandas.testing as pdt
 import qiime2
 from qiime2.plugin.testing import TestPluginBase
 
 from q2_annotate.eggnog import _eggnog_annotate, extract_annotations
-from q2_annotate.eggnog.annotation import _extract_generic, _filter, extraction_methods
+from q2_annotate.eggnog.annotation import (
+    _extract_generic,
+    _filter,
+    extraction_methods,
+    transfer_eggnog_annotations,
+)
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+from q2_types.feature_map import MAGtoContigsDirFmt
 from q2_types.genome_data import (
     OrthologAnnotationDirFmt,
     SeedOrthologDirFmt,
@@ -235,3 +243,92 @@ def test_filter(self):
     def test_filter_empty(self):
         with self.assertRaisesRegex(ValueError, " resulted in an empty table"):
             _filter(self.df, 0.1, 500.0)
+
+
+class TestTransferAnnotations(TestPluginBase):
+    package = "q2_annotate.eggnog.tests"
+
+    def setUp(self):
+        super().setUp()
+        self.annotations = OrthologAnnotationDirFmt(
+            self.get_data_path("annotations/"), mode="r"
+        )
+        self.feature_data_mags = MAGSequencesDirFmt(
+            self.get_data_path("mag-sequences-for-transfer/"), mode="r"
+        )
+
+    def test_transfer_to_feature_data(self):
+        result = transfer_eggnog_annotations(
+            self.annotations, self.feature_data_mags
+        )
+        src = self.annotations.annotation_dict()
+        self.assertEqual(
+            set(result.annotation_dict().keys()),
+            {
+                "1e9ffc02-0847-4f2c-b1e2-3965a4a78b15",
+                "62e07985-2556-435c-9e02-e7f94b8df07d",
+            },
+        )
+        for uuid, path in result.annotation_dict().items():
+            self.assertTrue(filecmp.cmp(src[uuid], path, shallow=False))
+
+    def test_transfer_raises_on_no_match(self):
+        uuid = "00000000-0000-4000-8000-000000000000"
+        Path(self.temp_dir.name, f"{uuid}.fasta").touch()
+        empty_mags = MAGSequencesDirFmt(self.temp_dir.name, mode="r")
+        with self.assertRaisesRegex(ValueError, "No annotation files matched"):
+            transfer_eggnog_annotations(self.annotations, empty_mags)
+
+
+class TestAnnotateMagsFromContigs(TestPluginBase):
+    package = "q2_annotate.eggnog.tests"
+
+    MAG1 = "11111111-1111-4111-8111-111111111111"
+    MAG2 = "22222222-2222-4222-8222-222222222222"
+
+    def setUp(self):
+        super().setUp()
+        self.contig_annotations = OrthologAnnotationDirFmt(
+            self.get_data_path("contig-annotations/"), mode="r"
+        )
+        self.contig_map = MAGtoContigsDirFmt(
+            self.get_data_path("mag-to-contigs/"), mode="r"
+        )
+        self.contig_map_partial = MAGtoContigsDirFmt(
+            self.get_data_path("mag-to-contigs-partial/"), mode="r"
+        )
+        self.contig_map_nomatch = MAGtoContigsDirFmt(
+            self.get_data_path("mag-to-contigs-nomatch/"), mode="r"
+        )
+
+    def _query_ids(self, result, mag_uuid):
+        df = pd.read_csv(result.annotation_dict()[mag_uuid], sep="\t", skiprows=4)
+        return df[df.columns[0]].tolist()
+
+    def test_aggregate_groups_contigs_into_mags(self):
+        result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map)
+        self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1, self.MAG2})
+        self.assertEqual(
+            sorted(self._query_ids(result, self.MAG1)),
+            ["k141_100_0", "k141_100_1", "k141_200_0"],
+        )
+        self.assertEqual(self._query_ids(result, self.MAG2), ["k141_300_0"])
+
+    def test_aggregate_preserves_header_and_drops_footer(self):
+        result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map)
+        lines = Path(result.annotation_dict()[self.MAG1]).read_text().splitlines()
+        self.assertIn("#query\tseed_ortholog\tevalue", lines[4])
+        self.assertFalse(lines[-1].startswith("##"))
+
+    def test_aggregate_warns_on_unmatched_rows(self):
+        with self.assertWarns(UserWarning):
+            result = transfer_eggnog_annotations(
+                self.contig_annotations, self.contig_map_partial
+            )
+        self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1})
+
+    def test_aggregate_raises_when_nothing_matches(self):
+        with self.assertRaisesRegex(ValueError, "No annotation rows could be"):
+            transfer_eggnog_annotations(
+                self.contig_annotations, self.contig_map_nomatch
+            )
diff --git a/q2_annotate/plugin_setup.py b/q2_annotate/plugin_setup.py
@@ -1817,6 +1817,35 @@
     citations=[],
 )
 
+plugin.methods.register_function(
+    function=q2_annotate.eggnog.transfer_eggnog_annotations,
+    inputs={
+        "ortholog_annotations": GenomeData[NOG],
+        "destination": FeatureData[MAG] | FeatureMap[MAGtoContigs],
+    },
+    parameters={},
+    outputs=[("transferred_annotations", GenomeData[NOG])],
+    input_descriptions={
+        "ortholog_annotations": "Ortholog annotations to transfer or aggregate.",
+        "destination": (
+            "FeatureData[MAG] to subset annotations, or "
+            "FeatureMap[MAGtoContigs] to aggregate contig-level annotations."
+        ),
+    },
+    parameter_descriptions={},
+    output_descriptions={
+        "transferred_annotations": "Transferred or aggregated annotations.",
+    },
+    name="Transfer or aggregate eggNOG annotations.",
+    description=(
+        "Transfers eggNOG ortholog annotations based on the destination "
+        "type. A FeatureData[MAG] copies annotations for matching MAGs "
+        "(e.g., after dereplication); a FeatureMap[MAGtoContigs] "
+        "aggregates contig-level annotations into per-MAG files."
+    ),
+    citations=[],
+)
+
 multiply_input_descriptions = {
     "table1": "First feature table.",
     "table2": "Second feature table with matching dimension.",