Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion q2_annotate/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
_eggnog_hmmer_search,
_eggnog_feature_table,
)
from .annotation import map_eggnog, _eggnog_annotate, extract_annotations
from .annotation import (
map_eggnog,
_eggnog_annotate,
extract_annotations,
transfer_eggnog_annotations,
)
from .dbs import (
fetch_eggnog_db,
fetch_diamond_db,
Expand All @@ -39,4 +44,5 @@
"search_orthologs_hmmer",
"_eggnog_hmmer_search",
"extract_annotations",
"transfer_eggnog_annotations",
]
133 changes: 132 additions & 1 deletion q2_annotate/eggnog/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import shutil
import subprocess
import warnings
from pathlib import Path
from typing import Union

import pandas as pd

from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.feature_map import MAGtoContigsDirFmt
from q2_types.genome_data import (
OrthologAnnotationDirFmt,
Orthologs,
Expand Down Expand Up @@ -168,7 +174,9 @@ def extract_annotations(
annot_df = pd.read_csv(
fp, sep="\t", skiprows=4, index_col=0
) # skip the first 4 rows as they contain comments
annot_df = annot_df.iloc[:-3, :] # remove the last 3 comment rows
# strip trailing comment rows (footer) only if present
if annot_df.index[-3:].astype(str).str.startswith("##").all():
annot_df = annot_df.iloc[:-3, :]
annot_df = _filter(annot_df, max_evalue, min_score)
annot_df = _extract_generic(annot_df, col, func)
annot_df.name = _id
Expand All @@ -177,3 +185,126 @@ def extract_annotations(
result = pd.concat(annotations, axis=1).fillna(0).T
result.index.name = "id"
return result


def _get_mag_ids_from_feature_data(mags: MAGSequencesDirFmt) -> set:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need this method? It literally does a single thing.

"""Extract MAG UUIDs from a FeatureData[MAG] artifact."""
return set(mags.feature_dict().keys())


def _copy_annotation_files(
source_annotations: OrthologAnnotationDirFmt,
mag_ids: set,
result: OrthologAnnotationDirFmt,

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should just make this method return results as nothing really happens to them before they are being passed to this method. This also makes it a bit more explicit what this method actually does/returns.

):
"""Copy annotation files from source to result for the given MAG IDs."""
annotation_dict = source_annotations.annotation_dict()

matched_ids = mag_ids & set(annotation_dict.keys())
if not matched_ids:
raise ValueError("No annotation files matched the destination MAG IDs.")
Comment on lines +203 to +205

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this check should happen already before we call this method - it will remove the need to pass the mag_ids and provide a simpler interface. Actually, you could even move it to a separate method as you have some additional check below - that way the validation can be taken care of one testable method and the copying by another one, making both of them responsible for different parts of the pipeline.


missing = mag_ids - set(annotation_dict.keys())
if missing:
warnings.warn(
f"{len(missing)} MAG(s) in the destination had no matching "
f"annotation file in the source and will be skipped: "
f"{', '.join(sorted(missing))}",
UserWarning,
)

for mag_id in matched_ids:
src_path = annotation_dict[mag_id]
shutil.copy2(src_path, str(result.path / Path(src_path).name))


def _annotate_mags_from_contigs(
ortholog_annotations: OrthologAnnotationDirFmt,
contig_map: MAGtoContigsDirFmt,
) -> OrthologAnnotationDirFmt:
"""Aggregate contig-level eggNOG annotations -> MAG-level annotations."""
# contig_map: {mag_uuid: [contig_id, ...]}
contig_map_dict = contig_map.file.view(dict)

# reverse map: contig_id -> mag_uuid
contig_to_mag = {
contig_id: mag_uuid
for mag_uuid, contig_ids in contig_map_dict.items()
for contig_id in contig_ids
}

# Read all annotation files into a DataFrame

frames = []
for _id, fp in ortholog_annotations.annotation_dict().items():
df = pd.read_csv(fp, sep="\t", skiprows=4)
# drop trailing comment only if present
first_col = df.columns[0]
df = df[~df[first_col].astype(str).str.startswith("##")]
Comment on lines +240 to +243

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether you could achieve the same by simply reading in every file as OrthologFileFmt and viewing as a df?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could use _annotations_to_dataframe fuction but the result would need to be adjusted anyway

frames.append(df)

all_annotations = pd.concat(frames, ignore_index=True)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to evaluate carefully whether this will work well when one has hundreds of samples with thousands of annotations. I'm a bit worried that the memory will blow up 😅


# Rebuild the eggNOG column header line.
col_header = "\t".join(all_annotations.columns) + "\n"

# Strip ORF suffix (contig_id_1 -> contig_id)
query_col = all_annotations.columns[0]
all_annotations["mag_uuid"] = (
all_annotations[query_col]
.str.replace(r"_\d+$", "", regex=True)
.map(contig_to_mag)
)

matched = all_annotations.dropna(subset=["mag_uuid"])
if matched.empty:
raise ValueError("No annotation rows could be matched to any MAG.")

unmatched = all_annotations["mag_uuid"].isna().sum()
if unmatched > 0:
total = len(all_annotations)
pct = unmatched / total * 100
warnings.warn(
f"{unmatched} of {total} annotation row(s) ({pct:.1f}%) were on "
"contigs not present in the contig map (e.g. unbinned contigs) "
"and were skipped.",
UserWarning,
)

result = OrthologAnnotationDirFmt()
for mag_uuid, group in matched.groupby("mag_uuid"):
out_fp = result.path / f"{mag_uuid}.emapper.annotations"
n_contigs = len(contig_map_dict.get(mag_uuid, []))
n_rows = len(group)
with open(out_fp, "w") as fh:
fh.write("## Transferred using transfer_eggnog_annotations (q2-annotate)\n")
fh.write("## Source: contig-level annotations\n")
fh.write(f"## MAG: {mag_uuid} | contigs: {n_contigs} | rows: {n_rows}\n")
fh.write("##\n")
fh.write(col_header)
group.drop(columns=["mag_uuid"]).to_csv(
fh, sep="\t", index=False, header=False
)

# Verbose-only summary
print(
f"Aggregated {len(matched)} of {len(all_annotations)} annotation "
f"row(s) into {matched['mag_uuid'].nunique()} MAG(s); "
f"{unmatched} row(s) skipped."
)

return result


def transfer_eggnog_annotations(
ortholog_annotations: OrthologAnnotationDirFmt,
destination: Union[MAGSequencesDirFmt, MAGtoContigsDirFmt],

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is confusing. The destination should be represented by the same kind of semantic type, either SampleData[MAGs] or FeatureData[MAG]. Now, you are mixing in the contig map. I think this should become an additional input required when SampleData[MAGs] were provided as source of the annotations or if FeatureData[MAG] was provided as the destination (whichever of those makes more sense for your pipeline).

) -> OrthologAnnotationDirFmt:
"""Transfer or aggregate eggNOG annotations based on the destination type."""
if isinstance(destination, MAGSequencesDirFmt):
result = OrthologAnnotationDirFmt()
mag_ids = _get_mag_ids_from_feature_data(destination)
_copy_annotation_files(ortholog_annotations, mag_ids, result)
return result
else:
return _annotate_mags_from_contigs(ortholog_annotations, destination)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## Fri May 24 23:19:02 2024
## emapper-2.1.12
## emapper.py -m no_search --annotate_hits_table input --cpu 1
##
#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs
k141_100_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 1.1.1.1 ko:K00001 - - - - ko00000 - - - PF00001
k141_100_1 ortholog1 0.0 100.0 COG0001@1|root 1|root F some description geneA - 2.2.2.2 ko:K00002 - - - - ko00000 - - - PF00001
k141_200_0 ortholog1 0.0 100.0 COG0001@1|root 1|root A some description geneA - 3.3.3.3 ko:K00003 - - - - ko00000 - - - PF00001
k141_300_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 4.4.4.4 ko:K00004 - - - - ko00000 - - - PF00001
## 4 queries scanned
## Total time (seconds): 1.0
## Rate: 4.00 q/s
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>seq1
ACGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>seq2
ACGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"11111111-1111-4111-8111-111111111111": [
"nonexistent_contig"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"11111111-1111-4111-8111-111111111111": [
"k141_100",
"k141_200"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"11111111-1111-4111-8111-111111111111": [
"k141_100",
"k141_200"
],
"22222222-2222-4222-8222-222222222222": [
"k141_300"
]
}
99 changes: 98 additions & 1 deletion q2_annotate/eggnog/tests/test_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import filecmp
from pathlib import Path

import pandas as pd
import pandas.testing as pdt
import qiime2
from qiime2.plugin.testing import TestPluginBase

from q2_annotate.eggnog import _eggnog_annotate, extract_annotations
from q2_annotate.eggnog.annotation import _extract_generic, _filter, extraction_methods
from q2_annotate.eggnog.annotation import (
_extract_generic,
_filter,
extraction_methods,
transfer_eggnog_annotations,
)
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.feature_map import MAGtoContigsDirFmt
from q2_types.genome_data import (
OrthologAnnotationDirFmt,
SeedOrthologDirFmt,
Expand Down Expand Up @@ -235,3 +243,92 @@ def test_filter(self):
def test_filter_empty(self):
with self.assertRaisesRegex(ValueError, " resulted in an empty table"):
_filter(self.df, 0.1, 500.0)


class TestTransferAnnotations(TestPluginBase):
package = "q2_annotate.eggnog.tests"

def setUp(self):
super().setUp()
self.annotations = OrthologAnnotationDirFmt(
self.get_data_path("annotations/"), mode="r"
)
self.feature_data_mags = MAGSequencesDirFmt(
self.get_data_path("mag-sequences-for-transfer/"), mode="r"
)

def test_transfer_to_feature_data(self):
result = transfer_eggnog_annotations(
self.annotations, self.feature_data_mags
)
src = self.annotations.annotation_dict()
self.assertEqual(
set(result.annotation_dict().keys()),
{
"1e9ffc02-0847-4f2c-b1e2-3965a4a78b15",
"62e07985-2556-435c-9e02-e7f94b8df07d",
},
)
for uuid, path in result.annotation_dict().items():
self.assertTrue(filecmp.cmp(src[uuid], path, shallow=False))

def test_transfer_raises_on_no_match(self):
uuid = "00000000-0000-4000-8000-000000000000"
Path(self.temp_dir.name, f"{uuid}.fasta").touch()
empty_mags = MAGSequencesDirFmt(self.temp_dir.name, mode="r")
with self.assertRaisesRegex(ValueError, "No annotation files matched"):
transfer_eggnog_annotations(self.annotations, empty_mags)


class TestAnnotateMagsFromContigs(TestPluginBase):
package = "q2_annotate.eggnog.tests"

MAG1 = "11111111-1111-4111-8111-111111111111"
MAG2 = "22222222-2222-4222-8222-222222222222"

def setUp(self):
super().setUp()
self.contig_annotations = OrthologAnnotationDirFmt(
self.get_data_path("contig-annotations/"), mode="r"
)
self.contig_map = MAGtoContigsDirFmt(
self.get_data_path("mag-to-contigs/"), mode="r"
)
self.contig_map_partial = MAGtoContigsDirFmt(
self.get_data_path("mag-to-contigs-partial/"), mode="r"
)
self.contig_map_nomatch = MAGtoContigsDirFmt(
self.get_data_path("mag-to-contigs-nomatch/"), mode="r"
)

def _query_ids(self, result, mag_uuid):
df = pd.read_csv(result.annotation_dict()[mag_uuid], sep="\t", skiprows=4)
return df[df.columns[0]].tolist()

def test_aggregate_groups_contigs_into_mags(self):
result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map)
self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1, self.MAG2})
self.assertEqual(
sorted(self._query_ids(result, self.MAG1)),
["k141_100_0", "k141_100_1", "k141_200_0"],
)
self.assertEqual(self._query_ids(result, self.MAG2), ["k141_300_0"])

def test_aggregate_preserves_header_and_drops_footer(self):
result = transfer_eggnog_annotations(self.contig_annotations, self.contig_map)
lines = Path(result.annotation_dict()[self.MAG1]).read_text().splitlines()
self.assertIn("#query\tseed_ortholog\tevalue", lines[4])
self.assertFalse(lines[-1].startswith("##"))

def test_aggregate_warns_on_unmatched_rows(self):
with self.assertWarns(UserWarning):
result = transfer_eggnog_annotations(
self.contig_annotations, self.contig_map_partial
)
self.assertEqual(set(result.annotation_dict().keys()), {self.MAG1})

def test_aggregate_raises_when_nothing_matches(self):
with self.assertRaisesRegex(ValueError, "No annotation rows could be"):
transfer_eggnog_annotations(
self.contig_annotations, self.contig_map_nomatch
)
29 changes: 29 additions & 0 deletions q2_annotate/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,6 +1817,35 @@
citations=[],
)

plugin.methods.register_function(
function=q2_annotate.eggnog.transfer_eggnog_annotations,
inputs={
"ortholog_annotations": GenomeData[NOG],
"destination": FeatureData[MAG] | FeatureMap[MAGtoContigs],
},
parameters={},
outputs=[("transferred_annotations", GenomeData[NOG])],
input_descriptions={
"ortholog_annotations": "Ortholog annotations to transfer or aggregate.",
"destination": (
"FeatureData[MAG] to subset annotations, or "
"FeatureMap[MAGtoContigs] to aggregate contig-level annotations."
),
},
parameter_descriptions={},
output_descriptions={
"transferred_annotations": "Transferred or aggregated annotations.",
},
name="Transfer or aggregate eggNOG annotations.",
description=(
"Transfers eggNOG ortholog annotations based on the destination "
"type. A FeatureData[MAG] copies annotations for matching MAGs "
"(e.g., after dereplication); a FeatureMap[MAGtoContigs] "
"aggregates contig-level annotations into per-MAG files."
),
citations=[],
)

multiply_input_descriptions = {
"table1": "First feature table.",
"table2": "Second feature table with matching dimension.",
Expand Down
Loading