-
Notifications
You must be signed in to change notification settings - Fork 21
Annotation transfer #349
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Annotation transfer #349
Changes from all commits
516f2f4
0844bcf
a62d5d8
0f06c50
6e35410
fe16f49
934aa06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,10 +5,16 @@ | |
| # | ||
| # The full license is in the file LICENSE, distributed with this software. | ||
| # ---------------------------------------------------------------------------- | ||
| import shutil | ||
| import subprocess | ||
| import warnings | ||
| from pathlib import Path | ||
| from typing import Union | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from q2_types.feature_data_mag import MAGSequencesDirFmt | ||
| from q2_types.feature_map import MAGtoContigsDirFmt | ||
| from q2_types.genome_data import ( | ||
| OrthologAnnotationDirFmt, | ||
| Orthologs, | ||
|
|
@@ -168,7 +174,9 @@ def extract_annotations( | |
| annot_df = pd.read_csv( | ||
| fp, sep="\t", skiprows=4, index_col=0 | ||
| ) # skip the first 4 rows as they contain comments | ||
| annot_df = annot_df.iloc[:-3, :] # remove the last 3 comment rows | ||
| # strip trailing comment rows (footer) only if present | ||
| if annot_df.index[-3:].astype(str).str.startswith("##").all(): | ||
| annot_df = annot_df.iloc[:-3, :] | ||
| annot_df = _filter(annot_df, max_evalue, min_score) | ||
| annot_df = _extract_generic(annot_df, col, func) | ||
| annot_df.name = _id | ||
|
|
@@ -177,3 +185,126 @@ def extract_annotations( | |
| result = pd.concat(annotations, axis=1).fillna(0).T | ||
| result.index.name = "id" | ||
| return result | ||
|
|
||
|
|
||
| def _get_mag_ids_from_feature_data(mags: MAGSequencesDirFmt) -> set: | ||
| """Extract MAG UUIDs from a FeatureData[MAG] artifact.""" | ||
| return set(mags.feature_dict().keys()) | ||
|
|
||
|
|
||
| def _copy_annotation_files( | ||
| source_annotations: OrthologAnnotationDirFmt, | ||
| mag_ids: set, | ||
| result: OrthologAnnotationDirFmt, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should just make this method return |
||
| ): | ||
| """Copy annotation files from source to result for the given MAG IDs.""" | ||
| annotation_dict = source_annotations.annotation_dict() | ||
|
|
||
| matched_ids = mag_ids & set(annotation_dict.keys()) | ||
| if not matched_ids: | ||
| raise ValueError("No annotation files matched the destination MAG IDs.") | ||
|
Comment on lines
+203
to
+205
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this check should happen already before we call this method - it will remove the need to pass the |
||
|
|
||
| missing = mag_ids - set(annotation_dict.keys()) | ||
| if missing: | ||
| warnings.warn( | ||
| f"{len(missing)} MAG(s) in the destination had no matching " | ||
| f"annotation file in the source and will be skipped: " | ||
| f"{', '.join(sorted(missing))}", | ||
| UserWarning, | ||
| ) | ||
|
|
||
| for mag_id in matched_ids: | ||
| src_path = annotation_dict[mag_id] | ||
| shutil.copy2(src_path, str(result.path / Path(src_path).name)) | ||
|
|
||
|
|
||
| def _annotate_mags_from_contigs( | ||
| ortholog_annotations: OrthologAnnotationDirFmt, | ||
| contig_map: MAGtoContigsDirFmt, | ||
| ) -> OrthologAnnotationDirFmt: | ||
| """Aggregate contig-level eggNOG annotations -> MAG-level annotations.""" | ||
| # contig_map: {mag_uuid: [contig_id, ...]} | ||
| contig_map_dict = contig_map.file.view(dict) | ||
|
|
||
| # reverse map: contig_id -> mag_uuid | ||
| contig_to_mag = { | ||
| contig_id: mag_uuid | ||
| for mag_uuid, contig_ids in contig_map_dict.items() | ||
| for contig_id in contig_ids | ||
| } | ||
|
|
||
| # Read all annotation files into a DataFrame | ||
|
|
||
| frames = [] | ||
| for _id, fp in ortholog_annotations.annotation_dict().items(): | ||
| df = pd.read_csv(fp, sep="\t", skiprows=4) | ||
| # drop trailing comment only if present | ||
| first_col = df.columns[0] | ||
| df = df[~df[first_col].astype(str).str.startswith("##")] | ||
|
Comment on lines
+240
to
+243
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering whether you could achieve the same by simply reading in every file as OrthologFileFmt and
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could use |
||
| frames.append(df) | ||
|
|
||
| all_annotations = pd.concat(frames, ignore_index=True) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to evaluate carefully whether this will work well when one has hundreds of samples with thousands of annotations. I'm a bit worried that the memory will blow up 😅 |
||
|
|
||
| # Rebuild the eggNOG column header line. | ||
| col_header = "\t".join(all_annotations.columns) + "\n" | ||
|
|
||
| # Strip ORF suffix (contig_id_1 -> contig_id) | ||
| query_col = all_annotations.columns[0] | ||
| all_annotations["mag_uuid"] = ( | ||
| all_annotations[query_col] | ||
| .str.replace(r"_\d+$", "", regex=True) | ||
| .map(contig_to_mag) | ||
| ) | ||
|
|
||
| matched = all_annotations.dropna(subset=["mag_uuid"]) | ||
| if matched.empty: | ||
| raise ValueError("No annotation rows could be matched to any MAG.") | ||
|
|
||
| unmatched = all_annotations["mag_uuid"].isna().sum() | ||
| if unmatched > 0: | ||
| total = len(all_annotations) | ||
| pct = unmatched / total * 100 | ||
| warnings.warn( | ||
| f"{unmatched} of {total} annotation row(s) ({pct:.1f}%) were on " | ||
| "contigs not present in the contig map (e.g. unbinned contigs) " | ||
| "and were skipped.", | ||
| UserWarning, | ||
| ) | ||
|
|
||
| result = OrthologAnnotationDirFmt() | ||
| for mag_uuid, group in matched.groupby("mag_uuid"): | ||
| out_fp = result.path / f"{mag_uuid}.emapper.annotations" | ||
| n_contigs = len(contig_map_dict.get(mag_uuid, [])) | ||
| n_rows = len(group) | ||
| with open(out_fp, "w") as fh: | ||
| fh.write("## Transferred using transfer_eggnog_annotations (q2-annotate)\n") | ||
| fh.write("## Source: contig-level annotations\n") | ||
| fh.write(f"## MAG: {mag_uuid} | contigs: {n_contigs} | rows: {n_rows}\n") | ||
| fh.write("##\n") | ||
| fh.write(col_header) | ||
| group.drop(columns=["mag_uuid"]).to_csv( | ||
| fh, sep="\t", index=False, header=False | ||
| ) | ||
|
|
||
| # Verbose-only summary | ||
| print( | ||
| f"Aggregated {len(matched)} of {len(all_annotations)} annotation " | ||
| f"row(s) into {matched['mag_uuid'].nunique()} MAG(s); " | ||
| f"{unmatched} row(s) skipped." | ||
| ) | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| def transfer_eggnog_annotations( | ||
| ortholog_annotations: OrthologAnnotationDirFmt, | ||
| destination: Union[MAGSequencesDirFmt, MAGtoContigsDirFmt], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is confusing. The destination should be represented by the same kind of semantic type, either SampleData[MAGs] or FeatureData[MAG]. Now, you are mixing in the contig map. I think this should become an additional input required when SampleData[MAGs] were provided as source of the annotations or if FeatureData[MAG] was provided as the destination (whichever of those makes more sense for your pipeline). |
||
| ) -> OrthologAnnotationDirFmt: | ||
| """Transfer or aggregate eggNOG annotations based on the destination type.""" | ||
| if isinstance(destination, MAGSequencesDirFmt): | ||
| result = OrthologAnnotationDirFmt() | ||
| mag_ids = _get_mag_ids_from_feature_data(destination) | ||
| _copy_annotation_files(ortholog_annotations, mag_ids, result) | ||
| return result | ||
| else: | ||
| return _annotate_mags_from_contigs(ortholog_annotations, destination) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| ## Fri May 24 23:19:02 2024 | ||
| ## emapper-2.1.12 | ||
| ## emapper.py -m no_search --annotate_hits_table input --cpu 1 | ||
| ## | ||
| #query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs | ||
| k141_100_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 1.1.1.1 ko:K00001 - - - - ko00000 - - - PF00001 | ||
| k141_100_1 ortholog1 0.0 100.0 COG0001@1|root 1|root F some description geneA - 2.2.2.2 ko:K00002 - - - - ko00000 - - - PF00001 | ||
| k141_200_0 ortholog1 0.0 100.0 COG0001@1|root 1|root A some description geneA - 3.3.3.3 ko:K00003 - - - - ko00000 - - - PF00001 | ||
| k141_300_0 ortholog1 0.0 100.0 COG0001@1|root 1|root L some description geneA - 4.4.4.4 ko:K00004 - - - - ko00000 - - - PF00001 | ||
| ## 4 queries scanned | ||
| ## Total time (seconds): 1.0 | ||
| ## Rate: 4.00 q/s |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| >seq1 | ||
| ACGT |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| >seq2 | ||
| ACGT |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| { | ||
| "11111111-1111-4111-8111-111111111111": [ | ||
| "nonexistent_contig" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| { | ||
| "11111111-1111-4111-8111-111111111111": [ | ||
| "k141_100", | ||
| "k141_200" | ||
| ] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| { | ||
| "11111111-1111-4111-8111-111111111111": [ | ||
| "k141_100", | ||
| "k141_200" | ||
| ], | ||
| "22222222-2222-4222-8222-222222222222": [ | ||
| "k141_300" | ||
| ] | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we really need this method? It literally does a single thing.