-
Notifications
You must be signed in to change notification settings - Fork 13
Add command-line options for custom reference genomes #32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| __version__ = "3.2.1" | ||
| __version__ = "3.3.0" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| #!/usr/bin/env python | ||
| """ | ||
| Script for extracting Y chromosome data from a full reference genome. | ||
|
|
||
| Developed for Yleaf by Alaina Hardie, @trianglegrrl | ||
|
|
||
| License: GNU General Public License v3 or later | ||
| A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt. | ||
| """ | ||
|
|
||
| import argparse | ||
| import logging | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') | ||
| LOG = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def extract_y_chromosome(input_file: Path, output_file: Path): | ||
| """ | ||
| Extract Y chromosome data from a full reference genome. | ||
|
|
||
| Args: | ||
| input_file: Path to the full reference genome file | ||
| output_file: Path where the Y chromosome data will be saved | ||
| """ | ||
| LOG.info(f"Extracting Y chromosome from {input_file}") | ||
|
|
||
| with open(output_file, "w") as fo: | ||
| with open(input_file) as fi: | ||
| record = False | ||
| y_chrom_found = False | ||
|
|
||
| for line in fi: | ||
| if line.startswith(">chrY") or line.startswith(">Y"): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the new T2T reference starts with >CP086569.2. Please check if there are other nomenclatures for Y chromosomal record IDs.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dionzand ... I checked a couple of T2T references (hg002v1.1.mat_Y_EBV_MT.fasta.gz, and chm13v2.0_maskedY_rCRS.fa.gz) and they use |
||
| record = True | ||
| y_chrom_found = True | ||
| LOG.info("Found Y chromosome") | ||
| fo.write(line) | ||
| elif record: | ||
| if line.startswith(">"): | ||
| break | ||
| fo.write(line) | ||
|
|
||
| if not y_chrom_found: | ||
| LOG.error("No Y chromosome found in the reference file!") | ||
| return False | ||
|
|
||
| LOG.info(f"Y chromosome successfully extracted to {output_file}") | ||
| return True | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Extract Y chromosome from a reference genome") | ||
| parser.add_argument("-i", "--input", required=True, type=str, | ||
| help="Input reference genome file (.fa, .fasta, or .fna)") | ||
| parser.add_argument("-o", "--output", required=True, type=str, | ||
| help="Output file for Y chromosome (.fa, .fasta, or .fna)") | ||
| args = parser.parse_args() | ||
|
|
||
| input_file = Path(args.input) | ||
| output_file = Path(args.output) | ||
|
|
||
| if not input_file.exists(): | ||
| LOG.error(f"Input file {input_file} does not exist") | ||
| sys.exit(1) | ||
|
|
||
| valid_extensions = ['.fa', '.fasta', '.fna'] | ||
| if input_file.suffix.lower() not in valid_extensions: | ||
| LOG.warning(f"Input file extension {input_file.suffix} is not a standard FASTA extension (.fa, .fasta, or .fna)") | ||
|
|
||
| output_file.parent.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # Extract Y chromosome | ||
| if not extract_y_chromosome(input_file, output_file): | ||
| sys.exit(1) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Uh oh!
There was an error while loading. Please reload this page.