diff --git a/README.md b/README.md index f9027bd..b2bd80b 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,24 @@ _Note: In version 3.0 we switched to using YFull (v10.01) for the underlying tre Yleaf -bam file.bam -o bam_output --reference_genome hg19 -dh -p +### Using custom reference genomes + +You can specify custom reference genomes instead of using the default downloaded ones: + + Yleaf -bam file.bam -o bam_output -rg hg19 -fg /path/to/full_genome.fa -yr /path/to/chrY.fa + +Where: +- `-fg` or `--full_genome_reference` specifies the path to a custom full genome reference file +- `-yr` or `--y_chromosome_reference` specifies the path to a custom Y chromosome reference file + +Both references must be in FASTA format (`.fa`, `.fasta`, or `.fna`). + +### Extracting Y chromosome from a reference genome + +If you have a full genome reference but need to extract just the Y chromosome, use the included extraction tool: + + python -m yleaf.extract_y_chromosome -i /path/to/full_genome.fa -o /path/to/output_chrY.fa + ## Additional information For a more comprehensive manual please have a look at the [yleaf_manual](yleaf_manual.pdf). diff --git a/setup.py b/setup.py index d3cbbfb..5b7c07e 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ def get_version(): 'console_scripts': [ 'Yleaf=yleaf.Yleaf:main', 'predict_haplogroup=yleaf.predict_haplogroup:main', + 'extract_y_chromosome=yleaf.extract_y_chromosome:main', ], }, include_package_data=True, diff --git a/yleaf/Yleaf.py b/yleaf/Yleaf.py index 39b8659..9e5f1c2 100644 --- a/yleaf/Yleaf.py +++ b/yleaf/Yleaf.py @@ -369,8 +369,11 @@ def main(): LOG.info(f"Running Yleaf with command: {' '.join(sys.argv)}") + # Validate custom references if provided + validate_custom_references(args) + # make sure the reference genome is present before doing something else, if not present it is downloaded - check_reference(args.reference_genome) + check_reference(args) if args.fastq: main_fastq(args, out_folder) @@ -430,6 +433,10 @@ def get_arguments() -> argparse.Namespace: " will be used instead as reference or the location will be used to download the " "reference if those files are missing or empty.", choices=[yleaf_constants.HG19, yleaf_constants.HG38], required=True) + parser.add_argument("-fg", "--full_genome_reference", required=False, type=check_file, + help="Custom full genome reference fasta file (.fa, .fasta, or .fna)", metavar="PATH") + parser.add_argument("-yr", "--y_chromosome_reference", required=False, type=check_file, + help="Custom Y chromosome reference fasta file (.fa, .fasta, or .fna)", metavar="PATH") parser.add_argument("-o", "--output", required=True, help="Folder name containing outputs", metavar="STRING") parser.add_argument("-r", "--reads_treshold", @@ -545,7 +552,7 @@ def main_fastq( ): files = get_files_with_extension(args.fastq, '.fastq') files += get_files_with_extension(args.fastq, '.fastq.gz') - reference = get_reference_path(args.reference_genome, True) + reference = get_reference_path(args, True) bam_folder = out_folder / yleaf_constants.FASTQ_BAM_FILE_FOLDER try: os.mkdir(bam_folder) @@ -657,27 +664,44 @@ def get_files_with_extension( def check_reference( - requested_version: str, + args: argparse.Namespace, ): - reference_file = get_reference_path(requested_version, True) + # Skip reference check if custom full genome reference is provided + if args.full_genome_reference: + return + + # Use default reference path + mock_args = argparse.Namespace( + reference_genome=args.reference_genome, + full_genome_reference=None, + y_chromosome_reference=None + ) + reference_file = get_reference_path(mock_args, True) if os.path.getsize(reference_file) < 100: - LOG.info(f"No reference genome version was found. Downloading the {requested_version} reference genome. This " + LOG.info(f"No reference genome version was found. Downloading the {args.reference_genome} reference genome. This " f"should be a one time thing.") - download_reference.main(requested_version) + download_reference.main(args.reference_genome) LOG.info("Finished downloading the reference genome.") def get_reference_path( - requested_version: str, + args: argparse.Namespace, is_full: bool ) -> Union[Path, None]: + # First check if custom references were provided via command line + if is_full and args.full_genome_reference: + return args.full_genome_reference + elif not is_full and args.y_chromosome_reference: + return args.y_chromosome_reference + + # Fall back to default references if is_full: - if requested_version == yleaf_constants.HG19: + if args.reference_genome == yleaf_constants.HG19: reference_file = yleaf_constants.HG19_FULL_GENOME else: reference_file = yleaf_constants.HG38_FULL_GENOME else: - if requested_version == yleaf_constants.HG19: + if args.reference_genome == yleaf_constants.HG19: reference_file = yleaf_constants.HG19_Y_CHROMOSOME else: reference_file = yleaf_constants.HG38_Y_CHROMOSOME @@ -1052,7 +1076,7 @@ def find_private_mutations( ): # identify mutations not part of haplogroups that are annotated in dbsnp or differ from the reference genome LOG.debug("Starting with extracting private mutations...") - snp_reference_file = get_reference_path(args.reference_genome, False) + snp_reference_file = get_reference_path(args, False) snp_database_file = yleaf_constants.DATA_FOLDER / args.reference_genome / yleaf_constants.SNP_DATA_FILE # run mpileup @@ -1211,5 +1235,32 @@ def draw_haplogroups( haplogroup_tree_image.main(namespace) +def validate_custom_references( + args: argparse.Namespace +): + """Validate custom reference files if provided.""" + if args.full_genome_reference: + if not os.path.exists(args.full_genome_reference): + LOG.error(f"Custom full genome reference file {args.full_genome_reference} not found.") + raise ValueError(f"Custom full genome reference file {args.full_genome_reference} not found.") + + if args.full_genome_reference.suffix not in [".fa", ".fasta", ".fna"]: + LOG.error(f"Custom full genome reference file must be in FASTA format (.fa, .fasta, or .fna).") + raise ValueError(f"Custom full genome reference file must be in FASTA format (.fa, .fasta, or .fna).") + + LOG.info(f"Using custom full genome reference: {args.full_genome_reference}") + + if args.y_chromosome_reference: + if not os.path.exists(args.y_chromosome_reference): + LOG.error(f"Custom Y chromosome reference file {args.y_chromosome_reference} not found.") + raise ValueError(f"Custom Y chromosome reference file {args.y_chromosome_reference} not found.") + + if args.y_chromosome_reference.suffix not in [".fa", ".fasta", ".fna"]: + LOG.error(f"Custom Y chromosome reference file must be in FASTA format (.fa, .fasta, or .fna).") + raise ValueError(f"Custom Y chromosome reference file must be in FASTA format (.fa, .fasta, or .fna).") + + LOG.info(f"Using custom Y chromosome reference: {args.y_chromosome_reference}") + + if __name__ == "__main__": main() diff --git a/yleaf/__init__.py b/yleaf/__init__.py index 1da6a55..88c513e 100644 --- a/yleaf/__init__.py +++ b/yleaf/__init__.py @@ -1 +1 @@ -__version__ = "3.2.1" +__version__ = "3.3.0" diff --git a/yleaf/extract_y_chromosome.py b/yleaf/extract_y_chromosome.py new file mode 100644 index 0000000..71c4d46 --- /dev/null +++ b/yleaf/extract_y_chromosome.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +""" +Script for extracting Y chromosome data from a full reference genome. + +Developed for Yleaf by Alaina Hardie, @trianglegrrl + +License: GNU General Public License v3 or later +A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt. +""" + +import argparse +import logging +import sys +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +LOG = logging.getLogger(__name__) + + +def extract_y_chromosome(input_file: Path, output_file: Path): + """ + Extract Y chromosome data from a full reference genome. + + Args: + input_file: Path to the full reference genome file + output_file: Path where the Y chromosome data will be saved + """ + LOG.info(f"Extracting Y chromosome from {input_file}") + + with open(output_file, "w") as fo: + with open(input_file) as fi: + record = False + y_chrom_found = False + + for line in fi: + if line.startswith(">chrY") or line.startswith(">Y"): + record = True + y_chrom_found = True + LOG.info("Found Y chromosome") + fo.write(line) + elif record: + if line.startswith(">"): + break + fo.write(line) + + if not y_chrom_found: + LOG.error("No Y chromosome found in the reference file!") + return False + + LOG.info(f"Y chromosome successfully extracted to {output_file}") + return True + + +def main(): + parser = argparse.ArgumentParser(description="Extract Y chromosome from a reference genome") + parser.add_argument("-i", "--input", required=True, type=str, + help="Input reference genome file (.fa, .fasta, or .fna)") + parser.add_argument("-o", "--output", required=True, type=str, + help="Output file for Y chromosome (.fa, .fasta, or .fna)") + args = parser.parse_args() + + input_file = Path(args.input) + output_file = Path(args.output) + + if not input_file.exists(): + LOG.error(f"Input file {input_file} does not exist") + sys.exit(1) + + valid_extensions = ['.fa', '.fasta', '.fna'] + if input_file.suffix.lower() not in valid_extensions: + LOG.warning(f"Input file extension {input_file.suffix} is not a standard FASTA extension (.fa, .fasta, or .fna)") + + output_file.parent.mkdir(parents=True, exist_ok=True) + + # Extract Y chromosome + if not extract_y_chromosome(input_file, output_file): + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file