Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,24 @@ _Note: In version 3.0 we switched to using YFull (v10.01) for the underlying tre

Yleaf -bam file.bam -o bam_output --reference_genome hg19 -dh -p

### Using custom reference genomes

You can specify custom reference genomes instead of using the default downloaded ones:

Yleaf -bam file.bam -o bam_output -rg hg19 -fg /path/to/full_genome.fa -yr /path/to/chrY.fa

Where:
- `-fg` or `--full_genome_reference` specifies the path to a custom full genome reference file
- `-yr` or `--y_chromosome_reference` specifies the path to a custom Y chromosome reference file

Both references must be in FASTA format (`.fa`, `.fasta`, or `.fna`).

### Extracting Y chromosome from a reference genome

If you have a full genome reference but need to extract just the Y chromosome, use the included extraction tool:

python -m yleaf.extract_y_chromosome -i /path/to/full_genome.fa -o /path/to/output_chrY.fa

## Additional information

For a more comprehensive manual please have a look at the [yleaf_manual](yleaf_manual.pdf).
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def get_version():
'console_scripts': [
'Yleaf=yleaf.Yleaf:main',
'predict_haplogroup=yleaf.predict_haplogroup:main',
'extract_y_chromosome=yleaf.extract_y_chromosome:main',
],
},
include_package_data=True,
Expand Down
71 changes: 61 additions & 10 deletions yleaf/Yleaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,11 @@ def main():

LOG.info(f"Running Yleaf with command: {' '.join(sys.argv)}")

# Validate custom references if provided
validate_custom_references(args)

# make sure the reference genome is present before doing something else, if not present it is downloaded
check_reference(args.reference_genome)
check_reference(args)

if args.fastq:
main_fastq(args, out_folder)
Expand Down Expand Up @@ -430,6 +433,10 @@ def get_arguments() -> argparse.Namespace:
" will be used instead as reference or the location will be used to download the "
"reference if those files are missing or empty.",
choices=[yleaf_constants.HG19, yleaf_constants.HG38], required=True)
parser.add_argument("-fg", "--full_genome_reference", required=False, type=check_file,
help="Custom full genome reference fasta file (.fa, .fasta, or .fna)", metavar="PATH")
parser.add_argument("-yr", "--y_chromosome_reference", required=False, type=check_file,
help="Custom Y chromosome reference fasta file (.fa, .fasta, or .fna)", metavar="PATH")
parser.add_argument("-o", "--output", required=True,
help="Folder name containing outputs", metavar="STRING")
parser.add_argument("-r", "--reads_treshold",
Expand Down Expand Up @@ -545,7 +552,7 @@ def main_fastq(
):
files = get_files_with_extension(args.fastq, '.fastq')
files += get_files_with_extension(args.fastq, '.fastq.gz')
reference = get_reference_path(args.reference_genome, True)
reference = get_reference_path(args, True)
bam_folder = out_folder / yleaf_constants.FASTQ_BAM_FILE_FOLDER
try:
os.mkdir(bam_folder)
Expand Down Expand Up @@ -657,27 +664,44 @@ def get_files_with_extension(


def check_reference(
requested_version: str,
args: argparse.Namespace,
):
reference_file = get_reference_path(requested_version, True)
# Skip reference check if custom full genome reference is provided
if args.full_genome_reference:
return

# Use default reference path
mock_args = argparse.Namespace(
reference_genome=args.reference_genome,
full_genome_reference=None,
y_chromosome_reference=None
)
reference_file = get_reference_path(mock_args, True)
if os.path.getsize(reference_file) < 100:
LOG.info(f"No reference genome version was found. Downloading the {requested_version} reference genome. This "
LOG.info(f"No reference genome version was found. Downloading the {args.reference_genome} reference genome. This "
f"should be a one time thing.")
download_reference.main(requested_version)
download_reference.main(args.reference_genome)
LOG.info("Finished downloading the reference genome.")


def get_reference_path(
requested_version: str,
args: argparse.Namespace,
is_full: bool
) -> Union[Path, None]:
# First check if custom references were provided via command line
if is_full and args.full_genome_reference:
return args.full_genome_reference
elif not is_full and args.y_chromosome_reference:
return args.y_chromosome_reference

# Fall back to default references
if is_full:
if requested_version == yleaf_constants.HG19:
if args.reference_genome == yleaf_constants.HG19:
reference_file = yleaf_constants.HG19_FULL_GENOME
else:
reference_file = yleaf_constants.HG38_FULL_GENOME
else:
if requested_version == yleaf_constants.HG19:
if args.reference_genome == yleaf_constants.HG19:
reference_file = yleaf_constants.HG19_Y_CHROMOSOME
else:
reference_file = yleaf_constants.HG38_Y_CHROMOSOME
Expand Down Expand Up @@ -1052,7 +1076,7 @@ def find_private_mutations(
):
# identify mutations not part of haplogroups that are annotated in dbsnp or differ from the reference genome
LOG.debug("Starting with extracting private mutations...")
snp_reference_file = get_reference_path(args.reference_genome, False)
snp_reference_file = get_reference_path(args, False)
snp_database_file = yleaf_constants.DATA_FOLDER / args.reference_genome / yleaf_constants.SNP_DATA_FILE

# run mpileup
Expand Down Expand Up @@ -1211,5 +1235,32 @@ def draw_haplogroups(
haplogroup_tree_image.main(namespace)


def validate_custom_references(
args: argparse.Namespace
):
"""Validate custom reference files if provided."""
if args.full_genome_reference:
if not os.path.exists(args.full_genome_reference):
LOG.error(f"Custom full genome reference file {args.full_genome_reference} not found.")
raise ValueError(f"Custom full genome reference file {args.full_genome_reference} not found.")

if args.full_genome_reference.suffix not in [".fa", ".fasta", ".fna"]:
LOG.error(f"Custom full genome reference file must be in FASTA format (.fa, .fasta, or .fna).")
raise ValueError(f"Custom full genome reference file must be in FASTA format (.fa, .fasta, or .fna).")

LOG.info(f"Using custom full genome reference: {args.full_genome_reference}")

if args.y_chromosome_reference:
if not os.path.exists(args.y_chromosome_reference):
LOG.error(f"Custom Y chromosome reference file {args.y_chromosome_reference} not found.")
raise ValueError(f"Custom Y chromosome reference file {args.y_chromosome_reference} not found.")

if args.y_chromosome_reference.suffix not in [".fa", ".fasta", ".fna"]:
LOG.error(f"Custom Y chromosome reference file must be in FASTA format (.fa, .fasta, or .fna).")
raise ValueError(f"Custom Y chromosome reference file must be in FASTA format (.fa, .fasta, or .fna).")

LOG.info(f"Using custom Y chromosome reference: {args.y_chromosome_reference}")


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion yleaf/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.2.1"
__version__ = "3.3.0"
81 changes: 81 additions & 0 deletions yleaf/extract_y_chromosome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
"""
Script for extracting Y chromosome data from a full reference genome.

Developed for Yleaf by Alaina Hardie, @trianglegrrl

License: GNU General Public License v3 or later
A copy of GNU GPL v3 should have been included in this software package in LICENSE.txt.
"""

import argparse
import logging
import sys
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
LOG = logging.getLogger(__name__)


def extract_y_chromosome(input_file: Path, output_file: Path):
"""
Extract Y chromosome data from a full reference genome.

Args:
input_file: Path to the full reference genome file
output_file: Path where the Y chromosome data will be saved
"""
LOG.info(f"Extracting Y chromosome from {input_file}")

with open(output_file, "w") as fo:
with open(input_file) as fi:
record = False
y_chrom_found = False

for line in fi:
if line.startswith(">chrY") or line.startswith(">Y"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the new T2T reference starts with >CP086569.2. Please check if there are other nomenclatures for Y chromosomal record IDs.

Copy link
Author

@trianglegrrl trianglegrrl Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dionzand ... I checked a couple of T2T references (hg002v1.1.mat_Y_EBV_MT.fasta.gz, and chm13v2.0_maskedY_rCRS.fa.gz) and they use chrY for the Y. Is there a different version I should be checking? I'm not super familiar with T2T so I might be missing something, but this is how I got what I got:

╰─ zgrep "^>" hg002v1.1.mat_Y_EBV_MT.fasta.gz | cut -d ' ' -f 1 | sed 's/>//'
[... other chromosome labels]
chrY_PATERNAL


╰─ zgrep "^>" chm13v2.0_maskedY_rCRS.fa.gz | cut -d ' ' -f 1 | sed 's/>//'
[... other chromosome labels]
chrY

record = True
y_chrom_found = True
LOG.info("Found Y chromosome")
fo.write(line)
elif record:
if line.startswith(">"):
break
fo.write(line)

if not y_chrom_found:
LOG.error("No Y chromosome found in the reference file!")
return False

LOG.info(f"Y chromosome successfully extracted to {output_file}")
return True


def main():
parser = argparse.ArgumentParser(description="Extract Y chromosome from a reference genome")
parser.add_argument("-i", "--input", required=True, type=str,
help="Input reference genome file (.fa, .fasta, or .fna)")
parser.add_argument("-o", "--output", required=True, type=str,
help="Output file for Y chromosome (.fa, .fasta, or .fna)")
args = parser.parse_args()

input_file = Path(args.input)
output_file = Path(args.output)

if not input_file.exists():
LOG.error(f"Input file {input_file} does not exist")
sys.exit(1)

valid_extensions = ['.fa', '.fasta', '.fna']
if input_file.suffix.lower() not in valid_extensions:
LOG.warning(f"Input file extension {input_file.suffix} is not a standard FASTA extension (.fa, .fasta, or .fna)")

output_file.parent.mkdir(parents=True, exist_ok=True)

# Extract Y chromosome
if not extract_y_chromosome(input_file, output_file):
sys.exit(1)


if __name__ == "__main__":
main()