Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions hvantk/commands/make_table_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,11 +661,18 @@ def mktable_eqtl(
)
@click.option("--tissue", type=str, default=None, help="Restrict to this tissue")
@click.option(
"--gene-map-ht",
"--hgnc-ht",
type=str,
default=None,
help="Gene mapping HT (gene_id key, gene_name field) for "
"symbol → Ensembl ID conversion",
help="HGNC Hail Table (built by 'hvantk mktable hgnc-gene') for "
"gene symbol → Ensembl ID mapping. Required unless --no-gene-map.",
)
@click.option(
"--no-gene-map",
is_flag=True,
default=False,
help="Skip Ensembl mapping — key by raw gene symbol. "
"The table will NOT join with eQTL tables in cascade analysis.",
)
@click.option(
"--p-threshold",
Expand All @@ -681,7 +688,8 @@ def mktable_pqtl(
ref_genome: str,
source: str,
tissue: str,
gene_map_ht: str,
hgnc_ht: str,
no_gene_map: bool,
p_threshold: float,
):
"""Build a pQTL Hail Table (keyed by locus, alleles, gene_id)."""
Expand All @@ -692,7 +700,8 @@ def mktable_pqtl(
reference_genome=ref_genome,
source=source,
tissue=tissue,
gene_map_ht=gene_map_ht,
hgnc_ht=hgnc_ht,
no_gene_map=no_gene_map,
p_threshold=p_threshold,
overwrite=overwrite,
export_tsv=export_tsv,
Expand Down
77 changes: 54 additions & 23 deletions hvantk/tables/table_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1764,7 +1764,8 @@ def create_pqtl_tb(
reference_genome: str = "GRCh38",
source: str = "gtex_fang",
tissue: Optional[str] = None,
gene_map_ht: Optional[str] = None,
hgnc_ht: Optional[str] = None,
no_gene_map: bool = False,
p_threshold: Optional[float] = None,
overwrite: bool = False,
export_tsv: bool = False,
Expand All @@ -1776,11 +1777,14 @@ def create_pqtl_tb(
(space-delimited gzip, TMT mass spectrometry, 5 tissues).
SE is derived as ``|BETA / STAT|`` (Fang files lack an SE column).

If *gene_map_ht* is provided, gene symbols are mapped to Ensembl
gene IDs via a Hail Table join. The mapping table should be keyed by
``gene_id`` and have a ``gene_name`` field (e.g. the Ensembl gene
table built by ``create_ensembl_gene_tb``). Without it, the original
``gene_name`` is used as ``gene_id``.
Gene symbols are mapped to Ensembl gene IDs via the HGNC table
and :class:`~hvantk.data.gene_mapper.GeneMapper`. This is
**required** because the cascade join uses ``(locus, alleles,
gene_id)`` with Ensembl IDs on the eQTL side; raw gene symbols
would produce zero matches.

Pass ``no_gene_map=True`` to opt out of mapping for non-cascade
use cases (the table will be keyed by raw gene symbol).

Parameters
----------
Expand All @@ -1794,13 +1798,21 @@ def create_pqtl_tb(
Data-source identifier. Only ``gtex_fang`` is currently supported.
tissue : str, optional
Restrict to this tissue.
gene_map_ht : str, optional
Path to gene-mapping Hail Table (``gene_id`` key, ``gene_name``
field).
hgnc_ht : str, optional
Path to HGNC Hail Table (built by ``create_hgnc_gene_tb``).
Required unless ``no_gene_map=True``.
no_gene_map : bool
Skip Ensembl mapping and key by raw gene symbol. The resulting
table will **not** join with eQTL tables in cascade analysis.
p_threshold : float, optional
P-value cutoff. ``None`` keeps all pairs (for coloc).
overwrite, export_tsv, fields
Standard builder parameters.

Raises
------
ValueError
If ``hgnc_ht`` is not provided and ``no_gene_map`` is ``False``.
"""
from hvantk.qtlcascade.constants import PQTL_SOURCES

Expand All @@ -1812,6 +1824,14 @@ def create_pqtl_tb(
"Only 'gtex_fang' (Fang et al. 2025) is currently supported."
)

if not hgnc_ht and not no_gene_map:
raise ValueError(
"Ensembl gene mapping is required for cascade-compatible pQTL "
"tables. Provide --hgnc-ht <path> (HGNC Hail Table built by "
"'hvantk mktable hgnc-gene'). If you intentionally want a "
"symbol-keyed table for non-cascade use, pass --no-gene-map."
)

def import_func():
return _import_pqtl_gtex_fang(input_path, tissue)

Expand All @@ -1822,25 +1842,36 @@ def transform(ht):
ht = ht.annotate(se=hl.abs(ht.beta / ht.stat))
ht = ht.drop("stat", "variant_id")

# Gene-symbol → Ensembl-ID mapping (prototype lesson #2:
# use Hail Table join, NOT hl.literal, to avoid IR poisoning).
if gene_map_ht:
logger.info("Mapping gene symbols → Ensembl IDs via %s", gene_map_ht)
gm = hl.read_table(gene_map_ht)
rev = (
gm.key_by()
.select("gene_id", "gene_name")
.key_by("gene_name")
.distinct()
# Gene-symbol → Ensembl-ID mapping via GeneMapper
# (prototype lesson #2: use Hail Table join, NOT hl.literal,
# to avoid IR poisoning).
if hgnc_ht:
from hvantk.data.gene_mapper import GeneMapper

logger.info(
"Mapping gene symbols → Ensembl IDs via GeneMapper (%s)",
hgnc_ht,
)
hgnc_table = hl.read_table(hgnc_ht)
mapper = GeneMapper(hgnc_table)
ht = mapper.annotate_table(
ht,
source_field="gene_symbol",
source_type="gene_symbol",
fields_to_add=["ensembl_gene_id"],
)
ht = ht.annotate(
gene_id=hl.or_else(rev[ht.gene_symbol].gene_id, ht.gene_symbol),
gene_id=hl.or_else(
ht.hgnc_ensembl_gene_id, ht.gene_symbol
),
)
ht = ht.drop("hgnc_ensembl_gene_id")
else:
# no_gene_map=True path
logger.warning(
"No gene_map_ht provided — using gene symbols as gene_id. "
"pQTL table will NOT join correctly with eQTL tables in "
"cascade analysis (eQTL uses Ensembl IDs)."
"--no-gene-map: using gene symbols as gene_id. "
"This table will NOT join with eQTL tables in cascade "
"analysis."
)
ht = ht.annotate(gene_id=ht.gene_symbol)

Expand Down
53 changes: 47 additions & 6 deletions hvantk/tests/test_cli_mktable_qtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,37 @@ def test_mktable_eqtl_cli_with_options():
)


def test_mktable_pqtl_cli_default():
def test_mktable_pqtl_cli_requires_hgnc_or_opt_out():
"""pQTL builder must fail when neither --hgnc-ht nor --no-gene-map is given."""
runner = CliRunner()
error = ValueError(
"Ensembl gene mapping is required for cascade-compatible pQTL tables."
)
with patch(
"hvantk.commands.make_table_cli._create_pqtl_tb", side_effect=error
):
result = runner.invoke(
mktable_group,
[
"pqtl",
"--raw-input",
"/data/fang_pqtl/",
"--output-ht",
"/out/pqtl.ht",
],
)
# Should fail because hgnc_ht is required (no_gene_map defaults to False)
assert result.exit_code != 0, result.output
assert "Ensembl gene mapping is required" in str(result.exception)


def test_mktable_pqtl_cli_no_gene_map():
"""--no-gene-map lets the user skip Ensembl mapping explicitly."""
runner = CliRunner()
mock_ht = MagicMock()
with patch("hvantk.commands.make_table_cli._create_pqtl_tb", return_value=mock_ht):
with patch(
"hvantk.commands.make_table_cli._create_pqtl_tb", return_value=mock_ht
) as mock_create:
result = runner.invoke(
mktable_group,
[
Expand All @@ -74,13 +101,26 @@ def test_mktable_pqtl_cli_default():
"/data/fang_pqtl/",
"--output-ht",
"/out/pqtl.ht",
"--no-gene-map",
],
)
assert result.exit_code == 0, result.output
assert "pQTL table created" in result.output
mock_create.assert_called_once_with(
input_path="/data/fang_pqtl/",
output_path="/out/pqtl.ht",
reference_genome="GRCh38",
source="gtex_fang",
tissue=None,
hgnc_ht=None,
no_gene_map=True,
p_threshold=None,
overwrite=False,
export_tsv=False,
)


def test_mktable_pqtl_cli_with_gene_map():
def test_mktable_pqtl_cli_with_hgnc_ht():
runner = CliRunner()
mock_ht = MagicMock()
with patch(
Expand All @@ -96,8 +136,8 @@ def test_mktable_pqtl_cli_with_gene_map():
"/out/pqtl_liver.ht",
"--tissue",
"Liver",
"--gene-map-ht",
"/data/ensembl_gene.ht",
"--hgnc-ht",
"/data/hgnc.ht",
"--overwrite",
],
)
Expand All @@ -108,7 +148,8 @@ def test_mktable_pqtl_cli_with_gene_map():
reference_genome="GRCh38",
source="gtex_fang",
tissue="Liver",
gene_map_ht="/data/ensembl_gene.ht",
hgnc_ht="/data/hgnc.ht",
no_gene_map=False,
p_threshold=None,
overwrite=True,
export_tsv=False,
Expand Down
Loading