From e2708cc841afdce87c76007876323e57836e8ff4 Mon Sep 17 00:00:00 2001 From: anderdc Date: Mon, 8 Jun 2026 16:42:02 -0500 Subject: [PATCH] fix(validator): serialize TAO source-balance check under axon_lock handle_swap_reserve called provider.get_balance outside axon_lock with a comment claiming the source-chain RPC is a separate connection. That holds for a BTC source (Esplora/Maestro HTTP) but not for a TAO source: the subtensor provider's get_balance runs on the shared axon_subtensor websocket that axon_lock exists to serialize. Every TAO->BTC reserve raced the lock-protected readers, causing recurring 'cannot call recv while another thread is already running recv' errors. Mark substrate-backed providers with uses_substrate and gate the balance check on it: serialize the TAO read under axon_lock, keep BTC's HTTP read lock-free so a slow Esplora call doesn't stall the forward loop. --- allways/chain_providers/base.py | 5 +++ allways/chain_providers/subtensor.py | 3 ++ allways/validator/axon_handlers.py | 11 +++-- tests/test_axon_handlers.py | 65 ++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 3 deletions(-) diff --git a/allways/chain_providers/base.py b/allways/chain_providers/base.py index af5292df..9c0fd12b 100644 --- a/allways/chain_providers/base.py +++ b/allways/chain_providers/base.py @@ -31,6 +31,11 @@ class ChainProvider(ABC): 3. Add {ENV_PREFIX}_* vars to .env """ + # True if this provider's RPCs hit the shared substrate websocket, so + # callers must serialise them under axon_lock. HTTP-backed providers leave + # this False and stay lock-free. + uses_substrate: bool = False + @abstractmethod def get_chain(self) -> ChainDefinition: ... diff --git a/allways/chain_providers/subtensor.py b/allways/chain_providers/subtensor.py index cdfa9989..38e37070 100644 --- a/allways/chain_providers/subtensor.py +++ b/allways/chain_providers/subtensor.py @@ -20,6 +20,9 @@ class SubtensorProvider(ChainProvider): clear error if they attempt to send. """ + # RPCs run on the shared substrate websocket — callers serialise via axon_lock. + uses_substrate = True + # Balances pallet index and transfer call indices on Subtensor _BALANCES_PALLET = 5 _TRANSFER_CALLS = {0: 'transfer_allow_death', 3: 'transfer_keep_alive', 7: 'transfer_all'} diff --git a/allways/validator/axon_handlers.py b/allways/validator/axon_handlers.py index 302a2bba..e064bbfc 100644 --- a/allways/validator/axon_handlers.py +++ b/allways/validator/axon_handlers.py @@ -325,9 +325,14 @@ async def handle_swap_reserve( reject_synapse(synapse, 'Invalid source address proof', ctx) return synapse - # Source-chain RPC — separate connection from substrate, so it doesn't - # need axon_lock and shouldn't block the substrate websocket. - balance = provider.get_balance(synapse.from_address) + # A TAO source reads balance over the shared substrate websocket, so it + # must serialise under axon_lock; a BTC source is HTTP and stays lock-free + # to avoid stalling the forward loop behind a slow Esplora call. + if provider.uses_substrate: + with validator.axon_lock: + balance = provider.get_balance(synapse.from_address) + else: + balance = provider.get_balance(synapse.from_address) if balance < synapse.from_amount: reject_synapse(synapse, 'Insufficient source balance', ctx) return synapse diff --git a/tests/test_axon_handlers.py b/tests/test_axon_handlers.py index e460da5c..df18de1e 100644 --- a/tests/test_axon_handlers.py +++ b/tests/test_axon_handlers.py @@ -889,6 +889,71 @@ def test_handle_swap_reserve_rejects_sentinel_rate(self): validator.axon_contract_client.vote_reserve.assert_not_called() +class TestSourceBalanceLock: + """The source-balance check must serialise on axon_lock for a substrate + source (TAO) but stay lock-free for an HTTP source (BTC) — otherwise the + TAO get_balance races the lock-protected readers and trips the substrate + `cannot call recv while another thread is already running recv` error.""" + + def test_provider_uses_substrate_flags(self): + """TAO provider hits the shared websocket; BTC is HTTP and lock-free.""" + from allways.chain_providers.base import ChainProvider + from allways.chain_providers.bitcoin import BitcoinProvider + from allways.chain_providers.subtensor import SubtensorProvider + + assert ChainProvider.uses_substrate is False + assert SubtensorProvider.uses_substrate is True + assert BitcoinProvider.uses_substrate is False + + def test_tao_source_balance_check_holds_axon_lock(self): + """A TAO-sourced reserve must acquire axon_lock around get_balance.""" + validator = make_reserve_validator() + lock = validator.axon_lock + + tao = MagicMock() + tao.uses_substrate = True + tao.verify_from_proof.return_value = True + # Record whether the lock is held at the moment get_balance runs. + held = {} + + def _get_balance(_addr): + held['locked'] = not lock.acquire(blocking=False) + if not held['locked']: + lock.release() + return 10**18 + + tao.get_balance.side_effect = _get_balance + validator.axon_chain_providers = {'tao': tao, 'btc': MagicMock()} + + commitment = make_commitment(from_chain='tao', to_chain='btc') + synapse = make_reserve_synapse(from_chain='tao', to_chain='btc', from_address='5user') + run_reserve_handler(validator, synapse, commitment=commitment) + + assert held.get('locked') is True + + def test_btc_source_balance_check_is_lock_free(self): + """A BTC-sourced reserve must NOT hold axon_lock during get_balance, so + a slow Esplora call can't stall the lock-protected forward loop.""" + validator = make_reserve_validator() + lock = validator.axon_lock + + btc = validator.axon_chain_providers['btc'] + btc.uses_substrate = False + held = {} + + def _get_balance(_addr): + held['locked'] = not lock.acquire(blocking=False) + if not held['locked']: + lock.release() + return 10**18 + + btc.get_balance.side_effect = _get_balance + + run_reserve_handler(validator, make_reserve_synapse()) + + assert held.get('locked') is False + + class TestMinerActivateExecutability: def _activate_synapse( self, hotkey: str = '5GrwvaEF5zXb26Fz9rcQpDWS57CtERHpNehXCPcNoHGKutQY'