perf: reduce instruction counts in `len()` by ryota2357 · Pull Request #2 · ryota2357/lean_string

ryota2357 · 2026-03-12T05:28:25Z

Reduced instruction counts for LeanString::len() (and LeanString::as_str() / as_bytes()) on both AArch64 and x86_64.

Assembly results

Direct LeanString::as_bytes is emitted as an alias of LeanString::as_str, so the as_str diff below covers both.

AArch64 (`cargo asm --lib`)

function	before	after
`LeanString::len`	13 instructions	10 instructions
`LeanString::as_str` / `as_bytes`	16 instructions	12 instructions

`LeanString::len`

Before:

ldrb w8, [x0, #14]
ldrh w9, [x0, #12]
orr  w8, w9, w8, lsl #16
ldr  w9, [x0, #8]
orr  x8, x9, x8, lsl #32
ldrb w9, [x0, #15]
sub  x10, x9, #192
mov  w11, #16
cmp  x10, #16
csel x10, x10, x11, lo
cmp  w9, #208
csel x0, x10, x8, lo
ret

After:

ldrb w8, [x0, #15]
sub  x9, x8, #192
mov  w10, #16
cmp  x9, #16
csel x9, x9, x10, lo
ldr  x10, [x0, #8]
and  x10, x10, #0xffffffffffffff
cmp  w8, #208
csel x0, x9, x10, lo
ret

`LeanString::as_str()` / `as_bytes()`

Before:

ldrb w8, [x0, #14]
ldrh w9, [x0, #12]
orr  w8, w9, w8, lsl #16
ldr  w9, [x0, #8]
orr  x8, x9, x8, lsl #32
ldrb w9, [x0, #15]
sub  x10, x9, #192
mov  w11, #16
cmp  x10, #16
csel x10, x10, x11, lo
cmp  w9, #208
csel x1, x10, x8, lo
ldr  x8, [x0]
cmp  w9, #207
csel x0, x8, x0, hi
ret

After:

ldrb w8, [x0, #15]
sub  x9, x8, #192
mov  w10, #16
cmp  x9, #16
csel x9, x9, x10, lo
ldp  x11, x10, [x0]
cmp  w8, #207
csel x0, x11, x0, hi
and  x10, x10, #0xffffffffffffff
cmp  w8, #208
csel x1, x9, x10, lo
ret

x86_64 (`cargo asm --lib --target x86_64-unknown-linux-gnu`)

function	before	after
`LeanString::len`	15 instructions	10 instructions
`LeanString::as_str` / `as_bytes`	17 instructions	12 instructions

`LeanString::len`

Before:

movzx eax, word ptr [rdi + 12]
movzx ecx, byte ptr [rdi + 14]
shl   ecx, 16
or    ecx, eax
shl   rcx, 32
mov   edx, dword ptr [rdi + 8]
or    rdx, rcx
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   eax, 16
cmovb rax, rsi
cmp   rcx, 208
cmovae rax, rdx
ret

After:

movzx ecx, byte ptr [rdi + 15]
lea   rax, [rcx - 192]
cmp   rax, 16
mov   edx, 16
cmovb rdx, rax
movabs rax, 72057594037927935
and   rax, qword ptr [rdi + 8]
cmp   rcx, 208
cmovb rax, rdx
ret

`LeanString::as_str()` / `as_bytes()`

Before:

mov   rax, rdi
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   edx, 16
cmovb rdx, rsi
cmp   rcx, 208
jb    .LBB2_2
movzx ecx, word ptr [rax + 12]
movzx esi, byte ptr [rax + 14]
shl   esi, 16
or    esi, ecx
shl   rsi, 32
mov   edx, dword ptr [rax + 8]
or    rdx, rsi
mov   rax, qword ptr [rax]
.LBB2_2:
ret

After:

mov   rax, rdi
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   edx, 16
cmovb rdx, rsi
cmp   rcx, 208
jb    .LBB2_2
movabs rdx, 72057594037927935
and   rdx, qword ptr [rax + 8]
mov   rax, qword ptr [rax]
.LBB2_2:
ret

= aarch64 assembly (Apple M1Pro, rustc 1.93.0) == `len()`: 13 → 11 instructions Before: ldrb w8, [x0, #14] ldrh w9, [x0, #12] orr w8, w9, w8, lsl #16 ldr w9, [x0, #8] orr x8, x9, x8, lsl #32 ldrb w9, [x0, #15] sub x10, x9, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo cmp w9, #208 csel x0, x10, x8, lo ret After: ldr x8, [x0, #8] and x9, x8, #0xffffffffffffff lsr x10, x8, #56 sub x10, x10, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo lsr x8, x8, #60 cmp x8, #13 csel x0, x10, x9, lo ret == `as_str()` / `as_bytes()`: 16 → 13 instructions Before: ldrb w8, [x0, #14] ldrh w9, [x0, #12] orr w8, w9, w8, lsl #16 ldr w9, [x0, #8] orr x8, x9, x8, lsl #32 ldrb w9, [x0, #15] sub x10, x9, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo cmp w9, #208 csel x1, x10, x8, lo ldr x8, [x0] cmp w9, #207 csel x0, x8, x0, hi ret After: ldp x9, x8, [x0] lsr x10, x8, #56 sub x10, x10, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo and x11, x8, #0xffffffffffffff lsr x8, x8, #60 cmp x8, #13 csel x1, x10, x11, lo cmp x8, #12 csel x0, x9, x0, hi ret = x86_64 assembly (Apple M1Pro, --target=x86_64-unknown-linux-gnu, rustc 1.93.0) == `len()`: 15 → 13 instructions Before: movzx eax, word ptr [rdi + 12] movzx ecx, byte ptr [rdi + 14] shl ecx, 16 or ecx, eax shl rcx, 32 mov edx, dword ptr [rdi + 8] or rdx, rcx movzx ecx, byte ptr [rdi + 15] lea rsi, [rcx - 192] cmp rsi, 16 mov eax, 16 cmovb rax, rsi cmp rcx, 208 cmovae rax, rdx ret After: mov rcx, qword ptr [rdi + 8] movabs rdx, 72057594037927935 and rdx, rcx mov rsi, rcx shr rsi, 56 add rsi, -192 cmp rsi, 16 mov eax, 16 cmovb rax, rsi shr rcx, 60 cmp ecx, 13 cmovae rax, rdx ret == `as_str()` / `as_bytes()`: 17 → 19 instructions Before: mov rax, rdi movzx ecx, byte ptr [rdi + 15] lea rsi, [rcx - 192] cmp rsi, 16 mov edx, 16 cmovb rdx, rsi cmp rcx, 208 jb .LBB11_2 movzx ecx, word ptr [rax + 12] movzx esi, byte ptr [rax + 14] shl esi, 16 or esi, ecx shl rsi, 32 mov edx, dword ptr [rax + 8] or rdx, rsi mov rax, qword ptr [rax] .LBB11_2: ret After: mov rax, rdi mov rcx, qword ptr [rdi + 8] movabs rsi, 72057594037927935 and rsi, rcx mov rdi, rcx shr rdi, 56 add rdi, -192 cmp rdi, 16 mov edx, 16 cmovb rdx, rdi movabs rdi, -3458764513820540929 inc rdi cmp rcx, rdi cmovae rdx, rsi shr rcx, 60 cmp ecx, 13 jb .LBB11_2 mov rax, qword ptr [rax] .LBB11_2: ret

= aarch64 assembly (Apple M1Pro, rustc 1.93.0) == `len()`: 11 → 10 instructions Before: ldr x8, [x0, #8] and x9, x8, #0xffffffffffffff lsr x10, x8, #56 sub x10, x10, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo lsr x8, x8, #60 cmp x8, #13 csel x0, x10, x9, lo ret After: ldrb w8, [x0, #15] sub x9, x8, #192 mov w10, #16 cmp x9, #16 csel x9, x9, x10, lo ldr x10, [x0, #8] and x10, x10, #0xffffffffffffff cmp w8, #208 csel x0, x9, x10, lo ret == `as_str()` / `as_bytes()`: 13 → 12 instructions Before: ldp x9, x8, [x0] lsr x10, x8, #56 sub x10, x10, #192 mov w11, #16 cmp x10, #16 csel x10, x10, x11, lo and x11, x8, #0xffffffffffffff lsr x8, x8, #60 cmp x8, #13 csel x1, x10, x11, lo cmp x8, #12 csel x0, x9, x0, hi ret After: ldrb w8, [x0, #15] sub x9, x8, #192 mov w10, #16 cmp x9, #16 csel x9, x9, x10, lo ldp x11, x10, [x0] cmp w8, #207 csel x0, x11, x0, hi and x10, x10, #0xffffffffffffff cmp w8, #208 csel x1, x9, x10, lo ret = x86_64 assembly (Apple M1Pro, --target=x86_64-unknown-linux-gnu, rustc 1.93.0) == `len()`: 13 → 10 instructions Before: mov rcx, qword ptr [rdi + 8] movabs rdx, 72057594037927935 and rdx, rcx mov rsi, rcx shr rsi, 56 add rsi, -192 cmp rsi, 16 mov eax, 16 cmovb rax, rsi shr rcx, 60 cmp ecx, 13 cmovae rax, rdx ret After: movzx ecx, byte ptr [rdi + 15] lea rax, [rcx - 192] cmp rax, 16 mov edx, 16 cmovb rdx, rax movabs rax, 72057594037927935 and rax, qword ptr [rdi + 8] cmp rcx, 208 cmovb rax, rdx ret == `as_str()` / `as_bytes()`: 19 → 12 instructions Before: mov rax, rdi mov rcx, qword ptr [rdi + 8] movabs rsi, 72057594037927935 and rsi, rcx mov rdi, rcx shr rdi, 56 add rdi, -192 cmp rdi, 16 mov edx, 16 cmovb rdx, rdi movabs rdi, -3458764513820540929 inc rdi cmp rcx, rdi cmovae rdx, rsi shr rcx, 60 cmp ecx, 13 jb .LBB2_2 mov rax, qword ptr [rax] .LBB2_2: ret After: mov rax, rdi movzx ecx, byte ptr [rdi + 15] lea rsi, [rcx - 192] cmp rsi, 16 mov edx, 16 cmovb rdx, rsi cmp rcx, 208 jb .LBB2_2 movabs rdx, 72057594037927935 and rdx, qword ptr [rax + 8] mov rax, qword ptr [rax] .LBB2_2: ret

Copilot

Pull request overview

This PR refactors the 64-bit Repr::len() implementation to compute the stored length by loading the “tail” word as a usize and masking off the marker byte, avoiding the previous byte-array reconstruction.

Changes:

Replaced tail-length extraction in Repr::len() (64-bit) with a usize load + from_le + bitmask.
Kept the inline-length fast-path logic unchanged.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

src/repr.rs

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

ryota2357 added 2 commits March 12, 2026 14:03

ryota2357 requested a review from Copilot March 12, 2026 05:28

Copilot started reviewing on behalf of ryota2357 March 12, 2026 05:29 View session

Copilot AI reviewed Mar 12, 2026

View reviewed changes

src/repr.rs Outdated Show resolved Hide resolved

chore: improve safety comment

3e96291

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

ryota2357 merged commit 9c5acb9 into main Mar 12, 2026
10 checks passed

ryota2357 deleted the perf-len branch March 12, 2026 06:49

ryota2357 changed the title ~~perf: reduce in len() instruction counts~~ perf: reduce instruction counts in len() Mar 12, 2026

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf: reduce instruction counts in `len()`#2

perf: reduce instruction counts in `len()`#2
ryota2357 merged 3 commits intomainfrom
perf-len

ryota2357 commented Mar 12, 2026

Uh oh!

Copilot AI left a comment

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

ryota2357 commented Mar 12, 2026

Assembly results

AArch64 (cargo asm --lib)

LeanString::len

LeanString::as_str() / as_bytes()

x86_64 (cargo asm --lib --target x86_64-unknown-linux-gnu)

LeanString::len

LeanString::as_str() / as_bytes()

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Uh oh!

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

AArch64 (`cargo asm --lib`)

`LeanString::len`

`LeanString::as_str()` / `as_bytes()`

x86_64 (`cargo asm --lib --target x86_64-unknown-linux-gnu`)

`LeanString::len`

`LeanString::as_str()` / `as_bytes()`