Skip to content

perf: reduce instruction counts in len()#2

Merged
ryota2357 merged 3 commits intomainfrom
perf-len
Mar 12, 2026
Merged

perf: reduce instruction counts in len()#2
ryota2357 merged 3 commits intomainfrom
perf-len

Conversation

@ryota2357
Copy link
Copy Markdown
Owner

Reduced instruction counts for LeanString::len() (and LeanString::as_str() / as_bytes()) on both AArch64 and x86_64.

Assembly results

Direct LeanString::as_bytes is emitted as an alias of LeanString::as_str, so the as_str diff below covers both.

AArch64 (cargo asm --lib)

function before after
LeanString::len 13 instructions 10 instructions
LeanString::as_str / as_bytes 16 instructions 12 instructions

LeanString::len

Before:

ldrb w8, [x0, #14]
ldrh w9, [x0, #12]
orr  w8, w9, w8, lsl #16
ldr  w9, [x0, #8]
orr  x8, x9, x8, lsl #32
ldrb w9, [x0, #15]
sub  x10, x9, #192
mov  w11, #16
cmp  x10, #16
csel x10, x10, x11, lo
cmp  w9, #208
csel x0, x10, x8, lo
ret

After:

ldrb w8, [x0, #15]
sub  x9, x8, #192
mov  w10, #16
cmp  x9, #16
csel x9, x9, x10, lo
ldr  x10, [x0, #8]
and  x10, x10, #0xffffffffffffff
cmp  w8, #208
csel x0, x9, x10, lo
ret

LeanString::as_str() / as_bytes()

Before:

ldrb w8, [x0, #14]
ldrh w9, [x0, #12]
orr  w8, w9, w8, lsl #16
ldr  w9, [x0, #8]
orr  x8, x9, x8, lsl #32
ldrb w9, [x0, #15]
sub  x10, x9, #192
mov  w11, #16
cmp  x10, #16
csel x10, x10, x11, lo
cmp  w9, #208
csel x1, x10, x8, lo
ldr  x8, [x0]
cmp  w9, #207
csel x0, x8, x0, hi
ret

After:

ldrb w8, [x0, #15]
sub  x9, x8, #192
mov  w10, #16
cmp  x9, #16
csel x9, x9, x10, lo
ldp  x11, x10, [x0]
cmp  w8, #207
csel x0, x11, x0, hi
and  x10, x10, #0xffffffffffffff
cmp  w8, #208
csel x1, x9, x10, lo
ret

x86_64 (cargo asm --lib --target x86_64-unknown-linux-gnu)

function before after
LeanString::len 15 instructions 10 instructions
LeanString::as_str / as_bytes 17 instructions 12 instructions

LeanString::len

Before:

movzx eax, word ptr [rdi + 12]
movzx ecx, byte ptr [rdi + 14]
shl   ecx, 16
or    ecx, eax
shl   rcx, 32
mov   edx, dword ptr [rdi + 8]
or    rdx, rcx
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   eax, 16
cmovb rax, rsi
cmp   rcx, 208
cmovae rax, rdx
ret

After:

movzx ecx, byte ptr [rdi + 15]
lea   rax, [rcx - 192]
cmp   rax, 16
mov   edx, 16
cmovb rdx, rax
movabs rax, 72057594037927935
and   rax, qword ptr [rdi + 8]
cmp   rcx, 208
cmovb rax, rdx
ret

LeanString::as_str() / as_bytes()

Before:

mov   rax, rdi
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   edx, 16
cmovb rdx, rsi
cmp   rcx, 208
jb    .LBB2_2
movzx ecx, word ptr [rax + 12]
movzx esi, byte ptr [rax + 14]
shl   esi, 16
or    esi, ecx
shl   rsi, 32
mov   edx, dword ptr [rax + 8]
or    rdx, rsi
mov   rax, qword ptr [rax]
.LBB2_2:
ret

After:

mov   rax, rdi
movzx ecx, byte ptr [rdi + 15]
lea   rsi, [rcx - 192]
cmp   rsi, 16
mov   edx, 16
cmovb rdx, rsi
cmp   rcx, 208
jb    .LBB2_2
movabs rdx, 72057594037927935
and   rdx, qword ptr [rax + 8]
mov   rax, qword ptr [rax]
.LBB2_2:
ret

= aarch64 assembly (Apple M1Pro, rustc 1.93.0)

== `len()`: 13 → 11 instructions

Before:
    ldrb w8, [x0, #14]
    ldrh w9, [x0, #12]
    orr  w8, w9, w8, lsl #16
    ldr  w9, [x0, #8]
    orr  x8, x9, x8, lsl #32
    ldrb w9, [x0, #15]
    sub  x10, x9, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    cmp  w9, #208
    csel x0, x10, x8, lo
    ret

After:
    ldr  x8, [x0, #8]
    and  x9, x8, #0xffffffffffffff
    lsr  x10, x8, #56
    sub  x10, x10, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    lsr  x8, x8, #60
    cmp  x8, #13
    csel x0, x10, x9, lo
    ret

== `as_str()` / `as_bytes()`: 16 → 13 instructions

Before:
    ldrb w8, [x0, #14]
    ldrh w9, [x0, #12]
    orr  w8, w9, w8, lsl #16
    ldr  w9, [x0, #8]
    orr  x8, x9, x8, lsl #32
    ldrb w9, [x0, #15]
    sub  x10, x9, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    cmp  w9, #208
    csel x1, x10, x8, lo
    ldr  x8, [x0]
    cmp  w9, #207
    csel x0, x8, x0, hi
    ret

After:
    ldp  x9, x8, [x0]
    lsr  x10, x8, #56
    sub  x10, x10, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    and  x11, x8, #0xffffffffffffff
    lsr  x8, x8, #60
    cmp  x8, #13
    csel x1, x10, x11, lo
    cmp  x8, #12
    csel x0, x9, x0, hi
    ret

= x86_64 assembly (Apple M1Pro, --target=x86_64-unknown-linux-gnu, rustc 1.93.0)

== `len()`: 15 → 13 instructions

Before:
    movzx  eax, word ptr [rdi + 12]
    movzx  ecx, byte ptr [rdi + 14]
    shl    ecx, 16
    or     ecx, eax
    shl    rcx, 32
    mov    edx, dword ptr [rdi + 8]
    or     rdx, rcx
    movzx  ecx, byte ptr [rdi + 15]
    lea    rsi, [rcx - 192]
    cmp    rsi, 16
    mov    eax, 16
    cmovb  rax, rsi
    cmp    rcx, 208
    cmovae rax, rdx
    ret

After:
    mov    rcx, qword ptr [rdi + 8]
    movabs rdx, 72057594037927935
    and    rdx, rcx
    mov    rsi, rcx
    shr    rsi, 56
    add    rsi, -192
    cmp    rsi, 16
    mov    eax, 16
    cmovb  rax, rsi
    shr    rcx, 60
    cmp    ecx, 13
    cmovae rax, rdx
    ret

== `as_str()` / `as_bytes()`: 17 → 19 instructions

Before:
    mov   rax, rdi
    movzx ecx, byte ptr [rdi + 15]
    lea   rsi, [rcx - 192]
    cmp   rsi, 16
    mov   edx, 16
    cmovb rdx, rsi
    cmp   rcx, 208
    jb    .LBB11_2
    movzx ecx, word ptr [rax + 12]
    movzx esi, byte ptr [rax + 14]
    shl   esi, 16
    or    esi, ecx
    shl   rsi, 32
    mov   edx, dword ptr [rax + 8]
    or    rdx, rsi
    mov   rax, qword ptr [rax]
.LBB11_2:
    ret

After:
    mov    rax, rdi
    mov    rcx, qword ptr [rdi + 8]
    movabs rsi, 72057594037927935
    and    rsi, rcx
    mov    rdi, rcx
    shr    rdi, 56
    add    rdi, -192
    cmp    rdi, 16
    mov    edx, 16
    cmovb  rdx, rdi
    movabs rdi, -3458764513820540929
    inc    rdi
    cmp    rcx, rdi
    cmovae rdx, rsi
    shr    rcx, 60
    cmp    ecx, 13
    jb     .LBB11_2
    mov    rax, qword ptr [rax]
.LBB11_2:
    ret
= aarch64 assembly (Apple M1Pro, rustc 1.93.0)

== `len()`: 11 → 10 instructions

Before:
    ldr  x8, [x0, #8]
    and  x9, x8, #0xffffffffffffff
    lsr  x10, x8, #56
    sub  x10, x10, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    lsr  x8, x8, #60
    cmp  x8, #13
    csel x0, x10, x9, lo
    ret

After:
    ldrb w8, [x0, #15]
    sub  x9, x8, #192
    mov  w10, #16
    cmp  x9, #16
    csel x9, x9, x10, lo
    ldr  x10, [x0, #8]
    and  x10, x10, #0xffffffffffffff
    cmp  w8, #208
    csel x0, x9, x10, lo
    ret

== `as_str()` / `as_bytes()`: 13 → 12 instructions

Before:
    ldp  x9, x8, [x0]
    lsr  x10, x8, #56
    sub  x10, x10, #192
    mov  w11, #16
    cmp  x10, #16
    csel x10, x10, x11, lo
    and  x11, x8, #0xffffffffffffff
    lsr  x8, x8, #60
    cmp  x8, #13
    csel x1, x10, x11, lo
    cmp  x8, #12
    csel x0, x9, x0, hi
    ret

After:
    ldrb w8, [x0, #15]
    sub  x9, x8, #192
    mov  w10, #16
    cmp  x9, #16
    csel x9, x9, x10, lo
    ldp  x11, x10, [x0]
    cmp  w8, #207
    csel x0, x11, x0, hi
    and  x10, x10, #0xffffffffffffff
    cmp  w8, #208
    csel x1, x9, x10, lo
    ret

= x86_64 assembly (Apple M1Pro, --target=x86_64-unknown-linux-gnu, rustc 1.93.0)

== `len()`: 13 → 10 instructions

Before:
    mov    rcx, qword ptr [rdi + 8]
    movabs rdx, 72057594037927935
    and    rdx, rcx
    mov    rsi, rcx
    shr    rsi, 56
    add    rsi, -192
    cmp    rsi, 16
    mov    eax, 16
    cmovb  rax, rsi
    shr    rcx, 60
    cmp    ecx, 13
    cmovae rax, rdx
    ret

After:
    movzx  ecx, byte ptr [rdi + 15]
    lea    rax, [rcx - 192]
    cmp    rax, 16
    mov    edx, 16
    cmovb  rdx, rax
    movabs rax, 72057594037927935
    and    rax, qword ptr [rdi + 8]
    cmp    rcx, 208
    cmovb  rax, rdx
    ret

== `as_str()` / `as_bytes()`: 19 → 12 instructions

Before:
    mov    rax, rdi
    mov    rcx, qword ptr [rdi + 8]
    movabs rsi, 72057594037927935
    and    rsi, rcx
    mov    rdi, rcx
    shr    rdi, 56
    add    rdi, -192
    cmp    rdi, 16
    mov    edx, 16
    cmovb  rdx, rdi
    movabs rdi, -3458764513820540929
    inc    rdi
    cmp    rcx, rdi
    cmovae rdx, rsi
    shr    rcx, 60
    cmp    ecx, 13
    jb     .LBB2_2
    mov    rax, qword ptr [rax]
.LBB2_2:
    ret

After:
    mov    rax, rdi
    movzx  ecx, byte ptr [rdi + 15]
    lea    rsi, [rcx - 192]
    cmp    rsi, 16
    mov    edx, 16
    cmovb  rdx, rsi
    cmp    rcx, 208
    jb     .LBB2_2
    movabs rdx, 72057594037927935
    and    rdx, qword ptr [rax + 8]
    mov    rax, qword ptr [rax]
.LBB2_2:
    ret
Copy link
Copy Markdown
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR refactors the 64-bit Repr::len() implementation to compute the stored length by loading the “tail” word as a usize and masking off the marker byte, avoiding the previous byte-array reconstruction.

Changes:

  • Replaced tail-length extraction in Repr::len() (64-bit) with a usize load + from_le + bitmask.
  • Kept the inline-length fast-path logic unchanged.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
@ryota2357 ryota2357 merged commit 9c5acb9 into main Mar 12, 2026
10 checks passed
@ryota2357 ryota2357 deleted the perf-len branch March 12, 2026 06:49
@ryota2357 ryota2357 changed the title perf: reduce in len() instruction counts perf: reduce instruction counts in len() Mar 12, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants