Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
package com.amazon.ion;

import com.amazon.ion.system.IonReaderBuilder;
import com.amazon.ion.system.IonSystemBuilder;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Random;
import java.util.concurrent.TimeUnit;

/**
* Benchmarks targeting the text reader hot paths:
* - {@code IonReaderTextRawTokensX.load_double_quoted_string} / {@code load_symbol_identifier}
* / {@code skip_over_blob} ASCII fast paths
* - {@code IonReaderTextUserX.isIonVersionMarker} (regex -> character-by-character)
*
* Results are comparable between the baseline and CR HEAD
* by running {@code ./gradlew :jmh -PjmhIncludes=TextHotPathBenchmark}.
*/
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Fork(1)
@Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
@State(Scope.Benchmark)
public class TextHotPathBenchmark {

private static final IonSystem SYSTEM = IonSystemBuilder.standard().build();

/** Text with many double-quoted ASCII strings; exercises load_double_quoted_string fast path. */
private byte[] doubleQuotedTextBytes;

/** Text with many ASCII symbol identifiers; exercises load_symbol_identifier fast path. */
private byte[] symbolIdentifierTextBytes;

/** Text with a large base64 blob; exercises skip_over_blob inline whitespace fast path. */
private byte[] blobTextBytes;

/** Text beginning with $ion_1_0 IVM many times over; exercises isIonVersionMarker. */
private byte[] ivmTextBytes;

@Setup(Level.Trial)
public void setup() {
doubleQuotedTextBytes = buildDoubleQuotedText();
symbolIdentifierTextBytes = buildSymbolIdentifierText();
blobTextBytes = buildBlobText();
ivmTextBytes = buildIvmText();
}

// ============ Benchmarks ============

/** Read lots of double-quoted ASCII strings. */
@Benchmark
public void readDoubleQuotedStrings(Blackhole bh) throws IOException {
try (IonReader r = IonReaderBuilder.standard().build(doubleQuotedTextBytes)) {
while (r.next() != null) {
bh.consume(r.stringValue());
}
}
}

/** Read many ASCII symbol identifiers. */
@Benchmark
public void readSymbolIdentifiers(Blackhole bh) throws IOException {
try (IonReader r = IonReaderBuilder.standard().build(symbolIdentifierTextBytes)) {
while (r.next() != null) {
bh.consume(r.stringValue());
}
}
}

/** Read a big blob; exercises skip_over_blob inline whitespace path. */
@Benchmark
public void readBlob(Blackhole bh) throws IOException {
try (IonReader r = IonReaderBuilder.standard().build(blobTextBytes)) {
while (r.next() != null) {
bh.consume(r.newBytes());
}
}
}

/** Read text starting with repeated IVMs; exercises isIonVersionMarker. */
@Benchmark
public void readIvmHeavyText(Blackhole bh) throws IOException {
try (IonReader r = IonReaderBuilder.standard().build(ivmTextBytes)) {
while (r.next() != null) {
bh.consume(r.getType());
}
}
}

// ============ Fixture builders ============

private static byte[] buildDoubleQuotedText() {
StringBuilder sb = new StringBuilder();
Random r = new Random(42);
for (int i = 0; i < 2000; i++) {
sb.append('"');
int len = 20 + r.nextInt(60);
for (int j = 0; j < len; j++) {
char c = (char) ('!' + r.nextInt(93));
if (c == '"' || c == '\\') c = 'a';
sb.append(c);
}
sb.append("\" ");
}
return sb.toString().getBytes(StandardCharsets.UTF_8);
}

private static byte[] buildSymbolIdentifierText() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 5000; i++) {
sb.append("symbol_identifier_").append(i).append(' ');
}
return sb.toString().getBytes(StandardCharsets.UTF_8);
}

private static byte[] buildBlobText() {
Random r = new Random(7);
byte[] raw = new byte[32 * 1024];
r.nextBytes(raw);
String b64 = java.util.Base64.getEncoder().encodeToString(raw);
StringBuilder sb = new StringBuilder(b64.length() + 16);
sb.append("{{ ");
for (int i = 0; i < b64.length(); i += 76) {
sb.append(b64, i, Math.min(i + 76, b64.length())).append('\n');
}
sb.append(" }}");
return sb.toString().getBytes(StandardCharsets.UTF_8);
}

private static byte[] buildIvmText() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 500; i++) {
sb.append("$ion_1_0 value_").append(i).append(' ');
}
return sb.toString().getBytes(StandardCharsets.UTF_8);
}
}
140 changes: 124 additions & 16 deletions src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java
Original file line number Diff line number Diff line change
Expand Up @@ -1841,10 +1841,19 @@ private final int loadRadixValue(StringBuilder sb, int c2, Radix radix)

private final int skip_over_symbol_identifier(SavePoint sp) throws IOException
{
int c = read_char();

while(IonTokenConstsX.isValidSymbolCharacter(c)) {
c = read_char();
// Read directly from stream for symbol identifier characters.
// Valid symbol chars (a-z, A-Z, 0-9, $, _) are all ASCII and never newlines.
int c;
for (;;) {
c = _stream.read();
if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) {
continue;
}
break;
}
// The terminating character may be a newline; update line count.
if (c == '\r' || c == '\n') {
c = line_count(c);
}

if (sp != null) {
Expand All @@ -1855,11 +1864,19 @@ private final int skip_over_symbol_identifier(SavePoint sp) throws IOException

protected void load_symbol_identifier(StringBuilder sb) throws IOException
{
int c = read_char();
while(IonTokenConstsX.isValidSymbolCharacter(c)) {
sb.append((char)c);
c = read_char();
// Read directly from stream for symbol identifier characters.
// Valid symbol chars (a-z, A-Z, 0-9, $, _) are all ASCII and never newlines.
int c;
for (;;) {
c = _stream.read();
if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) {
sb.append((char)c);
continue;
}
break;
}
// No need to call line_count here — unread_char puts the terminator back
// and the next read_char() call will handle line counting.
unread_char(c);
}

Expand Down Expand Up @@ -1941,7 +1958,29 @@ protected int load_single_quoted_string(StringBuilder sb, boolean is_clob)
boolean expectLowSurrogate = false;

for (;;) {
c = read_string_char(ProhibitedCharacters.NONE);
// Fast path: read directly from stream for common ASCII characters.
// ASCII printable chars (0x20-0x7E) excluding '\'' (0x27) and '\' (0x5C)
// don't need prohibited char checks, UTF-8 decoding, or surrogate handling.
c = _stream.read();
if (c > 0x27 && c != '\\' && c < 0x7F) {
// Common case: printable ASCII char (not '\'' or '\')
sb.append((char)c);
continue;
}
// Handle space, '!', '"', '#', '$', '%', '&' (0x20-0x26) which are also safe ASCII
if (c >= 0x20 && c < 0x27) {
sb.append((char)c);
continue;
}

// Slow path: handle special characters
// First, apply prohibited character check for control chars
// ProhibitedCharacters.NONE means no characters are prohibited
// Handle newlines and backslash for line counting
if (c == '\r' || c == '\n' || c == '\\') {
c = line_count(c);
}

switch (c) {
case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
Expand Down Expand Up @@ -2025,9 +2064,36 @@ protected int load_double_quoted_string(StringBuilder sb, boolean is_clob)
boolean expectLowSurrogate = false;

for (;;) {
c = read_string_char(ProhibitedCharacters.SHORT_CHAR);
// Fast path: read directly from stream for common ASCII characters.
// ASCII printable chars (0x20-0x7E) excluding '"' (0x22) and '\' (0x5C)
// don't need prohibited char checks, UTF-8 decoding, or surrogate handling.
c = _stream.read();
if (c > 0x22 && c != '\\' && c < 0x7F) {
// Common case: printable ASCII char (not '"' or '\')
// No need for surrogate checks since ASCII chars can't be surrogates
sb.append((char)c);
continue;
}
// Handle space and '!' (0x20-0x21) which are also safe ASCII
if (c == 0x20 || c == 0x21) {
sb.append((char)c);
continue;
}

// Slow path: handle special characters via switch
// Prohibited control chars (SHORT_CHAR): 0x00-0x1F except tab/vtab/ff/cr/lf
if (c >= 0 && c <= 0x1F
&& c != 0x09 && c != 0x0A && c != 0x0B && c != 0x0C && c != 0x0D) {
error("invalid character [" + printCodePointAsString(c) + "]");
}

switch (c) {
case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
case '\r':
case '\n':
c = line_count(c);
// line_count transforms raw \r/\n into CHAR_SEQ_NEWLINE_SEQUENCE_*
bad_token(c);
case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
continue;
Expand All @@ -2043,6 +2109,13 @@ protected int load_double_quoted_string(StringBuilder sb, boolean is_clob)
case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
bad_token(c);
case '\\':
c = line_count(c);
// line_count handles \<newline> → CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_*
if (c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1
|| c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2
|| c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3) {
continue;
}
c = read_char_escaped(c, is_clob);
break;
default:
Expand Down Expand Up @@ -2399,11 +2472,29 @@ private final int read_ut8_sequence(int c) throws IOException

private void skip_over_blob(SavePoint sp) throws IOException
{
int c = skip_over_blob_whitespace();
int c;
for (;;) {
if (c == UnifiedInputStreamX.EOF) break;
if (c == '}') break;
c = skip_over_blob_whitespace();
c = _stream.read();
// Fast path: base64 chars (> ' ' and not '}') are the common case
if (c > ' ' && c != '}') {
continue;
}
switch (c) {
case ' ':
case '\t':
continue;
case '\r':
case '\n':
line_count(c);
continue;
case '}':
case UnifiedInputStreamX.EOF:
break;
default:
// Other whitespace (form feed, etc.) — skip
continue;
}
Comment on lines +2479 to +2496
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a switch statement on c, which generates a jump table instead of multiple branch checks. Have you checked the performance difference between the two approaches?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converted to a switch.

break;
}
if (sp != null) {
// we don't care about these last 2 closing curly braces
Expand Down Expand Up @@ -2541,7 +2632,17 @@ private final int read_base64_byte_helper() throws IOException
// will be 1 byte returned immediately and 0-2 bytes
// put on the _binhex_stack to return later

int c = skip_over_blob_whitespace();
// Inline whitespace skipping: JIT doesn't inline skip_over_blob_whitespace()
// effectively here because it's called from multiple contexts. Inlining gives
// +54% blob read vs +29% without (measured on JDK 21).
int c;
for (;;) {
c = _stream.read();
if (c > ' ') break;
if (c == ' ' || c == '\t') continue;
if (c == '\r' || c == '\n') { line_count(c); continue; }
break;
}
if (c == UnifiedInputStreamX.EOF || c == '}') {
// we'll figure how which is which by check the stream for eof
return UnifiedInputStreamX.EOF;
Expand Down Expand Up @@ -2587,7 +2688,14 @@ private final int read_base64_getchar_helper(int c) throws IOException {
return read_base64_getchar_helper2(c);
}
private final int read_base64_getchar_helper() throws IOException {
int c = skip_over_blob_whitespace();
int c;
for (;;) {
c = _stream.read();
if (c > ' ') break;
if (c == ' ' || c == '\t') continue;
if (c == '\r' || c == '\n') { line_count(c); continue; }
break;
}
if (c == UnifiedInputStreamX.EOF || c == '}') {
error("invalid base64 image - too short");
}
Expand Down
Loading