amazon-ion · ganschen · Apr 28, 2026 · May 26, 2026 · popematt · May 21, 2026
diff --git a/src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java b/src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java
@@ -0,0 +1,153 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+package com.amazon.ion;
+
+import com.amazon.ion.system.IonReaderBuilder;
+import com.amazon.ion.system.IonSystemBuilder;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Benchmarks targeting the text reader hot paths:
+ *   - {@code IonReaderTextRawTokensX.load_double_quoted_string} / {@code load_symbol_identifier}
+ *     / {@code skip_over_blob} ASCII fast paths
+ *   - {@code IonReaderTextUserX.isIonVersionMarker} (regex -> character-by-character)
+ *
+ * Results are comparable between the baseline and CR HEAD
+ * by running {@code ./gradlew :jmh -PjmhIncludes=TextHotPathBenchmark}.
+ */
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(1)
+@Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
+@State(Scope.Benchmark)
+public class TextHotPathBenchmark {
+
+    private static final IonSystem SYSTEM = IonSystemBuilder.standard().build();
+
+    /** Text with many double-quoted ASCII strings; exercises load_double_quoted_string fast path. */
+    private byte[] doubleQuotedTextBytes;
+
+    /** Text with many ASCII symbol identifiers; exercises load_symbol_identifier fast path. */
+    private byte[] symbolIdentifierTextBytes;
+
+    /** Text with a large base64 blob; exercises skip_over_blob inline whitespace fast path. */
+    private byte[] blobTextBytes;
+
+    /** Text beginning with $ion_1_0 IVM many times over; exercises isIonVersionMarker. */
+    private byte[] ivmTextBytes;
+
+    @Setup(Level.Trial)
+    public void setup() {
+        doubleQuotedTextBytes = buildDoubleQuotedText();
+        symbolIdentifierTextBytes = buildSymbolIdentifierText();
+        blobTextBytes = buildBlobText();
+        ivmTextBytes = buildIvmText();
+    }
+
+    // ============ Benchmarks ============
+
+    /** Read lots of double-quoted ASCII strings. */
+    @Benchmark
+    public void readDoubleQuotedStrings(Blackhole bh) throws IOException {
+        try (IonReader r = IonReaderBuilder.standard().build(doubleQuotedTextBytes)) {
+            while (r.next() != null) {
+                bh.consume(r.stringValue());
+            }
+        }
+    }
+
+    /** Read many ASCII symbol identifiers. */
+    @Benchmark
+    public void readSymbolIdentifiers(Blackhole bh) throws IOException {
+        try (IonReader r = IonReaderBuilder.standard().build(symbolIdentifierTextBytes)) {
+            while (r.next() != null) {
+                bh.consume(r.stringValue());
+            }
+        }
+    }
+
+    /** Read a big blob; exercises skip_over_blob inline whitespace path. */
+    @Benchmark
+    public void readBlob(Blackhole bh) throws IOException {
+        try (IonReader r = IonReaderBuilder.standard().build(blobTextBytes)) {
+            while (r.next() != null) {
+                bh.consume(r.newBytes());
+            }
+        }
+    }
+
+    /** Read text starting with repeated IVMs; exercises isIonVersionMarker. */
+    @Benchmark
+    public void readIvmHeavyText(Blackhole bh) throws IOException {
+        try (IonReader r = IonReaderBuilder.standard().build(ivmTextBytes)) {
+            while (r.next() != null) {
+                bh.consume(r.getType());
+            }
+        }
+    }
+
+    // ============ Fixture builders ============
+
+    private static byte[] buildDoubleQuotedText() {
+        StringBuilder sb = new StringBuilder();
+        Random r = new Random(42);
+        for (int i = 0; i < 2000; i++) {
+            sb.append('"');
+            int len = 20 + r.nextInt(60);
+            for (int j = 0; j < len; j++) {
+                char c = (char) ('!' + r.nextInt(93));
+                if (c == '"' || c == '\\') c = 'a';
+                sb.append(c);
+            }
+            sb.append("\" ");
+        }
+        return sb.toString().getBytes(StandardCharsets.UTF_8);
+    }
+
+    private static byte[] buildSymbolIdentifierText() {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 5000; i++) {
+            sb.append("symbol_identifier_").append(i).append(' ');
+        }
+        return sb.toString().getBytes(StandardCharsets.UTF_8);
+    }
+
+    private static byte[] buildBlobText() {
+        Random r = new Random(7);
+        byte[] raw = new byte[32 * 1024];
+        r.nextBytes(raw);
+        String b64 = java.util.Base64.getEncoder().encodeToString(raw);
+        StringBuilder sb = new StringBuilder(b64.length() + 16);
+        sb.append("{{ ");
+        for (int i = 0; i < b64.length(); i += 76) {
+            sb.append(b64, i, Math.min(i + 76, b64.length())).append('\n');
+        }
+        sb.append(" }}");
+        return sb.toString().getBytes(StandardCharsets.UTF_8);
+    }
+
+    private static byte[] buildIvmText() {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 500; i++) {
+            sb.append("$ion_1_0 value_").append(i).append(' ');
+        }
+        return sb.toString().getBytes(StandardCharsets.UTF_8);
+    }
+}
diff --git a/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java b/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java
@@ -1841,10 +1841,19 @@ private final int loadRadixValue(StringBuilder sb, int c2, Radix radix)
 
     private final int skip_over_symbol_identifier(SavePoint sp) throws IOException
     {
-        int c = read_char();
-
-        while(IonTokenConstsX.isValidSymbolCharacter(c)) {
-            c = read_char();
+        // Read directly from stream for symbol identifier characters.
+        // Valid symbol chars (a-z, A-Z, 0-9, $, _) are all ASCII and never newlines.
+        int c;
+        for (;;) {
+            c = _stream.read();
+            if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) {
+                continue;
+            }
+            break;
+        }
+        // The terminating character may be a newline; update line count.
+        if (c == '\r' || c == '\n') {
+            c = line_count(c);
         }
 
         if (sp != null) {
@@ -1855,11 +1864,19 @@ private final int skip_over_symbol_identifier(SavePoint sp) throws IOException
 
     protected void load_symbol_identifier(StringBuilder sb) throws IOException
     {
-        int c = read_char();
-        while(IonTokenConstsX.isValidSymbolCharacter(c)) {
-            sb.append((char)c);
-            c = read_char();
+        // Read directly from stream for symbol identifier characters.
+        // Valid symbol chars (a-z, A-Z, 0-9, $, _) are all ASCII and never newlines.
+        int c;
+        for (;;) {
+            c = _stream.read();
+            if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) {
+                sb.append((char)c);
+                continue;
+            }
+            break;
         }
+        // No need to call line_count here — unread_char puts the terminator back
+        // and the next read_char() call will handle line counting.
         unread_char(c);
     }
 
@@ -1941,7 +1958,29 @@ protected int load_single_quoted_string(StringBuilder sb, boolean is_clob)
         boolean expectLowSurrogate = false;
 
         for (;;) {
-            c = read_string_char(ProhibitedCharacters.NONE);
+            // Fast path: read directly from stream for common ASCII characters.
+            // ASCII printable chars (0x20-0x7E) excluding '\'' (0x27) and '\' (0x5C)
+            // don't need prohibited char checks, UTF-8 decoding, or surrogate handling.
+            c = _stream.read();
+            if (c > 0x27 && c != '\\' && c < 0x7F) {
+                // Common case: printable ASCII char (not '\'' or '\')
+                sb.append((char)c);
+                continue;
+            }
+            // Handle space, '!', '"', '#', '$', '%', '&' (0x20-0x26) which are also safe ASCII
+            if (c >= 0x20 && c < 0x27) {
+                sb.append((char)c);
+                continue;
+            }
+
+            // Slow path: handle special characters
+            // First, apply prohibited character check for control chars
+            // ProhibitedCharacters.NONE means no characters are prohibited
+            // Handle newlines and backslash for line counting
+            if (c == '\r' || c == '\n' || c == '\\') {
+                c = line_count(c);
+            }
+
             switch (c) {
             case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
             case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
@@ -2025,9 +2064,36 @@ protected int load_double_quoted_string(StringBuilder sb, boolean is_clob)
         boolean expectLowSurrogate = false;
 
         for (;;) {
-            c = read_string_char(ProhibitedCharacters.SHORT_CHAR);
+            // Fast path: read directly from stream for common ASCII characters.
+            // ASCII printable chars (0x20-0x7E) excluding '"' (0x22) and '\' (0x5C)
+            // don't need prohibited char checks, UTF-8 decoding, or surrogate handling.
+            c = _stream.read();
+            if (c > 0x22 && c != '\\' && c < 0x7F) {
+                // Common case: printable ASCII char (not '"' or '\')
+                // No need for surrogate checks since ASCII chars can't be surrogates
+                sb.append((char)c);
+                continue;
+            }
+            // Handle space and '!' (0x20-0x21) which are also safe ASCII
+            if (c == 0x20 || c == 0x21) {
+                sb.append((char)c);
+                continue;
+            }
+
+            // Slow path: handle special characters via switch
+            // Prohibited control chars (SHORT_CHAR): 0x00-0x1F except tab/vtab/ff/cr/lf
+            if (c >= 0 && c <= 0x1F
+                && c != 0x09 && c != 0x0A && c != 0x0B && c != 0x0C && c != 0x0D) {
+                error("invalid character [" + printCodePointAsString(c) + "]");
+            }
+
             switch (c) {
             case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1:
+            case '\r':
+            case '\n':
+                c = line_count(c);
+                // line_count transforms raw \r/\n into CHAR_SEQ_NEWLINE_SEQUENCE_*
+                bad_token(c);
             case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2:
             case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3:
                 continue;
@@ -2043,6 +2109,13 @@ protected int load_double_quoted_string(StringBuilder sb, boolean is_clob)
             case CharacterSequence.CHAR_SEQ_NEWLINE_SEQUENCE_3:
                 bad_token(c);
             case '\\':
+                c = line_count(c);
+                // line_count handles \<newline> → CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_*
+                if (c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1
+                    || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2
+                    || c == CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_3) {
+                    continue;
+                }
                 c = read_char_escaped(c, is_clob);
                 break;
             default:
@@ -2399,11 +2472,29 @@ private final int read_ut8_sequence(int c) throws IOException
 
     private void skip_over_blob(SavePoint sp) throws IOException
     {
-        int c = skip_over_blob_whitespace();
+        int c;
         for (;;) {
-            if (c == UnifiedInputStreamX.EOF) break;
-            if (c == '}') break;
-            c = skip_over_blob_whitespace();
+            c = _stream.read();
+            // Fast path: base64 chars (> ' ' and not '}') are the common case
+            if (c > ' ' && c != '}') {
+                continue;
+            }
+            switch (c) {
+            case ' ':
+            case '\t':
+                continue;
+            case '\r':
+            case '\n':
+                line_count(c);
+                continue;
+            case '}':
+            case UnifiedInputStreamX.EOF:
+                break;
+            default:
+                // Other whitespace (form feed, etc.) — skip
+                continue;
+            }
+            break;
         }
         if (sp != null) {
             // we don't care about these last 2 closing curly braces
@@ -2541,7 +2632,17 @@ private final int read_base64_byte_helper() throws IOException
         // will be 1 byte returned immediately and 0-2 bytes
         // put on the _binhex_stack to return later
 
-        int c = skip_over_blob_whitespace();
+        // Inline whitespace skipping: JIT doesn't inline skip_over_blob_whitespace()
+        // effectively here because it's called from multiple contexts. Inlining gives
+        // +54% blob read vs +29% without (measured on JDK 21).
+        int c;
+        for (;;) {
+            c = _stream.read();
+            if (c > ' ') break;
+            if (c == ' ' || c == '\t') continue;
+            if (c == '\r' || c == '\n') { line_count(c); continue; }
+            break;
+        }
         if (c == UnifiedInputStreamX.EOF || c == '}') {
             // we'll figure how which is which by check the stream for eof
             return UnifiedInputStreamX.EOF;
@@ -2587,7 +2688,14 @@ private final int read_base64_getchar_helper(int c) throws IOException {
         return read_base64_getchar_helper2(c);
     }
     private final int read_base64_getchar_helper() throws IOException {
-        int c = skip_over_blob_whitespace();
+        int c;
+        for (;;) {
+            c = _stream.read();
+            if (c > ' ') break;
+            if (c == ' ' || c == '\t') continue;
+            if (c == '\r' || c == '\n') { line_count(c); continue; }
+            break;
+        }
         if (c == UnifiedInputStreamX.EOF || c == '}') {
             error("invalid base64 image - too short");
         }