Skip to content

Commit 863a794

Browse files
committed
fix: handle multi-byte characters
Ticket: DX-2800 This commit handles arbitrary sized characters. SWC uses byte offsets, while JS uses character offsets. This results in an index drift. #1109 had an initial fix, but upon adding more tests, it seemed that the fix was incomplete. This commit should pass the additional tests
1 parent c05e5fa commit 863a794

2 files changed

Lines changed: 158 additions & 18 deletions

File tree

packages/openapi-generator/src/comments.ts

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,27 @@ import { parse as parseComment, Block } from 'comment-parser';
22
import { Schema } from './ir';
33

44
/**
5-
* Compute the difference between byte length and character length for a string.
6-
* This accounts for multibyte UTF-8 characters.
5+
* Convert a UTF-8 byte offset to a JavaScript string character offset.
6+
* SWC (written in Rust) uses byte offsets, but JavaScript strings use
7+
* UTF-16 code unit offsets. This function handles the conversion by
8+
* iterating through the string and accumulating byte lengths.
9+
*
10+
* @param str The source string
11+
* @param byteOffset The byte offset to convert
12+
* @returns The corresponding character offset
713
*/
8-
function computeByteLengthDiff(str: string): number {
9-
return Buffer.byteLength(str, 'utf8') - str.length;
14+
function byteOffsetToCharOffset(str: string, byteOffset: number): number {
15+
let charCount = 0;
16+
let byteCount = 0;
17+
18+
for (const char of str) {
19+
const charBytes = Buffer.byteLength(char, 'utf8');
20+
if (byteCount + charBytes > byteOffset) break;
21+
byteCount += charBytes;
22+
charCount++;
23+
}
24+
25+
return charCount;
1026
}
1127

1228
export function leadingComment(
@@ -18,20 +34,13 @@ export function leadingComment(
1834
// SWC uses byte offsets, but JavaScript strings use character offsets.
1935
// When there are multibyte UTF-8 characters, we need to adjust.
2036
// Calculate the byte-to-char difference for the portion of source before our slice.
21-
const prefixLength = Math.min(start - srcSpanStart, src.length);
22-
const prefix = src.slice(0, prefixLength);
23-
const byteDiff = computeByteLengthDiff(prefix);
24-
25-
// Adjust the slice offsets by the byte difference
26-
const adjustedStart = start - srcSpanStart - byteDiff;
27-
const adjustedEnd =
28-
end -
29-
srcSpanStart -
30-
computeByteLengthDiff(src.slice(0, Math.min(end - srcSpanStart, src.length)));
31-
32-
let commentString = src
33-
.slice(Math.max(0, adjustedStart), Math.max(0, adjustedEnd))
34-
.trim();
37+
const startByteOffset = start - srcSpanStart;
38+
const endByteOffset = end - srcSpanStart;
39+
40+
const startCharOffset = byteOffsetToCharOffset(src, startByteOffset);
41+
const endCharOffset = byteOffsetToCharOffset(src, endByteOffset);
42+
43+
let commentString = src.slice(startCharOffset, endCharOffset).trim();
3544

3645
if (commentString.includes(' * ') && !/\/\*\*([\s\S]*?)\*\//.test(commentString)) {
3746
// The comment block seems to be JSDoc but was sliced incorrectly

packages/openapi-generator/test/openapi/comments.test.ts

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2112,6 +2112,137 @@ testCase('route with multibyte chars', ROUTE_WITH_MULTIBYTE_CHARS, {
21122112
},
21132113
});
21142114

2115+
const ROUTE_WITH_CJK_CHARS = `
2116+
import * as t from 'io-ts';
2117+
import * as h from '@api-ts/io-ts-http';
2118+
2119+
export const Body = t.type({
2120+
/**
2121+
* 日本語の名前フィールド (Japanese name field)
2122+
* @example 山田太郎
2123+
*/
2124+
japaneseName: t.string,
2125+
/**
2126+
* 中文名字字段 (Chinese name field)
2127+
* @example 张三
2128+
*/
2129+
chineseName: t.string,
2130+
/**
2131+
* 한국어 이름 필드 (Korean name field)
2132+
* @example 김철수
2133+
*/
2134+
koreanName: t.string,
2135+
});
2136+
2137+
/**
2138+
* Route testing CJK characters (日本語, 中文, 한국어)
2139+
*
2140+
* @operationId api.v1.cjkChars
2141+
* @tag Test Routes
2142+
*/
2143+
export const route = h.httpRoute({
2144+
path: '/cjk-chars',
2145+
method: 'POST',
2146+
request: h.httpRequest({
2147+
body: Body,
2148+
}),
2149+
response: {
2150+
200: {
2151+
result: t.string
2152+
}
2153+
},
2154+
});
2155+
`;
2156+
2157+
testCase('route with CJK characters', ROUTE_WITH_CJK_CHARS, {
2158+
openapi: '3.0.3',
2159+
info: {
2160+
title: 'Test',
2161+
version: '1.0.0',
2162+
},
2163+
paths: {
2164+
'/cjk-chars': {
2165+
post: {
2166+
summary: 'Route testing CJK characters (日本語, 中文, 한국어)',
2167+
operationId: 'api.v1.cjkChars',
2168+
tags: ['Test Routes'],
2169+
parameters: [],
2170+
requestBody: {
2171+
content: {
2172+
'application/json': {
2173+
schema: {
2174+
properties: {
2175+
japaneseName: {
2176+
type: 'string',
2177+
description: '日本語の名前フィールド (Japanese name field)',
2178+
example: '山田太郎',
2179+
},
2180+
chineseName: {
2181+
type: 'string',
2182+
description: '中文名字字段 (Chinese name field)',
2183+
example: '张三',
2184+
},
2185+
koreanName: {
2186+
type: 'string',
2187+
description: '한국어 이름 필드 (Korean name field)',
2188+
example: '김철수',
2189+
},
2190+
},
2191+
required: ['japaneseName', 'chineseName', 'koreanName'],
2192+
type: 'object',
2193+
},
2194+
},
2195+
},
2196+
},
2197+
responses: {
2198+
200: {
2199+
description: 'OK',
2200+
content: {
2201+
'application/json': {
2202+
schema: {
2203+
type: 'object',
2204+
properties: {
2205+
result: {
2206+
type: 'string',
2207+
},
2208+
},
2209+
required: ['result'],
2210+
},
2211+
},
2212+
},
2213+
},
2214+
},
2215+
},
2216+
},
2217+
},
2218+
components: {
2219+
schemas: {
2220+
Body: {
2221+
title: 'Body',
2222+
type: 'object',
2223+
properties: {
2224+
japaneseName: {
2225+
type: 'string',
2226+
description: '日本語の名前フィールド (Japanese name field)',
2227+
example: '山田太郎',
2228+
},
2229+
chineseName: {
2230+
type: 'string',
2231+
description: '中文名字字段 (Chinese name field)',
2232+
example: '张三',
2233+
},
2234+
koreanName: {
2235+
type: 'string',
2236+
description: '한국어 이름 필드 (Korean name field)',
2237+
example: '김철수',
2238+
},
2239+
},
2240+
required: ['japaneseName', 'chineseName', 'koreanName'],
2241+
},
2242+
},
2243+
},
2244+
});
2245+
21152246
const ROUTE_WITH_MARKDOWN_LIST = `
21162247
import * as t from 'io-ts';
21172248
import * as h from '@api-ts/io-ts-http';

0 commit comments

Comments
 (0)