Skip to content
Open
51 changes: 51 additions & 0 deletions devel/0147.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# [0147] 优化打开大体积 TMU 文件的性能

## 1 相关文档
- [dddd.md](dddd.md) - 任务文档模板

## 2 任务相关的代码文件
- `src/Data/Convert/Mogan/from_tmu.cpp` - TMU 文件解析核心代码
- `tests/Data/Convert/convert_test.cpp` - 单元测试

## 3 如何测试

### 3.1 确定性测试(单元测试)
```bash
xmake b convert_test
xmake r convert_test
```

### 3.2 非确定性测试(性能验证)
```bash
xmake b stem
# 测试加载 chapter-4.tmu 的时间
xmake r stem /home/da/DevTeam/chapter-4.tmu
```

## 4 如何提交

提交前执行以下最少步骤:
```bash
xmake b convert_test
xmake r convert_test
xmake b stem
```

## 5 What
优化打开大体积 TMU 文件的性能。`chapter-4.tmu` 文件大小为 14MB,其中某些行包含数百万字节的十六进制编码数据(图片等)。当前解析器在处理这些数据时存在明显的性能瓶颈。

## 6 Why
`chapter-4.tmu` 打开速度缓慢,影响用户体验。通过性能分析发现,解析器在处理 RAW_DATA(`<#...>`)时,每两个字符就创建一个临时字符串传递给 `from_hex`,造成大量不必要的内存分配。

## 7 How
1. 优化 `from_tmu.cpp` 中 RAW_DATA 的解析逻辑,避免临时字符串创建。
- 增加内联函数 `from_hex_char`,直接处理字符而非创建 2 字节临时字符串。
- 1M hex bytes 解析时间从 44ms 降至 29ms。
2. 优化 `read_next` 中普通字符的解析,避免逐字符调用 `read_char` 创建临时字符串。
- 增加快速路径直接批量读取 ASCII 字符,仅对非 ASCII 字符调用 `decode_from_utf8`。
- 1M 文本字符解析时间从 98ms 降至 23ms。
3. 优化 `decode` 函数,避免无转义字符时的逐字节复制。
- 先扫描字符串判断是否存在反斜杠,若无则直接返回原字符串。
- 1M 文本字符解析时间从约 40ms 进一步降至约 20ms。
4. 通过 `bench_start`/`bench_end` 测量 `tmu_to_tree` 整体解析性能。
- main 基线:310ms → 优化后:238ms(约 23% 提升)。
57 changes: 42 additions & 15 deletions src/Data/Convert/Mogan/from_tmu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "preferences.hpp"
#include "tree_helper.hpp"

#include "tm_debug.hpp"
#include <lolly/data/numeral.hpp>
#include <lolly/data/unicode.hpp>
#include <moebius/drd/drd_std.hpp>
Expand All @@ -27,6 +28,14 @@ using moebius::drd::STD_CODE;

using namespace moebius;

static inline int
from_hex_char (char c) {
if (is_digit (c)) return c - '0';
if ((c >= 'A') && (c <= 'F')) return c + 10 - 'A';
if ((c >= 'a') && (c <= 'f')) return c + 10 - 'a';
return 0;
}

/******************************************************************************
* Conversion of TeXmacs strings of the present format to TeXmacs trees
******************************************************************************/
Expand Down Expand Up @@ -72,9 +81,13 @@ tmu_reader::skip_blank () {

string
tmu_reader::decode (string s) {
int i, n= N (s);
string r;
int i, n= N (s);
for (i= 0; i < n; i++)
if (((i + 1) < n) && (s[i] == '\\')) break;
if (i == n) return s;

string r (s (0, i));
for (; i < n; i++)
if (((i + 1) < n) && (s[i] == '\\')) {
i++;
if (s[i] == ';')
Expand Down Expand Up @@ -120,18 +133,8 @@ tmu_reader::read_next () {
if (c == "") return "";
if (c == "#") return "<#";
if ((c == "\\") || (c == "|") || (c == "/")) return "<" * c;
if (is_iso_alpha (c[0]) || (c == ">")) {
pos= old_pos;
return "<";
}
pos= old_pos;
return "<";
/*
string d= read_char ();
if ((d == "\\") || (d == "|") || (d == "/")) return "<" * c * d;
pos= old_pos;
return "<" * c;
*/
}
case '|':
case '>':
Expand All @@ -141,6 +144,23 @@ tmu_reader::read_next () {
string r;
pos= old_pos;
while (true) {
while (pos < buf_N) {
char ch= buf[pos];
if (ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ' || ch == '<' ||
ch == '|' || ch == '>' || ch == '\\')
break;
if ((ch & 0x80) == 0) {
r << ch;
pos++;
}
else {
int start_pos= pos;
decode_from_utf8 (buf, pos);
r << buf (start_pos, pos);
}
}
if (pos >= buf_N) return r;

old_pos= pos;
c = read_char ();
if (c == "") return r;
Expand Down Expand Up @@ -276,7 +296,8 @@ tmu_reader::read (bool skip_flag) {
else if (tail_char_of_last == '#') {
string r;
while ((buf[pos] != '>') && (pos + 2 < buf_N)) {
r << ((char) from_hex (buf (pos, pos + 2)));
r << ((char) ((from_hex_char (buf[pos]) << 4) +
from_hex_char (buf[pos + 1])));
pos+= 2;
}
if (buf[pos] == '>') pos++;
Expand Down Expand Up @@ -334,14 +355,20 @@ tmu_reader::read (bool skip_flag) {

tree
tmu_to_tree (string s) {
bench_start ("tmu_to_tree");
tmu_reader tmr (s);
return tmr.read (true);
tree t= tmr.read (true);
bench_end ("tmu_to_tree");
return t;
}

tree
tmu_to_tree (string s, string version) {
bench_start ("tmu_to_tree");
tmu_reader tmr (s, version);
return tmr.read (true);
tree t= tmr.read (true);
bench_end ("tmu_to_tree");
return t;
}

/******************************************************************************
Expand Down
125 changes: 125 additions & 0 deletions tests/Data/Convert/convert_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

#include "base.hpp"
#include "convert.hpp"
#include "file.hpp"
#include "tm_ostream.hpp"
#include "tree_helper.hpp"

using namespace moebius;
Expand All @@ -25,6 +27,14 @@ class TestConverter : public QObject {
private slots:
void test_search_metadata_data ();
void test_search_metadata ();
void test_tmu_raw_data ();
void test_tmu_plain_text ();
void test_tmu_escape_sequences ();
void test_tmu_document ();
void test_tmu_compound ();
void test_tmu_nested_compound ();
void test_tmu_raw_data_performance ();
void test_tmu_text_performance ();
};

void
Expand Down Expand Up @@ -72,5 +82,120 @@ TestConverter::test_search_metadata () {
qcompare (search_metadata (input_tree, "invalid"), invalid);
}

void
TestConverter::test_tmu_raw_data () {
string s= "<#41424344>";
tree t= tmu_to_tree (s);
QVERIFY (is_func (t, RAW_DATA));
QCOMPARE (N (t), 1);
qcompare (as_string (t[0]), string ("ABCD"));
}

void
TestConverter::test_tmu_plain_text () {
tree t= tmu_to_tree ("hello");
QVERIFY (is_func (t, DOCUMENT));
QCOMPARE (N (t), 1);
QVERIFY (t[0] == tree ("hello"));

tree t2= tmu_to_tree ("hello world");
QVERIFY (is_func (t2, DOCUMENT));
QCOMPARE (N (t2), 1);
QVERIFY (t2[0] == tree ("hello world"));
}

void
TestConverter::test_tmu_escape_sequences () {
tree t1= tmu_to_tree ("a\\;b");
QVERIFY (is_func (t1, DOCUMENT));
QVERIFY (t1[0] == tree ("ab"));

tree t2= tmu_to_tree ("a\\\\b");
QVERIFY (is_func (t2, DOCUMENT));
QVERIFY (t2[0] == tree ("a\\b"));

tree t3= tmu_to_tree ("a\\|b");
QVERIFY (is_func (t3, DOCUMENT));
QVERIFY (t3[0] == tree ("a|b"));

tree t4= tmu_to_tree ("a\\>b");
QVERIFY (is_func (t4, DOCUMENT));
QVERIFY (t4[0] == tree ("a>b"));

QVERIFY (tmu_to_tree ("a\\<b")[0] == tree ("a<b"));
QVERIFY (tmu_to_tree ("\\;")[0] == tree (""));
QVERIFY (tmu_to_tree ("no escape")[0] == tree ("no escape"));
}

void
TestConverter::test_tmu_document () {
string s= "line1\n\nline2\n\nline3";
tree t= tmu_to_tree (s);
QVERIFY (is_func (t, DOCUMENT));
QCOMPARE (N (t), 3);
QVERIFY (t[0] == tree ("line1"));
QVERIFY (t[1] == tree ("line2"));
QVERIFY (t[2] == tree ("line3"));
}

void
TestConverter::test_tmu_compound () {
tree t= tmu_to_tree ("<bold|text>");
QVERIFY (is_func (t, DOCUMENT));
QCOMPARE (N (t), 1);
QVERIFY (is_compound (t[0]));
QVERIFY (t[0][0] == tree ("text"));
}

void
TestConverter::test_tmu_nested_compound () {
tree t= tmu_to_tree ("<with|color|red|hello>");
QVERIFY (is_func (t, DOCUMENT));
QCOMPARE (N (t), 1);
QVERIFY (is_compound (t[0]));
QCOMPARE (N (t[0]), 3);
QVERIFY (t[0][0] == tree ("color"));
QVERIFY (t[0][1] == tree ("red"));
QVERIFY (t[0][2] == tree ("hello"));
}

void
TestConverter::test_tmu_raw_data_performance () {
string hex_data;
for (int i= 0; i < 1000000; i++) {
hex_data << '4' << '1';
}
string s= "<TMU|<tuple|1.1.0|2025.1.5>>\n<#";
s << hex_data;
s << ">";

QElapsedTimer timer;
timer.start ();
tree t = tmu_document_to_tree (s);
qint64 elapsed= timer.elapsed ();

cout << "Performance: parsed 1M hex bytes in " << (int) elapsed << " ms\n";
QVERIFY (!is_compound (t, "error"));
}

void
TestConverter::test_tmu_text_performance () {
string text;
for (int i= 0; i < 1000000; i++) {
text << 'a';
}
string s= "<TMU|<tuple|1.1.0|2025.1.5>>\n<text|";
s << text;
s << "|>";

QElapsedTimer timer;
timer.start ();
tree t = tmu_document_to_tree (s);
qint64 elapsed= timer.elapsed ();

cout << "Performance: parsed 1M text chars in " << (int) elapsed << " ms\n";
QVERIFY (!is_compound (t, "error"));
}

QTEST_MAIN (TestConverter)
#include "convert_test.moc"
Loading