diff --git a/devel/0147.md b/devel/0147.md new file mode 100644 index 0000000000..39a483c8d8 --- /dev/null +++ b/devel/0147.md @@ -0,0 +1,51 @@ +# [0147] 优化打开大体积 TMU 文件的性能 + +## 1 相关文档 +- [dddd.md](dddd.md) - 任务文档模板 + +## 2 任务相关的代码文件 +- `src/Data/Convert/Mogan/from_tmu.cpp` - TMU 文件解析核心代码 +- `tests/Data/Convert/convert_test.cpp` - 单元测试 + +## 3 如何测试 + +### 3.1 确定性测试(单元测试) +```bash +xmake b convert_test +xmake r convert_test +``` + +### 3.2 非确定性测试(性能验证) +```bash +xmake b stem +# 测试加载 chapter-4.tmu 的时间 +xmake r stem /home/da/DevTeam/chapter-4.tmu +``` + +## 4 如何提交 + +提交前执行以下最少步骤: +```bash +xmake b convert_test +xmake r convert_test +xmake b stem +``` + +## 5 What +优化打开大体积 TMU 文件的性能。`chapter-4.tmu` 文件大小为 14MB,其中某些行包含数百万字节的十六进制编码数据(图片等)。当前解析器在处理这些数据时存在明显的性能瓶颈。 + +## 6 Why +`chapter-4.tmu` 打开速度缓慢,影响用户体验。通过性能分析发现,解析器在处理 RAW_DATA(`<#...>`)时,每两个字符就创建一个临时字符串传递给 `from_hex`,造成大量不必要的内存分配。 + +## 7 How +1. 优化 `from_tmu.cpp` 中 RAW_DATA 的解析逻辑,避免临时字符串创建。 + - 增加内联函数 `from_hex_char`,直接处理字符而非创建 2 字节临时字符串。 + - 1M hex bytes 解析时间从 44ms 降至 29ms。 +2. 优化 `read_next` 中普通字符的解析,避免逐字符调用 `read_char` 创建临时字符串。 + - 增加快速路径直接批量读取 ASCII 字符,仅对非 ASCII 字符调用 `decode_from_utf8`。 + - 1M 文本字符解析时间从 98ms 降至 23ms。 +3. 优化 `decode` 函数,避免无转义字符时的逐字节复制。 + - 先扫描字符串判断是否存在反斜杠,若无则直接返回原字符串。 + - 1M 文本字符解析时间从约 40ms 进一步降至约 20ms。 +4. 通过 `bench_start`/`bench_end` 测量 `tmu_to_tree` 整体解析性能。 + - main 基线:310ms → 优化后:238ms(约 23% 提升)。 \ No newline at end of file diff --git a/src/Data/Convert/Mogan/from_tmu.cpp b/src/Data/Convert/Mogan/from_tmu.cpp index 530a85bf34..e40acea3d1 100644 --- a/src/Data/Convert/Mogan/from_tmu.cpp +++ b/src/Data/Convert/Mogan/from_tmu.cpp @@ -15,6 +15,7 @@ #include "preferences.hpp" #include "tree_helper.hpp" +#include "tm_debug.hpp" #include #include #include @@ -27,6 +28,14 @@ using moebius::drd::STD_CODE; using namespace moebius; +static inline int +from_hex_char (char c) { + if (is_digit (c)) return c - '0'; + if ((c >= 'A') && (c <= 'F')) return c + 10 - 'A'; + if ((c >= 'a') && (c <= 'f')) return c + 10 - 'a'; + return 0; +} + /****************************************************************************** * Conversion of TeXmacs strings of the present format to TeXmacs trees ******************************************************************************/ @@ -72,9 +81,13 @@ tmu_reader::skip_blank () { string tmu_reader::decode (string s) { - int i, n= N (s); - string r; + int i, n= N (s); for (i= 0; i < n; i++) + if (((i + 1) < n) && (s[i] == '\\')) break; + if (i == n) return s; + + string r (s (0, i)); + for (; i < n; i++) if (((i + 1) < n) && (s[i] == '\\')) { i++; if (s[i] == ';') @@ -120,18 +133,8 @@ tmu_reader::read_next () { if (c == "") return ""; if (c == "#") return "<#"; if ((c == "\\") || (c == "|") || (c == "/")) return "<" * c; - if (is_iso_alpha (c[0]) || (c == ">")) { - pos= old_pos; - return "<"; - } pos= old_pos; return "<"; - /* - string d= read_char (); - if ((d == "\\") || (d == "|") || (d == "/")) return "<" * c * d; - pos= old_pos; - return "<" * c; - */ } case '|': case '>': @@ -141,6 +144,23 @@ tmu_reader::read_next () { string r; pos= old_pos; while (true) { + while (pos < buf_N) { + char ch= buf[pos]; + if (ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ' || ch == '<' || + ch == '|' || ch == '>' || ch == '\\') + break; + if ((ch & 0x80) == 0) { + r << ch; + pos++; + } + else { + int start_pos= pos; + decode_from_utf8 (buf, pos); + r << buf (start_pos, pos); + } + } + if (pos >= buf_N) return r; + old_pos= pos; c = read_char (); if (c == "") return r; @@ -276,7 +296,8 @@ tmu_reader::read (bool skip_flag) { else if (tail_char_of_last == '#') { string r; while ((buf[pos] != '>') && (pos + 2 < buf_N)) { - r << ((char) from_hex (buf (pos, pos + 2))); + r << ((char) ((from_hex_char (buf[pos]) << 4) + + from_hex_char (buf[pos + 1]))); pos+= 2; } if (buf[pos] == '>') pos++; @@ -334,14 +355,20 @@ tmu_reader::read (bool skip_flag) { tree tmu_to_tree (string s) { + bench_start ("tmu_to_tree"); tmu_reader tmr (s); - return tmr.read (true); + tree t= tmr.read (true); + bench_end ("tmu_to_tree"); + return t; } tree tmu_to_tree (string s, string version) { + bench_start ("tmu_to_tree"); tmu_reader tmr (s, version); - return tmr.read (true); + tree t= tmr.read (true); + bench_end ("tmu_to_tree"); + return t; } /****************************************************************************** diff --git a/tests/Data/Convert/convert_test.cpp b/tests/Data/Convert/convert_test.cpp index 56b9e7ef3e..ad83e087de 100644 --- a/tests/Data/Convert/convert_test.cpp +++ b/tests/Data/Convert/convert_test.cpp @@ -12,6 +12,8 @@ #include "base.hpp" #include "convert.hpp" +#include "file.hpp" +#include "tm_ostream.hpp" #include "tree_helper.hpp" using namespace moebius; @@ -25,6 +27,14 @@ class TestConverter : public QObject { private slots: void test_search_metadata_data (); void test_search_metadata (); + void test_tmu_raw_data (); + void test_tmu_plain_text (); + void test_tmu_escape_sequences (); + void test_tmu_document (); + void test_tmu_compound (); + void test_tmu_nested_compound (); + void test_tmu_raw_data_performance (); + void test_tmu_text_performance (); }; void @@ -72,5 +82,120 @@ TestConverter::test_search_metadata () { qcompare (search_metadata (input_tree, "invalid"), invalid); } +void +TestConverter::test_tmu_raw_data () { + string s= "<#41424344>"; + tree t= tmu_to_tree (s); + QVERIFY (is_func (t, RAW_DATA)); + QCOMPARE (N (t), 1); + qcompare (as_string (t[0]), string ("ABCD")); +} + +void +TestConverter::test_tmu_plain_text () { + tree t= tmu_to_tree ("hello"); + QVERIFY (is_func (t, DOCUMENT)); + QCOMPARE (N (t), 1); + QVERIFY (t[0] == tree ("hello")); + + tree t2= tmu_to_tree ("hello world"); + QVERIFY (is_func (t2, DOCUMENT)); + QCOMPARE (N (t2), 1); + QVERIFY (t2[0] == tree ("hello world")); +} + +void +TestConverter::test_tmu_escape_sequences () { + tree t1= tmu_to_tree ("a\\;b"); + QVERIFY (is_func (t1, DOCUMENT)); + QVERIFY (t1[0] == tree ("ab")); + + tree t2= tmu_to_tree ("a\\\\b"); + QVERIFY (is_func (t2, DOCUMENT)); + QVERIFY (t2[0] == tree ("a\\b")); + + tree t3= tmu_to_tree ("a\\|b"); + QVERIFY (is_func (t3, DOCUMENT)); + QVERIFY (t3[0] == tree ("a|b")); + + tree t4= tmu_to_tree ("a\\>b"); + QVERIFY (is_func (t4, DOCUMENT)); + QVERIFY (t4[0] == tree ("a>b")); + + QVERIFY (tmu_to_tree ("a\\"); + QVERIFY (is_func (t, DOCUMENT)); + QCOMPARE (N (t), 1); + QVERIFY (is_compound (t[0])); + QVERIFY (t[0][0] == tree ("text")); +} + +void +TestConverter::test_tmu_nested_compound () { + tree t= tmu_to_tree (""); + QVERIFY (is_func (t, DOCUMENT)); + QCOMPARE (N (t), 1); + QVERIFY (is_compound (t[0])); + QCOMPARE (N (t[0]), 3); + QVERIFY (t[0][0] == tree ("color")); + QVERIFY (t[0][1] == tree ("red")); + QVERIFY (t[0][2] == tree ("hello")); +} + +void +TestConverter::test_tmu_raw_data_performance () { + string hex_data; + for (int i= 0; i < 1000000; i++) { + hex_data << '4' << '1'; + } + string s= ">\n<#"; + s << hex_data; + s << ">"; + + QElapsedTimer timer; + timer.start (); + tree t = tmu_document_to_tree (s); + qint64 elapsed= timer.elapsed (); + + cout << "Performance: parsed 1M hex bytes in " << (int) elapsed << " ms\n"; + QVERIFY (!is_compound (t, "error")); +} + +void +TestConverter::test_tmu_text_performance () { + string text; + for (int i= 0; i < 1000000; i++) { + text << 'a'; + } + string s= ">\n"; + + QElapsedTimer timer; + timer.start (); + tree t = tmu_document_to_tree (s); + qint64 elapsed= timer.elapsed (); + + cout << "Performance: parsed 1M text chars in " << (int) elapsed << " ms\n"; + QVERIFY (!is_compound (t, "error")); +} + QTEST_MAIN (TestConverter) #include "convert_test.moc"