From 27307c7349cdfb286f1871dd71d14d7cca3f9e58 Mon Sep 17 00:00:00 2001 From: "Christopher J. Hazard, PhD" <143410553+howsohazard@users.noreply.github.com> Date: Fri, 5 Jun 2026 13:30:38 -0400 Subject: [PATCH 1/2] 25592: Fixes bugs involving string commonality and edit distance --- .../EvaluableNodeTreeManipulation.cpp | 2 +- .../EvaluableNodeTreeManipulation.h | 31 +++++++++++++------ .../interpreter/OpcodesAdvancedMath.cpp | 2 +- .../OpcodesCodeComparisonAndEvolution.cpp | 11 +++---- .../OpcodesEntityComparisonAndEvolution.cpp | 4 +-- 5 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp index 64ed78f6..244345b0 100644 --- a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp +++ b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.cpp @@ -924,7 +924,7 @@ std::pair EvaluableNodeTreeManipulation::CommonalityBet { auto n1sid = n1->GetStringIDReference(); auto n2sid = n2->GetStringIDReference(); - double commonality = CommonalityBetweenStrings(n1sid, n2sid); + double commonality = RelativeCommonalityBetweenStrings(n1sid, n2sid); double commonality_including_type = std::min(0.125 + 0.875 * commonality, 1.0); return std::make_pair(n1, commonality_including_type); } diff --git a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.h b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.h index a780c445..5e2bab6f 100644 --- a/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.h +++ b/src/Amalgam/evaluablenode/EvaluableNodeTreeManipulation.h @@ -431,7 +431,7 @@ class EvaluableNodeTreeManipulation } //returns the commonality between two strings that are different - static inline double CommonalityBetweenStrings(StringInternPool::StringID sid1, StringInternPool::StringID sid2) + static inline double RelativeCommonalityBetweenStrings(StringInternPool::StringID sid1, StringInternPool::StringID sid2) { if(sid1 == sid2) return 1.0; @@ -454,9 +454,9 @@ class EvaluableNodeTreeManipulation return 0.75 * edit_score + 0.25 * length_ratio; } - //returns the EditDistance between the sequences a and b using the specified sequence_commonality_buffer + //returns the commonality between the sequences a and b using the specified sequence_commonality_buffer template - static size_t EditDistance(std::vector &a, std::vector &b, + static size_t CommonalityBetweenVectors(std::vector &a, std::vector &b, FlatMatrix &sequence_commonality_buffer) { //if either string is empty, return the other @@ -473,16 +473,27 @@ class EvaluableNodeTreeManipulation return (a == b ? 1 : 0); }); - //edit distance is the longest sequence's size minus the commonality - return std::max(a_size, b_size) - sequence_commonality_buffer.At(a_size, b_size); + return sequence_commonality_buffer.At(a_size, b_size); + } + + //computes the commonality between the two utf-8 strings + inline static size_t CommonalityBetweenStrings(const std::string &a, const std::string &b) + { + StringManipulation::ExplodeUTF8Characters(a, aCharsBuffer); + StringManipulation::ExplodeUTF8Characters(b, bCharsBuffer); + return CommonalityBetweenVectors(aCharsBuffer, bCharsBuffer, sequenceCommonalityBuffer); } - //returns the EditDistance between the sequences a and b + //returns the edit distance between the sequences a and b using the specified sequence_commonality_buffer template - inline static size_t EditDistance(std::vector &a, std::vector &b) + static size_t EditDistance(std::vector &a, std::vector &b, + FlatMatrix &sequence_commonality_buffer) { - FlatMatrix sequence_commonality; - return EditDistance(a, b, sequence_commonality); + size_t commonality = CommonalityBetweenVectors(a, b, sequence_commonality_buffer); + + //edit distance is the difference between a_size and commonality and b_size and commonality, + //which can be written more succinctly with a * 2 + return a.size() + b.size() - 2 * commonality; } //computes the edit distance (Levenshtein distance) between the two utf-8 strings @@ -490,7 +501,7 @@ class EvaluableNodeTreeManipulation { StringManipulation::ExplodeUTF8Characters(a, aCharsBuffer); StringManipulation::ExplodeUTF8Characters(b, bCharsBuffer); - return EvaluableNodeTreeManipulation::EditDistance(aCharsBuffer, bCharsBuffer, sequenceCommonalityBuffer); + return EditDistance(aCharsBuffer, bCharsBuffer, sequenceCommonalityBuffer); } //computes the edit distance (Levenshtein distance) between the two utf-8 strings diff --git a/src/Amalgam/interpreter/OpcodesAdvancedMath.cpp b/src/Amalgam/interpreter/OpcodesAdvancedMath.cpp index 38f4efaf..065514ac 100644 --- a/src/Amalgam/interpreter/OpcodesAdvancedMath.cpp +++ b/src/Amalgam/interpreter/OpcodesAdvancedMath.cpp @@ -1601,7 +1601,7 @@ static OpcodeInitializer _ENT_GENERALIZED_DISTANCE(ENT_GENERALIZED_DISTANCE, &In 1 [1] [{difference_type "continuous" data_type "code" nominal_strings .false types_must_match .false}] -))&", R"(3.697640774259515)"}, +))&", R"(3.9642281506573376)"}, {R"&((generalized_distance ;vector1 { diff --git a/src/Amalgam/interpreter/OpcodesCodeComparisonAndEvolution.cpp b/src/Amalgam/interpreter/OpcodesCodeComparisonAndEvolution.cpp index 43b84262..17503610 100644 --- a/src/Amalgam/interpreter/OpcodesCodeComparisonAndEvolution.cpp +++ b/src/Amalgam/interpreter/OpcodesCodeComparisonAndEvolution.cpp @@ -260,7 +260,7 @@ static OpcodeInitializer _ENT_COMMONALITY(ENT_COMMONALITY, &Interpreter::Interpr OpcodeDetails d; d.parameters = R"(* node1 * node2 [assoc params])"; d.returns = R"(number)"; - d.description = R"(Evaluates to the total count of all of the nodes referenced within `node1` and `node2` that are equivalent. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "use_string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster.)"; + d.description = R"(Evaluates to the total count of all of the nodes referenced within `node1` and `node2` that are equivalent. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster.)"; d.examples = MakeAmalgamExamples({ {R"&((commonality (lambda @@ -388,11 +388,8 @@ EvaluableNodeReference Interpreter::InterpretNode_ENT_COMMONALITY(EvaluableNode //calculate edit distance based commonality if string edit distance if(string_edit_distance) { - size_t s1_len = 0; - size_t s2_len = 0; - auto edit_distance = EvaluableNodeTreeManipulation::EditDistance( - EvaluableNode::ToString(ocn[0]), EvaluableNode::ToString(ocn[1]), s1_len, s2_len); - auto commonality = static_cast(std::max(s1_len, s2_len) - edit_distance); + double commonality = static_cast(EvaluableNodeTreeManipulation::CommonalityBetweenStrings( + EvaluableNode::ToString(ocn[0]), EvaluableNode::ToString(ocn[1]))); return AllocReturn(commonality, immediate_result); } @@ -416,7 +413,7 @@ static OpcodeInitializer _ENT_EDIT_DISTANCE(ENT_EDIT_DISTANCE, &Interpreter::Int OpcodeDetails d; d.parameters = R"(* node1 * node2 [assoc params])"; d.returns = R"(number)"; - d.description = R"(Evaluates to the number of nodes that are different between `node1` and `node2`. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "use_string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster.)"; + d.description = R"(Evaluates to the number of nodes that are different between `node1` and `node2`. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster.)"; d.examples = MakeAmalgamExamples({ {R"&((edit_distance (lambda diff --git a/src/Amalgam/interpreter/OpcodesEntityComparisonAndEvolution.cpp b/src/Amalgam/interpreter/OpcodesEntityComparisonAndEvolution.cpp index 47b36564..7b7dc806 100644 --- a/src/Amalgam/interpreter/OpcodesEntityComparisonAndEvolution.cpp +++ b/src/Amalgam/interpreter/OpcodesEntityComparisonAndEvolution.cpp @@ -331,7 +331,7 @@ static OpcodeInitializer _ENT_COMMONALITY_ENTITIES(ENT_COMMONALITY_ENTITIES, &In {nominal_strings .false types_must_match .false} ) ] -))&", R"([64 64.74178574543642])", "", R"((destroy_entities "MergeEntity1" "MergeEntity2" )"} +))&", R"([64 64.58517088326876])", "", R"((destroy_entities "MergeEntity1" "MergeEntity2" )"} }); d.requiresEntity = true; d.valueNewness = OpcodeDetails::OpcodeReturnNewnessType::NEW; @@ -446,7 +446,7 @@ static OpcodeInitializer _ENT_EDIT_DISTANCE_ENTITIES(ENT_EDIT_DISTANCE_ENTITIES, ) ] ) -))&", R"([11 9.516428509127167])", "", R"((destroy_entities "MergeEntity1" "MergeEntity2" )"}, +))&", R"([11 9.829658233462482])", "", R"((destroy_entities "MergeEntity1" "MergeEntity2" )"}, }); d.requiresEntity = true; From c94e9c20199fc7faff6c25fd95294b9127b49099 Mon Sep 17 00:00:00 2001 From: howso-automation Date: Fri, 5 Jun 2026 17:43:04 +0000 Subject: [PATCH 2/2] Automated docs rebuild --- docs/advanced_math.md | 2 +- docs/code_comparison_and_evolution.md | 4 ++-- docs/entity_comparison_and_evolution.md | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/advanced_math.md b/docs/advanced_math.md index 952e699f..94253185 100644 --- a/docs/advanced_math.md +++ b/docs/advanced_math.md @@ -1635,7 +1635,7 @@ Example: ``` Output: ```amalgam -3.697640774259515 +3.9642281506573376 ``` Example: ```amalgam diff --git a/docs/code_comparison_and_evolution.md b/docs/code_comparison_and_evolution.md index 100deed2..c41a5d5d 100644 --- a/docs/code_comparison_and_evolution.md +++ b/docs/code_comparison_and_evolution.md @@ -171,7 +171,7 @@ Output: #### Parameters `* node1 * node2 [assoc params]` #### Description -Evaluates to the total count of all of the nodes referenced within `node1` and `node2` that are equivalent. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "use_string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster. +Evaluates to the total count of all of the nodes referenced within `node1` and `node2` that are equivalent. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster. #### Details - Permissions required: none - Allows concurrency: false @@ -375,7 +375,7 @@ Output: #### Parameters `* node1 * node2 [assoc params]` #### Description -Evaluates to the number of nodes that are different between `node1` and `node2`. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "use_string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster. +Evaluates to the number of nodes that are different between `node1` and `node2`. The assoc `params` can contain the keys "string_edit_distance", "types_must_match", "nominal_numbers", "nominal_strings", and "recursive_matching". If the key "string_edit_distance" is true (default is false), it will assume `node1` and `node2` as string literals and compute via string edit distance. If the key "types_must_match" is true (the default), it will only consider nodes common if the types match. If the key "nominal_numbers" is true (the default is false), then it will assume that all numbers will match only if identical; if false, it will compare similarity of values. The key "nominal_strings" defaults to true, but works similar to "nominal_numbers" except on strings using string edit distance. If the key "recursive_matching" is true or null, then it will attempt to recursively match any part of the data structure of `node1` to `node2`. If the key "recursive_matching" is false, then it will only attempt to merge the two at the same level, which yield better results if the data structures are common, and additionally will be much faster. #### Details - Permissions required: none - Allows concurrency: false diff --git a/docs/entity_comparison_and_evolution.md b/docs/entity_comparison_and_evolution.md index 1088996c..001d548a 100644 --- a/docs/entity_comparison_and_evolution.md +++ b/docs/entity_comparison_and_evolution.md @@ -246,7 +246,7 @@ Example: ``` Output: ```amalgam -[64 64.74178574543642] +[64 64.58517088326876] ``` [Amalgam Opcodes](./opcodes.md) @@ -337,7 +337,7 @@ Example: ``` Output: ```amalgam -[11 9.516428509127167] +[11 9.829658233462482] ``` [Amalgam Opcodes](./opcodes.md)