diff --git a/README.md b/README.md index 5bac55cd..ce71793c 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@

+The website can be accessed using this link. This program uses the fast flow-based method developed by Michael T. Gastner, Vivien Seguy, and Pratyush More. For more information, you may refer to the following [paper](https://www.pnas.org/content/115/10/E2156): diff --git a/include/string_to_decimal_converter.hpp b/include/string_to_decimal_converter.hpp index 8d4a741f..fc6ff2b1 100644 --- a/include/string_to_decimal_converter.hpp +++ b/include/string_to_decimal_converter.hpp @@ -4,6 +4,20 @@ #include #include +/** + * @brief A utility class for converting string representations of numbers to decimal format. + * + * This class handles various number formats including: + * - Regular decimal numbers (e.g. "123.456", "123,456") + * - Scientific notation (e.g. "1.23e-4", "1.23E4") + * - Special value "NA" + * + * For scientific notation: + * - Both 'e' and 'E' are supported as exponent markers + * - The mantissa can use either '.' or ',' as decimal separator + * - The exponent must be an integer and can be negative + * - The mantissa must be a valid decimal number + */ class StringToDecimalConverter { private: diff --git a/requirements.txt b/requirements.txt index 8ef9808b..b2b3a188 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ pip wheel conan==2.16.1 -cmake==3.30.0 \ No newline at end of file +cmake==3.30.0 diff --git a/src/misc/string_to_decimal_converter.cpp b/src/misc/string_to_decimal_converter.cpp index 29e6b322..2967af43 100644 --- a/src/misc/string_to_decimal_converter.cpp +++ b/src/misc/string_to_decimal_converter.cpp @@ -7,7 +7,7 @@ const std::string StringToDecimalConverter::NA_ = "NA"; bool StringToDecimalConverter::is_valid_char(char ch) { - return (std::isdigit(ch)) || ch == point_ || ch == comma_ || ch == minus_; + return (std::isdigit(ch)) || ch == point_ || ch == comma_ || ch == minus_ || ch == 'e' || ch == 'E'; } std::string StringToDecimalConverter::remove_char(std::string str, char ch) @@ -87,22 +87,66 @@ bool StringToDecimalConverter::is_str_valid_characters(const std::string &str) return true; } - // Only 0 to 9, '.', '-', and ',' are allowed + // Only 0 to 9, '.', '-', ',', 'e', and 'E' are allowed for (const auto &c : str) { if (!is_valid_char(c)) { return false; } } - // '-' can only be used once - if (count_char(str, minus_) > 1) { - return false; + // '-' can only be used once in the mantissa and once in the exponent + size_t first_e = str.find_first_of("eE"); + if (first_e != std::string::npos) { + // Check mantissa part + std::string mantissa = str.substr(0, first_e); + if (count_char(mantissa, minus_) > 1) { + return false; + } + if (count_char(mantissa, minus_) == 1 && mantissa[0] != minus_) { + return false; + } + // Check exponent part + std::string exponent = str.substr(first_e + 1); + if (count_char(exponent, minus_) > 1) { + return false; + } + if (count_char(exponent, minus_) == 1 && exponent[0] != minus_) { + return false; + } + } else { + // No scientific notation, check as before + if (count_char(str, minus_) > 1) { + return false; + } + if (count_char(str, minus_) == 1 && str[0] != minus_) { + return false; + } } - // '-' can only be used at the beginning - if (count_char(str, minus_) == 1 and str[0] != minus_) { + // Check for valid scientific notation format + size_t e_count = count_char(str, 'e') + count_char(str, 'E'); + if (e_count > 1) { return false; } + if (e_count == 1) { + size_t e_pos = str.find_first_of("eE"); + // Must have digits before and after 'e'/'E' + if (e_pos == 0 || e_pos == str.length() - 1) { + return false; + } + // Must have at least one digit after 'e'/'E' + bool has_digit_after = false; + for (size_t i = e_pos + 1; i < str.length(); i++) { + if (std::isdigit(str[i])) { + has_digit_after = true; + break; + } + } + if (!has_digit_after) { + return false; + } + } + return true; } @@ -111,19 +155,37 @@ bool StringToDecimalConverter::is_str_correct_format(const std::string &str) assert(is_str_valid_characters(str)); assert(is_str_NA(str) == false); - // if the number of commas and points both are more than 1, then this format - // does not belong to any known convention + // Handle scientific notation separately + size_t e_pos = str.find_first_of("eE"); + if (e_pos != std::string::npos) { + // Check mantissa part + std::string mantissa = str.substr(0, e_pos); + if (has_multiple_commas_and_points(mantissa)) { + return false; + } + if (has_invalid_comma_point_sequence(mantissa)) { + return false; + } + if (has_separator_at_the_end(mantissa)) { + return false; + } + // Exponent part should only contain digits and optional minus sign + std::string exponent = str.substr(e_pos + 1); + for (char c : exponent) { + if (!std::isdigit(c) && c != minus_) { + return false; + } + } + return true; + } + + // Original format validation for non-scientific notation if (has_multiple_commas_and_points(str)) { return false; } - - // Check for commas before and after a point, or points before and after a - // comma if (has_invalid_comma_point_sequence(str)) { return false; } - - // Check for separators at the end of the string if (has_separator_at_the_end(str)) { return false; } diff --git a/tests/unit/test_string_to_decimal_converter.cpp b/tests/unit/test_string_to_decimal_converter.cpp index 0e33be60..6c4bec0d 100644 --- a/tests/unit/test_string_to_decimal_converter.cpp +++ b/tests/unit/test_string_to_decimal_converter.cpp @@ -78,4 +78,87 @@ BOOST_AUTO_TEST_CASE(TestCorrectFormat_PointAtEnd) BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format("123456789.")); } +BOOST_AUTO_TEST_CASE(TestValidCharacters_ScientificNotation) +{ + // Basic scientific notation + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1.23e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1.23E4")); + + // Negative exponents + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1.23e-4")); + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1.23E-4")); + + // With commas as thousand separators + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1,234.56e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("1.234,56E4")); + + // Negative numbers + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("-1.23e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_valid_characters("-1.23E-4")); + + // Invalid scientific notation + BOOST_CHECK(!StringToDecimalConverter::is_str_valid_characters( + "1.23e")); // Missing exponent + BOOST_CHECK(!StringToDecimalConverter::is_str_valid_characters( + "e4")); // Missing mantissa + BOOST_CHECK(!StringToDecimalConverter::is_str_valid_characters( + "1.23ee4")); // Multiple e's + BOOST_CHECK(!StringToDecimalConverter::is_str_valid_characters( + "1.23e4.5")); // Non-integer exponent + BOOST_CHECK(!StringToDecimalConverter::is_str_valid_characters( + "1.23e-")); // Incomplete negative exponent +} + +BOOST_AUTO_TEST_CASE(TestCorrectFormat_ScientificNotation) +{ + // Valid formats + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("1.23e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("1.23E4")); + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("1,234.56e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("1.234,56E4")); + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("-1.23e4")); + BOOST_CHECK(StringToDecimalConverter::is_str_correct_format("-1.23E-4")); + + // Invalid formats + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23e4.5")); // Non-integer exponent + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23e4,")); // Comma at end + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23e4.")); // Point at end + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23,456.789e4")); // Multiple separators + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23e4e5")); // Multiple e's + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "1.23e")); // Missing exponent + BOOST_CHECK(!StringToDecimalConverter::is_str_correct_format( + "e4")); // Missing mantissa +} + +BOOST_AUTO_TEST_CASE(TestParseStr_ScientificNotation) +{ + // Test parsing with point as decimal separator + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("1.23e4", true), + "1.23e4"); + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("1,234.56e4", true), + "1234.56e4"); + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("-1.23e-4", true), + "-1.23e-4"); + + // Test parsing with comma as decimal separator + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("1,23e4", false), + "1.23e4"); + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("1.234,56E4", false), + "1234.56E4"); + BOOST_CHECK_EQUAL( + StringToDecimalConverter::parse_str("-1,23e-4", false), + "-1.23e-4"); +} + BOOST_AUTO_TEST_SUITE_END()