diff --git a/.idea/wana_kana_rust.iml b/.idea/wana_kana_rust.iml index d6ebd48..38e2683 100644 --- a/.idea/wana_kana_rust.iml +++ b/.idea/wana_kana_rust.iml @@ -2,7 +2,12 @@ - + + + + + + diff --git a/README.md b/README.md index 2146355..c6a04cd 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,11 @@ use wana_kana::ConvertJapanese; assert_eq!("ワナカナ".to_romaji(), "wanakana"); assert_eq!("WanaKana".to_hiragana(), "わなかな"); assert_eq!("WANAKANA".to_kana(), "ワナカナ"); + +// half-width katakana is supported +assert_eq!("ワナカナ".to_romaji(), "wanakana"); +assert_eq!("ワナカナ".to_hiragana(), "わなかな"); +assert_eq!("ワナカナ".to_katakana(), "ワナカナ"); ``` ## Tests diff --git a/src/constants.rs b/src/constants.rs index e805e84..d9e5400 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -95,4 +95,7 @@ pub const HIRAGANA_END: u32 = 0x3096; pub const KATAKANA_START: u32 = 0x30A1; pub const KATAKANA_END: u32 = 0x30FC; pub const PROLONGED_SOUND_MARK: u32 = 0x30FC; +pub const HALFWIDTH_PROLONGED_SOUND_MARK: u32 = 0xFF70; pub const KANA_SLASH_DOT: u32 = 0x30FB; +pub const HALFWIDTH_KATAKANA_START: u32 = 0xFF61; +pub const HALFWIDTH_KATAKANA_END: u32 = 0xFF9F; diff --git a/src/halfwidth_to_hiragana_node_tree.rs b/src/halfwidth_to_hiragana_node_tree.rs new file mode 100644 index 0000000..5982630 --- /dev/null +++ b/src/halfwidth_to_hiragana_node_tree.rs @@ -0,0 +1,659 @@ +#[derive(Debug, Clone)] +pub(crate) struct Node { + pub transitions: Option>, + pub output: &'static str, +} + +impl Node { + pub(crate) fn get(&self, chars: &[char]) -> (&'static str, usize) { + let mut i = 0; + let mut curr_node = self; + for char in chars.iter() { + if let Some(trans_node) = curr_node.find_transition_node(*char) { + curr_node = trans_node; + } else { + break; + } + i += 1; + } + (curr_node.output, i) + } + + pub(crate) fn find_transition_node(&self, char: char) -> Option<&Node> { + if let Some(t) = &self.transitions { + t.binary_search_by_key(&char, |t| t.0) + .ok() + .map(|index| &t[index].1) + } else { + None + } + } + + fn sort(&mut self) { + if let Some(transitions) = &mut self.transitions { + transitions.sort_by_key(|el| el.0); + for el in transitions { + el.1.sort(); + } + } + } +} + +lazy_static! { + pub(crate) static ref HALFWIDTH_KATAKANA_TO_HIRAGANA_NODE_TREE: Node = { + let transitions = Some(vec![ + ( + '\u{3000}', + Node { + transitions: None, + output: " ", + }, + ), + ( + 'ァ', + Node { + transitions: None, + output: "ぁ", + }, + ), + ( + 'ア', + Node { + transitions: None, + output: "あ", + }, + ), + ( + 'ィ', + Node { + transitions: None, + output: "ぃ", + }, + ), + ( + 'イ', + Node { + transitions: None, + output: "い", + }, + ), + ( + 'ゥ', + Node { + transitions: None, + output: "ぅ", + }, + ), + ( + 'ウ', + Node { + output: "う", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ゔ", + }, + )]), + }, + ), + ( + 'ェ', + Node { + transitions: None, + output: "ぇ", + }, + ), + ( + 'エ', + Node { + transitions: None, + output: "え", + }, + ), + ( + 'ォ', + Node { + transitions: None, + output: "ぉ", + }, + ), + ( + 'オ', + Node { + transitions: None, + output: "お", + }, + ), + ( + 'カ', + Node { + output: "か", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "が", + }, + )]), + }, + ), + ( + 'キ', + Node { + output: "き", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ぎ", + }, + )]), + }, + ), + ( + 'ク', + Node { + output: "く", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ぐ", + }, + )]), + }, + ), + ( + 'ケ', + Node { + output: "け", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "げ", + }, + )]), + }, + ), + ( + 'コ', + Node { + output: "こ", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ご", + }, + )]), + }, + ), + ( + 'サ', + Node { + output: "さ", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ざ", + }, + )]), + }, + ), + ( + 'シ', + Node { + output: "し", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "じ", + }, + )]), + }, + ), + ( + 'ス', + Node { + output: "す", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ず", + }, + )]), + }, + ), + ( + 'セ', + Node { + output: "せ", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ぜ", + }, + )]), + }, + ), + ( + 'ソ', + Node { + output: "そ", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ぞ", + }, + )]), + }, + ), + ( + 'タ', + Node { + output: "た", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "だ", + }, + )]), + }, + ), + ( + 'チ', + Node { + output: "ち", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ぢ", + }, + )]), + }, + ), + ( + 'ッ', + Node { + transitions: None, + output: "っ", + }, + ), + ( + 'ツ', + Node { + output: "つ", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "づ", + }, + )]), + }, + ), + ( + 'テ', + Node { + output: "て", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "で", + }, + )]), + }, + ), + ( + 'ト', + Node { + output: "と", + transitions: Some(vec![( + '゙', + Node { + transitions: None, + output: "ど", + }, + )]), + }, + ), + ( + 'ナ', + Node { + transitions: None, + output: "な", + }, + ), + ( + 'ニ', + Node { + transitions: None, + output: "に", + }, + ), + ( + 'ヌ', + Node { + transitions: None, + output: "ぬ", + }, + ), + ( + 'ネ', + Node { + transitions: None, + output: "ね", + }, + ), + ( + 'ノ', + Node { + transitions: None, + output: "の", + }, + ), + ( + 'ハ', + Node { + output: "は", + transitions: Some(vec![ + ( + '゙', + Node { + transitions: None, + output: "ば", + }, + ), + ( + '゚', + Node { + transitions: None, + output: "ぱ", + }, + ), + ]), + }, + ), + ( + 'ヒ', + Node { + output: "ひ", + transitions: Some(vec![ + ( + '゙', + Node { + transitions: None, + output: "び", + }, + ), + ( + '゚', + Node { + transitions: None, + output: "ぴ", + }, + ), + ]), + }, + ), + ( + 'フ', + Node { + output: "ふ", + transitions: Some(vec![ + ( + '゙', + Node { + transitions: None, + output: "ぶ", + }, + ), + ( + '゚', + Node { + transitions: None, + output: "ぷ", + }, + ), + ]), + }, + ), + ( + 'ヘ', + Node { + output: "へ", + transitions: Some(vec![ + ( + '゙', + Node { + transitions: None, + output: "べ", + }, + ), + ( + '゚', + Node { + transitions: None, + output: "ぺ", + }, + ), + ]), + }, + ), + ( + 'ホ', + Node { + output: "ほ", + transitions: Some(vec![ + ( + '゙', + Node { + transitions: None, + output: "ぼ", + }, + ), + ( + '゚', + Node { + transitions: None, + output: "ぽ", + }, + ), + ]), + }, + ), + ( + 'マ', + Node { + transitions: None, + output: "ま", + }, + ), + ( + 'ミ', + Node { + transitions: None, + output: "み", + }, + ), + ( + 'ム', + Node { + transitions: None, + output: "む", + }, + ), + ( + 'メ', + Node { + transitions: None, + output: "め", + }, + ), + ( + 'モ', + Node { + transitions: None, + output: "も", + }, + ), + ( + 'ャ', + Node { + transitions: None, + output: "ゃ", + }, + ), + ( + 'ヤ', + Node { + transitions: None, + output: "や", + }, + ), + ( + 'ュ', + Node { + transitions: None, + output: "ゅ", + }, + ), + ( + 'ユ', + Node { + transitions: None, + output: "ゆ", + }, + ), + ( + 'ョ', + Node { + transitions: None, + output: "ょ", + }, + ), + ( + 'ヨ', + Node { + transitions: None, + output: "よ", + }, + ), + ( + 'ラ', + Node { + transitions: None, + output: "ら", + }, + ), + ( + 'リ', + Node { + transitions: None, + output: "り", + }, + ), + ( + 'ル', + Node { + transitions: None, + output: "る", + }, + ), + ( + 'レ', + Node { + transitions: None, + output: "れ", + }, + ), + ( + 'ロ', + Node { + transitions: None, + output: "ろ", + }, + ), + ( + 'ワ', + Node { + transitions: None, + output: "わ", + }, + ), + ( + 'ヲ', + Node { + transitions: None, + output: "を", + }, + ), + ( + 'ン', + Node { + transitions: None, + output: "ん", + }, + ), + ( + 'ー', + Node { + transitions: None, + output: "ー", + }, + ), + ( + '。', + Node { + transitions: None, + output: "。", + }, + ), + ( + '、', + Node { + transitions: None, + output: "、", + }, + ), + ( + '「', + Node { + transitions: None, + output: "「", + }, + ), + ( + '」', + Node { + transitions: None, + output: "」", + }, + ), + ( + '・', + Node { + transitions: None, + output: "・", + }, + ), + ]); + + let mut node = Node { + transitions, + output: "", + }; + node.sort(); + node + }; +} diff --git a/src/is_katakana.rs b/src/is_katakana.rs index cd34e27..5d2ef16 100644 --- a/src/is_katakana.rs +++ b/src/is_katakana.rs @@ -1,3 +1,4 @@ +use crate::utils::is_char_halfwidth_katakana::is_char_halfwidth_katakana; use crate::utils::is_char_katakana::*; /// Test if all chars of `input` are [Katakana](https://en.wikipedia.org/wiki/Katakana) @@ -8,6 +9,14 @@ pub fn is_katakana(input: &str) -> bool { input.chars().all(is_char_katakana) } +/// Test if `input` contains any [Half-width Katakana](https://en.wikipedia.org/wiki/Half-width_kana) +pub fn is_mixed_halfwidth_katakana(input: &str) -> bool { + if input.is_empty() { + return false; + } + input.chars().any(is_char_halfwidth_katakana) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/lib.rs b/src/lib.rs index d40f7fe..39b2929 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,6 +66,7 @@ pub(crate) mod is_katakana; pub(crate) mod is_mixed; pub(crate) mod is_romaji; +pub(crate) mod halfwidth_to_hiragana_node_tree; pub(crate) mod to_hiragana; pub(crate) mod to_kana; pub(crate) mod to_kana_node_tree; @@ -87,6 +88,7 @@ mod options; pub use crate::options::Options; pub mod traits; + pub use traits::{ConvertJapanese, IsJapaneseChar, IsJapaneseStr}; #[cfg(test)] diff --git a/src/to_hiragana.rs b/src/to_hiragana.rs index 09045b3..312c446 100644 --- a/src/to_hiragana.rs +++ b/src/to_hiragana.rs @@ -153,6 +153,26 @@ mod tests { } } + mod halfwidth_katakana { + use super::*; + #[test] + fn converts_to_hiragana() { + assert_eq!(to_hiragana("アイウエオカキクケコ"), "あいうえおかきくけこ"); + assert_eq!(to_hiragana("サシスセソタチツテト"), "さしすせそたちつてと"); + assert_eq!(to_hiragana("ナニヌネノハヒフヘホ"), "なにぬねのはひふへほ"); + assert_eq!(to_hiragana("マミムメモヤユヨ"), "まみむめもやゆよ"); + assert_eq!(to_hiragana("ラリルレロワヲン"), "らりるれろわをん"); + assert_eq!(to_hiragana("ァィゥェォャュョッー・「」"), "ぁぃぅぇぉゃゅょっ・「」"); + assert_eq!(to_hiragana("ガギグゲゴ"), "がぎぐげご"); + assert_eq!(to_hiragana("ザジズゼゾ"), "ざじずぜぞ"); + assert_eq!(to_hiragana("ダヂヅデド"), "だぢづでど"); + assert_eq!(to_hiragana("バビブベボ"), "ばびぶべぼ"); + assert_eq!(to_hiragana("パピプペポ"), "ぱぴぷぺぽ"); + assert_eq!(to_hiragana("ヴ"), "ゔ"); + assert_eq!(to_hiragana("スーパー"), to_hiragana("スーパー")); + } + } + #[test] fn mixed_input() { assert_eq!( diff --git a/src/to_kana.rs b/src/to_kana.rs index a20e695..2ad6c72 100644 --- a/src/to_kana.rs +++ b/src/to_kana.rs @@ -115,8 +115,8 @@ mod tests { #[test] fn non_romaji_will_be_passed_through() { assert_eq!( - to_kana("ワニカニ AiUeO 鰐蟹 12345 @#$%"), - "ワニカニ アいウえオ 鰐蟹 12345 @#$%" + to_kana("ワニカニ AiUeO アイウエオ 鰐蟹 12345 @#$%"), + "ワニカニ アいウえオ アイウエオ 鰐蟹 12345 @#$%" ); } diff --git a/src/to_katakana.rs b/src/to_katakana.rs index b1fbeb3..c8c78ec 100644 --- a/src/to_katakana.rs +++ b/src/to_katakana.rs @@ -1,6 +1,8 @@ +use crate::is_katakana::is_mixed_halfwidth_katakana; use crate::is_mixed::*; use crate::is_romaji::*; use crate::options::Options; +use crate::utils::halfwidth_katakana_to_hiragana::halfwidth_katakana_to_hiragana; use crate::utils::hiragana_to_katakana::*; use crate::utils::romaji_to_hiragana::*; @@ -11,6 +13,13 @@ pub fn to_katakana(input: &str) -> String { /// Convert input to [Katakana](https://en.wikipedia.org/wiki/Katakana) pub fn to_katakana_with_opt(input: &str, options: Options) -> String { let config = options; + + let input = if is_mixed_halfwidth_katakana(input) { + &halfwidth_katakana_to_hiragana(input) + } else { + input + }; + if config.pass_romaji { hiragana_to_katakana(input) } else if is_romaji(input) || is_mixed(input) { @@ -118,4 +127,23 @@ mod tests { ); } } + + mod hankaku_katakana { + use super::*; + #[test] + fn converts_to_hiragana() { + assert_eq!(to_katakana("アイウエオカキクケコ"), "アイウエオカキクケコ"); + assert_eq!(to_katakana("サシスセソタチツテト"), "サシスセソタチツテト"); + assert_eq!(to_katakana("ナニヌネノハヒフヘホ"), "ナニヌネノハヒフヘホ"); + assert_eq!(to_katakana("マミムメモヤユヨ"), "マミムメモヤユヨ"); + assert_eq!(to_katakana("ラリルレロワヲン"), "ラリルレロワヲン"); + assert_eq!(to_katakana("ァィゥェォャュョッー・「」"), "ァィゥェォャュョッー・「」"); + assert_eq!(to_katakana("ガギグゲゴ"), "ガギグゲゴ"); + assert_eq!(to_katakana("ザジズゼゾ"), "ザジズゼゾ"); + assert_eq!(to_katakana("ダヂヅデド"), "ダヂヅデド"); + assert_eq!(to_katakana("バビブベボ"), "バビブベボ"); + assert_eq!(to_katakana("パピプペポ"), "パピプペポ"); + assert_eq!(to_katakana("ヴ"), "ヴ"); + } + } } diff --git a/src/to_romaji.rs b/src/to_romaji.rs index ae8a7f8..8941c2d 100644 --- a/src/to_romaji.rs +++ b/src/to_romaji.rs @@ -228,6 +228,27 @@ mod tests { } } + mod halfwidth_katakana { + use super::*; + #[test] + fn converts_to_hiragana() { + assert_eq!(to_romaji("アイウエオカキクケコ"), "aiueokakikukeko"); + assert_eq!(to_romaji("サシスセソタチツテト"), "sashisusesotachitsuteto"); + assert_eq!(to_romaji("ナニヌネノハヒフヘホ"), "naninunenohahifuheho"); + assert_eq!(to_romaji("マミムメモヤユヨ"), "mamimumemoyayuyo"); + assert_eq!(to_romaji("ラリルレロワヲン"), "rarirurerowawon"); + assert_eq!(to_romaji("ァィゥェォャュョ"), "aiueoyayuyo"); + assert_eq!(to_romaji("ッ"), ""); + assert_eq!(to_romaji("グッド"), "guddo"); + assert_eq!(to_romaji("ガギグゲゴ"), "gagigugego"); + assert_eq!(to_romaji("ザジズゼゾ"), "zajizuzezo"); + assert_eq!(to_romaji("ダヂヅデド"), "dajizudedo"); + assert_eq!(to_romaji("バビブベボ"), "babibubebo"); + assert_eq!(to_romaji("パピプペポ"), "papipupepo"); + assert_eq!(to_romaji("ヴ"), "vo"); + } + } + #[test] fn check_panic_issue_13() { assert_eq!(to_romaji("ウーッー"), "uu"); diff --git a/src/utils/halfwidth_katakana_to_hiragana.rs b/src/utils/halfwidth_katakana_to_hiragana.rs new file mode 100644 index 0000000..08dd55c --- /dev/null +++ b/src/utils/halfwidth_katakana_to_hiragana.rs @@ -0,0 +1,47 @@ +//! Convert [Half-width Katakana](https://en.wikipedia.org/wiki/Half-width_kana). to [Hiragana](https://en.wikipedia.org/wiki/Hiragana) +//! +//! Passes through any non-half-width katakana chars +//! +//! # Examples +//! +//! halfwidth_katakana_to_hiragana('カタカナ') +//! +//! // => "かたかな" +//! +//! halfwidth_katakana_to_hiragana('カタカナ is a type of kana') +//! +//! // => "カタカナ is a type of kana" + +use crate::halfwidth_to_hiragana_node_tree::HALFWIDTH_KATAKANA_TO_HIRAGANA_NODE_TREE; + +pub fn halfwidth_katakana_to_hiragana(orig: &str) -> String { + let chars = orig.chars().collect::>(); + let mut output = String::with_capacity(orig.len()); + let len = chars.len(); + // Position in the string that is being evaluated + let mut curr_pos = 0; + + while curr_pos != len { + let result = HALFWIDTH_KATAKANA_TO_HIRAGANA_NODE_TREE.get(&chars[curr_pos..]); + // nothing found, pass through + if result.1 == 0 { + output.push(chars[curr_pos]); + curr_pos += 1; + } else { + output.push_str(result.0); + curr_pos += result.1; + } + } + + output +} + +#[test] +fn test_halfwidth_katakana_to_hiragana() { + assert_eq!(halfwidth_katakana_to_hiragana("カタカナ"), "かたかな"); + assert_eq!(halfwidth_katakana_to_hiragana("カタカナ"), "カタカナ"); + assert_eq!( + halfwidth_katakana_to_hiragana("カタカナ カタカナ is a type of kana"), + "カタカナ かたかな is a type of kana" + ); +} diff --git a/src/utils/hiragana_to_katakana.rs b/src/utils/hiragana_to_katakana.rs index a829d75..bc9c017 100644 --- a/src/utils/hiragana_to_katakana.rs +++ b/src/utils/hiragana_to_katakana.rs @@ -1,7 +1,7 @@ use crate::constants::{HIRAGANA_START, KATAKANA_START}; use crate::utils::is_char_hiragana::*; -use crate::utils::is_char_long_dash::*; use crate::utils::is_char_slash_dot::*; +use crate::utils::is_prolonged_sound::*; /// Convert [Hiragana](https://en.wikipedia.org/wiki/Hiragana) to [Katakana](https://en.wikipedia.org/wiki/Katakana) /// @@ -22,7 +22,7 @@ pub fn hiragana_to_katakana(input: &str) -> String { let mut kata = vec![]; for char in input.chars() { // Short circuit to avoid incorrect codeshift for 'ー' and '・' - if is_char_long_dash(char) || is_char_slash_dot(char) { + if is_prolonged_sound(char) || is_char_slash_dot(char) { kata.push(char); } else if is_char_hiragana(char) { // Shift charcode. diff --git a/src/utils/is_char_halfwidth_katakana.rs b/src/utils/is_char_halfwidth_katakana.rs new file mode 100644 index 0000000..ca31448 --- /dev/null +++ b/src/utils/is_char_halfwidth_katakana.rs @@ -0,0 +1,22 @@ +use crate::constants::{HALFWIDTH_KATAKANA_END, HALFWIDTH_KATAKANA_START}; +use crate::utils::is_char_in_range::*; + +/// Tests a character. Returns true if the character is [Half-width Katakana](https://en.wikipedia.org/wiki/Half-width_kana). + +pub fn is_char_halfwidth_katakana(char: char) -> bool { + is_char_in_range(char, HALFWIDTH_KATAKANA_START, HALFWIDTH_KATAKANA_END) +} + +#[test] +fn is_char_halfwidth_katakana_test() { + assert!(is_char_halfwidth_katakana('。')); + assert!(is_char_halfwidth_katakana('ヲ')); + assert!(is_char_halfwidth_katakana('゚')); + assert!(is_char_halfwidth_katakana('「')); + assert!(is_char_halfwidth_katakana('」')); + + assert!(!is_char_halfwidth_katakana('ナ')); + assert!(!is_char_halfwidth_katakana('は')); + assert!(!is_char_halfwidth_katakana('n')); + assert!(!is_char_halfwidth_katakana('!')); +} diff --git a/src/utils/is_char_hiragana.rs b/src/utils/is_char_hiragana.rs index 47bcf15..d5f6a09 100644 --- a/src/utils/is_char_hiragana.rs +++ b/src/utils/is_char_hiragana.rs @@ -1,10 +1,10 @@ use crate::constants::{HIRAGANA_END, HIRAGANA_START}; use crate::utils::is_char_in_range::*; -use crate::utils::is_char_long_dash::is_char_long_dash; +use crate::utils::is_prolonged_sound::is_prolonged_sound; /// Tests a character. Returns true if the character is [Hiragana](https://en.wikipedia.org/wiki/Hiragana). pub fn is_char_hiragana(char: char) -> bool { - if is_char_long_dash(char) { + if is_prolonged_sound(char) { return true; }; is_char_in_range(char, HIRAGANA_START, HIRAGANA_END) diff --git a/src/utils/is_char_long_dash.rs b/src/utils/is_char_long_dash.rs deleted file mode 100644 index 9e9e7de..0000000 --- a/src/utils/is_char_long_dash.rs +++ /dev/null @@ -1,14 +0,0 @@ -use crate::constants::PROLONGED_SOUND_MARK; - -/// Returns true if char is 'ー' -pub fn is_char_long_dash(char: char) -> bool { - char as u32 == PROLONGED_SOUND_MARK -} - -#[test] -fn is_char_long_dash_test() { - assert!(is_char_long_dash('ー')); - assert!(!is_char_long_dash('-')); - assert!(!is_char_long_dash('f')); - assert!(!is_char_long_dash('ふ')); -} diff --git a/src/utils/is_prolonged_sound.rs b/src/utils/is_prolonged_sound.rs new file mode 100644 index 0000000..8d29930 --- /dev/null +++ b/src/utils/is_prolonged_sound.rs @@ -0,0 +1,16 @@ +use crate::constants::{HALFWIDTH_PROLONGED_SOUND_MARK, PROLONGED_SOUND_MARK}; + +/// Returns true if char is the prolonged sound mark (fullwidth 'ー' or halfwidth 'ー'). +pub fn is_prolonged_sound(char: char) -> bool { + let code = char as u32; + code == PROLONGED_SOUND_MARK || code == HALFWIDTH_PROLONGED_SOUND_MARK +} + +#[test] +fn is_prolonged_sound_test() { + assert!(is_prolonged_sound('ー')); + assert!(is_prolonged_sound('ー')); + assert!(!is_prolonged_sound('-')); + assert!(!is_prolonged_sound('f')); + assert!(!is_prolonged_sound('ふ')); +} diff --git a/src/utils/katakana_to_hiragana.rs b/src/utils/katakana_to_hiragana.rs index 7bd570c..f3cc2ed 100644 --- a/src/utils/katakana_to_hiragana.rs +++ b/src/utils/katakana_to_hiragana.rs @@ -15,16 +15,18 @@ use fnv::FnvHashMap; use crate::constants::{HIRAGANA_START, KATAKANA_START}; +use crate::halfwidth_to_hiragana_node_tree::HALFWIDTH_KATAKANA_TO_HIRAGANA_NODE_TREE; use crate::to_romaji::TO_ROMAJI_NODE_TREE; +use crate::utils::is_char_halfwidth_katakana::is_char_halfwidth_katakana; use crate::utils::is_char_katakana::*; -use crate::utils::is_char_long_dash::*; use crate::utils::is_char_slash_dot::*; +use crate::utils::is_prolonged_sound::*; pub fn is_char_initial_long_dash(char: char, index: usize) -> bool { - is_char_long_dash(char) && index == 0 + is_prolonged_sound(char) && index == 0 } pub fn is_char_inner_long_dash(char: char, index: usize) -> bool { - is_char_long_dash(char) && index != 0 + is_prolonged_sound(char) && index != 0 } pub fn is_kana_as_symbol(char: char) -> bool { 'ヶ' == char || 'ヵ' == char @@ -45,30 +47,37 @@ pub fn katakana_to_hiragana(input: &str) -> String { } pub(crate) fn katakana_to_hiragana_with_opt(input: &str, is_destination_romaji: bool) -> String { - let mut hira = Vec::with_capacity(input.chars().count()); + let chars = input.chars().collect::>(); + let mut hira = Vec::with_capacity(chars.len()); let mut previous_kana: Option = None; - for (index, input_char) in input.chars().enumerate() { + let mut count: usize = 0; + + while count < chars.len() { + let input_char = chars[count]; // Short circuit to avoid incorrect codeshift for 'ー' and '・' if is_char_slash_dot(input_char) - || is_char_initial_long_dash(input_char, index) + || is_char_initial_long_dash(input_char, count) || is_kana_as_symbol(input_char) { hira.push(input_char); // Transform long vowels: 'オー' to 'おう' } else if let (Some(previous_kana), true) = - (previous_kana, is_char_inner_long_dash(input_char, index)) + (previous_kana, is_char_inner_long_dash(input_char, count)) { // Transform previous_kana back to romaji, and slice off the vowel let Some(node) = TO_ROMAJI_NODE_TREE.find_transition_node(previous_kana) else { hira.push(input_char); + count += 1; continue; }; let romaji_opt = node.output.chars().last(); // However, ensure 'オー' => 'おお' => 'oo' if this is a transform on the way to romaji - if let Some(prev_char) = input.chars().nth(index - 1) { - if is_char_katakana(prev_char) && romaji_opt == Some('o') && is_destination_romaji { + if let Some(prev_char) = chars.get(count - 1) { + if is_char_katakana(*prev_char) && romaji_opt == Some('o') && is_destination_romaji + { hira.push('お'); + count += 1; continue; } } @@ -76,7 +85,7 @@ pub(crate) fn katakana_to_hiragana_with_opt(input: &str, is_destination_romaji: if let Some(hit) = romaji_opt.and_then(|romaji| LONG_VOWELS.get(&romaji)) { hira.push(*hit); } - } else if !is_char_long_dash(input_char) && is_char_katakana(input_char) { + } else if !is_prolonged_sound(input_char) && is_char_katakana(input_char) { let hira_char = match input_char { // rare special cases 'ヷ' => 'わ', // wa with a voiced mark @@ -93,11 +102,19 @@ pub(crate) fn katakana_to_hiragana_with_opt(input: &str, is_destination_romaji: hira.push(hira_char); previous_kana = Some(hira_char); + } else if is_char_halfwidth_katakana(input_char) { + let result = HALFWIDTH_KATAKANA_TO_HIRAGANA_NODE_TREE.get(&chars[count..]); + hira.extend(result.0.chars()); + // Track the last produced kana so a following 'ー' can trigger + // the long-vowel transformation (e.g. 'スー' => 'すう'). + previous_kana = result.0.chars().last(); + count += result.1 - 1; } else { // Pass non katakana chars through hira.push(input_char); previous_kana = None; } + count += 1; } hira.into_iter().collect() } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index a3326ef..722fbbd 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,9 +1,11 @@ #[macro_use] pub(crate) mod hashmap_macro; pub mod get_chunk; +pub mod halfwidth_katakana_to_hiragana; pub mod hiragana_to_katakana; pub mod is_char_consonant; pub mod is_char_english_punctuation; +pub mod is_char_halfwidth_katakana; pub mod is_char_hiragana; pub mod is_char_in_range; pub mod is_char_japanese; @@ -13,12 +15,12 @@ pub mod is_char_kana; pub mod is_char_kanji; pub mod is_char_katakana; pub mod is_char_latin_number; -pub mod is_char_long_dash; pub mod is_char_punctuation; pub mod is_char_romaji; pub mod is_char_slash_dot; pub mod is_char_upper_case; pub mod is_char_vowel; +pub mod is_prolonged_sound; pub mod katakana_to_hiragana; pub mod romaji_to_hiragana; @@ -35,12 +37,12 @@ pub use is_char_kana::*; pub use is_char_kanji::*; pub use is_char_katakana::*; pub use is_char_latin_number::*; -pub use is_char_long_dash::*; pub use is_char_punctuation::*; pub use is_char_romaji::*; pub use is_char_slash_dot::*; pub use is_char_upper_case::*; pub use is_char_vowel::*; +pub use is_prolonged_sound::*; pub use katakana_to_hiragana::*; pub use romaji_to_hiragana::*;