From e84841b1f52c8f925fda9c04f041f992d276bdbd Mon Sep 17 00:00:00 2001 From: David Gisser Date: Sun, 21 Dec 2025 20:22:42 -0800 Subject: [PATCH] fix censor behavior for japanese diactritics --- src/censor.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/censor.rs b/src/censor.rs index 2294047..abf65bb 100644 --- a/src/censor.rs +++ b/src/censor.rs @@ -164,10 +164,12 @@ impl> Censor { fn filter_char(c: &char) -> bool { use finl_unicode::categories::{CharacterCategories, MinorCategory}; let category = c.get_minor_category(); + // Preserve Japanese dakuten/handakuten so kana aren't turned into their unvoiced forms. + let preserve_japanese = matches!(*c, '\u{3099}' | '\u{309A}'); let nok = matches!( category, MinorCategory::Cn | MinorCategory::Co | MinorCategory::Mn - ); + ) && !preserve_japanese; !(nok || BANNED.deref().deref().contains(*c)) } @@ -1275,6 +1277,15 @@ mod tests { ); } + #[test] + #[serial] + fn japanese_diacritics_preserved() { + assert_eq!("パピプペポ", "パピプペポ".censor()); + assert_eq!("バビブベボ", "バビブベボ".censor()); + assert_eq!("ぱぴぷぺぽ", "ぱぴぷぺぽ".censor()); + assert_eq!("ばびぶべぼ", "ばびぶべぼ".censor()); + } + #[test] #[serial] fn bandwidth() {