fix(tesseract/regex): support unicode characters

Mostly from the first part of regexify_text
2024-02-10 11:17:56 +07:00 · 2024-02-10 11:17:56 +07:00 · c72d2cf16b
commit c72d2cf16b
parent 5ae36d7f2a
5 changed files with 26 additions and 8 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -935,7 +935,7 @@ dependencies = [
 "httpdate",
 "itoa",
 "pin-project-lite",
- "socket2 0.4.10",
+ "socket2 0.5.5",
 "tokio",
 "tower-service",
 "tracing",
@ -2226,6 +2226,7 @@ dependencies = [
 "swordfish-common",
 "tokio",
 "toml",
+ "unicode-segmentation",
 ]

 [[package]]
@ -2762,6 +2763,12 @@ dependencies = [
 "tinyvec",
 ]

+[[package]]
+name = "unicode-segmentation"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -22,4 +22,5 @@ rpath = false
 [profile.release-debug]
 inherits = "release"
 debug = true
+incremental = true
 strip = false
--- a/swordfish/Cargo.toml
+++ b/swordfish/Cargo.toml
@ -14,6 +14,7 @@ serde = "1.0.193"
 serenity = { version = "0.12.0", features = ["builder"] }
 tokio = { version = "1.35.1", features = ["full"] }
 toml = "0.8.8"
+unicode-segmentation = "1.11.0"

 [dependencies.swordfish-common]
 path = "../swordfish-common"
--- a/swordfish/src/katana.rs
+++ b/swordfish/src/katana.rs
@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET;
 const CARD_SERIES_X_OFFSET: u32 = 22;
 const CARD_SERIES_Y_OFFSET: u32 = 278;
 const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
-const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
+const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET;

 fn save_image_if_trace(img: &DynamicImage, path: &str) {
    let log_lvl = CONFIG.get().unwrap().log.level.as_str();
--- a/swordfish/src/tesseract/utils.rs
+++ b/swordfish/src/tesseract/utils.rs
@ -1,9 +1,10 @@
 use swordfish_common::trace;
+use unicode_segmentation::UnicodeSegmentation;

 const ALLOWED_CHARS: [char; 14] = [
    ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
 ];
-const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
+const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("];

 fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
    match text.find(from) {
@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) {
    // This is usually the corner of the card
    trace!("Text: {}", text);
    replace_string(text, "A\n", "");
+    // Workaround for ' qugnd' -> ' Grand'
+    // As in Grandfather
+    trace!("Text: {}", text);
+    replace_string(text, "\nqugnd", "\nGrand");
    // Workaround for '“NO'
    // This is usually the left bottom corner of the card
    trace!("Text: {}", text);
@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String {
        }
        regex.push_str("(?=.*");
        let processed_word = word.to_lowercase();
+        let p_word_unicode = processed_word.graphemes(true).collect::<Vec<&str>>();
        trace!("Processed word: {}", processed_word);
-        if partial_match && processed_word.len() > 4 {
+        if partial_match && p_word_unicode.len() > 4 {
            // Remove first two and last two characters for "partial match"
-            if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
-                && !processed_word[word.len() - 2..word.len()]
-                    .contains(|c: char| REGEX_CHARS.contains(&c))
+            if !p_word_unicode[0..3]
+                .iter()
+                .any(|c: &&str| REGEX_STRINGS.contains(&c))
+                && !p_word_unicode[word.len() - 2..word.len()]
+                    .iter()
+                    .any(|c: &&str| REGEX_STRINGS.contains(&c))
            {
-                regex.push_str(&processed_word[2..word.len() - 2]);
+                regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
            } else {
                regex.push_str(&processed_word.as_str());
            }