From c72d2cf16ba16cd503189ab4fc3b6d13930adf95 Mon Sep 17 00:00:00 2001
From: tretrauit <tretrauit@gmail.com>
Date: Sat, 10 Feb 2024 11:17:56 +0700
Subject: [PATCH] fix(tesseract/regex): support unicode characters

Mostly from the first part of regexify_text
---
 Cargo.lock                       |  9 ++++++++-
 Cargo.toml                       |  1 +
 swordfish/Cargo.toml             |  1 +
 swordfish/src/katana.rs          |  2 +-
 swordfish/src/tesseract/utils.rs | 21 +++++++++++++++------
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73f8ec3..509fc4c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -935,7 +935,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.4.10",
+ "socket2 0.5.5",
  "tokio",
  "tower-service",
  "tracing",
@@ -2226,6 +2226,7 @@ dependencies = [
  "swordfish-common",
  "tokio",
  "toml",
+ "unicode-segmentation",
 ]
 
 [[package]]
@@ -2762,6 +2763,12 @@ dependencies = [
  "tinyvec",
 ]
 
+[[package]]
+name = "unicode-segmentation"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
diff --git a/Cargo.toml b/Cargo.toml
index fabea93..4238063 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,4 +22,5 @@ rpath = false
 [profile.release-debug]
 inherits = "release"
 debug = true
+incremental = true
 strip = false
diff --git a/swordfish/Cargo.toml b/swordfish/Cargo.toml
index 87904d3..b34d0fe 100644
--- a/swordfish/Cargo.toml
+++ b/swordfish/Cargo.toml
@@ -14,6 +14,7 @@ serde = "1.0.193"
 serenity = { version = "0.12.0", features = ["builder"] }
 tokio = { version = "1.35.1", features = ["full"] }
 toml = "0.8.8"
+unicode-segmentation = "1.11.0"
 
 [dependencies.swordfish-common]
 path = "../swordfish-common"
diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs
index 6cf7af6..41b00f7 100644
--- a/swordfish/src/katana.rs
+++ b/swordfish/src/katana.rs
@@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET;
 const CARD_SERIES_X_OFFSET: u32 = 22;
 const CARD_SERIES_Y_OFFSET: u32 = 278;
 const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
-const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
+const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET;
 
 fn save_image_if_trace(img: &DynamicImage, path: &str) {
     let log_lvl = CONFIG.get().unwrap().log.level.as_str();
diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs
index 5c030fc..10798a5 100644
--- a/swordfish/src/tesseract/utils.rs
+++ b/swordfish/src/tesseract/utils.rs
@@ -1,9 +1,10 @@
 use swordfish_common::trace;
+use unicode_segmentation::UnicodeSegmentation;
 
 const ALLOWED_CHARS: [char; 14] = [
     ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
 ];
-const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
+const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("];
 
 fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
     match text.find(from) {
@@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) {
     // This is usually the corner of the card
     trace!("Text: {}", text);
     replace_string(text, "A\n", "");
+    // Workaround for ' qugnd' -> ' Grand'
+    // As in Grandfather
+    trace!("Text: {}", text);
+    replace_string(text, "\nqugnd", "\nGrand");
     // Workaround for '“NO'
     // This is usually the left bottom corner of the card
     trace!("Text: {}", text);
@@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String {
         }
         regex.push_str("(?=.*");
         let processed_word = word.to_lowercase();
+        let p_word_unicode = processed_word.graphemes(true).collect::<Vec<&str>>();
         trace!("Processed word: {}", processed_word);
-        if partial_match && processed_word.len() > 4 {
+        if partial_match && p_word_unicode.len() > 4 {
             // Remove first two and last two characters for "partial match"
-            if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
-                && !processed_word[word.len() - 2..word.len()]
-                    .contains(|c: char| REGEX_CHARS.contains(&c))
+            if !p_word_unicode[0..3]
+                .iter()
+                .any(|c: &&str| REGEX_STRINGS.contains(&c))
+                && !p_word_unicode[word.len() - 2..word.len()]
+                    .iter()
+                    .any(|c: &&str| REGEX_STRINGS.contains(&c))
             {
-                regex.push_str(&processed_word[2..word.len() - 2]);
+                regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
             } else {
                 regex.push_str(&processed_word.as_str());
             }