From c72d2cf16ba16cd503189ab4fc3b6d13930adf95 Mon Sep 17 00:00:00 2001 From: tretrauit Date: Sat, 10 Feb 2024 11:17:56 +0700 Subject: [PATCH] fix(tesseract/regex): support unicode characters Mostly from the first part of regexify_text --- Cargo.lock | 9 ++++++++- Cargo.toml | 1 + swordfish/Cargo.toml | 1 + swordfish/src/katana.rs | 2 +- swordfish/src/tesseract/utils.rs | 21 +++++++++++++++------ 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 73f8ec3..509fc4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -935,7 +935,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.4.10", + "socket2 0.5.5", "tokio", "tower-service", "tracing", @@ -2226,6 +2226,7 @@ dependencies = [ "swordfish-common", "tokio", "toml", + "unicode-segmentation", ] [[package]] @@ -2762,6 +2763,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index fabea93..4238063 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,5 @@ rpath = false [profile.release-debug] inherits = "release" debug = true +incremental = true strip = false diff --git a/swordfish/Cargo.toml b/swordfish/Cargo.toml index 87904d3..b34d0fe 100644 --- a/swordfish/Cargo.toml +++ b/swordfish/Cargo.toml @@ -14,6 +14,7 @@ serde = "1.0.193" serenity = { version = "0.12.0", features = ["builder"] } tokio = { version = "1.35.1", features = ["full"] } toml = "0.8.8" +unicode-segmentation = "1.11.0" [dependencies.swordfish-common] path = "../swordfish-common" diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs index 6cf7af6..41b00f7 100644 --- a/swordfish/src/katana.rs +++ b/swordfish/src/katana.rs @@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET; const CARD_SERIES_X_OFFSET: u32 = 22; const CARD_SERIES_Y_OFFSET: u32 = 278; const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET; -const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET; +const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET; fn save_image_if_trace(img: &DynamicImage, path: &str) { let log_lvl = CONFIG.get().unwrap().log.level.as_str(); diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs index 5c030fc..10798a5 100644 --- a/swordfish/src/tesseract/utils.rs +++ b/swordfish/src/tesseract/utils.rs @@ -1,9 +1,10 @@ use swordfish_common::trace; +use unicode_segmentation::UnicodeSegmentation; const ALLOWED_CHARS: [char; 14] = [ ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é', ]; -const REGEX_CHARS: [char; 4] = ['[', ']', ')', '(']; +const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("]; fn replace_string(text: &mut String, from: &str, to: &str) -> bool { match text.find(from) { @@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) { // This is usually the corner of the card trace!("Text: {}", text); replace_string(text, "A\n", ""); + // Workaround for ' qugnd' -> ' Grand' + // As in Grandfather + trace!("Text: {}", text); + replace_string(text, "\nqugnd", "\nGrand"); // Workaround for '“NO' // This is usually the left bottom corner of the card trace!("Text: {}", text); @@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String { } regex.push_str("(?=.*"); let processed_word = word.to_lowercase(); + let p_word_unicode = processed_word.graphemes(true).collect::>(); trace!("Processed word: {}", processed_word); - if partial_match && processed_word.len() > 4 { + if partial_match && p_word_unicode.len() > 4 { // Remove first two and last two characters for "partial match" - if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c)) - && !processed_word[word.len() - 2..word.len()] - .contains(|c: char| REGEX_CHARS.contains(&c)) + if !p_word_unicode[0..3] + .iter() + .any(|c: &&str| REGEX_STRINGS.contains(&c)) + && !p_word_unicode[word.len() - 2..word.len()] + .iter() + .any(|c: &&str| REGEX_STRINGS.contains(&c)) { - regex.push_str(&processed_word[2..word.len() - 2]); + regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str()); } else { regex.push_str(&processed_word.as_str()); }