fix(tesseract/regex): support unicode characters

Mostly from the first part of regexify_text
This commit is contained in:
tretrauit 2024-02-10 11:17:56 +07:00
parent 5ae36d7f2a
commit c72d2cf16b
5 changed files with 26 additions and 8 deletions

9
Cargo.lock generated
View File

@ -935,7 +935,7 @@ dependencies = [
"httpdate",
"itoa",
"pin-project-lite",
"socket2 0.4.10",
"socket2 0.5.5",
"tokio",
"tower-service",
"tracing",
@ -2226,6 +2226,7 @@ dependencies = [
"swordfish-common",
"tokio",
"toml",
"unicode-segmentation",
]
[[package]]
@ -2762,6 +2763,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
[[package]]
name = "untrusted"
version = "0.9.0"

View File

@ -22,4 +22,5 @@ rpath = false
[profile.release-debug]
inherits = "release"
debug = true
incremental = true
strip = false

View File

@ -14,6 +14,7 @@ serde = "1.0.193"
serenity = { version = "0.12.0", features = ["builder"] }
tokio = { version = "1.35.1", features = ["full"] }
toml = "0.8.8"
unicode-segmentation = "1.11.0"
[dependencies.swordfish-common]
path = "../swordfish-common"

View File

@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET;
const CARD_SERIES_X_OFFSET: u32 = 22;
const CARD_SERIES_Y_OFFSET: u32 = 278;
const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET;
fn save_image_if_trace(img: &DynamicImage, path: &str) {
let log_lvl = CONFIG.get().unwrap().log.level.as_str();

View File

@ -1,9 +1,10 @@
use swordfish_common::trace;
use unicode_segmentation::UnicodeSegmentation;
const ALLOWED_CHARS: [char; 14] = [
' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
];
const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("];
fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
match text.find(from) {
@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) {
// This is usually the corner of the card
trace!("Text: {}", text);
replace_string(text, "A\n", "");
// Workaround for ' qugnd' -> ' Grand'
// As in Grandfather
trace!("Text: {}", text);
replace_string(text, "\nqugnd", "\nGrand");
// Workaround for '“NO'
// This is usually the left bottom corner of the card
trace!("Text: {}", text);
@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String {
}
regex.push_str("(?=.*");
let processed_word = word.to_lowercase();
let p_word_unicode = processed_word.graphemes(true).collect::<Vec<&str>>();
trace!("Processed word: {}", processed_word);
if partial_match && processed_word.len() > 4 {
if partial_match && p_word_unicode.len() > 4 {
// Remove first two and last two characters for "partial match"
if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
&& !processed_word[word.len() - 2..word.len()]
.contains(|c: char| REGEX_CHARS.contains(&c))
if !p_word_unicode[0..3]
.iter()
.any(|c: &&str| REGEX_STRINGS.contains(&c))
&& !p_word_unicode[word.len() - 2..word.len()]
.iter()
.any(|c: &&str| REGEX_STRINGS.contains(&c))
{
regex.push_str(&processed_word[2..word.len() - 2]);
regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
} else {
regex.push_str(&processed_word.as_str());
}