fix(tesseract/regex): support unicode characters
Mostly from the first part of regexify_text
This commit is contained in:
parent
5ae36d7f2a
commit
c72d2cf16b
9
Cargo.lock
generated
9
Cargo.lock
generated
@ -935,7 +935,7 @@ dependencies = [
|
||||
"httpdate",
|
||||
"itoa",
|
||||
"pin-project-lite",
|
||||
"socket2 0.4.10",
|
||||
"socket2 0.5.5",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@ -2226,6 +2226,7 @@ dependencies = [
|
||||
"swordfish-common",
|
||||
"tokio",
|
||||
"toml",
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2762,6 +2763,12 @@ dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
|
@ -22,4 +22,5 @@ rpath = false
|
||||
[profile.release-debug]
|
||||
inherits = "release"
|
||||
debug = true
|
||||
incremental = true
|
||||
strip = false
|
||||
|
@ -14,6 +14,7 @@ serde = "1.0.193"
|
||||
serenity = { version = "0.12.0", features = ["builder"] }
|
||||
tokio = { version = "1.35.1", features = ["full"] }
|
||||
toml = "0.8.8"
|
||||
unicode-segmentation = "1.11.0"
|
||||
|
||||
[dependencies.swordfish-common]
|
||||
path = "../swordfish-common"
|
||||
|
@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET;
|
||||
const CARD_SERIES_X_OFFSET: u32 = 22;
|
||||
const CARD_SERIES_Y_OFFSET: u32 = 278;
|
||||
const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
|
||||
const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
|
||||
const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET;
|
||||
|
||||
fn save_image_if_trace(img: &DynamicImage, path: &str) {
|
||||
let log_lvl = CONFIG.get().unwrap().log.level.as_str();
|
||||
|
@ -1,9 +1,10 @@
|
||||
use swordfish_common::trace;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
const ALLOWED_CHARS: [char; 14] = [
|
||||
' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
|
||||
];
|
||||
const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
|
||||
const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("];
|
||||
|
||||
fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
|
||||
match text.find(from) {
|
||||
@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) {
|
||||
// This is usually the corner of the card
|
||||
trace!("Text: {}", text);
|
||||
replace_string(text, "A\n", "");
|
||||
// Workaround for ' qugnd' -> ' Grand'
|
||||
// As in Grandfather
|
||||
trace!("Text: {}", text);
|
||||
replace_string(text, "\nqugnd", "\nGrand");
|
||||
// Workaround for '“NO'
|
||||
// This is usually the left bottom corner of the card
|
||||
trace!("Text: {}", text);
|
||||
@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String {
|
||||
}
|
||||
regex.push_str("(?=.*");
|
||||
let processed_word = word.to_lowercase();
|
||||
let p_word_unicode = processed_word.graphemes(true).collect::<Vec<&str>>();
|
||||
trace!("Processed word: {}", processed_word);
|
||||
if partial_match && processed_word.len() > 4 {
|
||||
if partial_match && p_word_unicode.len() > 4 {
|
||||
// Remove first two and last two characters for "partial match"
|
||||
if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
|
||||
&& !processed_word[word.len() - 2..word.len()]
|
||||
.contains(|c: char| REGEX_CHARS.contains(&c))
|
||||
if !p_word_unicode[0..3]
|
||||
.iter()
|
||||
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||
&& !p_word_unicode[word.len() - 2..word.len()]
|
||||
.iter()
|
||||
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||
{
|
||||
regex.push_str(&processed_word[2..word.len() - 2]);
|
||||
regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
|
||||
} else {
|
||||
regex.push_str(&processed_word.as_str());
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user