fix(tesseract/regex): support unicode characters
Mostly from the first part of regexify_text
This commit is contained in:
parent
5ae36d7f2a
commit
c72d2cf16b
9
Cargo.lock
generated
9
Cargo.lock
generated
@ -935,7 +935,7 @@ dependencies = [
|
|||||||
"httpdate",
|
"httpdate",
|
||||||
"itoa",
|
"itoa",
|
||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"socket2 0.4.10",
|
"socket2 0.5.5",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tower-service",
|
"tower-service",
|
||||||
"tracing",
|
"tracing",
|
||||||
@ -2226,6 +2226,7 @@ dependencies = [
|
|||||||
"swordfish-common",
|
"swordfish-common",
|
||||||
"tokio",
|
"tokio",
|
||||||
"toml",
|
"toml",
|
||||||
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2762,6 +2763,12 @@ dependencies = [
|
|||||||
"tinyvec",
|
"tinyvec",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "untrusted"
|
name = "untrusted"
|
||||||
version = "0.9.0"
|
version = "0.9.0"
|
||||||
|
@ -22,4 +22,5 @@ rpath = false
|
|||||||
[profile.release-debug]
|
[profile.release-debug]
|
||||||
inherits = "release"
|
inherits = "release"
|
||||||
debug = true
|
debug = true
|
||||||
|
incremental = true
|
||||||
strip = false
|
strip = false
|
||||||
|
@ -14,6 +14,7 @@ serde = "1.0.193"
|
|||||||
serenity = { version = "0.12.0", features = ["builder"] }
|
serenity = { version = "0.12.0", features = ["builder"] }
|
||||||
tokio = { version = "1.35.1", features = ["full"] }
|
tokio = { version = "1.35.1", features = ["full"] }
|
||||||
toml = "0.8.8"
|
toml = "0.8.8"
|
||||||
|
unicode-segmentation = "1.11.0"
|
||||||
|
|
||||||
[dependencies.swordfish-common]
|
[dependencies.swordfish-common]
|
||||||
path = "../swordfish-common"
|
path = "../swordfish-common"
|
||||||
|
@ -21,7 +21,7 @@ const CARD_NAME_HEIGHT: u32 = 70 - CARD_NAME_Y_OFFSET;
|
|||||||
const CARD_SERIES_X_OFFSET: u32 = 22;
|
const CARD_SERIES_X_OFFSET: u32 = 22;
|
||||||
const CARD_SERIES_Y_OFFSET: u32 = 278;
|
const CARD_SERIES_Y_OFFSET: u32 = 278;
|
||||||
const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
|
const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
|
||||||
const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
|
const CARD_SERIES_HEIGHT: u32 = 328 - CARD_SERIES_Y_OFFSET;
|
||||||
|
|
||||||
fn save_image_if_trace(img: &DynamicImage, path: &str) {
|
fn save_image_if_trace(img: &DynamicImage, path: &str) {
|
||||||
let log_lvl = CONFIG.get().unwrap().log.level.as_str();
|
let log_lvl = CONFIG.get().unwrap().log.level.as_str();
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
use swordfish_common::trace;
|
use swordfish_common::trace;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
const ALLOWED_CHARS: [char; 14] = [
|
const ALLOWED_CHARS: [char; 14] = [
|
||||||
' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
|
' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
|
||||||
];
|
];
|
||||||
const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
|
const REGEX_STRINGS: [&str; 4] = ["[", "]", ")", "("];
|
||||||
|
|
||||||
fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
|
fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
|
||||||
match text.find(from) {
|
match text.find(from) {
|
||||||
@ -49,6 +50,10 @@ pub fn fix_tesseract_string(text: &mut String) {
|
|||||||
// This is usually the corner of the card
|
// This is usually the corner of the card
|
||||||
trace!("Text: {}", text);
|
trace!("Text: {}", text);
|
||||||
replace_string(text, "A\n", "");
|
replace_string(text, "A\n", "");
|
||||||
|
// Workaround for ' qugnd' -> ' Grand'
|
||||||
|
// As in Grandfather
|
||||||
|
trace!("Text: {}", text);
|
||||||
|
replace_string(text, "\nqugnd", "\nGrand");
|
||||||
// Workaround for '“NO'
|
// Workaround for '“NO'
|
||||||
// This is usually the left bottom corner of the card
|
// This is usually the left bottom corner of the card
|
||||||
trace!("Text: {}", text);
|
trace!("Text: {}", text);
|
||||||
@ -251,14 +256,18 @@ pub fn regexify_text(text: &String) -> String {
|
|||||||
}
|
}
|
||||||
regex.push_str("(?=.*");
|
regex.push_str("(?=.*");
|
||||||
let processed_word = word.to_lowercase();
|
let processed_word = word.to_lowercase();
|
||||||
|
let p_word_unicode = processed_word.graphemes(true).collect::<Vec<&str>>();
|
||||||
trace!("Processed word: {}", processed_word);
|
trace!("Processed word: {}", processed_word);
|
||||||
if partial_match && processed_word.len() > 4 {
|
if partial_match && p_word_unicode.len() > 4 {
|
||||||
// Remove first two and last two characters for "partial match"
|
// Remove first two and last two characters for "partial match"
|
||||||
if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
|
if !p_word_unicode[0..3]
|
||||||
&& !processed_word[word.len() - 2..word.len()]
|
.iter()
|
||||||
.contains(|c: char| REGEX_CHARS.contains(&c))
|
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||||
|
&& !p_word_unicode[word.len() - 2..word.len()]
|
||||||
|
.iter()
|
||||||
|
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||||
{
|
{
|
||||||
regex.push_str(&processed_word[2..word.len() - 2]);
|
regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
|
||||||
} else {
|
} else {
|
||||||
regex.push_str(&processed_word.as_str());
|
regex.push_str(&processed_word.as_str());
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user