chore(katana): move regex building to swordfish

Welp, I can add my dirty workaround now
This commit is contained in:
tretrauit 2024-01-10 12:31:29 +07:00
parent 5accadf277
commit 952467d4b1
2 changed files with 41 additions and 40 deletions

View File

@ -41,43 +41,13 @@ pub async fn query_character(name: &String, series: &String) -> Option<Character
}
pub async fn query_character_regex(name: &String, series: &String) -> Option<Character> {
let mut name_regex = String::new();
let mut ascii_name = String::new();
for c in name.chars() {
if c.is_ascii_alphanumeric() {
ascii_name.push(c);
} else {
ascii_name.push(' ');
}
}
ascii_name.split_whitespace().for_each(|word| {
name_regex.push_str("(?=.*\\b");
name_regex.push_str(word.to_lowercase().as_str());
name_regex.push_str("\\b)");
});
name_regex.push_str(".+");
let mut series_regex = String::new();
let mut ascii_series = String::new();
for c in series.chars() {
if c.is_ascii_alphanumeric() {
ascii_series.push(c);
} else {
ascii_series.push(' ');
}
}
ascii_series.split_whitespace().for_each(|word| {
series_regex.push_str("(?=.*\\b");
series_regex.push_str(word.to_lowercase().as_str());
series_regex.push_str("\\b)");
});
series_regex.push_str(".+");
KATANA
.get()
.unwrap()
.find_one(
mongodb::bson::doc! {
"name": {"$regex": name_regex, "$options" : "i"},
"series": {"$regex": series_regex, "$options" : "i"}
"name": {"$regex": name, "$options" : "i"},
"series": {"$regex": series, "$options" : "i"}
},
None,
)

View File

@ -13,7 +13,7 @@ use swordfish_common::{error, trace, warn};
use tokio::task;
use tokio::time::Instant;
const ALLOWED_CHARS: [char; 10] = [' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\''];
const ALLOWED_CHARS: [char; 11] = [' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@'];
const CARD_NAME_X_OFFSET: u32 = 22;
const CARD_NAME_Y_OFFSET: u32 = 28;
const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET;
@ -71,9 +71,7 @@ fn fix_tesseract_string(text: &mut String) {
// This is usually the left bottom corner of the card
trace!("Text: {}", text);
if text.ends_with(r##"“NO"##) {
for _ in 0..3 {
text.pop();
}
text.drain(text.len() - 4..text.len());
}
// Workaround for "\n." (and others in the future)
let text_clone = text.clone();
@ -101,8 +99,7 @@ fn fix_tesseract_string(text: &mut String) {
trace!("Prev prev char: {}", prev_prev_char);
if prev_prev_char == 'o' {
rm_prev = -1;
text.remove(i - 2);
text.remove(i - 2);
text.drain(i - 3..i - 1);
text.insert_str(i - 2, "yo!")
}
}
@ -167,6 +164,30 @@ fn fix_tesseract_string(text: &mut String) {
trace!("Text (final): {}", text);
}
fn regexify_text(text: &String) -> String {
let mut regex = String::new();
let mut ascii_text = String::new();
for c in text.chars() {
// Here comes the workaround...
// The character "0" is sometimes used in place of "O" in names
if ['0', 'O'].contains(&c) {
ascii_text.push_str("[0O]");
} else if c.is_ascii_alphanumeric() {
ascii_text.push(c);
} else {
ascii_text.push(' ');
}
}
ascii_text.split_whitespace().for_each(|word| {
regex.push_str("(?=.*\\b");
regex.push_str(word.to_lowercase().as_str());
regex.push_str("\\b)");
});
regex.push_str(".+");
trace!("Regex: {}", regex);
regex
}
fn save_image_if_trace(img: &DynamicImage, path: &str) {
let log_lvl = CONFIG.get().unwrap().log.level.as_str();
if log_lvl == "trace" {
@ -282,7 +303,12 @@ pub async fn analyze_card_libtesseract(card: image::DynamicImage, count: u32) ->
Some(c) => {
character = c;
}
None => match db::query_character_regex(&character.name, &character.series).await {
None => match db::query_character_regex(
&regexify_text(&character.name),
&regexify_text(&character.series),
)
.await
{
Some(c) => {
character = c;
}
@ -349,7 +375,12 @@ pub async fn analyze_card_subprocess(card: image::DynamicImage, count: u32) -> D
Some(c) => {
character = c;
}
None => match db::query_character_regex(&character.name, &character.series).await {
None => match db::query_character_regex(
&regexify_text(&character.name),
&regexify_text(&character.series),
)
.await
{
Some(c) => {
character = c;
}