diff --git a/swordfish/src/debug.rs b/swordfish/src/debug.rs index 246d7eb..880ca70 100644 --- a/swordfish/src/debug.rs +++ b/swordfish/src/debug.rs @@ -1,5 +1,6 @@ use crate::helper; use crate::katana; +use crate::tesseract; use crate::utils; use crate::CONFIG; use serenity::framework::standard::CommandResult; @@ -299,6 +300,23 @@ pub async fn dbg_message(ctx: &Context, msg: &Message) -> CommandResult { Ok(()) } +pub async fn dbg_regexify_text(ctx: &Context, msg: &Message) -> CommandResult { + let content = msg.content.split_whitespace().collect::>()[2..].join(" "); + helper::info_message( + ctx, + msg, + format!( + "```\n\ + {}\n\ + ```", + tesseract::utils::regexify_text(&content) + ), + Some("Regexified Text".to_string()), + ) + .await; + Ok(()) +} + pub async fn dbg_embed(ctx: &Context, msg: &Message) -> CommandResult { let target_msg = match dbg_get_message("embed", ctx, msg).await { Ok(msg) => msg, diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs index 189de65..6cf7af6 100644 --- a/swordfish/src/katana.rs +++ b/swordfish/src/katana.rs @@ -1,4 +1,5 @@ use crate::helper; +use crate::tesseract::utils::{fix_tesseract_string, regexify_text}; use crate::tesseract::{libtesseract, subprocess}; use crate::CONFIG; use image::imageops::colorops::contrast_in_place; @@ -13,10 +14,6 @@ use swordfish_common::{error, trace, warn}; use tokio::task; use tokio::time::Instant; -const ALLOWED_CHARS: [char; 14] = [ - ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é', -]; -const REGEX_CHARS: [char; 4] = ['[', ']', ')', '(']; const CARD_NAME_X_OFFSET: u32 = 22; const CARD_NAME_Y_OFFSET: u32 = 28; const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET; @@ -26,275 +23,6 @@ const CARD_SERIES_Y_OFFSET: u32 = 278; const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET; const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET; -fn replace_string(text: &mut String, from: &str, to: &str) -> bool { - match text.find(from) { - Some(i) => { - text.replace_range(i..i + from.len(), to); - true - } - None => false, - } -} - -fn fix_tesseract_string(text: &mut String) { - // Remove the \n - trace!("Text: {}", text); - if text.ends_with("\n") { - text.pop(); - } - // Workaround for a bug the text - // e.g. "We Never Learn\nN" -> "We Never Learn" - trace!("Text: {}", text); - if text.ends_with("\nN") { - text.truncate(text.len() - 2); - } - // Replace first (to prevent "byte index 13 is not a char boundary; it is inside '—' (bytes 11..14)") - while replace_string(text, "—", "-") { - trace!("Replacing '—' with '-'"); - } - // Workaround for a bug the text - trace!("Text: {}", text); - if text.starts_with("- ") || text.starts_with("-.") { - text.drain(0..2); - } - // Remove the first character if it is not alphanumeric - if !text.starts_with(|c: char| c.is_ascii_alphanumeric()) { - text.remove(0); - } - // Workaround IR -> Ik - // Maybe it only occurs if Ik is in the start of the string? - // e.g. "IReda" -> "Ikeda" - trace!("Text: {}", text); - replace_string(text, "IR", "Ik"); - // Workaround for "A\n" - // This is usually the corner of the card - trace!("Text: {}", text); - replace_string(text, "A\n", ""); - // Workaround for '“NO' - // This is usually the left bottom corner of the card - trace!("Text: {}", text); - if text.ends_with(r##"“NO"##) { - text.drain(text.len() - 4..text.len()); - } - // Workaround for "\n." (and others in the future) - let text_clone = text.clone(); - let mut clone_chars = text_clone.chars(); - for (i, c) in clone_chars.clone().enumerate() { - if c != '\n' { - continue; - } - let prev_char = match clone_chars.nth(i - 1) { - Some(c) => c, - None => continue, - }; - let mut rm_prev: i8 = 0; - trace!("Prev char: {}", prev_char); - if ['-'].contains(&prev_char) { - rm_prev = 1; - text.remove(i - 1); - } - // Fix for "Asobi ni Iku lo Asobi ni Oide" -> "Asobi ni Iku yo! Asobi ni Oide" - if prev_char == 'l' { - let prev_prev_char = match clone_chars.nth(i - 2) { - Some(c) => c, - None => continue, - }; - trace!("Prev prev char: {}", prev_prev_char); - if prev_prev_char == 'o' { - rm_prev = -1; - text.drain(i - 3..i - 1); - text.insert_str(i - 2, "yo!") - } - } - let next_char = match clone_chars.nth(i + 1) { - Some(c) => c, - None => break, - }; - trace!("Next char: {}", next_char); - if ['.'].contains(&next_char) { - text.remove((i as i8 + 1 - rm_prev) as usize); - } - } - // Replace "\n" with " " - trace!("Text: {}", text); - while replace_string(text, "\n", " ") { - trace!("Replacing '\\n' with ' '"); - } - // Remove all non-alphanumeric characters - trace!("Text: {}", text); - text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric()); - // Fix "mn" -> "III" - trace!("Text: {}", text); - if text.ends_with("mn") { - text.pop(); - text.pop(); - text.push_str("III"); - } - // Fix "1ll" -> "III" - trace!("Text: {}", text); - replace_string(text, "1ll", "III"); - // Fix "lll" -> "!!!" - trace!("Text: {}", text); - replace_string(text, "lll", "!!!"); - // Fix "Il" -> "II" in the end of the string - trace!("Text: {}", text); - if text.ends_with("Il") { - text.pop(); - text.pop(); - text.push_str("II"); - } - // Replace multiple spaces with one space - trace!("Text: {}", text); - while replace_string(text, " ", " ") { - trace!("Removing multiple spaces"); - } - // Remove the last character if it is a dash - if text.ends_with("-") { - text.pop(); - } - // Workaround if the first character is a space - trace!("Text: {}", text); - while text.starts_with(|c: char| c.is_whitespace()) { - trace!("Removing leading space"); - text.remove(0); - } - // Workaround if the last character is a space - trace!("Text: {}", text); - while text.ends_with(|c: char| c.is_whitespace()) { - trace!("Removing ending space"); - text.pop(); - } - trace!("Text (final): {}", text); -} - -fn regexify_text(text: &String) -> String { - let partial_match: bool; - let short_text = text.len() < 6; - if text.len() > 23 { - partial_match = true; - } else { - partial_match = false; - } - let mut regex = String::new(); - let mut ascii_text = String::new(); - let mut prev_chars: Vec = Vec::new(); - for c in text.chars() { - // Here comes the workaround... - // The character "0" is sometimes used in place of "O" in names - if ['0', 'O'].contains(&c) { - ascii_text.push_str("[0O]"); - } else if ['u', 'v'].contains(&c) { - ascii_text.push_str("[uv]"); - } else if ['t'].contains(&c) { - ascii_text.push_str("[ti]"); - } else if ['I', 'l', '!', '1'].contains(&c) { - ascii_text.push_str("[Il!1i]"); - } else if ['.'].contains(&c) { - if prev_chars.len() > 3 { - let prev_char = prev_chars[prev_chars.len() - 1]; - let prev_prev_char = prev_chars[prev_chars.len() - 2]; - if prev_char.is_numeric() && prev_prev_char.is_whitespace() { - continue; - } - } - ascii_text.push(' '); - } else if ['R'].contains(&c) { - ascii_text.push_str("[Rk]"); - } else if ['m'].contains(&c) { - ascii_text.push_str("(m|ra)"); - } else if ['a'].contains(&c) { - ascii_text.push_str("[ao]") - } else if c.is_ascii_alphanumeric() { - ascii_text.push(c); - } else { - ascii_text.push(' '); - } - prev_chars.push(c); - } - if ascii_text.ends_with(|c: char| c.is_ascii_digit()) { - ascii_text.pop(); - } - // Filter for short string. - if short_text && !ascii_text.contains(|c: char| c.is_whitespace()) { - regex.push_str("^"); - let mut request_quantifier: bool = false; - let mut regex_any: bool = false; - let mut regex_any_from: usize = 0; - for (i, char) in ascii_text.chars().enumerate() { - trace!("Char: {}", char); - if char == '[' { - regex_any = true; - regex_any_from = i; - if i == 0 { - request_quantifier = true; - } - continue; - } else if i == ascii_text.len() - 1 { - regex.push_str(".*"); - } - if regex_any { - if char == ']' { - regex_any = false; - regex.push_str(&ascii_text[regex_any_from..i + 1]); - if request_quantifier { - regex.push_str(".*"); - } - } - continue; - } - regex.push(char); - if i == 0 { - regex.push_str(".*"); - } - } - regex.push_str("$"); - trace!("Regex (short string): {}", regex); - return regex; - } - let split = ascii_text.split_whitespace(); - let len = split.clone().count(); - trace!("Partial match: {}", partial_match); - for (i, word) in split.enumerate() { - if word.len() < 2 { - if i > 0 && i < len - 1 { - continue; - } - if ["x", "X"].contains(&word) { - continue; - } - } - regex.push_str("(?=.*"); - let processed_word = word.to_lowercase(); - trace!("Processed word: {}", processed_word); - if partial_match && processed_word.len() > 4 { - // Remove first two and last two characters for "partial match" - if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c)) - && !processed_word[word.len() - 2..word.len()] - .contains(|c: char| REGEX_CHARS.contains(&c)) - { - regex.push_str(&processed_word[2..word.len() - 2]); - } else { - regex.push_str(&processed_word.as_str()); - } - } else { - // Do not push word boundary if the word contains special characters like "!" - trace!("Current processed word: {}", processed_word); - if processed_word - .chars() - .all(|c| c.is_ascii_alphanumeric()) - { - regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str()); - } else { - regex.push_str(format!("{}", &processed_word.as_str()).as_str()); - } - } - regex.push_str(")"); - } - regex.push_str(".+"); - trace!("Regex: {}", regex); - regex -} - fn save_image_if_trace(img: &DynamicImage, path: &str) { let log_lvl = CONFIG.get().unwrap().log.level.as_str(); if log_lvl == "trace" { diff --git a/swordfish/src/main.rs b/swordfish/src/main.rs index 7f332c7..293413a 100644 --- a/swordfish/src/main.rs +++ b/swordfish/src/main.rs @@ -346,6 +346,8 @@ async fn debug(ctx: &Context, msg: &Message) -> CommandResult { "kda" => debug::dbg_kdropanalyze(ctx, msg).await?, "embed" => debug::dbg_embed(ctx, msg).await?, "message" => debug::dbg_message(ctx, msg).await?, + "regexify-text" => debug::dbg_regexify_text(ctx, msg).await?, + "regextxt" => debug::dbg_regexify_text(ctx, msg).await?, "parse-qingque-atopwl" => debug::dbg_parse_qingque_atopwl(ctx, msg).await?, "parse-katana-kc_ow" => debug::dbg_parse_katana_kc_ow(ctx, msg).await?, "parse-katana-klu_lookup" => debug::dbg_parse_katana_klu_lookup(ctx, msg).await?, diff --git a/swordfish/src/tesseract/mod.rs b/swordfish/src/tesseract/mod.rs index 948005f..e951f3f 100644 --- a/swordfish/src/tesseract/mod.rs +++ b/swordfish/src/tesseract/mod.rs @@ -1,2 +1,3 @@ pub mod libtesseract; pub mod subprocess; +pub mod utils; diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs new file mode 100644 index 0000000..cf66809 --- /dev/null +++ b/swordfish/src/tesseract/utils.rs @@ -0,0 +1,272 @@ +use swordfish_common::trace; + +const ALLOWED_CHARS: [char; 14] = [ + ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é', +]; +const REGEX_CHARS: [char; 4] = ['[', ']', ')', '(']; + +fn replace_string(text: &mut String, from: &str, to: &str) -> bool { + match text.find(from) { + Some(i) => { + text.replace_range(i..i + from.len(), to); + true + } + None => false, + } +} + +pub fn fix_tesseract_string(text: &mut String) { + // Remove the \n + trace!("Text: {}", text); + if text.ends_with("\n") { + text.pop(); + } + // Workaround for a bug the text + // e.g. "We Never Learn\nN" -> "We Never Learn" + trace!("Text: {}", text); + if text.ends_with("\nN") { + text.truncate(text.len() - 2); + } + // Replace first (to prevent "byte index 13 is not a char boundary; it is inside '—' (bytes 11..14)") + while replace_string(text, "—", "-") { + trace!("Replacing '—' with '-'"); + } + // Workaround for a bug the text + trace!("Text: {}", text); + if text.starts_with("- ") || text.starts_with("-.") { + text.drain(0..2); + } + // Remove the first character if it is not alphanumeric + if !text.starts_with(|c: char| c.is_ascii_alphanumeric()) { + text.remove(0); + } + // Workaround IR -> Ik + // Maybe it only occurs if Ik is in the start of the string? + // e.g. "IReda" -> "Ikeda" + trace!("Text: {}", text); + replace_string(text, "IR", "Ik"); + // Workaround for "A\n" + // This is usually the corner of the card + trace!("Text: {}", text); + replace_string(text, "A\n", ""); + // Workaround for '“NO' + // This is usually the left bottom corner of the card + trace!("Text: {}", text); + if text.ends_with(r##"“NO"##) { + text.drain(text.len() - 4..text.len()); + } + // Workaround for "\n." (and others in the future) + let text_clone = text.clone(); + let mut clone_chars = text_clone.chars(); + for (i, c) in clone_chars.clone().enumerate() { + if c != '\n' { + continue; + } + let prev_char = match clone_chars.nth(i - 1) { + Some(c) => c, + None => continue, + }; + let mut rm_prev: i8 = 0; + trace!("Prev char: {}", prev_char); + if ['-'].contains(&prev_char) { + rm_prev = 1; + text.remove(i - 1); + } + // Fix for "Asobi ni Iku lo Asobi ni Oide" -> "Asobi ni Iku yo! Asobi ni Oide" + if prev_char == 'l' { + let prev_prev_char = match clone_chars.nth(i - 2) { + Some(c) => c, + None => continue, + }; + trace!("Prev prev char: {}", prev_prev_char); + if prev_prev_char == 'o' { + rm_prev = -1; + text.drain(i - 3..i - 1); + text.insert_str(i - 2, "yo!") + } + } + let next_char = match clone_chars.nth(i + 1) { + Some(c) => c, + None => break, + }; + trace!("Next char: {}", next_char); + if ['.'].contains(&next_char) { + text.remove((i as i8 + 1 - rm_prev) as usize); + } + } + // Replace "\n" with " " + trace!("Text: {}", text); + while replace_string(text, "\n", " ") { + trace!("Replacing '\\n' with ' '"); + } + // Remove all non-alphanumeric characters + trace!("Text: {}", text); + text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric()); + // Fix "mn" -> "III" + trace!("Text: {}", text); + if text.ends_with("mn") { + text.pop(); + text.pop(); + text.push_str("III"); + } + // Fix "1ll" -> "III" + trace!("Text: {}", text); + replace_string(text, "1ll", "III"); + // Fix "lll" -> "!!!" + trace!("Text: {}", text); + replace_string(text, "lll", "!!!"); + // Fix "Il" -> "II" in the end of the string + trace!("Text: {}", text); + if text.ends_with("Il") { + text.pop(); + text.pop(); + text.push_str("II"); + } + // Replace multiple spaces with one space + trace!("Text: {}", text); + while replace_string(text, " ", " ") { + trace!("Removing multiple spaces"); + } + // Remove the last character if it is a dash + if text.ends_with("-") { + text.pop(); + } + // Workaround if the first character is a space + trace!("Text: {}", text); + while text.starts_with(|c: char| c.is_whitespace()) { + trace!("Removing leading space"); + text.remove(0); + } + // Workaround if the last character is a space + trace!("Text: {}", text); + while text.ends_with(|c: char| c.is_whitespace()) { + trace!("Removing ending space"); + text.pop(); + } + trace!("Text (final): {}", text); +} + +pub fn regexify_text(text: &String) -> String { + let partial_match: bool; + let short_text = text.len() < 6; + if text.len() > 23 { + partial_match = true; + } else { + partial_match = false; + } + let mut regex = String::new(); + let mut ascii_text = String::new(); + let mut prev_chars: Vec = Vec::new(); + for c in text.chars() { + // Here comes the workaround... + // The character "0" is sometimes used in place of "O" in names + if ['0', 'O'].contains(&c) { + ascii_text.push_str("[0O]"); + } else if ['u', 'v'].contains(&c) { + ascii_text.push_str("[uv]"); + } else if ['t'].contains(&c) { + ascii_text.push_str("[ti]"); + } else if ['I', 'l', '!', '1'].contains(&c) { + ascii_text.push_str("[Il!1i]"); + } else if ['.'].contains(&c) { + if prev_chars.len() > 3 { + let prev_char = prev_chars[prev_chars.len() - 1]; + let prev_prev_char = prev_chars[prev_chars.len() - 2]; + if prev_char.is_numeric() && prev_prev_char.is_whitespace() { + continue; + } + } + ascii_text.push(' '); + } else if ['R'].contains(&c) { + ascii_text.push_str("[Rk]"); + } else if ['m'].contains(&c) { + ascii_text.push_str("(m|ra)"); + } else if ['a'].contains(&c) { + ascii_text.push_str("[ao]") + } else if c.is_ascii_alphanumeric() { + ascii_text.push(c); + } else { + ascii_text.push(' '); + } + prev_chars.push(c); + } + if ascii_text.ends_with(|c: char| c.is_ascii_digit()) { + ascii_text.pop(); + } + // Filter for short string. + if short_text && !ascii_text.contains(|c: char| c.is_whitespace()) { + regex.push_str("^"); + let mut request_quantifier: bool = false; + let mut regex_any: bool = false; + let mut regex_any_from: usize = 0; + for (i, char) in ascii_text.chars().enumerate() { + trace!("Char: {}", char); + if char == '[' { + regex_any = true; + regex_any_from = i; + if i == 0 { + request_quantifier = true; + } + continue; + } else if i == ascii_text.len() - 1 { + regex.push_str(".*"); + } + if regex_any { + if char == ']' { + regex_any = false; + regex.push_str(&ascii_text[regex_any_from..i + 1]); + if request_quantifier { + regex.push_str(".*"); + } + } + continue; + } + regex.push(char); + if i == 0 { + regex.push_str(".*"); + } + } + regex.push_str("$"); + trace!("Regex (short string): {}", regex); + return regex; + } + let split = ascii_text.split_whitespace(); + let len = split.clone().count(); + trace!("Partial match: {}", partial_match); + for (i, word) in split.enumerate() { + if word.len() < 2 { + if i > 0 && i < len - 1 { + continue; + } + if ["x", "X"].contains(&word) { + continue; + } + } + regex.push_str("(?=.*"); + let processed_word = word.to_lowercase(); + trace!("Processed word: {}", processed_word); + if partial_match && processed_word.len() > 4 { + // Remove first two and last two characters for "partial match" + if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c)) + && !processed_word[word.len() - 2..word.len()] + .contains(|c: char| REGEX_CHARS.contains(&c)) + { + regex.push_str(&processed_word[2..word.len() - 2]); + } else { + regex.push_str(&processed_word.as_str()); + } + } else { + // Do not push word boundary if the word contains special characters like "!" + trace!("Current processed word: {}", processed_word); + if processed_word.chars().all(|c| c.is_ascii_alphanumeric()) { + regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str()); + } else { + regex.push_str(format!("{}", &processed_word.as_str()).as_str()); + } + } + regex.push_str(")"); + } + regex.push_str(".+"); + trace!("Regex: {}", regex); + regex +}