chore: add regexify-text debug command

2024-01-24 00:51:26 +07:00 · 2024-01-24 00:51:26 +07:00 · a317d4f28b
commit a317d4f28b
parent 9609e1b217
5 changed files with 294 additions and 273 deletions
--- a/swordfish/src/debug.rs
+++ b/swordfish/src/debug.rs
@ -1,5 +1,6 @@
 use crate::helper;
 use crate::katana;
 use crate::tesseract;
 use crate::utils;
 use crate::CONFIG;
 use serenity::framework::standard::CommandResult;
@ -299,6 +300,23 @@ pub async fn dbg_message(ctx: &Context, msg: &Message) -> CommandResult {
    Ok(())
 }
 pub async fn dbg_regexify_text(ctx: &Context, msg: &Message) -> CommandResult {
    let content = msg.content.split_whitespace().collect::<Vec<&str>>()[2..].join(" ");
    helper::info_message(
        ctx,
        msg,
        format!(
            "```\n\
    {}\n\
    ```",
            tesseract::utils::regexify_text(&content)
        ),
        Some("Regexified Text".to_string()),
    )
    .await;
    Ok(())
 }
 pub async fn dbg_embed(ctx: &Context, msg: &Message) -> CommandResult {
    let target_msg = match dbg_get_message("embed", ctx, msg).await {
        Ok(msg) => msg,
--- a/swordfish/src/katana.rs
+++ b/swordfish/src/katana.rs
@ -1,4 +1,5 @@
 use crate::helper;
 use crate::tesseract::utils::{fix_tesseract_string, regexify_text};
 use crate::tesseract::{libtesseract, subprocess};
 use crate::CONFIG;
 use image::imageops::colorops::contrast_in_place;
@ -13,10 +14,6 @@ use swordfish_common::{error, trace, warn};
 use tokio::task;
 use tokio::time::Instant;
 const ALLOWED_CHARS: [char; 14] = [
    ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
 ];
 const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
 const CARD_NAME_X_OFFSET: u32 = 22;
 const CARD_NAME_Y_OFFSET: u32 = 28;
 const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET;
@ -26,275 +23,6 @@ const CARD_SERIES_Y_OFFSET: u32 = 278;
 const CARD_SERIES_WIDTH: u32 = 206 - CARD_SERIES_X_OFFSET;
 const CARD_SERIES_HEIGHT: u32 = 330 - CARD_SERIES_Y_OFFSET;
 fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
    match text.find(from) {
        Some(i) => {
            text.replace_range(i..i + from.len(), to);
            true
        }
        None => false,
    }
 }
 fn fix_tesseract_string(text: &mut String) {
    // Remove the \n
    trace!("Text: {}", text);
    if text.ends_with("\n") {
        text.pop();
    }
    // Workaround for a bug the text
    // e.g. "We Never Learn\nN" -> "We Never Learn"
    trace!("Text: {}", text);
    if text.ends_with("\nN") {
        text.truncate(text.len() - 2);
    }
    // Replace first (to prevent "byte index 13 is not a char boundary; it is inside '—' (bytes 11..14)")
    while replace_string(text, "—", "-") {
        trace!("Replacing '—' with '-'");
    }
    // Workaround for a bug the text
    trace!("Text: {}", text);
    if text.starts_with("- ") || text.starts_with("-.") {
        text.drain(0..2);
    }
    // Remove the first character if it is not alphanumeric
    if !text.starts_with(|c: char| c.is_ascii_alphanumeric()) {
        text.remove(0);
    }
    // Workaround IR -> Ik
    // Maybe it only occurs if Ik is in the start of the string?
    // e.g. "IReda" -> "Ikeda"
    trace!("Text: {}", text);
    replace_string(text, "IR", "Ik");
    // Workaround for "A\n"
    // This is usually the corner of the card
    trace!("Text: {}", text);
    replace_string(text, "A\n", "");
    // Workaround for '“NO'
    // This is usually the left bottom corner of the card
    trace!("Text: {}", text);
    if text.ends_with(r##"“NO"##) {
        text.drain(text.len() - 4..text.len());
    }
    // Workaround for "\n." (and others in the future)
    let text_clone = text.clone();
    let mut clone_chars = text_clone.chars();
    for (i, c) in clone_chars.clone().enumerate() {
        if c != '\n' {
            continue;
        }
        let prev_char = match clone_chars.nth(i - 1) {
            Some(c) => c,
            None => continue,
        };
        let mut rm_prev: i8 = 0;
        trace!("Prev char: {}", prev_char);
        if ['-'].contains(&prev_char) {
            rm_prev = 1;
            text.remove(i - 1);
        }
        // Fix for "Asobi ni Iku lo Asobi ni Oide" -> "Asobi ni Iku yo! Asobi ni Oide"
        if prev_char == 'l' {
            let prev_prev_char = match clone_chars.nth(i - 2) {
                Some(c) => c,
                None => continue,
            };
            trace!("Prev prev char: {}", prev_prev_char);
            if prev_prev_char == 'o' {
                rm_prev = -1;
                text.drain(i - 3..i - 1);
                text.insert_str(i - 2, "yo!")
            }
        }
        let next_char = match clone_chars.nth(i + 1) {
            Some(c) => c,
            None => break,
        };
        trace!("Next char: {}", next_char);
        if ['.'].contains(&next_char) {
            text.remove((i as i8 + 1 - rm_prev) as usize);
        }
    }
    // Replace "\n" with " "
    trace!("Text: {}", text);
    while replace_string(text, "\n", " ") {
        trace!("Replacing '\\n' with ' '");
    }
    // Remove all non-alphanumeric characters
    trace!("Text: {}", text);
    text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric());
    // Fix "mn" -> "III"
    trace!("Text: {}", text);
    if text.ends_with("mn") {
        text.pop();
        text.pop();
        text.push_str("III");
    }
    // Fix "1ll" -> "III"
    trace!("Text: {}", text);
    replace_string(text, "1ll", "III");
    // Fix "lll" -> "!!!"
    trace!("Text: {}", text);
    replace_string(text, "lll", "!!!");
    // Fix "Il" -> "II" in the end of the string
    trace!("Text: {}", text);
    if text.ends_with("Il") {
        text.pop();
        text.pop();
        text.push_str("II");
    }
    // Replace multiple spaces with one space
    trace!("Text: {}", text);
    while replace_string(text, "  ", " ") {
        trace!("Removing multiple spaces");
    }
    // Remove the last character if it is a dash
    if text.ends_with("-") {
        text.pop();
    }
    // Workaround if the first character is a space
    trace!("Text: {}", text);
    while text.starts_with(|c: char| c.is_whitespace()) {
        trace!("Removing leading space");
        text.remove(0);
    }
    // Workaround if the last character is a space
    trace!("Text: {}", text);
    while text.ends_with(|c: char| c.is_whitespace()) {
        trace!("Removing ending space");
        text.pop();
    }
    trace!("Text (final): {}", text);
 }
 fn regexify_text(text: &String) -> String {
    let partial_match: bool;
    let short_text = text.len() < 6;
    if text.len() > 23 {
        partial_match = true;
    } else {
        partial_match = false;
    }
    let mut regex = String::new();
    let mut ascii_text = String::new();
    let mut prev_chars: Vec<char> = Vec::new();
    for c in text.chars() {
        // Here comes the workaround...
        // The character "0" is sometimes used in place of "O" in names
        if ['0', 'O'].contains(&c) {
            ascii_text.push_str("[0O]");
        } else if ['u', 'v'].contains(&c) {
            ascii_text.push_str("[uv]");
        } else if ['t'].contains(&c) {
            ascii_text.push_str("[ti]");
        } else if ['I', 'l', '!', '1'].contains(&c) {
            ascii_text.push_str("[Il!1i]");
        } else if ['.'].contains(&c) {
            if prev_chars.len() > 3 {
                let prev_char = prev_chars[prev_chars.len() - 1];
                let prev_prev_char = prev_chars[prev_chars.len() - 2];
                if prev_char.is_numeric() && prev_prev_char.is_whitespace() {
                    continue;
                }
            }
            ascii_text.push(' ');
        } else if ['R'].contains(&c) {
            ascii_text.push_str("[Rk]");
        } else if ['m'].contains(&c) {
            ascii_text.push_str("(m|ra)");
        } else if ['a'].contains(&c) {
            ascii_text.push_str("[ao]")
        } else if c.is_ascii_alphanumeric() {
            ascii_text.push(c);
        } else {
            ascii_text.push(' ');
        }
        prev_chars.push(c);
    }
    if ascii_text.ends_with(|c: char| c.is_ascii_digit()) {
        ascii_text.pop();
    }
    // Filter for short string.
    if short_text && !ascii_text.contains(|c: char| c.is_whitespace()) {
        regex.push_str("^");
        let mut request_quantifier: bool = false;
        let mut regex_any: bool = false;
        let mut regex_any_from: usize = 0;
        for (i, char) in ascii_text.chars().enumerate() {
            trace!("Char: {}", char);
            if char == '[' {
                regex_any = true;
                regex_any_from = i;
                if i == 0 {
                    request_quantifier = true;
                }
                continue;
            } else if i == ascii_text.len() - 1 {
                regex.push_str(".*");
            }
            if regex_any {
                if char == ']' {
                    regex_any = false;
                    regex.push_str(&ascii_text[regex_any_from..i + 1]);
                    if request_quantifier {
                        regex.push_str(".*");
                    }
                }
                continue;
            }
            regex.push(char);
            if i == 0 {
                regex.push_str(".*");
            }
        }
        regex.push_str("$");
        trace!("Regex (short string): {}", regex);
        return regex;
    }
    let split = ascii_text.split_whitespace();
    let len = split.clone().count();
    trace!("Partial match: {}", partial_match);
    for (i, word) in split.enumerate() {
        if word.len() < 2 {
            if i > 0 && i < len - 1 {
                continue;
            }
            if ["x", "X"].contains(&word) {
                continue;
            }
        }
        regex.push_str("(?=.*");
        let processed_word = word.to_lowercase();
        trace!("Processed word: {}", processed_word);
        if partial_match && processed_word.len() > 4 {
            // Remove first two and last two characters for "partial match"
            if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
                && !processed_word[word.len() - 2..word.len()]
                    .contains(|c: char| REGEX_CHARS.contains(&c))
            {
                regex.push_str(&processed_word[2..word.len() - 2]);
            } else {
                regex.push_str(&processed_word.as_str());
            }
        } else {
            // Do not push word boundary if the word contains special characters like "!"
            trace!("Current processed word: {}", processed_word);
            if processed_word
                .chars()
                .all(|c| c.is_ascii_alphanumeric())
            {
                regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str());
            } else {
                regex.push_str(format!("{}", &processed_word.as_str()).as_str());
            }
        }
        regex.push_str(")");
    }
    regex.push_str(".+");
    trace!("Regex: {}", regex);
    regex
 }
 fn save_image_if_trace(img: &DynamicImage, path: &str) {
    let log_lvl = CONFIG.get().unwrap().log.level.as_str();
    if log_lvl == "trace" {
--- a/swordfish/src/main.rs
+++ b/swordfish/src/main.rs
@ -346,6 +346,8 @@ async fn debug(ctx: &Context, msg: &Message) -> CommandResult {
        "kda" => debug::dbg_kdropanalyze(ctx, msg).await?,
        "embed" => debug::dbg_embed(ctx, msg).await?,
        "message" => debug::dbg_message(ctx, msg).await?,
        "regexify-text" => debug::dbg_regexify_text(ctx, msg).await?,
        "regextxt" => debug::dbg_regexify_text(ctx, msg).await?,
        "parse-qingque-atopwl" => debug::dbg_parse_qingque_atopwl(ctx, msg).await?,
        "parse-katana-kc_ow" => debug::dbg_parse_katana_kc_ow(ctx, msg).await?,
        "parse-katana-klu_lookup" => debug::dbg_parse_katana_klu_lookup(ctx, msg).await?,
--- a/swordfish/src/tesseract/mod.rs
+++ b/swordfish/src/tesseract/mod.rs
@ -1,2 +1,3 @@
 pub mod libtesseract;
 pub mod subprocess;
 pub mod utils;
--- a/swordfish/src/tesseract/utils.rs
+++ b/swordfish/src/tesseract/utils.rs
@ -0,0 +1,272 @@
 use swordfish_common::trace;
 const ALLOWED_CHARS: [char; 14] = [
    ' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&', '_', 'é',
 ];
 const REGEX_CHARS: [char; 4] = ['[', ']', ')', '('];
 fn replace_string(text: &mut String, from: &str, to: &str) -> bool {
    match text.find(from) {
        Some(i) => {
            text.replace_range(i..i + from.len(), to);
            true
        }
        None => false,
    }
 }
 pub fn fix_tesseract_string(text: &mut String) {
    // Remove the \n
    trace!("Text: {}", text);
    if text.ends_with("\n") {
        text.pop();
    }
    // Workaround for a bug the text
    // e.g. "We Never Learn\nN" -> "We Never Learn"
    trace!("Text: {}", text);
    if text.ends_with("\nN") {
        text.truncate(text.len() - 2);
    }
    // Replace first (to prevent "byte index 13 is not a char boundary; it is inside '—' (bytes 11..14)")
    while replace_string(text, "—", "-") {
        trace!("Replacing '—' with '-'");
    }
    // Workaround for a bug the text
    trace!("Text: {}", text);
    if text.starts_with("- ") || text.starts_with("-.") {
        text.drain(0..2);
    }
    // Remove the first character if it is not alphanumeric
    if !text.starts_with(|c: char| c.is_ascii_alphanumeric()) {
        text.remove(0);
    }
    // Workaround IR -> Ik
    // Maybe it only occurs if Ik is in the start of the string?
    // e.g. "IReda" -> "Ikeda"
    trace!("Text: {}", text);
    replace_string(text, "IR", "Ik");
    // Workaround for "A\n"
    // This is usually the corner of the card
    trace!("Text: {}", text);
    replace_string(text, "A\n", "");
    // Workaround for '“NO'
    // This is usually the left bottom corner of the card
    trace!("Text: {}", text);
    if text.ends_with(r##"“NO"##) {
        text.drain(text.len() - 4..text.len());
    }
    // Workaround for "\n." (and others in the future)
    let text_clone = text.clone();
    let mut clone_chars = text_clone.chars();
    for (i, c) in clone_chars.clone().enumerate() {
        if c != '\n' {
            continue;
        }
        let prev_char = match clone_chars.nth(i - 1) {
            Some(c) => c,
            None => continue,
        };
        let mut rm_prev: i8 = 0;
        trace!("Prev char: {}", prev_char);
        if ['-'].contains(&prev_char) {
            rm_prev = 1;
            text.remove(i - 1);
        }
        // Fix for "Asobi ni Iku lo Asobi ni Oide" -> "Asobi ni Iku yo! Asobi ni Oide"
        if prev_char == 'l' {
            let prev_prev_char = match clone_chars.nth(i - 2) {
                Some(c) => c,
                None => continue,
            };
            trace!("Prev prev char: {}", prev_prev_char);
            if prev_prev_char == 'o' {
                rm_prev = -1;
                text.drain(i - 3..i - 1);
                text.insert_str(i - 2, "yo!")
            }
        }
        let next_char = match clone_chars.nth(i + 1) {
            Some(c) => c,
            None => break,
        };
        trace!("Next char: {}", next_char);
        if ['.'].contains(&next_char) {
            text.remove((i as i8 + 1 - rm_prev) as usize);
        }
    }
    // Replace "\n" with " "
    trace!("Text: {}", text);
    while replace_string(text, "\n", " ") {
        trace!("Replacing '\\n' with ' '");
    }
    // Remove all non-alphanumeric characters
    trace!("Text: {}", text);
    text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric());
    // Fix "mn" -> "III"
    trace!("Text: {}", text);
    if text.ends_with("mn") {
        text.pop();
        text.pop();
        text.push_str("III");
    }
    // Fix "1ll" -> "III"
    trace!("Text: {}", text);
    replace_string(text, "1ll", "III");
    // Fix "lll" -> "!!!"
    trace!("Text: {}", text);
    replace_string(text, "lll", "!!!");
    // Fix "Il" -> "II" in the end of the string
    trace!("Text: {}", text);
    if text.ends_with("Il") {
        text.pop();
        text.pop();
        text.push_str("II");
    }
    // Replace multiple spaces with one space
    trace!("Text: {}", text);
    while replace_string(text, "  ", " ") {
        trace!("Removing multiple spaces");
    }
    // Remove the last character if it is a dash
    if text.ends_with("-") {
        text.pop();
    }
    // Workaround if the first character is a space
    trace!("Text: {}", text);
    while text.starts_with(|c: char| c.is_whitespace()) {
        trace!("Removing leading space");
        text.remove(0);
    }
    // Workaround if the last character is a space
    trace!("Text: {}", text);
    while text.ends_with(|c: char| c.is_whitespace()) {
        trace!("Removing ending space");
        text.pop();
    }
    trace!("Text (final): {}", text);
 }
 pub fn regexify_text(text: &String) -> String {
    let partial_match: bool;
    let short_text = text.len() < 6;
    if text.len() > 23 {
        partial_match = true;
    } else {
        partial_match = false;
    }
    let mut regex = String::new();
    let mut ascii_text = String::new();
    let mut prev_chars: Vec<char> = Vec::new();
    for c in text.chars() {
        // Here comes the workaround...
        // The character "0" is sometimes used in place of "O" in names
        if ['0', 'O'].contains(&c) {
            ascii_text.push_str("[0O]");
        } else if ['u', 'v'].contains(&c) {
            ascii_text.push_str("[uv]");
        } else if ['t'].contains(&c) {
            ascii_text.push_str("[ti]");
        } else if ['I', 'l', '!', '1'].contains(&c) {
            ascii_text.push_str("[Il!1i]");
        } else if ['.'].contains(&c) {
            if prev_chars.len() > 3 {
                let prev_char = prev_chars[prev_chars.len() - 1];
                let prev_prev_char = prev_chars[prev_chars.len() - 2];
                if prev_char.is_numeric() && prev_prev_char.is_whitespace() {
                    continue;
                }
            }
            ascii_text.push(' ');
        } else if ['R'].contains(&c) {
            ascii_text.push_str("[Rk]");
        } else if ['m'].contains(&c) {
            ascii_text.push_str("(m|ra)");
        } else if ['a'].contains(&c) {
            ascii_text.push_str("[ao]")
        } else if c.is_ascii_alphanumeric() {
            ascii_text.push(c);
        } else {
            ascii_text.push(' ');
        }
        prev_chars.push(c);
    }
    if ascii_text.ends_with(|c: char| c.is_ascii_digit()) {
        ascii_text.pop();
    }
    // Filter for short string.
    if short_text && !ascii_text.contains(|c: char| c.is_whitespace()) {
        regex.push_str("^");
        let mut request_quantifier: bool = false;
        let mut regex_any: bool = false;
        let mut regex_any_from: usize = 0;
        for (i, char) in ascii_text.chars().enumerate() {
            trace!("Char: {}", char);
            if char == '[' {
                regex_any = true;
                regex_any_from = i;
                if i == 0 {
                    request_quantifier = true;
                }
                continue;
            } else if i == ascii_text.len() - 1 {
                regex.push_str(".*");
            }
            if regex_any {
                if char == ']' {
                    regex_any = false;
                    regex.push_str(&ascii_text[regex_any_from..i + 1]);
                    if request_quantifier {
                        regex.push_str(".*");
                    }
                }
                continue;
            }
            regex.push(char);
            if i == 0 {
                regex.push_str(".*");
            }
        }
        regex.push_str("$");
        trace!("Regex (short string): {}", regex);
        return regex;
    }
    let split = ascii_text.split_whitespace();
    let len = split.clone().count();
    trace!("Partial match: {}", partial_match);
    for (i, word) in split.enumerate() {
        if word.len() < 2 {
            if i > 0 && i < len - 1 {
                continue;
            }
            if ["x", "X"].contains(&word) {
                continue;
            }
        }
        regex.push_str("(?=.*");
        let processed_word = word.to_lowercase();
        trace!("Processed word: {}", processed_word);
        if partial_match && processed_word.len() > 4 {
            // Remove first two and last two characters for "partial match"
            if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
                && !processed_word[word.len() - 2..word.len()]
                    .contains(|c: char| REGEX_CHARS.contains(&c))
            {
                regex.push_str(&processed_word[2..word.len() - 2]);
            } else {
                regex.push_str(&processed_word.as_str());
            }
        } else {
            // Do not push word boundary if the word contains special characters like "!"
            trace!("Current processed word: {}", processed_word);
            if processed_word.chars().all(|c| c.is_ascii_alphanumeric()) {
                regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str());
            } else {
                regex.push_str(format!("{}", &processed_word.as_str()).as_str());
            }
        }
        regex.push_str(")");
    }
    regex.push_str(".+");
    trace!("Regex: {}", regex);
    regex
 }