From 0ada9f6e464ae141b5bcb98ae4fba75fc1554363 Mon Sep 17 00:00:00 2001 From: tretrauit Date: Fri, 5 Jan 2024 20:18:54 +0700 Subject: [PATCH] feat: use rusty-tesseract as another backend Also use OnceLock & LazyLock --- Cargo.lock | 34 ++++- swordfish-common/Cargo.toml | 1 + swordfish-common/src/lib.rs | 2 + swordfish-common/src/tesseract.rs | 18 --- .../src/tesseract/libtesseract.rs | 60 +++++++++ swordfish-common/src/tesseract/mod.rs | 2 + swordfish-common/src/tesseract/subprocess.rs | 36 ++++++ swordfish/Cargo.toml | 1 - swordfish/src/config.rs | 16 ++- swordfish/src/katana.rs | 121 +++++++++++++----- swordfish/src/main.rs | 53 ++++---- 11 files changed, 259 insertions(+), 85 deletions(-) delete mode 100644 swordfish-common/src/tesseract.rs create mode 100644 swordfish-common/src/tesseract/libtesseract.rs create mode 100644 swordfish-common/src/tesseract/mod.rs create mode 100644 swordfish-common/src/tesseract/subprocess.rs diff --git a/Cargo.lock b/Cargo.lock index 681153c..b8fe845 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1442,6 +1442,19 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rusty-tesseract" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e49fec5324d880080a07a9a1c83c9a3aab3c9128c26273ec56a8443cc9d3a334" +dependencies = [ + "image", + "subprocess", + "substring", + "tempfile", + "thiserror", +] + [[package]] name = "ryu" version = "1.0.16" @@ -1676,13 +1689,31 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "substring" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ee6433ecef213b2e72f587ef64a2f5943e7cd16fbd82dbe8bc07486c534c86" +dependencies = [ + "autocfg", +] + [[package]] name = "swordfish" version = "0.1.0" dependencies = [ "dotenvy", "image", - "once_cell", "regex", "serde", "serenity", @@ -1699,6 +1730,7 @@ dependencies = [ "humantime", "leptess", "log", + "rusty-tesseract", "tracing", "tracing-subscriber", ] diff --git a/swordfish-common/Cargo.toml b/swordfish-common/Cargo.toml index 471bf70..ea85c73 100644 --- a/swordfish-common/Cargo.toml +++ b/swordfish-common/Cargo.toml @@ -10,5 +10,6 @@ fern = "0.6.2" humantime = "2.1.0" leptess = "0.14.0" log = "0.4.20" +rusty-tesseract = "1.1.9" tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } diff --git a/swordfish-common/src/lib.rs b/swordfish-common/src/lib.rs index f1f5d4e..4a8cafb 100644 --- a/swordfish-common/src/lib.rs +++ b/swordfish-common/src/lib.rs @@ -1,3 +1,4 @@ +#![feature(lazy_cell)] pub use log; pub use tracing::{debug, error, info, trace, warn}; use tracing_subscriber::{self, fmt, EnvFilter}; @@ -10,6 +11,7 @@ pub fn setup_logger(level: &str) -> Result<(), fern::InitError> { .with_level(true) .with_target(true) .with_thread_ids(false) + .with_line_number(true) .with_thread_names(false); let filter = EnvFilter::builder() .from_env() diff --git a/swordfish-common/src/tesseract.rs b/swordfish-common/src/tesseract.rs deleted file mode 100644 index dac0b4a..0000000 --- a/swordfish-common/src/tesseract.rs +++ /dev/null @@ -1,18 +0,0 @@ -pub use leptess::{LepTess, Variable}; - -pub fn init_tesseract(numeric_only: bool) -> Result { - let mut lep_tess = match LepTess::new(None, "eng") { - Ok(lep_tess) => lep_tess, - Err(why) => return Err(format!("Failed to initialize Tesseract: {:?}", why)), - }; - lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap(); - // Use LSTM only. - lep_tess.set_variable(Variable::TesseditOcrEngineMode, "1").unwrap(); - if numeric_only { - match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") { - Ok(_) => (), - Err(why) => return Err(format!("Failed to set whitelist: {:?}", why)), - }; - } - Ok(lep_tess) -} diff --git a/swordfish-common/src/tesseract/libtesseract.rs b/swordfish-common/src/tesseract/libtesseract.rs new file mode 100644 index 0000000..f1aa3e0 --- /dev/null +++ b/swordfish-common/src/tesseract/libtesseract.rs @@ -0,0 +1,60 @@ +pub use leptess::{LepTess, Variable}; +use std::{sync::{ + Arc, Mutex, LazyLock +}, thread}; + +static TESSERACT: LazyLock>> = LazyLock::new(|| { + let mut lep_tess = match LepTess::new(None, "eng") { + Ok(lep_tess) => lep_tess, + Err(why) => panic!("{}", format!("Failed to initialize Tesseract: {:?}", why)), + }; + // lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap(); + // Use LSTM only. + lep_tess.set_variable(Variable::TesseditOcrEngineMode, "2").unwrap(); + Arc::new(Mutex::new(lep_tess)) +}); + +static mut TESSERACT_VEC: Vec>> = Vec::new(); + +pub fn get_tesseract(numeric_only: bool) -> Arc> { + TESSERACT.clone() +} + +pub unsafe fn get_tesseract_from_vec(numeric_only: bool) -> Arc> { + let lep_tess: Arc>; + if TESSERACT_VEC.len() == 0 { + for _ in 0..3 { + let num_only = numeric_only.clone(); + thread::spawn(move || { + let ocr = init_tesseract(num_only).unwrap(); + TESSERACT_VEC.push(Arc::new(Mutex::new(ocr))); + }); + } + lep_tess = Arc::new(Mutex::new(init_tesseract(numeric_only).unwrap())); + } + else { + lep_tess = TESSERACT_VEC.pop().unwrap(); + thread::spawn(move || unsafe { + let ocr = init_tesseract(numeric_only).unwrap(); + TESSERACT_VEC.push(Arc::new(Mutex::new(ocr))); + }); + } + lep_tess +} + +pub fn init_tesseract(numeric_only: bool) -> Result { + let mut lep_tess = match LepTess::new(None, "eng") { + Ok(lep_tess) => lep_tess, + Err(why) => return Err(format!("Failed to initialize Tesseract: {:?}", why)), + }; + lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap(); + // Use LSTM only. + lep_tess.set_variable(Variable::TesseditOcrEngineMode, "1").unwrap(); + if numeric_only { + match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") { + Ok(_) => (), + Err(why) => return Err(format!("Failed to set whitelist: {:?}", why)), + }; + } + Ok(lep_tess) +} diff --git a/swordfish-common/src/tesseract/mod.rs b/swordfish-common/src/tesseract/mod.rs new file mode 100644 index 0000000..dd50bb3 --- /dev/null +++ b/swordfish-common/src/tesseract/mod.rs @@ -0,0 +1,2 @@ +pub mod subprocess; +pub mod libtesseract; \ No newline at end of file diff --git a/swordfish-common/src/tesseract/subprocess.rs b/swordfish-common/src/tesseract/subprocess.rs new file mode 100644 index 0000000..3ec8555 --- /dev/null +++ b/swordfish-common/src/tesseract/subprocess.rs @@ -0,0 +1,36 @@ +pub use rusty_tesseract; +pub use rusty_tesseract::{Args, Image}; +use std::{collections::HashMap, sync::LazyLock}; + +static TESSERACT_ARGS: LazyLock = LazyLock::new(|| Args { + lang: "eng".to_string(), + config_variables: HashMap::new(), + psm: Some(6), + dpi: None, + oem: Some(1), +}); + +static TESSERACT_NUMERIC_ARGS: LazyLock = LazyLock::new(|| Args { + lang: "eng".to_string(), + config_variables: HashMap::from([( + "tessedit_char_whitelist".into(), + "0123456789".into(), + )]), + psm: Some(6), + dpi: None, + oem: Some(1), +}); + +pub fn image_to_string(image: &Image) -> Result { + match rusty_tesseract::image_to_string(image, &TESSERACT_ARGS) { + Ok(text) => Ok(text), + Err(why) => Err(format!("Failed to OCR image: {:?}", why)), + } +} + +pub fn image_to_numeric_string(image: &Image) -> Result { + match rusty_tesseract::image_to_string(image, &TESSERACT_NUMERIC_ARGS) { + Ok(text) => Ok(text), + Err(why) => Err(format!("Failed to OCR image: {:?}", why)), + } +} diff --git a/swordfish/Cargo.toml b/swordfish/Cargo.toml index 0ab689f..3b23b80 100644 --- a/swordfish/Cargo.toml +++ b/swordfish/Cargo.toml @@ -8,7 +8,6 @@ edition = "2021" [dependencies] dotenvy = "0.15.7" image = "0.24.7" -once_cell = "1.19.0" regex = "1.10.2" serde = "1.0.193" serenity = { version = "0.12.0", features = ["builder"] } diff --git a/swordfish/src/config.rs b/swordfish/src/config.rs index 6f7ba64..c6b4039 100644 --- a/swordfish/src/config.rs +++ b/swordfish/src/config.rs @@ -1,21 +1,26 @@ use serde::{Deserialize, Serialize}; use std::fs; -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct FileLog { pub enabled: bool, pub path: String, } - -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Log { pub level: String, pub file: FileLog, } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct Tesseract { + pub backend: String, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Config { pub log: Log, + pub tesseract: Tesseract, } impl Config { @@ -28,6 +33,9 @@ impl Config { path: "swordfish.log".to_string(), }, }, + tesseract: Tesseract { + backend: "libtesseract".to_string(), + }, } } pub fn save(&self, path: &str) { diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs index 10b59c6..778229d 100644 --- a/swordfish/src/katana.rs +++ b/swordfish/src/katana.rs @@ -1,17 +1,18 @@ use image::imageops::colorops::contrast_in_place; use image::io::Reader as ImageReader; -use image::ImageFormat; -use once_cell::sync::Lazy; +use image::{DynamicImage, ImageFormat}; use regex::Regex; use serenity::model::channel::Message; use std::io::Cursor; -use std::{env, thread}; +use std::sync::LazyLock; +use std::thread; use swordfish_common::structs::Card; -use swordfish_common::tesseract; +use swordfish_common::tesseract::{libtesseract, subprocess}; use swordfish_common::{trace, warn}; +use crate::CONFIG; -static TEXT_NUM_REGEX: Lazy = Lazy::new(|| Regex::new(r"[A-Za-z0-9]").unwrap()); -static ALLOWED_CHARS_REGEX: Lazy = Lazy::new(|| Regex::new(r"['-: ]").unwrap()); +static TEXT_NUM_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"[A-Za-z0-9]").unwrap()); +static ALLOWED_CHARS_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"[!'-: ]").unwrap()); fn replace_string(text: &mut String, from: &str, to: &str) -> bool { match text.find(from) { @@ -65,23 +66,33 @@ fn fix_tesseract_string(text: &mut String) { Some(c) => c, None => continue, }; + let mut rm_prev: i8 = 0; + trace!("Prev char: {}", prev_char); + if ['-'].contains(&prev_char) { + rm_prev = 1; + text.remove(i - 1); + } + // Fix for "Asobi ni Iku lo Asobi ni Oide" -> "Asobi ni Iku yo! Asobi ni Oide" + if prev_char == 'l' { + let prev_prev_char = match text.chars().nth(i - 2) { + Some(c) => c, + None => continue, + }; + trace!("Prev prev char: {}", prev_prev_char); + if prev_prev_char == 'o' { + rm_prev = -1; + text.remove(i - 2); + text.remove(i - 2); + text.insert_str(i - 2, "yo!") + } + } let next_char = match text.chars().nth(i + 1) { Some(c) => c, None => break, }; - let mut rm_prev: bool = false; - trace!("Prev char: {}", prev_char); - if ['-'].contains(&prev_char) { - rm_prev = true; - text.remove(i - 1); - } trace!("Next char: {}", next_char); if ['.'].contains(&next_char) { - if rm_prev { - text.remove(i); - } else { - text.remove(i + 1); - } + text.remove((i as i8 + 1 - rm_prev) as usize); } } // Replace "\n" with " " @@ -117,10 +128,7 @@ fn fix_tesseract_string(text: &mut String) { } fn save_image_if_trace(img: &image::DynamicImage, path: &str) { - let log_lvl = match env::var("LOG_LEVEL") { - Ok(log_lvl) => log_lvl, - Err(_) => return, - }; + let log_lvl = CONFIG.get().unwrap().log.level.as_str(); if log_lvl == "trace" { match img.save(path) { Ok(_) => { @@ -133,13 +141,14 @@ fn save_image_if_trace(img: &image::DynamicImage, path: &str) { } } -pub fn analyze_card(card: image::DynamicImage, count: u32) -> Card { +pub fn analyze_card_libtesseract(card: image::DynamicImage, count: u32) -> Card { trace!("Spawning threads for analyzing card..."); // Read the name and the series let card_clone = card.clone(); let name_thread = thread::spawn(move || { - let mut leptess = tesseract::init_tesseract(false).expect("Failed to initialize Tesseract"); - // let binding = tesseract::init_tesseract_quick(false); + let mut leptess = + libtesseract::init_tesseract(false).expect("Failed to initialize Tesseract"); + // let binding = tesseract::get_tesseract_from_vec(false); // let mut leptess = binding.lock().unwrap(); let name_img = card_clone.crop_imm(22, 26, 204 - 22, 70 - 26); let mut buffer: Cursor> = Cursor::new(Vec::new()); @@ -157,8 +166,9 @@ pub fn analyze_card(card: image::DynamicImage, count: u32) -> Card { }); let card_clone = card.clone(); let series_thread = thread::spawn(move || { - let mut leptess = tesseract::init_tesseract(false).expect("Failed to initialize Tesseract"); - // let binding = tesseract::init_tesseract_quick(false); + let mut leptess = + libtesseract::init_tesseract(false).expect("Failed to initialize Tesseract"); + // let binding = tesseract::get_tesseract_from_vec(false); // let mut leptess = binding.lock().unwrap(); let series_img = card_clone.crop_imm(22, 276, 204 - 22, 330 - 276); let mut buffer: Cursor> = Cursor::new(Vec::new()); @@ -191,6 +201,58 @@ pub fn analyze_card(card: image::DynamicImage, count: u32) -> Card { }; } +pub fn analyze_card_subprocess(card: image::DynamicImage, count: u32) -> Card { + trace!("Spawning threads for analyzing card..."); + // Read the name and the series + let card_clone = card.clone(); + let name_thread = thread::spawn(move || { + let name_img = card_clone.crop_imm(22, 26, 204 - 22, 70 - 26); + let img = subprocess::Image::from_dynamic_image(&name_img).unwrap(); + save_image_if_trace( + &name_img, + format!("debug/4-subprocess-{}-name.png", count).as_str(), + ); + let mut name_str = subprocess::image_to_string(&img).unwrap(); + fix_tesseract_string(&mut name_str); + name_str + }); + let card_clone = card.clone(); + let series_thread = thread::spawn(move || { + let series_img = card_clone.crop_imm(22, 276, 204 - 22, 330 - 276); + let img = subprocess::Image::from_dynamic_image(&series_img).unwrap(); + save_image_if_trace( + &series_img, + format!("debug/4-subprocess-{}-series.png", count).as_str(), + ); + let mut series_str = subprocess::image_to_string(&img).unwrap(); + fix_tesseract_string(&mut series_str); + series_str + }); + let name = name_thread.join().unwrap(); + trace!("Name: {}", name); + let series = series_thread.join().unwrap(); + trace!("Series: {}", series); + // TODO: Read the print number + // TODO: Read the wishlist number (from our database) + return Card { + wishlist: None, + name, + series, + print: 0, + }; +} + +fn execute_analyze_drop(image: DynamicImage, count: u32) -> Card { + let config = CONFIG.get().unwrap(); + match config.tesseract.backend.as_str() { + "libtesseract" => analyze_card_libtesseract(image, count), + "subprocess" => analyze_card_subprocess(image, count), + _ => { + panic!("Invalid Tesseract backend: {}", config.tesseract.backend); + } + } +} + pub async fn analyze_drop_message(message: &Message) -> Result, String> { if message.attachments.len() < 1 { return Err("No attachments found".to_string()); @@ -228,11 +290,10 @@ pub async fn analyze_drop_message(message: &Message) -> Result, String trace!("Cropping card {} ({}, {}, {}, {})", i, x, y, width, height); let card_img = img.crop_imm(x, y, width, height); save_image_if_trace(&card_img, &format!("debug/3-cropped-{}.png", i)); - let job = move || { + jobs.push(move || { trace!("Analyzing card {}", i); - Ok((i, analyze_card(card_img, i))) - }; - jobs.push(job); + Ok((i, execute_analyze_drop(card_img, i))) + }); } let mut tasks: Vec>> = Vec::new(); for job in jobs { diff --git a/swordfish/src/main.rs b/swordfish/src/main.rs index 2654b69..9c3d509 100644 --- a/swordfish/src/main.rs +++ b/swordfish/src/main.rs @@ -1,5 +1,5 @@ +#![feature(lazy_cell)] use dotenvy::dotenv; -use once_cell::sync::Lazy; use serenity::async_trait; use serenity::framework::standard::macros::{command, group}; use serenity::framework::standard::{CommandResult, Configuration, StandardFramework}; @@ -10,6 +10,7 @@ use serenity::model::{ use serenity::prelude::*; use std::env; use std::path::Path; +use std::sync::OnceLock; use std::time::Instant; use swordfish_common::*; @@ -21,7 +22,7 @@ mod katana; mod template; const GITHUB_URL: &str = "https://github.com/teppyboy/swordfish"; -static mut LOG_LEVEL: Lazy = Lazy::new(|| "unknown".to_string()); +static CONFIG: OnceLock = OnceLock::new(); #[group] #[commands(ping, kdropanalyze, info)] @@ -77,22 +78,11 @@ async fn main() { config = config::Config::new(); config.save("./config.toml"); } - let level_str = config.log.level; + let level_str = config.log.level.clone(); let log_level = env::var("LOG_LEVEL").unwrap_or(level_str); - unsafe { - // 1st way to kys - LOG_LEVEL = Lazy::new(|| { - let config: Config; - if Path::new("./config.toml").exists() { - config = config::Config::load("./config.toml"); - } else { - config = config::Config::new(); - config.save("./config.toml"); - } - let level_str = config.log.level; - env::var("LOG_LEVEL").unwrap_or(level_str) - }); - } + CONFIG + .set(config) + .expect("Failed to register config to static"); setup_logger(&log_level).expect("Failed to setup logger"); info!("Swordfish v{} - {}", env!("CARGO_PKG_VERSION"), GITHUB_URL); info!("Log level: {}", log_level); @@ -229,19 +219,20 @@ async fn kdropanalyze(ctx: &Context, msg: &Message) -> CommandResult { #[command] async fn info(ctx: &Context, msg: &Message) -> CommandResult { - unsafe { - let reply_str = format!( - "Swordfish v{} - {}\n\ - Log level: `{}`\n\ - Build type: `{}`\n\n\ - Like my work? Consider donating to my [Ko-fi](https://ko-fi.com/tretrauit) or [Patreon](https://patreon.com/tretrauit)!\n\ - ", - env!("CARGO_PKG_VERSION"), - GITHUB_URL, - LOG_LEVEL.as_str(), - env!("BUILD_PROFILE"), - ); - helper::info_message(ctx, msg, reply_str, Some("Information".to_string())).await; - } + let reply_str = format!( + "Swordfish v{} - {}\n\ + Log level: `{}`\n\ + Build type: `{}`\n\n\ + Like my work? Consider supporting me at my [Ko-fi](https://ko-fi.com/tretrauit) or [Patreon](https://patreon.com/tretrauit)!\n\n\ + *Debug information*\n\ + Tesseract backend: `{}`\n\ + ", + env!("CARGO_PKG_VERSION"), + GITHUB_URL, + CONFIG.get().unwrap().log.level.clone().as_str(), + env!("BUILD_PROFILE"), + CONFIG.get().unwrap().tesseract.backend.clone().as_str(), + ); + helper::info_message(ctx, msg, reply_str, Some("Information".to_string())).await; Ok(()) }