diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs index 887b60f..e09b709 100644 --- a/swordfish/src/katana.rs +++ b/swordfish/src/katana.rs @@ -13,7 +13,6 @@ use swordfish_common::{trace, warn}; use tokio::task; static TEXT_NUM_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"[A-Za-z0-9]").unwrap()); -static ALLOWED_CHARS_REGEX: LazyLock = LazyLock::new(|| Regex::new(r"[\-!.': ]").unwrap()); const CARD_NAME_X_OFFSET: u32 = 22; const CARD_NAME_Y_OFFSET: u32 = 28; const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET; @@ -123,9 +122,7 @@ fn fix_tesseract_string(text: &mut String) { } // Remove all non-alphanumeric characters trace!("Text: {}", text); - text.retain(|c| { - TEXT_NUM_REGEX.is_match(&c.to_string()) || ALLOWED_CHARS_REGEX.is_match(&c.to_string()) - }); + text.retain(|c| TEXT_NUM_REGEX.is_match(&c.to_string()) || c.is_ascii_alphanumeric()); // Fix "mn" -> "III" trace!("Text: {}", text); if text.ends_with("mn") { diff --git a/swordfish/src/tesseract/libtesseract.rs b/swordfish/src/tesseract/libtesseract.rs index 58a2438..593ac86 100644 --- a/swordfish/src/tesseract/libtesseract.rs +++ b/swordfish/src/tesseract/libtesseract.rs @@ -1,49 +1,44 @@ pub use leptess::{LepTess, Variable}; -use std::{ - sync::{Arc, LazyLock, Mutex}, - thread, -}; - -static TESSERACT: LazyLock>> = LazyLock::new(|| { - let mut lep_tess = match LepTess::new(None, "eng") { - Ok(lep_tess) => lep_tess, - Err(why) => panic!("{}", format!("Failed to initialize Tesseract: {:?}", why)), - }; - // lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap(); - // Use LSTM only. - lep_tess - .set_variable(Variable::TesseditOcrEngineMode, "2") - .unwrap(); - Arc::new(Mutex::new(lep_tess)) -}); +use std::sync::{Arc, LazyLock, Mutex}; +use tokio::task; static mut TESSERACT_VEC: Vec>> = Vec::new(); +static mut TESSERACT_NUMERIC_VEC: Vec>> = Vec::new(); -/// -/// Get a Tesseract instance. -/// -/// Deprecated because it provides no performance benefit, if you really need -/// then use get_tesseract_from_vec. -/// -pub fn get_tesseract(numeric_only: bool) -> Arc> { - TESSERACT.clone() -} - -pub unsafe fn get_tesseract_from_vec(numeric_only: bool) -> Arc> { +pub unsafe fn get_tesseract() -> Arc> { let lep_tess: Arc>; if TESSERACT_VEC.len() == 0 { for _ in 0..3 { - let num_only = numeric_only.clone(); - thread::spawn(move || { - let ocr = init_tesseract(num_only).unwrap(); + task::spawn(async move { + let ocr = init_tesseract(false).unwrap(); TESSERACT_VEC.push(Arc::new(Mutex::new(ocr))); }); } - lep_tess = Arc::new(Mutex::new(init_tesseract(numeric_only).unwrap())); + lep_tess = Arc::new(Mutex::new(init_tesseract(false).unwrap())); } else { lep_tess = TESSERACT_VEC.pop().unwrap(); - thread::spawn(move || unsafe { - let ocr = init_tesseract(numeric_only).unwrap(); + task::spawn(async move { + let ocr = init_tesseract(false).unwrap(); + TESSERACT_VEC.push(Arc::new(Mutex::new(ocr))); + }); + } + lep_tess +} + +pub unsafe fn get_tesseract_numeric() -> Arc> { + let lep_tess: Arc>; + if TESSERACT_NUMERIC_VEC.len() == 0 { + for _ in 0..3 { + task::spawn(async move { + let ocr = init_tesseract(false).unwrap(); + TESSERACT_NUMERIC_VEC.push(Arc::new(Mutex::new(ocr))); + }); + } + lep_tess = Arc::new(Mutex::new(init_tesseract(false).unwrap())); + } else { + lep_tess = TESSERACT_VEC.pop().unwrap(); + task::spawn(async move { + let ocr = init_tesseract(false).unwrap(); TESSERACT_VEC.push(Arc::new(Mutex::new(ocr))); }); } @@ -62,6 +57,8 @@ pub fn init_tesseract(numeric_only: bool) -> Result { lep_tess .set_variable(Variable::TesseditOcrEngineMode, "1") .unwrap(); + // Set 70 as DPI + lep_tess.set_variable(Variable::UserDefinedDpi, "70").unwrap(); if numeric_only { match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") { Ok(_) => (), diff --git a/swordfish/src/tesseract/subprocess.rs b/swordfish/src/tesseract/subprocess.rs index ed864db..6ad3e61 100644 --- a/swordfish/src/tesseract/subprocess.rs +++ b/swordfish/src/tesseract/subprocess.rs @@ -6,7 +6,7 @@ static TESSERACT_ARGS: LazyLock = LazyLock::new(|| Args { lang: "eng".to_string(), config_variables: HashMap::new(), psm: Some(6), - dpi: None, + dpi: Some(70), oem: Some(1), }); @@ -14,7 +14,7 @@ static TESSERACT_NUMERIC_ARGS: LazyLock = LazyLock::new(|| Args { lang: "eng".to_string(), config_variables: HashMap::from([("tessedit_char_whitelist".into(), "0123456789".into())]), psm: Some(6), - dpi: None, + dpi: Some(70), oem: Some(1), });