feat(tesseract): set DPI to 70
I copied Nori code.
This commit is contained in:
parent
6d735e76cc
commit
fa9ecaf59c
@ -13,7 +13,6 @@ use swordfish_common::{trace, warn};
|
|||||||
use tokio::task;
|
use tokio::task;
|
||||||
|
|
||||||
static TEXT_NUM_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[A-Za-z0-9]").unwrap());
|
static TEXT_NUM_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[A-Za-z0-9]").unwrap());
|
||||||
static ALLOWED_CHARS_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[\-!.': ]").unwrap());
|
|
||||||
const CARD_NAME_X_OFFSET: u32 = 22;
|
const CARD_NAME_X_OFFSET: u32 = 22;
|
||||||
const CARD_NAME_Y_OFFSET: u32 = 28;
|
const CARD_NAME_Y_OFFSET: u32 = 28;
|
||||||
const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET;
|
const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET;
|
||||||
@ -123,9 +122,7 @@ fn fix_tesseract_string(text: &mut String) {
|
|||||||
}
|
}
|
||||||
// Remove all non-alphanumeric characters
|
// Remove all non-alphanumeric characters
|
||||||
trace!("Text: {}", text);
|
trace!("Text: {}", text);
|
||||||
text.retain(|c| {
|
text.retain(|c| TEXT_NUM_REGEX.is_match(&c.to_string()) || c.is_ascii_alphanumeric());
|
||||||
TEXT_NUM_REGEX.is_match(&c.to_string()) || ALLOWED_CHARS_REGEX.is_match(&c.to_string())
|
|
||||||
});
|
|
||||||
// Fix "mn" -> "III"
|
// Fix "mn" -> "III"
|
||||||
trace!("Text: {}", text);
|
trace!("Text: {}", text);
|
||||||
if text.ends_with("mn") {
|
if text.ends_with("mn") {
|
||||||
|
@ -1,49 +1,44 @@
|
|||||||
pub use leptess::{LepTess, Variable};
|
pub use leptess::{LepTess, Variable};
|
||||||
use std::{
|
use std::sync::{Arc, LazyLock, Mutex};
|
||||||
sync::{Arc, LazyLock, Mutex},
|
use tokio::task;
|
||||||
thread,
|
|
||||||
};
|
|
||||||
|
|
||||||
static TESSERACT: LazyLock<Arc<Mutex<LepTess>>> = LazyLock::new(|| {
|
|
||||||
let mut lep_tess = match LepTess::new(None, "eng") {
|
|
||||||
Ok(lep_tess) => lep_tess,
|
|
||||||
Err(why) => panic!("{}", format!("Failed to initialize Tesseract: {:?}", why)),
|
|
||||||
};
|
|
||||||
// lep_tess.set_variable(Variable::TesseditPagesegMode, "6").unwrap();
|
|
||||||
// Use LSTM only.
|
|
||||||
lep_tess
|
|
||||||
.set_variable(Variable::TesseditOcrEngineMode, "2")
|
|
||||||
.unwrap();
|
|
||||||
Arc::new(Mutex::new(lep_tess))
|
|
||||||
});
|
|
||||||
|
|
||||||
static mut TESSERACT_VEC: Vec<Arc<Mutex<LepTess>>> = Vec::new();
|
static mut TESSERACT_VEC: Vec<Arc<Mutex<LepTess>>> = Vec::new();
|
||||||
|
static mut TESSERACT_NUMERIC_VEC: Vec<Arc<Mutex<LepTess>>> = Vec::new();
|
||||||
|
|
||||||
///
|
pub unsafe fn get_tesseract() -> Arc<Mutex<LepTess>> {
|
||||||
/// Get a Tesseract instance.
|
|
||||||
///
|
|
||||||
/// Deprecated because it provides no performance benefit, if you really need
|
|
||||||
/// then use get_tesseract_from_vec.
|
|
||||||
///
|
|
||||||
pub fn get_tesseract(numeric_only: bool) -> Arc<Mutex<LepTess>> {
|
|
||||||
TESSERACT.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn get_tesseract_from_vec(numeric_only: bool) -> Arc<Mutex<LepTess>> {
|
|
||||||
let lep_tess: Arc<Mutex<LepTess>>;
|
let lep_tess: Arc<Mutex<LepTess>>;
|
||||||
if TESSERACT_VEC.len() == 0 {
|
if TESSERACT_VEC.len() == 0 {
|
||||||
for _ in 0..3 {
|
for _ in 0..3 {
|
||||||
let num_only = numeric_only.clone();
|
task::spawn(async move {
|
||||||
thread::spawn(move || {
|
let ocr = init_tesseract(false).unwrap();
|
||||||
let ocr = init_tesseract(num_only).unwrap();
|
|
||||||
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
lep_tess = Arc::new(Mutex::new(init_tesseract(numeric_only).unwrap()));
|
lep_tess = Arc::new(Mutex::new(init_tesseract(false).unwrap()));
|
||||||
} else {
|
} else {
|
||||||
lep_tess = TESSERACT_VEC.pop().unwrap();
|
lep_tess = TESSERACT_VEC.pop().unwrap();
|
||||||
thread::spawn(move || unsafe {
|
task::spawn(async move {
|
||||||
let ocr = init_tesseract(numeric_only).unwrap();
|
let ocr = init_tesseract(false).unwrap();
|
||||||
|
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
lep_tess
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn get_tesseract_numeric() -> Arc<Mutex<LepTess>> {
|
||||||
|
let lep_tess: Arc<Mutex<LepTess>>;
|
||||||
|
if TESSERACT_NUMERIC_VEC.len() == 0 {
|
||||||
|
for _ in 0..3 {
|
||||||
|
task::spawn(async move {
|
||||||
|
let ocr = init_tesseract(false).unwrap();
|
||||||
|
TESSERACT_NUMERIC_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
lep_tess = Arc::new(Mutex::new(init_tesseract(false).unwrap()));
|
||||||
|
} else {
|
||||||
|
lep_tess = TESSERACT_VEC.pop().unwrap();
|
||||||
|
task::spawn(async move {
|
||||||
|
let ocr = init_tesseract(false).unwrap();
|
||||||
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
TESSERACT_VEC.push(Arc::new(Mutex::new(ocr)));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -62,6 +57,8 @@ pub fn init_tesseract(numeric_only: bool) -> Result<LepTess, String> {
|
|||||||
lep_tess
|
lep_tess
|
||||||
.set_variable(Variable::TesseditOcrEngineMode, "1")
|
.set_variable(Variable::TesseditOcrEngineMode, "1")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
// Set 70 as DPI
|
||||||
|
lep_tess.set_variable(Variable::UserDefinedDpi, "70").unwrap();
|
||||||
if numeric_only {
|
if numeric_only {
|
||||||
match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") {
|
match lep_tess.set_variable(Variable::TesseditCharWhitelist, "0123456789") {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
|
@ -6,7 +6,7 @@ static TESSERACT_ARGS: LazyLock<Args> = LazyLock::new(|| Args {
|
|||||||
lang: "eng".to_string(),
|
lang: "eng".to_string(),
|
||||||
config_variables: HashMap::new(),
|
config_variables: HashMap::new(),
|
||||||
psm: Some(6),
|
psm: Some(6),
|
||||||
dpi: None,
|
dpi: Some(70),
|
||||||
oem: Some(1),
|
oem: Some(1),
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -14,7 +14,7 @@ static TESSERACT_NUMERIC_ARGS: LazyLock<Args> = LazyLock::new(|| Args {
|
|||||||
lang: "eng".to_string(),
|
lang: "eng".to_string(),
|
||||||
config_variables: HashMap::from([("tessedit_char_whitelist".into(), "0123456789".into())]),
|
config_variables: HashMap::from([("tessedit_char_whitelist".into(), "0123456789".into())]),
|
||||||
psm: Some(6),
|
psm: Some(6),
|
||||||
dpi: None,
|
dpi: Some(70),
|
||||||
oem: Some(1),
|
oem: Some(1),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user