From 8e2a62942702f5b8b3a91095d14a56d40e776e72 Mon Sep 17 00:00:00 2001 From: tretrauit Date: Sat, 10 Feb 2024 21:44:44 +0700 Subject: [PATCH] fix(tesseract/regex): fix wrong length of string because of unicode Yeah. --- swordfish/src/tesseract/utils.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs index 10798a5..eda0ed4 100644 --- a/swordfish/src/tesseract/utils.rs +++ b/swordfish/src/tesseract/utils.rs @@ -263,11 +263,15 @@ pub fn regexify_text(text: &String) -> String { if !p_word_unicode[0..3] .iter() .any(|c: &&str| REGEX_STRINGS.contains(&c)) - && !p_word_unicode[word.len() - 2..word.len()] + && !p_word_unicode[p_word_unicode.len() - 2..p_word_unicode.len()] .iter() .any(|c: &&str| REGEX_STRINGS.contains(&c)) { - regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str()); + regex.push_str( + p_word_unicode[2..p_word_unicode.len() - 2] + .concat() + .as_str(), + ); } else { regex.push_str(&processed_word.as_str()); } @@ -277,7 +281,7 @@ pub fn regexify_text(text: &String) -> String { if processed_word.chars().all(|c| c.is_ascii_alphanumeric()) { regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str()); } else { - regex.push_str(format!("{}", &processed_word.as_str()).as_str()); + regex.push_str(&processed_word.as_str()); } } regex.push_str(")");