From bc4dcad932f199f6568ae6388563acfef911f5f3 Mon Sep 17 00:00:00 2001 From: tretrauit Date: Wed, 31 Jan 2024 01:25:58 +0700 Subject: [PATCH] fix(regex): break immediately after appending the last character --- swordfish/src/tesseract/utils.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs index 8f765c8..2db0929 100644 --- a/swordfish/src/tesseract/utils.rs +++ b/swordfish/src/tesseract/utils.rs @@ -101,7 +101,7 @@ pub fn fix_tesseract_string(text: &mut String) { } // Remove all non-alphanumeric characters trace!("Text: {}", text); - text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric()); + text.retain(|c| ALLOWED_CHARS.binary_search(&c).is_ok() || c.is_ascii_alphanumeric()); // Fix "mn" -> "III" trace!("Text: {}", text); if text.ends_with("mn") { @@ -160,15 +160,15 @@ pub fn regexify_text(text: &String) -> String { for c in text.chars() { // Here comes the workaround... // The character "0" is sometimes used in place of "O" in names - if ['0', 'O'].contains(&c) { + if ['0', 'O'].binary_search(&c).is_ok() { ascii_text.push_str("[0O]"); - } else if ['u', 'v', 'y'].contains(&c) { + } else if ['u', 'v', 'y'].binary_search(&c).is_ok() { ascii_text.push_str("[uvy]"); - } else if ['t'].contains(&c) { + } else if ['t'].binary_search(&c).is_ok() { ascii_text.push_str("[ti]"); - } else if ['I', 'l', '!', '1'].contains(&c) { + } else if ['I', 'l', '!', '1'].binary_search(&c).is_ok() { ascii_text.push_str("[Il!1i]"); - } else if ['.'].contains(&c) { + } else if ['.'].binary_search(&c).is_ok() { if prev_chars.len() > 3 { let prev_char = prev_chars[prev_chars.len() - 1]; let prev_prev_char = prev_chars[prev_chars.len() - 2]; @@ -177,12 +177,14 @@ pub fn regexify_text(text: &String) -> String { } } ascii_text.push(' '); - } else if ['R'].contains(&c) { + } else if ['R'].binary_search(&c).is_ok() { ascii_text.push_str("[Rk]"); - } else if ['m'].contains(&c) { + } else if ['m'].binary_search(&c).is_ok() { ascii_text.push_str("(m|ra)"); - } else if ['a'].contains(&c) { + } else if ['a'].binary_search(&c).is_ok() { ascii_text.push_str("[ao]") + } else if ['H', 'E'].binary_search(&c).is_ok() { + ascii_text.push_str("[HE]") } else if c.is_ascii_alphanumeric() { ascii_text.push(c); } else { @@ -212,6 +214,7 @@ pub fn regexify_text(text: &String) -> String { } else if i == ascii_text.len() - 1 { regex.push_str(".*"); regex.push(char); + break; } if regex_any { if char == ']' { @@ -249,9 +252,9 @@ pub fn regexify_text(text: &String) -> String { trace!("Processed word: {}", processed_word); if partial_match && processed_word.len() > 4 { // Remove first two and last two characters for "partial match" - if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c)) + if !processed_word[0..3].contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok()) && !processed_word[word.len() - 2..word.len()] - .contains(|c: char| REGEX_CHARS.contains(&c)) + .contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok()) { regex.push_str(&processed_word[2..word.len() - 2]); } else {