From 5facb83ce7f321631cc1053798d25033ce6a1949 Mon Sep 17 00:00:00 2001 From: tretrauit Date: Wed, 31 Jan 2024 01:44:59 +0700 Subject: [PATCH] Revert "fix(regex): break immediately after appending the last character" This reverts commit bc4dcad932f199f6568ae6388563acfef911f5f3. Edited to retain the fix in the commit --- swordfish/src/tesseract/utils.rs | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/swordfish/src/tesseract/utils.rs b/swordfish/src/tesseract/utils.rs index 2db0929..8f765c8 100644 --- a/swordfish/src/tesseract/utils.rs +++ b/swordfish/src/tesseract/utils.rs @@ -101,7 +101,7 @@ pub fn fix_tesseract_string(text: &mut String) { } // Remove all non-alphanumeric characters trace!("Text: {}", text); - text.retain(|c| ALLOWED_CHARS.binary_search(&c).is_ok() || c.is_ascii_alphanumeric()); + text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric()); // Fix "mn" -> "III" trace!("Text: {}", text); if text.ends_with("mn") { @@ -160,15 +160,15 @@ pub fn regexify_text(text: &String) -> String { for c in text.chars() { // Here comes the workaround... // The character "0" is sometimes used in place of "O" in names - if ['0', 'O'].binary_search(&c).is_ok() { + if ['0', 'O'].contains(&c) { ascii_text.push_str("[0O]"); - } else if ['u', 'v', 'y'].binary_search(&c).is_ok() { + } else if ['u', 'v', 'y'].contains(&c) { ascii_text.push_str("[uvy]"); - } else if ['t'].binary_search(&c).is_ok() { + } else if ['t'].contains(&c) { ascii_text.push_str("[ti]"); - } else if ['I', 'l', '!', '1'].binary_search(&c).is_ok() { + } else if ['I', 'l', '!', '1'].contains(&c) { ascii_text.push_str("[Il!1i]"); - } else if ['.'].binary_search(&c).is_ok() { + } else if ['.'].contains(&c) { if prev_chars.len() > 3 { let prev_char = prev_chars[prev_chars.len() - 1]; let prev_prev_char = prev_chars[prev_chars.len() - 2]; @@ -177,14 +177,12 @@ pub fn regexify_text(text: &String) -> String { } } ascii_text.push(' '); - } else if ['R'].binary_search(&c).is_ok() { + } else if ['R'].contains(&c) { ascii_text.push_str("[Rk]"); - } else if ['m'].binary_search(&c).is_ok() { + } else if ['m'].contains(&c) { ascii_text.push_str("(m|ra)"); - } else if ['a'].binary_search(&c).is_ok() { + } else if ['a'].contains(&c) { ascii_text.push_str("[ao]") - } else if ['H', 'E'].binary_search(&c).is_ok() { - ascii_text.push_str("[HE]") } else if c.is_ascii_alphanumeric() { ascii_text.push(c); } else { @@ -214,7 +212,6 @@ pub fn regexify_text(text: &String) -> String { } else if i == ascii_text.len() - 1 { regex.push_str(".*"); regex.push(char); - break; } if regex_any { if char == ']' { @@ -252,9 +249,9 @@ pub fn regexify_text(text: &String) -> String { trace!("Processed word: {}", processed_word); if partial_match && processed_word.len() > 4 { // Remove first two and last two characters for "partial match" - if !processed_word[0..3].contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok()) + if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c)) && !processed_word[word.len() - 2..word.len()] - .contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok()) + .contains(|c: char| REGEX_CHARS.contains(&c)) { regex.push_str(&processed_word[2..word.len() - 2]); } else {