Revert "fix(regex): break immediately after appending the last character"

This reverts commit bc4dcad932.

Edited to retain the fix in the commit
This commit is contained in:
tretrauit 2024-01-31 01:44:59 +07:00
parent 588e50af33
commit 5facb83ce7

View File

@ -101,7 +101,7 @@ pub fn fix_tesseract_string(text: &mut String) {
}
// Remove all non-alphanumeric characters
trace!("Text: {}", text);
text.retain(|c| ALLOWED_CHARS.binary_search(&c).is_ok() || c.is_ascii_alphanumeric());
text.retain(|c| ALLOWED_CHARS.contains(&c) || c.is_ascii_alphanumeric());
// Fix "mn" -> "III"
trace!("Text: {}", text);
if text.ends_with("mn") {
@ -160,15 +160,15 @@ pub fn regexify_text(text: &String) -> String {
for c in text.chars() {
// Here comes the workaround...
// The character "0" is sometimes used in place of "O" in names
if ['0', 'O'].binary_search(&c).is_ok() {
if ['0', 'O'].contains(&c) {
ascii_text.push_str("[0O]");
} else if ['u', 'v', 'y'].binary_search(&c).is_ok() {
} else if ['u', 'v', 'y'].contains(&c) {
ascii_text.push_str("[uvy]");
} else if ['t'].binary_search(&c).is_ok() {
} else if ['t'].contains(&c) {
ascii_text.push_str("[ti]");
} else if ['I', 'l', '!', '1'].binary_search(&c).is_ok() {
} else if ['I', 'l', '!', '1'].contains(&c) {
ascii_text.push_str("[Il!1i]");
} else if ['.'].binary_search(&c).is_ok() {
} else if ['.'].contains(&c) {
if prev_chars.len() > 3 {
let prev_char = prev_chars[prev_chars.len() - 1];
let prev_prev_char = prev_chars[prev_chars.len() - 2];
@ -177,14 +177,12 @@ pub fn regexify_text(text: &String) -> String {
}
}
ascii_text.push(' ');
} else if ['R'].binary_search(&c).is_ok() {
} else if ['R'].contains(&c) {
ascii_text.push_str("[Rk]");
} else if ['m'].binary_search(&c).is_ok() {
} else if ['m'].contains(&c) {
ascii_text.push_str("(m|ra)");
} else if ['a'].binary_search(&c).is_ok() {
} else if ['a'].contains(&c) {
ascii_text.push_str("[ao]")
} else if ['H', 'E'].binary_search(&c).is_ok() {
ascii_text.push_str("[HE]")
} else if c.is_ascii_alphanumeric() {
ascii_text.push(c);
} else {
@ -214,7 +212,6 @@ pub fn regexify_text(text: &String) -> String {
} else if i == ascii_text.len() - 1 {
regex.push_str(".*");
regex.push(char);
break;
}
if regex_any {
if char == ']' {
@ -252,9 +249,9 @@ pub fn regexify_text(text: &String) -> String {
trace!("Processed word: {}", processed_word);
if partial_match && processed_word.len() > 4 {
// Remove first two and last two characters for "partial match"
if !processed_word[0..3].contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok())
if !processed_word[0..3].contains(|c: char| REGEX_CHARS.contains(&c))
&& !processed_word[word.len() - 2..word.len()]
.contains(|c: char| REGEX_CHARS.binary_search(&c).is_ok())
.contains(|c: char| REGEX_CHARS.contains(&c))
{
regex.push_str(&processed_word[2..word.len() - 2]);
} else {