fix(tesseract/regex): fix wrong length of string because of unicode
Yeah.
This commit is contained in:
parent
c72d2cf16b
commit
8e2a629427
@ -263,11 +263,15 @@ pub fn regexify_text(text: &String) -> String {
|
|||||||
if !p_word_unicode[0..3]
|
if !p_word_unicode[0..3]
|
||||||
.iter()
|
.iter()
|
||||||
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||||
&& !p_word_unicode[word.len() - 2..word.len()]
|
&& !p_word_unicode[p_word_unicode.len() - 2..p_word_unicode.len()]
|
||||||
.iter()
|
.iter()
|
||||||
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
.any(|c: &&str| REGEX_STRINGS.contains(&c))
|
||||||
{
|
{
|
||||||
regex.push_str(p_word_unicode[2..word.len() - 2].concat().as_str());
|
regex.push_str(
|
||||||
|
p_word_unicode[2..p_word_unicode.len() - 2]
|
||||||
|
.concat()
|
||||||
|
.as_str(),
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
regex.push_str(&processed_word.as_str());
|
regex.push_str(&processed_word.as_str());
|
||||||
}
|
}
|
||||||
@ -277,7 +281,7 @@ pub fn regexify_text(text: &String) -> String {
|
|||||||
if processed_word.chars().all(|c| c.is_ascii_alphanumeric()) {
|
if processed_word.chars().all(|c| c.is_ascii_alphanumeric()) {
|
||||||
regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str());
|
regex.push_str(format!("\\b{}\\b", &processed_word.as_str()).as_str());
|
||||||
} else {
|
} else {
|
||||||
regex.push_str(format!("{}", &processed_word.as_str()).as_str());
|
regex.push_str(&processed_word.as_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
regex.push_str(")");
|
regex.push_str(")");
|
||||||
|
Loading…
Reference in New Issue
Block a user