From d46e0f8e6feeb13ac32fc15e742e70281b5e7dea Mon Sep 17 00:00:00 2001
From: tretrauit <tretrauit@gmail.com>
Date: Wed, 10 Jan 2024 22:59:34 +0700
Subject: [PATCH] fix(katana): add more workarounds

---
 swordfish/src/katana.rs | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/swordfish/src/katana.rs b/swordfish/src/katana.rs
index 2bca7c2..1d9fa63 100644
--- a/swordfish/src/katana.rs
+++ b/swordfish/src/katana.rs
@@ -13,7 +13,7 @@ use swordfish_common::{error, trace, warn};
 use tokio::task;
 use tokio::time::Instant;
 
-const ALLOWED_CHARS: [char; 11] = [' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@'];
+const ALLOWED_CHARS: [char; 12] = [' ', '-', '.', '!', ':', '(', ')', '\'', '/', '\'', '@', '&'];
 const CARD_NAME_X_OFFSET: u32 = 22;
 const CARD_NAME_Y_OFFSET: u32 = 28;
 const CARD_NAME_WIDTH: u32 = 202 - CARD_NAME_X_OFFSET;
@@ -167,19 +167,41 @@ fn fix_tesseract_string(text: &mut String) {
 fn regexify_text(text: &String) -> String {
     let mut regex = String::new();
     let mut ascii_text = String::new();
+    let mut prev_chars: Vec<char> = Vec::new();
     for c in text.chars() {
         // Here comes the workaround...
         // The character "0" is sometimes used in place of "O" in names
         if ['0', 'O'].contains(&c) {
             ascii_text.push_str("[0O]");
+        } else if ['u', 'v'].contains(&c) && prev_chars.len() > 0 {
+            let prev_char = prev_chars[prev_chars.len() - 1];
+            if ['u', 'v'].contains(&prev_char) {
+                ascii_text.pop();
+                ascii_text.push_str("[uv][uv]");
+            } else {
+                ascii_text.push(c);
+            }
+        } else if ['t'].contains(&c) {
+            ascii_text.push_str("[ti]");
+        } else if ['.'].contains(&c) {
+            let prev_char = prev_chars[prev_chars.len() - 1];
+            let prev_prev_char = prev_chars[prev_chars.len() - 2];
+            if prev_char.is_numeric() && prev_prev_char.is_whitespace() {
+                continue;
+            }
         } else if c.is_ascii_alphanumeric() {
             ascii_text.push(c);
         } else {
             ascii_text.push(' ');
         }
+        prev_chars.push(c);
     }
-    for word in ascii_text.split_whitespace() {
-        if word.len() < 2 && regex.len() > 0 {
+    let split = ascii_text.split_whitespace();
+    let len = split.clone().count();
+    for (i, word) in split.enumerate() {
+        if word.len() < 2 && i > 0 && i < len - 1
+            || (word.len() == 1 && word.to_ascii_uppercase() == word)
+        {
             continue;
         }
         regex.push_str("(?=.*\\b");