use hand-rolled strncasecmp, ignore accents

author: Ben Winston 2024-06-03 22:36:11 -0400
committer: Ben Winston 2024-06-03 22:36:11 -0400
commit: 84078b5791036f0aaa3979019638aebe129a9bfd (patch)
tree: 4cb5365356bf37fcd66ce83eabc63ce3eade89b0 /booki.c
parent: 770f194677027c1961bbfd77e536bd4d7e97b40e (diff)
1 files changed, 92 insertions, 4 deletions
diff --git a/booki.c b/booki.c
index b73dc3d..a530f76 100644
--- a/booki.c
+++ b/booki.c
@@ -13,6 +13,94 @@
 #define MAX_SEARCH_OPTS 5
 
 /*** helpers ***/
+bool comparable(const char* pattern, const char* candidate, int len) {
+    unsigned char p = *pattern;
+    unsigned char c = *candidate;
+    while (len > 0) {
+        // make pattern uppercase, if applicable
+        if (p >= 'a' && p <= 'z')
+            p -= 0x20;
+
+        // if we're looking at an ascii character, compare as normal
+        if (c < 0x80) {
+            if (c >= 'a' && c <= 'z')
+                c -= 0x20;
+            if (p != c)
+                return false;
+        }
+        // looking at Latin-1 Supplement
+        else if (c == 0xC3) {
+
+            // go to next candidate byte
+            candidate++;
+            c = *candidate;
+
+            // this set has upper/lower similarly spaced to ascii
+            if (c >= 0xA0)
+                c -= 0x20;
+
+            // TODO "AE" only matches A
+            // TODO latin small y with diareses is not covered, as it has no uppercase!
+            // a-ish
+            if (c >= 0x80 && c <= 0x86) {
+                if (p != 'A')
+                    return false;
+            }
+            // c-ish
+            else if (c == 0x87) {
+                if (p != 'C')
+                    return false;
+            }
+            // e-ish
+            else if (c >= 0x88 && c <= 0x8B) {
+                if (p != 'E')
+                    return false;
+            }
+            // i-ish
+            else if (c >= 0x8C && c <= 0x8F) {
+                if (p != 'I')
+                    return false;
+            }
+            // n-ish
+            else if (c == 0x91) {
+                if (p != 'N')
+                    return false;
+            }
+            // o-ish
+            else if (((c >= 0x92 && c <= 0x96) || c == 0x98)) {
+                if (p != 'O')
+                    return false;
+            }
+            // u-ish
+            else if (c >= 0x99 && c <= 0x9C) {
+                if (p != 'U')
+                    return false;
+            }
+            // y-ish
+            else if (c == 0x9D) {
+                if (p != 'Y')
+                    return false;
+            }
+            // fallthrough
+            else if (p != c) {
+                return false;
+            }
+        }
+        // TODO latin-1 extended
+        else {
+            // don't know how to compare these
+            return false;
+        }
+
+        pattern++;
+        p = *pattern;
+        candidate++;
+        c = *candidate;
+        len--;
+    }
+    return true;
+}
+
 char* load_file(char* filename) {
     // open the file
     FILE* fp = fopen(filename, "r");
@@ -436,19 +524,19 @@ bool match_string(const char* pattern, const struct es text) {
     bool valid = false;
     if (head_match && tail_match)
         // text must be identical to pattern (minus ^ and $)
-        valid = text.len == (pattern_length - 2) && strncasecmp(pattern + 1, text.ptr, pattern_length - 2) == 0;
+        valid = text.len == (pattern_length - 2) && comparable(pattern + 1, text.ptr, pattern_length - 2);
     else if (head_match)
         // text must match the pattern starting from pattern + 1
-        valid = strncasecmp(pattern + 1, text.ptr, pattern_length - 1) == 0;
+        valid = comparable(pattern + 1, text.ptr, pattern_length - 1);
     else if (tail_match) {
         // text starting from (pattern + 1) from the end must match pattern (without $)
-        valid = strncasecmp(pattern, text.ptr + (text.len - pattern_length + 1), pattern_length - 1) == 0;
+        valid = comparable(pattern, text.ptr + (text.len - pattern_length + 1), pattern_length - 1);
     }
 
     // we only need to compare while remaining text is
     // as long or longer than pattern
     for (int i = 0; i <= (text.len - pattern_length); i++) {
-        if (strncasecmp(pattern, text.ptr + i, pattern_length) == 0) {
+        if (comparable(pattern, text.ptr + i, pattern_length)) {
             valid = true;
             break;
         }
author	Ben Winston	2024-06-03 22:36:11 -0400
committer	Ben Winston	2024-06-03 22:36:11 -0400
commit	84078b5791036f0aaa3979019638aebe129a9bfd (patch)
tree	4cb5365356bf37fcd66ce83eabc63ce3eade89b0 /booki.c
parent	770f194677027c1961bbfd77e536bd4d7e97b40e (diff)