Treat commands as utf8 when looking for possible mispellings

2025-08-07 13:29:49 +00:00 · 2016-04-21 00:18:20 +02:00 · 2016-04-21 00:18:20 +02:00 · 653371763b
commit 653371763b
parent eaba94fa13
4 changed files with 180 additions and 14 deletions
--- a/src/main/CMakeLists.txt
+++ b/src/main/CMakeLists.txt
@ -7,6 +7,7 @@ add_executable(${PROJECT_NAME}
 	findactions.c
 	builtin_feats.c
 	damerau_levenshtein.c
+	utf8_ops.c
 )

 target_include_directories(${PROJECT_NAME}
--- a/src/main/damerau_levenshtein.c
+++ b/src/main/damerau_levenshtein.c
@ -64,6 +64,7 @@

 #include "damerau_levenshtein.h"
 #include "pbl_wrapper.h"
+#include "utf8_ops.h"
 #include <string.h>
 #include <iso646.h>
 #include <assert.h>
@ -80,8 +81,6 @@
 	__typeof__ (b) _b = (b); \
 	_a > _b ? _a : _b; })

-typedef wchar_t Character;
-
 static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
 	const int retval = pblMapAdd(
 		parMap,
@ -111,9 +110,9 @@ int damerau_levenshtein (
 {
 	return damerau_levenshtein_with_size(
 		parSource,
-		strlen(parSource),
+		utf8_strlen(parSource),
 		parTarget,
-		strlen(parTarget),
+		utf8_strlen(parTarget),
 		parDeleteCost,
 		parInsertCost,
 		parReplaceCost,
@ -150,6 +149,14 @@ int damerau_levenshtein_with_size (
 	int j_swap;
 	int pre_swap_cost;
 	int retval;
+	const char* source;
+	const char* target;
+	const char* source_index_1;
+	const char* target_index_1;
+	Character source_char;
+	Character target_char;
+	Character source_char_0;
+	Character target_char_0;

 	assert(parSource);
 	assert(parTarget);
@ -175,42 +182,55 @@ int damerau_levenshtein_with_size (
 	sourceIndexByCharacter = pblMapNewHashMap();
 	assert(sourceIndexByCharacter);

-	if (parSource[0] != parTarget[0]) {
+	source = parSource;
+	target = parTarget;
+	source_char_0 = utf8_advance(&source, parSource + parSourceLen);
+	target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
+	source_index_1 = source;
+	target_index_1 = target;
+	if (source_char_0 != target_char_0) {
 		table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
 			min(parReplaceCost, parDeleteCost + parInsertCost);
 	}
-	insert_pair(sourceIndexByCharacter, parSource[0], 0);
+	insert_pair(sourceIndexByCharacter, source_char_0, 0);

+	assert(source = source_index_1);
 	for (i = 1; i < parSourceLen; ++i) {
-		delete_distance = table[i - 1 + 0 * parSourceLen];
+		source_char = utf8_advance(&source, parSource + parSourceLen);
+		delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
 		insert_distance = (i + 1) * parDeleteCost + parInsertCost;
 		match_distance = i * parDeleteCost +
-			(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
+			(source_char == target_char_0 ? 0 : parReplaceCost);
 		table[i + 0 * parSourceLen] = min(
 			min(delete_distance, insert_distance), match_distance
 		);
 	}

+	assert(target == target_index_1);
 	for (j = 1; j < parTargetLen; ++j) {
+		target_char = utf8_advance(&target, parTarget + parTargetLen);
 		delete_distance = (j + 1) * parInsertCost + parDeleteCost;
 		insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
 		match_distance = j * parInsertCost +
-			(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
+			(source_char_0 == target_char ? 0 : parReplaceCost);
 		table[0 + j * parSourceLen] = min(
 			min(delete_distance, insert_distance), match_distance
 		);
 	}

+	source = source_index_1;
 	for (i = 1; i < parSourceLen; ++i) {
-		maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
+		source_char = utf8_advance(&source, parSource + parSourceLen);
+		maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
+		target = target_index_1;
 		for (j = 1; j < parTargetLen; ++j) {
-			candidateSwapIndex =
-				get_value(sourceIndexByCharacter, parTarget[j]);
+			target_char = utf8_advance(&target, parTarget + parTargetLen);
+			candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
 			j_swap = maxSourceLetterMatchIndex;
 			delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
 			insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
 			match_distance = table[(i - 1) + (j - 1) * parSourceLen];
-			if (parSource[i] != parTarget[j])
+			if (source_char != target_char)
 				match_distance += parReplaceCost;
 			else
 				maxSourceLetterMatchIndex = j;
@ -238,7 +258,7 @@ int damerau_levenshtein_with_size (
 				swap_distance
 			);
 		}
-		insert_pair(sourceIndexByCharacter, parSource[i], i);
+		insert_pair(sourceIndexByCharacter, source_char, i);
 	}

 	retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
--- a/src/main/utf8_ops.c
+++ b/src/main/utf8_ops.c
@ -0,0 +1,115 @@
+/* Copyright 2015, 2016, Michele Santullo
+ * This file is part of "dindexer".
+ *
+ * "dindexer" is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * "dindexer" is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ *  Bits  Pattern
+ *  ----  -------
+ *    7   0xxxxxxx
+ *   11   110xxxxx 10xxxxxx
+ *   16   1110xxxx 10xxxxxx 10xxxxxx
+ *   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *   32   111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+#include "utf8_ops.h"
+#include <iso646.h>
+#include <assert.h>
+
+static size_t sequence_length ( char parChar ) a_pure;
+
+/* See: http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
+size_t utf8_strlen (const char* parString) {
+	size_t i = 0;
+	size_t i_before = 0;
+	size_t count = 0;
+
+	while ((signed char)parString[i] > 0) {
+ascii:
+		i++;
+	}
+
+	count += i - i_before;
+
+	while (parString[i]) {
+		if ((signed char)parString[i] > 0) {
+			i_before = i;
+			goto ascii;
+		}
+		else {
+			i += sequence_length(parString[i]);
+		}
+
+		count++;
+	}
+	return count;
+}
+
+Character utf8_advance (const char** parString, const char* parStringEnd) {
+	const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
+	Character retval;
+	uint8_t curr_code;
+	int seq_len;
+	int z;
+
+	if (*parString >= parStringEnd) {
+		*parString = parStringEnd;
+		return 0;
+	}
+
+	curr_code = (uint8_t)(**parString);
+	seq_len = sequence_length(curr_code);
+	if (not seq_len) {
+		++(*parString);
+		return 0;
+	}
+	retval = curr_code bitand masks[seq_len - 1];
+
+	for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
+		curr_code = **parString;
+		if (curr_code bitand 0xc0 != 0x80)
+			return 0;
+		retval = (retval << 6) bitor (curr_code bitand 0x3f);
+	}
+	return retval;
+}
+
+size_t sequence_length (char parChar) {
+	const uint8_t curr_code = (uint8_t)parChar;
+	if (curr_code < 0x80) {
+		return 1;
+	}
+	else if ((curr_code bitand 0xe0) == 0xc0) {
+		return 2;
+	}
+	else if ((curr_code bitand 0xf0) == 0xe0) {
+		return 3;
+	}
+	else if ((curr_code bitand 0xf8) == 0xf0) {
+		return 4;
+	}
+	else if ((curr_code bitand 0xfc) == 0xf8) {
+		return 5;
+	}
+	else if ((curr_code bitand 0xfc) == 0xfc) {
+		return 6;
+	}
+	else {
+		assert(0);
+		return 0;
+	}
+}
--- a/src/main/utf8_ops.h
+++ b/src/main/utf8_ops.h
@ -0,0 +1,30 @@
+/* Copyright 2015, 2016, Michele Santullo
+ * This file is part of "dindexer".
+ *
+ * "dindexer" is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * "dindexer" is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef id810F3D0C21864315B17EF76E83510B6D
+#define id810F3D0C21864315B17EF76E83510B6D
+
+#include <stddef.h>
+#include <stdint.h>
+#include "helpers/compatibility.h"
+
+typedef uint32_t Character;
+
+size_t utf8_strlen ( const char* parString ) a_pure;
+Character utf8_advance ( const char** parString, const char* parStringEnd );
+
+#endif