Treat commands as utf8 when looking for possible mispellings

2025-10-19 17:09:25 +00:00 · 2016-04-21 00:18:20 +02:00 · 2016-04-21 00:18:20 +02:00 · 653371763b
commit 653371763b
parent eaba94fa13
4 changed files with 180 additions and 14 deletions
--- a/src/main/CMakeLists.txt
+++ b/src/main/CMakeLists.txt
@ -7,6 +7,7 @@ add_executable(${PROJECT_NAME}
 	findactions.c
 	builtin_feats.c
 	damerau_levenshtein.c
 	utf8_ops.c
 )
 target_include_directories(${PROJECT_NAME}
--- a/src/main/damerau_levenshtein.c
+++ b/src/main/damerau_levenshtein.c
@ -64,6 +64,7 @@
 #include "damerau_levenshtein.h"
 #include "pbl_wrapper.h"
 #include "utf8_ops.h"
 #include <string.h>
 #include <iso646.h>
 #include <assert.h>
@ -80,8 +81,6 @@
 	__typeof__ (b) _b = (b); \
 	_a > _b ? _a : _b; })
 typedef wchar_t Character;
 static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
 	const int retval = pblMapAdd(
 		parMap,
@ -111,9 +110,9 @@ int damerau_levenshtein (
 {
 	return damerau_levenshtein_with_size(
 		parSource,
-		strlen(parSource),
+		utf8_strlen(parSource),
 		parTarget,
-		strlen(parTarget),
+		utf8_strlen(parTarget),
 		parDeleteCost,
 		parInsertCost,
 		parReplaceCost,
@ -150,6 +149,14 @@ int damerau_levenshtein_with_size (
 	int j_swap;
 	int pre_swap_cost;
 	int retval;
 	const char* source;
 	const char* target;
 	const char* source_index_1;
 	const char* target_index_1;
 	Character source_char;
 	Character target_char;
 	Character source_char_0;
 	Character target_char_0;
 	assert(parSource);
 	assert(parTarget);
@ -175,42 +182,55 @@ int damerau_levenshtein_with_size (
 	sourceIndexByCharacter = pblMapNewHashMap();
 	assert(sourceIndexByCharacter);
-	if (parSource[0] != parTarget[0]) {
+	source = parSource;
 	target = parTarget;
 	source_char_0 = utf8_advance(&source, parSource + parSourceLen);
 	target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
 	source_index_1 = source;
 	target_index_1 = target;
 	if (source_char_0 != target_char_0) {
 		table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
 			min(parReplaceCost, parDeleteCost + parInsertCost);
 	}
-	insert_pair(sourceIndexByCharacter, parSource[0], 0);
+	insert_pair(sourceIndexByCharacter, source_char_0, 0);
 	assert(source = source_index_1);
 	for (i = 1; i < parSourceLen; ++i) {
-		delete_distance = table[i - 1 + 0 * parSourceLen];
+		source_char = utf8_advance(&source, parSource + parSourceLen);
 		delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
 		insert_distance = (i + 1) * parDeleteCost + parInsertCost;
 		match_distance = i * parDeleteCost +
-			(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
+			(source_char == target_char_0 ? 0 : parReplaceCost);
 		table[i + 0 * parSourceLen] = min(
 			min(delete_distance, insert_distance), match_distance
 		);
 	}
 	assert(target == target_index_1);
 	for (j = 1; j < parTargetLen; ++j) {
 		target_char = utf8_advance(&target, parTarget + parTargetLen);
 		delete_distance = (j + 1) * parInsertCost + parDeleteCost;
 		insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
 		match_distance = j * parInsertCost +
-			(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
+			(source_char_0 == target_char ? 0 : parReplaceCost);
 		table[0 + j * parSourceLen] = min(
 			min(delete_distance, insert_distance), match_distance
 		);
 	}
 	source = source_index_1;
 	for (i = 1; i < parSourceLen; ++i) {
-		maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
+		source_char = utf8_advance(&source, parSource + parSourceLen);
 		maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
 		target = target_index_1;
 		for (j = 1; j < parTargetLen; ++j) {
-			candidateSwapIndex =
+			target_char = utf8_advance(&target, parTarget + parTargetLen);
-				get_value(sourceIndexByCharacter, parTarget[j]);
+			candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
 			j_swap = maxSourceLetterMatchIndex;
 			delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
 			insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
 			match_distance = table[(i - 1) + (j - 1) * parSourceLen];
-			if (parSource[i] != parTarget[j])
+			if (source_char != target_char)
 				match_distance += parReplaceCost;
 			else
 				maxSourceLetterMatchIndex = j;
@ -238,7 +258,7 @@ int damerau_levenshtein_with_size (
 				swap_distance
 			);
 		}
-		insert_pair(sourceIndexByCharacter, parSource[i], i);
+		insert_pair(sourceIndexByCharacter, source_char, i);
 	}
 	retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
--- a/src/main/utf8_ops.c
+++ b/src/main/utf8_ops.c
@ -0,0 +1,115 @@
 /* Copyright 2015, 2016, Michele Santullo
 * This file is part of "dindexer".
 *
 * "dindexer" is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * "dindexer" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
 */
 /*
 *  Bits  Pattern
 *  ----  -------
 *    7   0xxxxxxx
 *   11   110xxxxx 10xxxxxx
 *   16   1110xxxx 10xxxxxx 10xxxxxx
 *   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 *   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *   32   111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 */
 #include "utf8_ops.h"
 #include <iso646.h>
 #include <assert.h>
 static size_t sequence_length ( char parChar ) a_pure;
 /* See: http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
 size_t utf8_strlen (const char* parString) {
 	size_t i = 0;
 	size_t i_before = 0;
 	size_t count = 0;
 	while ((signed char)parString[i] > 0) {
 ascii:
 		i++;
 	}
 	count += i - i_before;
 	while (parString[i]) {
 		if ((signed char)parString[i] > 0) {
 			i_before = i;
 			goto ascii;
 		}
 		else {
 			i += sequence_length(parString[i]);
 		}
 		count++;
 	}
 	return count;
 }
 Character utf8_advance (const char** parString, const char* parStringEnd) {
 	const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
 	Character retval;
 	uint8_t curr_code;
 	int seq_len;
 	int z;
 	if (*parString >= parStringEnd) {
 		*parString = parStringEnd;
 		return 0;
 	}
 	curr_code = (uint8_t)(**parString);
 	seq_len = sequence_length(curr_code);
 	if (not seq_len) {
 		++(*parString);
 		return 0;
 	}
 	retval = curr_code bitand masks[seq_len - 1];
 	for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
 		curr_code = **parString;
 		if (curr_code bitand 0xc0 != 0x80)
 			return 0;
 		retval = (retval << 6) bitor (curr_code bitand 0x3f);
 	}
 	return retval;
 }
 size_t sequence_length (char parChar) {
 	const uint8_t curr_code = (uint8_t)parChar;
 	if (curr_code < 0x80) {
 		return 1;
 	}
 	else if ((curr_code bitand 0xe0) == 0xc0) {
 		return 2;
 	}
 	else if ((curr_code bitand 0xf0) == 0xe0) {
 		return 3;
 	}
 	else if ((curr_code bitand 0xf8) == 0xf0) {
 		return 4;
 	}
 	else if ((curr_code bitand 0xfc) == 0xf8) {
 		return 5;
 	}
 	else if ((curr_code bitand 0xfc) == 0xfc) {
 		return 6;
 	}
 	else {
 		assert(0);
 		return 0;
 	}
 }
--- a/src/main/utf8_ops.h
+++ b/src/main/utf8_ops.h
@ -0,0 +1,30 @@
 /* Copyright 2015, 2016, Michele Santullo
 * This file is part of "dindexer".
 *
 * "dindexer" is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * "dindexer" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "dindexer".  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef id810F3D0C21864315B17EF76E83510B6D
 #define id810F3D0C21864315B17EF76E83510B6D
 #include <stddef.h>
 #include <stdint.h>
 #include "helpers/compatibility.h"
 typedef uint32_t Character;
 size_t utf8_strlen ( const char* parString ) a_pure;
 Character utf8_advance ( const char** parString, const char* parStringEnd );
 #endif