1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2025-07-02 14:04:22 +00:00

Treat commands as utf8 when looking for possible mispellings

This commit is contained in:
King_DuckZ 2016-04-21 00:18:20 +02:00
parent eaba94fa13
commit 653371763b
4 changed files with 180 additions and 14 deletions

View file

@ -7,6 +7,7 @@ add_executable(${PROJECT_NAME}
findactions.c
builtin_feats.c
damerau_levenshtein.c
utf8_ops.c
)
target_include_directories(${PROJECT_NAME}

View file

@ -64,6 +64,7 @@
#include "damerau_levenshtein.h"
#include "pbl_wrapper.h"
#include "utf8_ops.h"
#include <string.h>
#include <iso646.h>
#include <assert.h>
@ -80,8 +81,6 @@
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; })
typedef wchar_t Character;
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
const int retval = pblMapAdd(
parMap,
@ -111,9 +110,9 @@ int damerau_levenshtein (
{
return damerau_levenshtein_with_size(
parSource,
strlen(parSource),
utf8_strlen(parSource),
parTarget,
strlen(parTarget),
utf8_strlen(parTarget),
parDeleteCost,
parInsertCost,
parReplaceCost,
@ -150,6 +149,14 @@ int damerau_levenshtein_with_size (
int j_swap;
int pre_swap_cost;
int retval;
const char* source;
const char* target;
const char* source_index_1;
const char* target_index_1;
Character source_char;
Character target_char;
Character source_char_0;
Character target_char_0;
assert(parSource);
assert(parTarget);
@ -175,42 +182,55 @@ int damerau_levenshtein_with_size (
sourceIndexByCharacter = pblMapNewHashMap();
assert(sourceIndexByCharacter);
if (parSource[0] != parTarget[0]) {
source = parSource;
target = parTarget;
source_char_0 = utf8_advance(&source, parSource + parSourceLen);
target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
source_index_1 = source;
target_index_1 = target;
if (source_char_0 != target_char_0) {
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
min(parReplaceCost, parDeleteCost + parInsertCost);
}
insert_pair(sourceIndexByCharacter, parSource[0], 0);
insert_pair(sourceIndexByCharacter, source_char_0, 0);
assert(source = source_index_1);
for (i = 1; i < parSourceLen; ++i) {
delete_distance = table[i - 1 + 0 * parSourceLen];
source_char = utf8_advance(&source, parSource + parSourceLen);
delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
match_distance = i * parDeleteCost +
(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
(source_char == target_char_0 ? 0 : parReplaceCost);
table[i + 0 * parSourceLen] = min(
min(delete_distance, insert_distance), match_distance
);
}
assert(target == target_index_1);
for (j = 1; j < parTargetLen; ++j) {
target_char = utf8_advance(&target, parTarget + parTargetLen);
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
match_distance = j * parInsertCost +
(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
(source_char_0 == target_char ? 0 : parReplaceCost);
table[0 + j * parSourceLen] = min(
min(delete_distance, insert_distance), match_distance
);
}
source = source_index_1;
for (i = 1; i < parSourceLen; ++i) {
maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
source_char = utf8_advance(&source, parSource + parSourceLen);
maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
target = target_index_1;
for (j = 1; j < parTargetLen; ++j) {
candidateSwapIndex =
get_value(sourceIndexByCharacter, parTarget[j]);
target_char = utf8_advance(&target, parTarget + parTargetLen);
candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
j_swap = maxSourceLetterMatchIndex;
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
if (parSource[i] != parTarget[j])
if (source_char != target_char)
match_distance += parReplaceCost;
else
maxSourceLetterMatchIndex = j;
@ -238,7 +258,7 @@ int damerau_levenshtein_with_size (
swap_distance
);
}
insert_pair(sourceIndexByCharacter, parSource[i], i);
insert_pair(sourceIndexByCharacter, source_char, i);
}
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];

115
src/main/utf8_ops.c Normal file
View file

@ -0,0 +1,115 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Bits Pattern
* ---- -------
* 7 0xxxxxxx
* 11 110xxxxx 10xxxxxx
* 16 1110xxxx 10xxxxxx 10xxxxxx
* 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 32 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
#include "utf8_ops.h"
#include <iso646.h>
#include <assert.h>
static size_t sequence_length ( char parChar ) a_pure;
/* See: http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
size_t utf8_strlen (const char* parString) {
size_t i = 0;
size_t i_before = 0;
size_t count = 0;
while ((signed char)parString[i] > 0) {
ascii:
i++;
}
count += i - i_before;
while (parString[i]) {
if ((signed char)parString[i] > 0) {
i_before = i;
goto ascii;
}
else {
i += sequence_length(parString[i]);
}
count++;
}
return count;
}
Character utf8_advance (const char** parString, const char* parStringEnd) {
const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
Character retval;
uint8_t curr_code;
int seq_len;
int z;
if (*parString >= parStringEnd) {
*parString = parStringEnd;
return 0;
}
curr_code = (uint8_t)(**parString);
seq_len = sequence_length(curr_code);
if (not seq_len) {
++(*parString);
return 0;
}
retval = curr_code bitand masks[seq_len - 1];
for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
curr_code = **parString;
if (curr_code bitand 0xc0 != 0x80)
return 0;
retval = (retval << 6) bitor (curr_code bitand 0x3f);
}
return retval;
}
size_t sequence_length (char parChar) {
const uint8_t curr_code = (uint8_t)parChar;
if (curr_code < 0x80) {
return 1;
}
else if ((curr_code bitand 0xe0) == 0xc0) {
return 2;
}
else if ((curr_code bitand 0xf0) == 0xe0) {
return 3;
}
else if ((curr_code bitand 0xf8) == 0xf0) {
return 4;
}
else if ((curr_code bitand 0xfc) == 0xf8) {
return 5;
}
else if ((curr_code bitand 0xfc) == 0xfc) {
return 6;
}
else {
assert(0);
return 0;
}
}

30
src/main/utf8_ops.h Normal file
View file

@ -0,0 +1,30 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id810F3D0C21864315B17EF76E83510B6D
#define id810F3D0C21864315B17EF76E83510B6D
#include <stddef.h>
#include <stdint.h>
#include "helpers/compatibility.h"
typedef uint32_t Character;
size_t utf8_strlen ( const char* parString ) a_pure;
Character utf8_advance ( const char** parString, const char* parStringEnd );
#endif