mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2025-07-02 14:04:22 +00:00
Treat commands as utf8 when looking for possible mispellings
This commit is contained in:
parent
eaba94fa13
commit
653371763b
4 changed files with 180 additions and 14 deletions
|
@ -7,6 +7,7 @@ add_executable(${PROJECT_NAME}
|
|||
findactions.c
|
||||
builtin_feats.c
|
||||
damerau_levenshtein.c
|
||||
utf8_ops.c
|
||||
)
|
||||
|
||||
target_include_directories(${PROJECT_NAME}
|
||||
|
|
|
@ -64,6 +64,7 @@
|
|||
|
||||
#include "damerau_levenshtein.h"
|
||||
#include "pbl_wrapper.h"
|
||||
#include "utf8_ops.h"
|
||||
#include <string.h>
|
||||
#include <iso646.h>
|
||||
#include <assert.h>
|
||||
|
@ -80,8 +81,6 @@
|
|||
__typeof__ (b) _b = (b); \
|
||||
_a > _b ? _a : _b; })
|
||||
|
||||
typedef wchar_t Character;
|
||||
|
||||
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
|
||||
const int retval = pblMapAdd(
|
||||
parMap,
|
||||
|
@ -111,9 +110,9 @@ int damerau_levenshtein (
|
|||
{
|
||||
return damerau_levenshtein_with_size(
|
||||
parSource,
|
||||
strlen(parSource),
|
||||
utf8_strlen(parSource),
|
||||
parTarget,
|
||||
strlen(parTarget),
|
||||
utf8_strlen(parTarget),
|
||||
parDeleteCost,
|
||||
parInsertCost,
|
||||
parReplaceCost,
|
||||
|
@ -150,6 +149,14 @@ int damerau_levenshtein_with_size (
|
|||
int j_swap;
|
||||
int pre_swap_cost;
|
||||
int retval;
|
||||
const char* source;
|
||||
const char* target;
|
||||
const char* source_index_1;
|
||||
const char* target_index_1;
|
||||
Character source_char;
|
||||
Character target_char;
|
||||
Character source_char_0;
|
||||
Character target_char_0;
|
||||
|
||||
assert(parSource);
|
||||
assert(parTarget);
|
||||
|
@ -175,42 +182,55 @@ int damerau_levenshtein_with_size (
|
|||
sourceIndexByCharacter = pblMapNewHashMap();
|
||||
assert(sourceIndexByCharacter);
|
||||
|
||||
if (parSource[0] != parTarget[0]) {
|
||||
source = parSource;
|
||||
target = parTarget;
|
||||
source_char_0 = utf8_advance(&source, parSource + parSourceLen);
|
||||
target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
|
||||
source_index_1 = source;
|
||||
target_index_1 = target;
|
||||
if (source_char_0 != target_char_0) {
|
||||
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
|
||||
min(parReplaceCost, parDeleteCost + parInsertCost);
|
||||
}
|
||||
insert_pair(sourceIndexByCharacter, parSource[0], 0);
|
||||
insert_pair(sourceIndexByCharacter, source_char_0, 0);
|
||||
|
||||
assert(source = source_index_1);
|
||||
for (i = 1; i < parSourceLen; ++i) {
|
||||
delete_distance = table[i - 1 + 0 * parSourceLen];
|
||||
source_char = utf8_advance(&source, parSource + parSourceLen);
|
||||
delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
|
||||
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
||||
match_distance = i * parDeleteCost +
|
||||
(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
|
||||
(source_char == target_char_0 ? 0 : parReplaceCost);
|
||||
table[i + 0 * parSourceLen] = min(
|
||||
min(delete_distance, insert_distance), match_distance
|
||||
);
|
||||
}
|
||||
|
||||
assert(target == target_index_1);
|
||||
for (j = 1; j < parTargetLen; ++j) {
|
||||
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
||||
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
||||
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
||||
match_distance = j * parInsertCost +
|
||||
(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
|
||||
(source_char_0 == target_char ? 0 : parReplaceCost);
|
||||
table[0 + j * parSourceLen] = min(
|
||||
min(delete_distance, insert_distance), match_distance
|
||||
);
|
||||
}
|
||||
|
||||
source = source_index_1;
|
||||
for (i = 1; i < parSourceLen; ++i) {
|
||||
maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
|
||||
source_char = utf8_advance(&source, parSource + parSourceLen);
|
||||
maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
|
||||
target = target_index_1;
|
||||
for (j = 1; j < parTargetLen; ++j) {
|
||||
candidateSwapIndex =
|
||||
get_value(sourceIndexByCharacter, parTarget[j]);
|
||||
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
||||
candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
|
||||
j_swap = maxSourceLetterMatchIndex;
|
||||
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
||||
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
|
||||
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
|
||||
if (parSource[i] != parTarget[j])
|
||||
if (source_char != target_char)
|
||||
match_distance += parReplaceCost;
|
||||
else
|
||||
maxSourceLetterMatchIndex = j;
|
||||
|
@ -238,7 +258,7 @@ int damerau_levenshtein_with_size (
|
|||
swap_distance
|
||||
);
|
||||
}
|
||||
insert_pair(sourceIndexByCharacter, parSource[i], i);
|
||||
insert_pair(sourceIndexByCharacter, source_char, i);
|
||||
}
|
||||
|
||||
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
|
||||
|
|
115
src/main/utf8_ops.c
Normal file
115
src/main/utf8_ops.c
Normal file
|
@ -0,0 +1,115 @@
|
|||
/* Copyright 2015, 2016, Michele Santullo
|
||||
* This file is part of "dindexer".
|
||||
*
|
||||
* "dindexer" is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* "dindexer" is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Bits Pattern
|
||||
* ---- -------
|
||||
* 7 0xxxxxxx
|
||||
* 11 110xxxxx 10xxxxxx
|
||||
* 16 1110xxxx 10xxxxxx 10xxxxxx
|
||||
* 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
* 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
* 32 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*/
|
||||
|
||||
#include "utf8_ops.h"
|
||||
#include <iso646.h>
|
||||
#include <assert.h>
|
||||
|
||||
static size_t sequence_length ( char parChar ) a_pure;
|
||||
|
||||
/* See: http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
|
||||
size_t utf8_strlen (const char* parString) {
|
||||
size_t i = 0;
|
||||
size_t i_before = 0;
|
||||
size_t count = 0;
|
||||
|
||||
while ((signed char)parString[i] > 0) {
|
||||
ascii:
|
||||
i++;
|
||||
}
|
||||
|
||||
count += i - i_before;
|
||||
|
||||
while (parString[i]) {
|
||||
if ((signed char)parString[i] > 0) {
|
||||
i_before = i;
|
||||
goto ascii;
|
||||
}
|
||||
else {
|
||||
i += sequence_length(parString[i]);
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
Character utf8_advance (const char** parString, const char* parStringEnd) {
|
||||
const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
|
||||
Character retval;
|
||||
uint8_t curr_code;
|
||||
int seq_len;
|
||||
int z;
|
||||
|
||||
if (*parString >= parStringEnd) {
|
||||
*parString = parStringEnd;
|
||||
return 0;
|
||||
}
|
||||
|
||||
curr_code = (uint8_t)(**parString);
|
||||
seq_len = sequence_length(curr_code);
|
||||
if (not seq_len) {
|
||||
++(*parString);
|
||||
return 0;
|
||||
}
|
||||
retval = curr_code bitand masks[seq_len - 1];
|
||||
|
||||
for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
|
||||
curr_code = **parString;
|
||||
if (curr_code bitand 0xc0 != 0x80)
|
||||
return 0;
|
||||
retval = (retval << 6) bitor (curr_code bitand 0x3f);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
size_t sequence_length (char parChar) {
|
||||
const uint8_t curr_code = (uint8_t)parChar;
|
||||
if (curr_code < 0x80) {
|
||||
return 1;
|
||||
}
|
||||
else if ((curr_code bitand 0xe0) == 0xc0) {
|
||||
return 2;
|
||||
}
|
||||
else if ((curr_code bitand 0xf0) == 0xe0) {
|
||||
return 3;
|
||||
}
|
||||
else if ((curr_code bitand 0xf8) == 0xf0) {
|
||||
return 4;
|
||||
}
|
||||
else if ((curr_code bitand 0xfc) == 0xf8) {
|
||||
return 5;
|
||||
}
|
||||
else if ((curr_code bitand 0xfc) == 0xfc) {
|
||||
return 6;
|
||||
}
|
||||
else {
|
||||
assert(0);
|
||||
return 0;
|
||||
}
|
||||
}
|
30
src/main/utf8_ops.h
Normal file
30
src/main/utf8_ops.h
Normal file
|
@ -0,0 +1,30 @@
|
|||
/* Copyright 2015, 2016, Michele Santullo
|
||||
* This file is part of "dindexer".
|
||||
*
|
||||
* "dindexer" is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* "dindexer" is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id810F3D0C21864315B17EF76E83510B6D
|
||||
#define id810F3D0C21864315B17EF76E83510B6D
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "helpers/compatibility.h"
|
||||
|
||||
typedef uint32_t Character;
|
||||
|
||||
size_t utf8_strlen ( const char* parString ) a_pure;
|
||||
Character utf8_advance ( const char** parString, const char* parStringEnd );
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue