mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2025-10-19 17:09:25 +00:00
Treat commands as utf8 when looking for possible mispellings
This commit is contained in:
parent
eaba94fa13
commit
653371763b
4 changed files with 180 additions and 14 deletions
|
@ -7,6 +7,7 @@ add_executable(${PROJECT_NAME}
|
||||||
findactions.c
|
findactions.c
|
||||||
builtin_feats.c
|
builtin_feats.c
|
||||||
damerau_levenshtein.c
|
damerau_levenshtein.c
|
||||||
|
utf8_ops.c
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(${PROJECT_NAME}
|
target_include_directories(${PROJECT_NAME}
|
||||||
|
|
|
@ -64,6 +64,7 @@
|
||||||
|
|
||||||
#include "damerau_levenshtein.h"
|
#include "damerau_levenshtein.h"
|
||||||
#include "pbl_wrapper.h"
|
#include "pbl_wrapper.h"
|
||||||
|
#include "utf8_ops.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <iso646.h>
|
#include <iso646.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
@ -80,8 +81,6 @@
|
||||||
__typeof__ (b) _b = (b); \
|
__typeof__ (b) _b = (b); \
|
||||||
_a > _b ? _a : _b; })
|
_a > _b ? _a : _b; })
|
||||||
|
|
||||||
typedef wchar_t Character;
|
|
||||||
|
|
||||||
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
|
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
|
||||||
const int retval = pblMapAdd(
|
const int retval = pblMapAdd(
|
||||||
parMap,
|
parMap,
|
||||||
|
@ -111,9 +110,9 @@ int damerau_levenshtein (
|
||||||
{
|
{
|
||||||
return damerau_levenshtein_with_size(
|
return damerau_levenshtein_with_size(
|
||||||
parSource,
|
parSource,
|
||||||
strlen(parSource),
|
utf8_strlen(parSource),
|
||||||
parTarget,
|
parTarget,
|
||||||
strlen(parTarget),
|
utf8_strlen(parTarget),
|
||||||
parDeleteCost,
|
parDeleteCost,
|
||||||
parInsertCost,
|
parInsertCost,
|
||||||
parReplaceCost,
|
parReplaceCost,
|
||||||
|
@ -150,6 +149,14 @@ int damerau_levenshtein_with_size (
|
||||||
int j_swap;
|
int j_swap;
|
||||||
int pre_swap_cost;
|
int pre_swap_cost;
|
||||||
int retval;
|
int retval;
|
||||||
|
const char* source;
|
||||||
|
const char* target;
|
||||||
|
const char* source_index_1;
|
||||||
|
const char* target_index_1;
|
||||||
|
Character source_char;
|
||||||
|
Character target_char;
|
||||||
|
Character source_char_0;
|
||||||
|
Character target_char_0;
|
||||||
|
|
||||||
assert(parSource);
|
assert(parSource);
|
||||||
assert(parTarget);
|
assert(parTarget);
|
||||||
|
@ -175,42 +182,55 @@ int damerau_levenshtein_with_size (
|
||||||
sourceIndexByCharacter = pblMapNewHashMap();
|
sourceIndexByCharacter = pblMapNewHashMap();
|
||||||
assert(sourceIndexByCharacter);
|
assert(sourceIndexByCharacter);
|
||||||
|
|
||||||
if (parSource[0] != parTarget[0]) {
|
source = parSource;
|
||||||
|
target = parTarget;
|
||||||
|
source_char_0 = utf8_advance(&source, parSource + parSourceLen);
|
||||||
|
target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
|
||||||
|
source_index_1 = source;
|
||||||
|
target_index_1 = target;
|
||||||
|
if (source_char_0 != target_char_0) {
|
||||||
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
|
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
|
||||||
min(parReplaceCost, parDeleteCost + parInsertCost);
|
min(parReplaceCost, parDeleteCost + parInsertCost);
|
||||||
}
|
}
|
||||||
insert_pair(sourceIndexByCharacter, parSource[0], 0);
|
insert_pair(sourceIndexByCharacter, source_char_0, 0);
|
||||||
|
|
||||||
|
assert(source = source_index_1);
|
||||||
for (i = 1; i < parSourceLen; ++i) {
|
for (i = 1; i < parSourceLen; ++i) {
|
||||||
delete_distance = table[i - 1 + 0 * parSourceLen];
|
source_char = utf8_advance(&source, parSource + parSourceLen);
|
||||||
|
delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
|
||||||
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
||||||
match_distance = i * parDeleteCost +
|
match_distance = i * parDeleteCost +
|
||||||
(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
|
(source_char == target_char_0 ? 0 : parReplaceCost);
|
||||||
table[i + 0 * parSourceLen] = min(
|
table[i + 0 * parSourceLen] = min(
|
||||||
min(delete_distance, insert_distance), match_distance
|
min(delete_distance, insert_distance), match_distance
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
assert(target == target_index_1);
|
||||||
for (j = 1; j < parTargetLen; ++j) {
|
for (j = 1; j < parTargetLen; ++j) {
|
||||||
|
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
||||||
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
||||||
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
||||||
match_distance = j * parInsertCost +
|
match_distance = j * parInsertCost +
|
||||||
(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
|
(source_char_0 == target_char ? 0 : parReplaceCost);
|
||||||
table[0 + j * parSourceLen] = min(
|
table[0 + j * parSourceLen] = min(
|
||||||
min(delete_distance, insert_distance), match_distance
|
min(delete_distance, insert_distance), match_distance
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
source = source_index_1;
|
||||||
for (i = 1; i < parSourceLen; ++i) {
|
for (i = 1; i < parSourceLen; ++i) {
|
||||||
maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
|
source_char = utf8_advance(&source, parSource + parSourceLen);
|
||||||
|
maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
|
||||||
|
target = target_index_1;
|
||||||
for (j = 1; j < parTargetLen; ++j) {
|
for (j = 1; j < parTargetLen; ++j) {
|
||||||
candidateSwapIndex =
|
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
||||||
get_value(sourceIndexByCharacter, parTarget[j]);
|
candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
|
||||||
j_swap = maxSourceLetterMatchIndex;
|
j_swap = maxSourceLetterMatchIndex;
|
||||||
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
||||||
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
|
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
|
||||||
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
|
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
|
||||||
if (parSource[i] != parTarget[j])
|
if (source_char != target_char)
|
||||||
match_distance += parReplaceCost;
|
match_distance += parReplaceCost;
|
||||||
else
|
else
|
||||||
maxSourceLetterMatchIndex = j;
|
maxSourceLetterMatchIndex = j;
|
||||||
|
@ -238,7 +258,7 @@ int damerau_levenshtein_with_size (
|
||||||
swap_distance
|
swap_distance
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
insert_pair(sourceIndexByCharacter, parSource[i], i);
|
insert_pair(sourceIndexByCharacter, source_char, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
|
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
|
||||||
|
|
115
src/main/utf8_ops.c
Normal file
115
src/main/utf8_ops.c
Normal file
|
@ -0,0 +1,115 @@
|
||||||
|
/* Copyright 2015, 2016, Michele Santullo
|
||||||
|
* This file is part of "dindexer".
|
||||||
|
*
|
||||||
|
* "dindexer" is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* "dindexer" is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bits Pattern
|
||||||
|
* ---- -------
|
||||||
|
* 7 0xxxxxxx
|
||||||
|
* 11 110xxxxx 10xxxxxx
|
||||||
|
* 16 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
* 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
* 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
* 32 111111xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "utf8_ops.h"
|
||||||
|
#include <iso646.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
static size_t sequence_length ( char parChar ) a_pure;
|
||||||
|
|
||||||
|
/* See: http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html */
|
||||||
|
size_t utf8_strlen (const char* parString) {
|
||||||
|
size_t i = 0;
|
||||||
|
size_t i_before = 0;
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
while ((signed char)parString[i] > 0) {
|
||||||
|
ascii:
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
count += i - i_before;
|
||||||
|
|
||||||
|
while (parString[i]) {
|
||||||
|
if ((signed char)parString[i] > 0) {
|
||||||
|
i_before = i;
|
||||||
|
goto ascii;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
i += sequence_length(parString[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
Character utf8_advance (const char** parString, const char* parStringEnd) {
|
||||||
|
const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
|
||||||
|
Character retval;
|
||||||
|
uint8_t curr_code;
|
||||||
|
int seq_len;
|
||||||
|
int z;
|
||||||
|
|
||||||
|
if (*parString >= parStringEnd) {
|
||||||
|
*parString = parStringEnd;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
curr_code = (uint8_t)(**parString);
|
||||||
|
seq_len = sequence_length(curr_code);
|
||||||
|
if (not seq_len) {
|
||||||
|
++(*parString);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
retval = curr_code bitand masks[seq_len - 1];
|
||||||
|
|
||||||
|
for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
|
||||||
|
curr_code = **parString;
|
||||||
|
if (curr_code bitand 0xc0 != 0x80)
|
||||||
|
return 0;
|
||||||
|
retval = (retval << 6) bitor (curr_code bitand 0x3f);
|
||||||
|
}
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t sequence_length (char parChar) {
|
||||||
|
const uint8_t curr_code = (uint8_t)parChar;
|
||||||
|
if (curr_code < 0x80) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else if ((curr_code bitand 0xe0) == 0xc0) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
else if ((curr_code bitand 0xf0) == 0xe0) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
else if ((curr_code bitand 0xf8) == 0xf0) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
else if ((curr_code bitand 0xfc) == 0xf8) {
|
||||||
|
return 5;
|
||||||
|
}
|
||||||
|
else if ((curr_code bitand 0xfc) == 0xfc) {
|
||||||
|
return 6;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
30
src/main/utf8_ops.h
Normal file
30
src/main/utf8_ops.h
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
/* Copyright 2015, 2016, Michele Santullo
|
||||||
|
* This file is part of "dindexer".
|
||||||
|
*
|
||||||
|
* "dindexer" is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* "dindexer" is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef id810F3D0C21864315B17EF76E83510B6D
|
||||||
|
#define id810F3D0C21864315B17EF76E83510B6D
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "helpers/compatibility.h"
|
||||||
|
|
||||||
|
typedef uint32_t Character;
|
||||||
|
|
||||||
|
size_t utf8_strlen ( const char* parString ) a_pure;
|
||||||
|
Character utf8_advance ( const char** parString, const char* parStringEnd );
|
||||||
|
|
||||||
|
#endif
|
Loading…
Add table
Add a link
Reference in a new issue