mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2024-11-29 01:33:46 +00:00
Bugfixes and improvements
This commit is contained in:
parent
653371763b
commit
e3e704c50f
2 changed files with 11 additions and 9 deletions
|
@ -153,6 +153,8 @@ int damerau_levenshtein_with_size (
|
||||||
const char* target;
|
const char* target;
|
||||||
const char* source_index_1;
|
const char* source_index_1;
|
||||||
const char* target_index_1;
|
const char* target_index_1;
|
||||||
|
const char* source_end = parSource + strlen(parSource);
|
||||||
|
const char* target_end = parTarget + strlen(parTarget);
|
||||||
Character source_char;
|
Character source_char;
|
||||||
Character target_char;
|
Character target_char;
|
||||||
Character source_char_0;
|
Character source_char_0;
|
||||||
|
@ -184,8 +186,8 @@ int damerau_levenshtein_with_size (
|
||||||
|
|
||||||
source = parSource;
|
source = parSource;
|
||||||
target = parTarget;
|
target = parTarget;
|
||||||
source_char_0 = utf8_advance(&source, parSource + parSourceLen);
|
source_char_0 = utf8_advance(&source, source_end);
|
||||||
target_char_0 = utf8_advance(&target, parTarget + parTargetLen);
|
target_char_0 = utf8_advance(&target, target_end);
|
||||||
source_index_1 = source;
|
source_index_1 = source;
|
||||||
target_index_1 = target;
|
target_index_1 = target;
|
||||||
if (source_char_0 != target_char_0) {
|
if (source_char_0 != target_char_0) {
|
||||||
|
@ -196,7 +198,7 @@ int damerau_levenshtein_with_size (
|
||||||
|
|
||||||
assert(source = source_index_1);
|
assert(source = source_index_1);
|
||||||
for (i = 1; i < parSourceLen; ++i) {
|
for (i = 1; i < parSourceLen; ++i) {
|
||||||
source_char = utf8_advance(&source, parSource + parSourceLen);
|
source_char = utf8_advance(&source, source_end);
|
||||||
delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
|
delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost;
|
||||||
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
||||||
match_distance = i * parDeleteCost +
|
match_distance = i * parDeleteCost +
|
||||||
|
@ -208,7 +210,7 @@ int damerau_levenshtein_with_size (
|
||||||
|
|
||||||
assert(target == target_index_1);
|
assert(target == target_index_1);
|
||||||
for (j = 1; j < parTargetLen; ++j) {
|
for (j = 1; j < parTargetLen; ++j) {
|
||||||
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
target_char = utf8_advance(&target, target_end);
|
||||||
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
||||||
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
||||||
match_distance = j * parInsertCost +
|
match_distance = j * parInsertCost +
|
||||||
|
@ -220,11 +222,11 @@ int damerau_levenshtein_with_size (
|
||||||
|
|
||||||
source = source_index_1;
|
source = source_index_1;
|
||||||
for (i = 1; i < parSourceLen; ++i) {
|
for (i = 1; i < parSourceLen; ++i) {
|
||||||
source_char = utf8_advance(&source, parSource + parSourceLen);
|
source_char = utf8_advance(&source, source_end);
|
||||||
maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
|
maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1);
|
||||||
target = target_index_1;
|
target = target_index_1;
|
||||||
for (j = 1; j < parTargetLen; ++j) {
|
for (j = 1; j < parTargetLen; ++j) {
|
||||||
target_char = utf8_advance(&target, parTarget + parTargetLen);
|
target_char = utf8_advance(&target, target_end);
|
||||||
candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
|
candidateSwapIndex = get_value(sourceIndexByCharacter, target_char);
|
||||||
j_swap = maxSourceLetterMatchIndex;
|
j_swap = maxSourceLetterMatchIndex;
|
||||||
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
||||||
|
|
|
@ -60,7 +60,7 @@ ascii:
|
||||||
}
|
}
|
||||||
|
|
||||||
Character utf8_advance (const char** parString, const char* parStringEnd) {
|
Character utf8_advance (const char** parString, const char* parStringEnd) {
|
||||||
const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03};
|
const Character masks[6] = {0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x03};
|
||||||
Character retval;
|
Character retval;
|
||||||
uint8_t curr_code;
|
uint8_t curr_code;
|
||||||
int seq_len;
|
int seq_len;
|
||||||
|
@ -79,9 +79,9 @@ Character utf8_advance (const char** parString, const char* parStringEnd) {
|
||||||
}
|
}
|
||||||
retval = curr_code bitand masks[seq_len - 1];
|
retval = curr_code bitand masks[seq_len - 1];
|
||||||
|
|
||||||
for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) {
|
for (z = 0; ++(*parString) < parStringEnd and z < (seq_len - 1); ++z) {
|
||||||
curr_code = **parString;
|
curr_code = **parString;
|
||||||
if (curr_code bitand 0xc0 != 0x80)
|
if ((curr_code bitand 0xc0) != 0x80)
|
||||||
return 0;
|
return 0;
|
||||||
retval = (retval << 6) bitor (curr_code bitand 0x3f);
|
retval = (retval << 6) bitor (curr_code bitand 0x3f);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue