From e3e704c50fc3ae142391a7aadb519748642a0616 Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 21 Apr 2016 00:20:00 +0200 Subject: [PATCH] Bugfixes and improvements --- src/main/damerau_levenshtein.c | 14 ++++++++------ src/main/utf8_ops.c | 6 +++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/damerau_levenshtein.c b/src/main/damerau_levenshtein.c index 774043c..1aa100f 100644 --- a/src/main/damerau_levenshtein.c +++ b/src/main/damerau_levenshtein.c @@ -153,6 +153,8 @@ int damerau_levenshtein_with_size ( const char* target; const char* source_index_1; const char* target_index_1; + const char* source_end = parSource + strlen(parSource); + const char* target_end = parTarget + strlen(parTarget); Character source_char; Character target_char; Character source_char_0; @@ -184,8 +186,8 @@ int damerau_levenshtein_with_size ( source = parSource; target = parTarget; - source_char_0 = utf8_advance(&source, parSource + parSourceLen); - target_char_0 = utf8_advance(&target, parTarget + parTargetLen); + source_char_0 = utf8_advance(&source, source_end); + target_char_0 = utf8_advance(&target, target_end); source_index_1 = source; target_index_1 = target; if (source_char_0 != target_char_0) { @@ -196,7 +198,7 @@ int damerau_levenshtein_with_size ( assert(source = source_index_1); for (i = 1; i < parSourceLen; ++i) { - source_char = utf8_advance(&source, parSource + parSourceLen); + source_char = utf8_advance(&source, source_end); delete_distance = table[i - 1 + 0 * parSourceLen] + parDeleteCost; insert_distance = (i + 1) * parDeleteCost + parInsertCost; match_distance = i * parDeleteCost + @@ -208,7 +210,7 @@ int damerau_levenshtein_with_size ( assert(target == target_index_1); for (j = 1; j < parTargetLen; ++j) { - target_char = utf8_advance(&target, parTarget + parTargetLen); + target_char = utf8_advance(&target, target_end); delete_distance = (j + 1) * parInsertCost + parDeleteCost; insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost; match_distance = j * parInsertCost + @@ -220,11 +222,11 @@ int damerau_levenshtein_with_size ( source = source_index_1; for (i = 1; i < parSourceLen; ++i) { - source_char = utf8_advance(&source, parSource + parSourceLen); + source_char = utf8_advance(&source, source_end); maxSourceLetterMatchIndex = (source_char == target_char_0 ? 0 : -1); target = target_index_1; for (j = 1; j < parTargetLen; ++j) { - target_char = utf8_advance(&target, parTarget + parTargetLen); + target_char = utf8_advance(&target, target_end); candidateSwapIndex = get_value(sourceIndexByCharacter, target_char); j_swap = maxSourceLetterMatchIndex; delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost; diff --git a/src/main/utf8_ops.c b/src/main/utf8_ops.c index fb4c8bc..1ac3819 100644 --- a/src/main/utf8_ops.c +++ b/src/main/utf8_ops.c @@ -60,7 +60,7 @@ ascii: } Character utf8_advance (const char** parString, const char* parStringEnd) { - const Character masks[6] = {0x00, 0x1f, 0x0f, 0x07, 0x03, 0x03}; + const Character masks[6] = {0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x03}; Character retval; uint8_t curr_code; int seq_len; @@ -79,9 +79,9 @@ Character utf8_advance (const char** parString, const char* parStringEnd) { } retval = curr_code bitand masks[seq_len - 1]; - for (z = 0; z < (seq_len - 1) and ++(*parString) < parStringEnd; ++z) { + for (z = 0; ++(*parString) < parStringEnd and z < (seq_len - 1); ++z) { curr_code = **parString; - if (curr_code bitand 0xc0 != 0x80) + if ((curr_code bitand 0xc0) != 0x80) return 0; retval = (retval << 6) bitor (curr_code bitand 0x3f); }