From 31142a49b32c1c5727c7088b1fab96bb37997aea Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Tue, 19 Apr 2016 00:09:38 +0200 Subject: [PATCH] Implement mispelling suggestions for argument to dindexer --- src/main/CMakeLists.txt | 3 +- src/main/damerau_levenshtein.c | 248 +++++++++++++++++++++++++++++++++ src/main/damerau_levenshtein.h | 44 ++++++ src/main/main.c | 23 ++- src/main/pbl_wrapper.h | 25 ++++ 5 files changed, 340 insertions(+), 3 deletions(-) create mode 100644 src/main/damerau_levenshtein.c create mode 100644 src/main/damerau_levenshtein.h create mode 100644 src/main/pbl_wrapper.h diff --git a/src/main/CMakeLists.txt b/src/main/CMakeLists.txt index 5954f6f..f86ff13 100644 --- a/src/main/CMakeLists.txt +++ b/src/main/CMakeLists.txt @@ -6,11 +6,12 @@ add_executable(${PROJECT_NAME} main.c findactions.c builtin_feats.c + damerau_levenshtein.c ) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/include - PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src + PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src/src ) target_link_libraries(${PROJECT_NAME} diff --git a/src/main/damerau_levenshtein.c b/src/main/damerau_levenshtein.c new file mode 100644 index 0000000..25dabe4 --- /dev/null +++ b/src/main/damerau_levenshtein.c @@ -0,0 +1,248 @@ +/* Copyright (c) 2012 Kevin L. Stern + * Copyright (c) 2016 Michele Santullo + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * See + * http://software-and-algorithms.blogspot.ca/2012/09/damerau-levenshtein-edit-distance.html + * and + * https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java + */ + + +/** + * The Damerau-Levenshtein Algorithm is an extension to the Levenshtein + * Algorithm which solves the edit distance problem between a source string and + * a target string with the following operations: + * + * + * + * Note that the adjacent character swap operation is an edit that may be + * applied when two adjacent characters in the source string match two adjacent + * characters in the target string, but in reverse order, rather than a general + * allowance for adjacent character swaps. + *

+ * + * This implementation allows the client to specify the costs of the various + * edit operations with the restriction that the cost of two swap operations + * must not be less than the cost of a delete operation followed by an insert + * operation. This restriction is required to preclude two swaps involving the + * same character being required for optimality which, in turn, enables a fast + * dynamic programming solution. + *

+ * + * The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is + * the length of the source string and m is the length of the target string. + * This implementation consumes O(n*m) space. + * + * @author Kevin L. Stern + */ + +#include "damerau_levenshtein.h" +#include "pbl_wrapper.h" +#include +#include +#include +#include +#include + +//See: http://stackoverflow.com/questions/3437404/min-and-max-in-c +#define min(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a < _b ? _a : _b; }) +#define max(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a > _b ? _a : _b; }) + +typedef wchar_t Character; + +static void insert_pair (PblMap* parMap, Character parKey, int parValue) { + const int retval = pblMapAdd( + parMap, + &parKey, + sizeof(parKey), + &parValue, + sizeof(parValue) + ); + assert(0 <= retval); +} + +static int get_value (PblMap* parMap, Character parKey) { + size_t ret_len; + void* value = pblMapGet(parMap, &parKey, sizeof(parKey), &ret_len); + assert(not value or (sizeof(int) == ret_len)); + return (value ? *(int*)value : -1); +} + +int damerau_levenshtein ( + const char* parSource, + const char* parTarget, + int parDeleteCost, + int parInsertCost, + int parReplaceCost, + int parSwapCost +) +{ + return damerau_levenshtein_with_size( + parSource, + strlen(parSource), + parTarget, + strlen(parTarget), + parDeleteCost, + parInsertCost, + parReplaceCost, + parSwapCost + ); +} + +/** + * Compute the Damerau-Levenshtein distance between the specified source + * string and the specified target string. + */ +int damerau_levenshtein_with_size ( + const char* parSource, + size_t parSourceLen, + const char* parTarget, + size_t parTargetLen, + int parDeleteCost, + int parInsertCost, + int parReplaceCost, + int parSwapCost +) +{ + int i; + int j; + int* table; + PblMap* sourceIndexByCharacter; + int delete_distance; + int insert_distance; + int match_distance; + int swap_distance; + int maxSourceLetterMatchIndex; + int candidateSwapIndex; + int i_swap; + int j_swap; + int pre_swap_cost; + int retval; + + assert(parSource); + assert(parTarget); + + /* + * Required to facilitate the premise to the algorithm that two swaps of the + * same character are never required for optimality. + */ + if (2 * parSwapCost < parInsertCost + parDeleteCost) { + /*throw new IllegalArgumentException("Unsupported cost assignment");*/ + return -1; + } + + if (0 == parSourceLen) + return parTargetLen * parInsertCost; + if (0 == parTargetLen) + return parSourceLen * parDeleteCost; + + const int table_length = parSourceLen * parTargetLen; + table = (int*)malloc(sizeof(int) * table_length); + memset(table, 0, sizeof(int) * table_length); + + sourceIndexByCharacter = pblMapNewHashMap(); + assert(sourceIndexByCharacter); + + if (parSource[0] != parTarget[0]) { + table[0 /*source*/ + 0 /*target*/ * parSourceLen] = + min(parReplaceCost, parDeleteCost + parInsertCost); + } + insert_pair(sourceIndexByCharacter, parSource[0], 0); + + for (i = 1; i < parSourceLen; ++i) { + delete_distance = table[i - 1 + 0 * parSourceLen]; + insert_distance = (i + 1) * parDeleteCost + parInsertCost; + match_distance = i * parDeleteCost + + (parSource[i] == parTarget[i] ? 0 : parReplaceCost); + table[i + 0 * parSourceLen] = min( + min(delete_distance, insert_distance), match_distance + ); + } + + for (j = 1; j < parTargetLen; ++j) { + delete_distance = (j + 1) * parInsertCost + parDeleteCost; + insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost; + match_distance = j * parInsertCost + + (parSource[0] == parTarget[j] ? 0 : parReplaceCost); + table[0 + j * parSourceLen] = min( + min(delete_distance, insert_distance), match_distance + ); + } + + for (i = 1; i < parSourceLen; ++i) { + maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1); + for (j = 1; j < parTargetLen; ++j) { + candidateSwapIndex = + get_value(sourceIndexByCharacter, parTarget[j]); + j_swap = maxSourceLetterMatchIndex; + delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost; + insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost; + match_distance = table[(i - 1) + (j - 1) * parSourceLen]; + if (parSource[i] != parTarget[j]) + match_distance += parReplaceCost; + else + maxSourceLetterMatchIndex = j; + + if (-1 != candidateSwapIndex and -1 != j_swap) { + i_swap = candidateSwapIndex; + if (0 == i_swap and 0 == j_swap) + pre_swap_cost = 0; + else + pre_swap_cost = table[ + max(0, i_swap - 1) + max(0, j_swap - 1) * parSourceLen + ]; + swap_distance = pre_swap_cost + (i - i_swap - 1) * + parDeleteCost + (j - j_swap - 1) * parInsertCost + + parSwapCost; + } + else { + swap_distance = INT_MAX; + } + table[i + j * parSourceLen] = min( + min( + min(delete_distance, insert_distance), + match_distance + ), + swap_distance + ); + } + insert_pair(sourceIndexByCharacter, parSource[i], i); + } + + retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen]; + free(table); + pblMapFree(sourceIndexByCharacter); + return retval; +} diff --git a/src/main/damerau_levenshtein.h b/src/main/damerau_levenshtein.h new file mode 100644 index 0000000..a16cccb --- /dev/null +++ b/src/main/damerau_levenshtein.h @@ -0,0 +1,44 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#ifndef id88738025C6B24BDEB604A5AE3C36EE8D +#define id88738025C6B24BDEB604A5AE3C36EE8D + +#include "helpers/compatibility.h" +#include "stddef.h" + +int damerau_levenshtein ( + const char* parSource, + const char* parTarget, + int parDeleteCost, + int parInsertCost, + int parReplaceCost, + int parSwapCost +) a_pure; + +int damerau_levenshtein_with_size ( + const char* parSource, + size_t parSourceLen, + const char* parTarget, + size_t parTargetLen, + int parDeleteCost, + int parInsertCost, + int parReplaceCost, + int parSwapCost +) a_pure; + +#endif diff --git a/src/main/main.c b/src/main/main.c index 872fbbf..a0376c4 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -21,6 +21,7 @@ #include "findactions.h" #include "helpers/lengthof.h" #include "builtin_feats.h" +#include "damerau_levenshtein.h" #include #include #include @@ -55,6 +56,7 @@ static size_t foreach_avail_action ( int(*parFunc)(const char*, const void*), ch static int printf_stream ( const char* parMsg, const void* parStream ); static int printf_stream_inplace ( const char* parMsg, const void* parPrintContext ); static int same_action ( const char* parAction1, const void* parAction2 ); +static int find_similar ( const char* parAction, const void* parUserInput ); static void print_usage ( void ); static int manage_commandline ( int parArgc, char* parArgv[], char** parActions, size_t parActionCount, int* parShouldQuit ); @@ -88,8 +90,17 @@ int main (int parArgc, char* parArgv[]) { selected_action = foreach_avail_action(&same_action, actions, actions_count, specified_action); if (actions_count == selected_action) { - fprintf(stderr, "Unrecognized action \"%s\" - available actions are:\n", specified_action); - foreach_avail_action(&printf_stream, actions, actions_count, stderr); + //Find a possible mispelling and show a hint to the user if any + selected_action = foreach_avail_action(&find_similar, actions, actions_count, specified_action); + if (selected_action < actions_count) { + fprintf(stderr, "Unrecognized action \"%s\" - maybe you meant \"%s\"?\n", + specified_action, + get_actionname(actions[selected_action]) + ); + } + else { + fprintf(stderr, "Unrecognized action \"%s\"\n", specified_action); + } free_actions(actions, actions_count); return 2; } @@ -177,6 +188,14 @@ static int same_action (const char* parAction1, const void* parAction2) { } } +static int find_similar (const char* parAction, const void* parUserInput) { + const int distance = damerau_levenshtein((const char*)parUserInput, parAction, 1, 1, 1, 1); + if (distance <= 2) + return 1; + else + return 0; +} + static void print_usage() { printf("--help, -h - show this help\n"); printf("--builtin, -b - show build info\n"); diff --git a/src/main/pbl_wrapper.h b/src/main/pbl_wrapper.h new file mode 100644 index 0000000..6db1714 --- /dev/null +++ b/src/main/pbl_wrapper.h @@ -0,0 +1,25 @@ +/* Copyright 2015, 2016, Michele Santullo + * This file is part of "dindexer". + * + * "dindexer" is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * "dindexer" is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with "dindexer". If not, see . + */ + +#ifndef idDB955D222C4A4AF5B9BD1A4A0BBDD9E3 +#define idDB955D222C4A4AF5B9BD1A4A0BBDD9E3 + +#include +#include +#include "pbl.h" + +#endif