mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2024-11-25 00:53:43 +00:00
Implement mispelling suggestions for argument to dindexer
This commit is contained in:
parent
3e130cd346
commit
31142a49b3
5 changed files with 340 additions and 3 deletions
|
@ -6,11 +6,12 @@ add_executable(${PROJECT_NAME}
|
|||
main.c
|
||||
findactions.c
|
||||
builtin_feats.c
|
||||
damerau_levenshtein.c
|
||||
)
|
||||
|
||||
target_include_directories(${PROJECT_NAME}
|
||||
PRIVATE ${CMAKE_SOURCE_DIR}/include
|
||||
PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src
|
||||
PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src/src
|
||||
)
|
||||
|
||||
target_link_libraries(${PROJECT_NAME}
|
||||
|
|
248
src/main/damerau_levenshtein.c
Normal file
248
src/main/damerau_levenshtein.c
Normal file
|
@ -0,0 +1,248 @@
|
|||
/* Copyright (c) 2012 Kevin L. Stern
|
||||
* Copyright (c) 2016 Michele Santullo
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* See
|
||||
* http://software-and-algorithms.blogspot.ca/2012/09/damerau-levenshtein-edit-distance.html
|
||||
* and
|
||||
* https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* The Damerau-Levenshtein Algorithm is an extension to the Levenshtein
|
||||
* Algorithm which solves the edit distance problem between a source string and
|
||||
* a target string with the following operations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Character Insertion</li>
|
||||
* <li>Character Deletion</li>
|
||||
* <li>Character Replacement</li>
|
||||
* <li>Adjacent Character Swap</li>
|
||||
* </ul>
|
||||
*
|
||||
* Note that the adjacent character swap operation is an edit that may be
|
||||
* applied when two adjacent characters in the source string match two adjacent
|
||||
* characters in the target string, but in reverse order, rather than a general
|
||||
* allowance for adjacent character swaps.
|
||||
* <p>
|
||||
*
|
||||
* This implementation allows the client to specify the costs of the various
|
||||
* edit operations with the restriction that the cost of two swap operations
|
||||
* must not be less than the cost of a delete operation followed by an insert
|
||||
* operation. This restriction is required to preclude two swaps involving the
|
||||
* same character being required for optimality which, in turn, enables a fast
|
||||
* dynamic programming solution.
|
||||
* <p>
|
||||
*
|
||||
* The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is
|
||||
* the length of the source string and m is the length of the target string.
|
||||
* This implementation consumes O(n*m) space.
|
||||
*
|
||||
* @author Kevin L. Stern
|
||||
*/
|
||||
|
||||
#include "damerau_levenshtein.h"
|
||||
#include "pbl_wrapper.h"
|
||||
#include <string.h>
|
||||
#include <iso646.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
|
||||
//See: http://stackoverflow.com/questions/3437404/min-and-max-in-c
|
||||
#define min(a,b) \
|
||||
({ __typeof__ (a) _a = (a); \
|
||||
__typeof__ (b) _b = (b); \
|
||||
_a < _b ? _a : _b; })
|
||||
#define max(a,b) \
|
||||
({ __typeof__ (a) _a = (a); \
|
||||
__typeof__ (b) _b = (b); \
|
||||
_a > _b ? _a : _b; })
|
||||
|
||||
typedef wchar_t Character;
|
||||
|
||||
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
|
||||
const int retval = pblMapAdd(
|
||||
parMap,
|
||||
&parKey,
|
||||
sizeof(parKey),
|
||||
&parValue,
|
||||
sizeof(parValue)
|
||||
);
|
||||
assert(0 <= retval);
|
||||
}
|
||||
|
||||
static int get_value (PblMap* parMap, Character parKey) {
|
||||
size_t ret_len;
|
||||
void* value = pblMapGet(parMap, &parKey, sizeof(parKey), &ret_len);
|
||||
assert(not value or (sizeof(int) == ret_len));
|
||||
return (value ? *(int*)value : -1);
|
||||
}
|
||||
|
||||
int damerau_levenshtein (
|
||||
const char* parSource,
|
||||
const char* parTarget,
|
||||
int parDeleteCost,
|
||||
int parInsertCost,
|
||||
int parReplaceCost,
|
||||
int parSwapCost
|
||||
)
|
||||
{
|
||||
return damerau_levenshtein_with_size(
|
||||
parSource,
|
||||
strlen(parSource),
|
||||
parTarget,
|
||||
strlen(parTarget),
|
||||
parDeleteCost,
|
||||
parInsertCost,
|
||||
parReplaceCost,
|
||||
parSwapCost
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the Damerau-Levenshtein distance between the specified source
|
||||
* string and the specified target string.
|
||||
*/
|
||||
int damerau_levenshtein_with_size (
|
||||
const char* parSource,
|
||||
size_t parSourceLen,
|
||||
const char* parTarget,
|
||||
size_t parTargetLen,
|
||||
int parDeleteCost,
|
||||
int parInsertCost,
|
||||
int parReplaceCost,
|
||||
int parSwapCost
|
||||
)
|
||||
{
|
||||
int i;
|
||||
int j;
|
||||
int* table;
|
||||
PblMap* sourceIndexByCharacter;
|
||||
int delete_distance;
|
||||
int insert_distance;
|
||||
int match_distance;
|
||||
int swap_distance;
|
||||
int maxSourceLetterMatchIndex;
|
||||
int candidateSwapIndex;
|
||||
int i_swap;
|
||||
int j_swap;
|
||||
int pre_swap_cost;
|
||||
int retval;
|
||||
|
||||
assert(parSource);
|
||||
assert(parTarget);
|
||||
|
||||
/*
|
||||
* Required to facilitate the premise to the algorithm that two swaps of the
|
||||
* same character are never required for optimality.
|
||||
*/
|
||||
if (2 * parSwapCost < parInsertCost + parDeleteCost) {
|
||||
/*throw new IllegalArgumentException("Unsupported cost assignment");*/
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (0 == parSourceLen)
|
||||
return parTargetLen * parInsertCost;
|
||||
if (0 == parTargetLen)
|
||||
return parSourceLen * parDeleteCost;
|
||||
|
||||
const int table_length = parSourceLen * parTargetLen;
|
||||
table = (int*)malloc(sizeof(int) * table_length);
|
||||
memset(table, 0, sizeof(int) * table_length);
|
||||
|
||||
sourceIndexByCharacter = pblMapNewHashMap();
|
||||
assert(sourceIndexByCharacter);
|
||||
|
||||
if (parSource[0] != parTarget[0]) {
|
||||
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
|
||||
min(parReplaceCost, parDeleteCost + parInsertCost);
|
||||
}
|
||||
insert_pair(sourceIndexByCharacter, parSource[0], 0);
|
||||
|
||||
for (i = 1; i < parSourceLen; ++i) {
|
||||
delete_distance = table[i - 1 + 0 * parSourceLen];
|
||||
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
|
||||
match_distance = i * parDeleteCost +
|
||||
(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
|
||||
table[i + 0 * parSourceLen] = min(
|
||||
min(delete_distance, insert_distance), match_distance
|
||||
);
|
||||
}
|
||||
|
||||
for (j = 1; j < parTargetLen; ++j) {
|
||||
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
|
||||
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
|
||||
match_distance = j * parInsertCost +
|
||||
(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
|
||||
table[0 + j * parSourceLen] = min(
|
||||
min(delete_distance, insert_distance), match_distance
|
||||
);
|
||||
}
|
||||
|
||||
for (i = 1; i < parSourceLen; ++i) {
|
||||
maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
|
||||
for (j = 1; j < parTargetLen; ++j) {
|
||||
candidateSwapIndex =
|
||||
get_value(sourceIndexByCharacter, parTarget[j]);
|
||||
j_swap = maxSourceLetterMatchIndex;
|
||||
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
|
||||
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
|
||||
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
|
||||
if (parSource[i] != parTarget[j])
|
||||
match_distance += parReplaceCost;
|
||||
else
|
||||
maxSourceLetterMatchIndex = j;
|
||||
|
||||
if (-1 != candidateSwapIndex and -1 != j_swap) {
|
||||
i_swap = candidateSwapIndex;
|
||||
if (0 == i_swap and 0 == j_swap)
|
||||
pre_swap_cost = 0;
|
||||
else
|
||||
pre_swap_cost = table[
|
||||
max(0, i_swap - 1) + max(0, j_swap - 1) * parSourceLen
|
||||
];
|
||||
swap_distance = pre_swap_cost + (i - i_swap - 1) *
|
||||
parDeleteCost + (j - j_swap - 1) * parInsertCost +
|
||||
parSwapCost;
|
||||
}
|
||||
else {
|
||||
swap_distance = INT_MAX;
|
||||
}
|
||||
table[i + j * parSourceLen] = min(
|
||||
min(
|
||||
min(delete_distance, insert_distance),
|
||||
match_distance
|
||||
),
|
||||
swap_distance
|
||||
);
|
||||
}
|
||||
insert_pair(sourceIndexByCharacter, parSource[i], i);
|
||||
}
|
||||
|
||||
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
|
||||
free(table);
|
||||
pblMapFree(sourceIndexByCharacter);
|
||||
return retval;
|
||||
}
|
44
src/main/damerau_levenshtein.h
Normal file
44
src/main/damerau_levenshtein.h
Normal file
|
@ -0,0 +1,44 @@
|
|||
/* Copyright 2015, 2016, Michele Santullo
|
||||
* This file is part of "dindexer".
|
||||
*
|
||||
* "dindexer" is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* "dindexer" is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef id88738025C6B24BDEB604A5AE3C36EE8D
|
||||
#define id88738025C6B24BDEB604A5AE3C36EE8D
|
||||
|
||||
#include "helpers/compatibility.h"
|
||||
#include "stddef.h"
|
||||
|
||||
int damerau_levenshtein (
|
||||
const char* parSource,
|
||||
const char* parTarget,
|
||||
int parDeleteCost,
|
||||
int parInsertCost,
|
||||
int parReplaceCost,
|
||||
int parSwapCost
|
||||
) a_pure;
|
||||
|
||||
int damerau_levenshtein_with_size (
|
||||
const char* parSource,
|
||||
size_t parSourceLen,
|
||||
const char* parTarget,
|
||||
size_t parTargetLen,
|
||||
int parDeleteCost,
|
||||
int parInsertCost,
|
||||
int parReplaceCost,
|
||||
int parSwapCost
|
||||
) a_pure;
|
||||
|
||||
#endif
|
|
@ -21,6 +21,7 @@
|
|||
#include "findactions.h"
|
||||
#include "helpers/lengthof.h"
|
||||
#include "builtin_feats.h"
|
||||
#include "damerau_levenshtein.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <iso646.h>
|
||||
|
@ -55,6 +56,7 @@ static size_t foreach_avail_action ( int(*parFunc)(const char*, const void*), ch
|
|||
static int printf_stream ( const char* parMsg, const void* parStream );
|
||||
static int printf_stream_inplace ( const char* parMsg, const void* parPrintContext );
|
||||
static int same_action ( const char* parAction1, const void* parAction2 );
|
||||
static int find_similar ( const char* parAction, const void* parUserInput );
|
||||
static void print_usage ( void );
|
||||
static int manage_commandline ( int parArgc, char* parArgv[], char** parActions, size_t parActionCount, int* parShouldQuit );
|
||||
|
||||
|
@ -88,8 +90,17 @@ int main (int parArgc, char* parArgv[]) {
|
|||
selected_action = foreach_avail_action(&same_action, actions, actions_count, specified_action);
|
||||
|
||||
if (actions_count == selected_action) {
|
||||
fprintf(stderr, "Unrecognized action \"%s\" - available actions are:\n", specified_action);
|
||||
foreach_avail_action(&printf_stream, actions, actions_count, stderr);
|
||||
//Find a possible mispelling and show a hint to the user if any
|
||||
selected_action = foreach_avail_action(&find_similar, actions, actions_count, specified_action);
|
||||
if (selected_action < actions_count) {
|
||||
fprintf(stderr, "Unrecognized action \"%s\" - maybe you meant \"%s\"?\n",
|
||||
specified_action,
|
||||
get_actionname(actions[selected_action])
|
||||
);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Unrecognized action \"%s\"\n", specified_action);
|
||||
}
|
||||
free_actions(actions, actions_count);
|
||||
return 2;
|
||||
}
|
||||
|
@ -177,6 +188,14 @@ static int same_action (const char* parAction1, const void* parAction2) {
|
|||
}
|
||||
}
|
||||
|
||||
static int find_similar (const char* parAction, const void* parUserInput) {
|
||||
const int distance = damerau_levenshtein((const char*)parUserInput, parAction, 1, 1, 1, 1);
|
||||
if (distance <= 2)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void print_usage() {
|
||||
printf("--help, -h - show this help\n");
|
||||
printf("--builtin, -b - show build info\n");
|
||||
|
|
25
src/main/pbl_wrapper.h
Normal file
25
src/main/pbl_wrapper.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
/* Copyright 2015, 2016, Michele Santullo
|
||||
* This file is part of "dindexer".
|
||||
*
|
||||
* "dindexer" is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* "dindexer" is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef idDB955D222C4A4AF5B9BD1A4A0BBDD9E3
|
||||
#define idDB955D222C4A4AF5B9BD1A4A0BBDD9E3
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
#include "pbl.h"
|
||||
|
||||
#endif
|
Loading…
Reference in a new issue