1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2024-11-29 01:33:46 +00:00

Implement mispelling suggestions for argument to dindexer

This commit is contained in:
King_DuckZ 2016-04-19 00:09:38 +02:00
parent 3e130cd346
commit 31142a49b3
5 changed files with 340 additions and 3 deletions

View file

@ -6,11 +6,12 @@ add_executable(${PROJECT_NAME}
main.c
findactions.c
builtin_feats.c
damerau_levenshtein.c
)
target_include_directories(${PROJECT_NAME}
PRIVATE ${CMAKE_SOURCE_DIR}/include
PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src
PRIVATE ${CMAKE_SOURCE_DIR}/lib/pbl/pbl/src/src
)
target_link_libraries(${PROJECT_NAME}

View file

@ -0,0 +1,248 @@
/* Copyright (c) 2012 Kevin L. Stern
* Copyright (c) 2016 Michele Santullo
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
* See
* http://software-and-algorithms.blogspot.ca/2012/09/damerau-levenshtein-edit-distance.html
* and
* https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
*/
/**
* The Damerau-Levenshtein Algorithm is an extension to the Levenshtein
* Algorithm which solves the edit distance problem between a source string and
* a target string with the following operations:
*
* <ul>
* <li>Character Insertion</li>
* <li>Character Deletion</li>
* <li>Character Replacement</li>
* <li>Adjacent Character Swap</li>
* </ul>
*
* Note that the adjacent character swap operation is an edit that may be
* applied when two adjacent characters in the source string match two adjacent
* characters in the target string, but in reverse order, rather than a general
* allowance for adjacent character swaps.
* <p>
*
* This implementation allows the client to specify the costs of the various
* edit operations with the restriction that the cost of two swap operations
* must not be less than the cost of a delete operation followed by an insert
* operation. This restriction is required to preclude two swaps involving the
* same character being required for optimality which, in turn, enables a fast
* dynamic programming solution.
* <p>
*
* The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is
* the length of the source string and m is the length of the target string.
* This implementation consumes O(n*m) space.
*
* @author Kevin L. Stern
*/
#include "damerau_levenshtein.h"
#include "pbl_wrapper.h"
#include <string.h>
#include <iso646.h>
#include <assert.h>
#include <stdlib.h>
#include <limits.h>
//See: http://stackoverflow.com/questions/3437404/min-and-max-in-c
#define min(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; })
#define max(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; })
typedef wchar_t Character;
static void insert_pair (PblMap* parMap, Character parKey, int parValue) {
const int retval = pblMapAdd(
parMap,
&parKey,
sizeof(parKey),
&parValue,
sizeof(parValue)
);
assert(0 <= retval);
}
static int get_value (PblMap* parMap, Character parKey) {
size_t ret_len;
void* value = pblMapGet(parMap, &parKey, sizeof(parKey), &ret_len);
assert(not value or (sizeof(int) == ret_len));
return (value ? *(int*)value : -1);
}
int damerau_levenshtein (
const char* parSource,
const char* parTarget,
int parDeleteCost,
int parInsertCost,
int parReplaceCost,
int parSwapCost
)
{
return damerau_levenshtein_with_size(
parSource,
strlen(parSource),
parTarget,
strlen(parTarget),
parDeleteCost,
parInsertCost,
parReplaceCost,
parSwapCost
);
}
/**
* Compute the Damerau-Levenshtein distance between the specified source
* string and the specified target string.
*/
int damerau_levenshtein_with_size (
const char* parSource,
size_t parSourceLen,
const char* parTarget,
size_t parTargetLen,
int parDeleteCost,
int parInsertCost,
int parReplaceCost,
int parSwapCost
)
{
int i;
int j;
int* table;
PblMap* sourceIndexByCharacter;
int delete_distance;
int insert_distance;
int match_distance;
int swap_distance;
int maxSourceLetterMatchIndex;
int candidateSwapIndex;
int i_swap;
int j_swap;
int pre_swap_cost;
int retval;
assert(parSource);
assert(parTarget);
/*
* Required to facilitate the premise to the algorithm that two swaps of the
* same character are never required for optimality.
*/
if (2 * parSwapCost < parInsertCost + parDeleteCost) {
/*throw new IllegalArgumentException("Unsupported cost assignment");*/
return -1;
}
if (0 == parSourceLen)
return parTargetLen * parInsertCost;
if (0 == parTargetLen)
return parSourceLen * parDeleteCost;
const int table_length = parSourceLen * parTargetLen;
table = (int*)malloc(sizeof(int) * table_length);
memset(table, 0, sizeof(int) * table_length);
sourceIndexByCharacter = pblMapNewHashMap();
assert(sourceIndexByCharacter);
if (parSource[0] != parTarget[0]) {
table[0 /*source*/ + 0 /*target*/ * parSourceLen] =
min(parReplaceCost, parDeleteCost + parInsertCost);
}
insert_pair(sourceIndexByCharacter, parSource[0], 0);
for (i = 1; i < parSourceLen; ++i) {
delete_distance = table[i - 1 + 0 * parSourceLen];
insert_distance = (i + 1) * parDeleteCost + parInsertCost;
match_distance = i * parDeleteCost +
(parSource[i] == parTarget[i] ? 0 : parReplaceCost);
table[i + 0 * parSourceLen] = min(
min(delete_distance, insert_distance), match_distance
);
}
for (j = 1; j < parTargetLen; ++j) {
delete_distance = (j + 1) * parInsertCost + parDeleteCost;
insert_distance = table[0 + (j - 1) * parSourceLen] + parInsertCost;
match_distance = j * parInsertCost +
(parSource[0] == parTarget[j] ? 0 : parReplaceCost);
table[0 + j * parSourceLen] = min(
min(delete_distance, insert_distance), match_distance
);
}
for (i = 1; i < parSourceLen; ++i) {
maxSourceLetterMatchIndex = (parSource[i] == parTarget[0] ? 0 : -1);
for (j = 1; j < parTargetLen; ++j) {
candidateSwapIndex =
get_value(sourceIndexByCharacter, parTarget[j]);
j_swap = maxSourceLetterMatchIndex;
delete_distance = table[(i - 1) + j * parSourceLen] + parDeleteCost;
insert_distance = table[i + (j - 1) * parSourceLen] + parInsertCost;
match_distance = table[(i - 1) + (j - 1) * parSourceLen];
if (parSource[i] != parTarget[j])
match_distance += parReplaceCost;
else
maxSourceLetterMatchIndex = j;
if (-1 != candidateSwapIndex and -1 != j_swap) {
i_swap = candidateSwapIndex;
if (0 == i_swap and 0 == j_swap)
pre_swap_cost = 0;
else
pre_swap_cost = table[
max(0, i_swap - 1) + max(0, j_swap - 1) * parSourceLen
];
swap_distance = pre_swap_cost + (i - i_swap - 1) *
parDeleteCost + (j - j_swap - 1) * parInsertCost +
parSwapCost;
}
else {
swap_distance = INT_MAX;
}
table[i + j * parSourceLen] = min(
min(
min(delete_distance, insert_distance),
match_distance
),
swap_distance
);
}
insert_pair(sourceIndexByCharacter, parSource[i], i);
}
retval = table[(parSourceLen - 1) + (parTargetLen - 1) * parSourceLen];
free(table);
pblMapFree(sourceIndexByCharacter);
return retval;
}

View file

@ -0,0 +1,44 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef id88738025C6B24BDEB604A5AE3C36EE8D
#define id88738025C6B24BDEB604A5AE3C36EE8D
#include "helpers/compatibility.h"
#include "stddef.h"
int damerau_levenshtein (
const char* parSource,
const char* parTarget,
int parDeleteCost,
int parInsertCost,
int parReplaceCost,
int parSwapCost
) a_pure;
int damerau_levenshtein_with_size (
const char* parSource,
size_t parSourceLen,
const char* parTarget,
size_t parTargetLen,
int parDeleteCost,
int parInsertCost,
int parReplaceCost,
int parSwapCost
) a_pure;
#endif

View file

@ -21,6 +21,7 @@
#include "findactions.h"
#include "helpers/lengthof.h"
#include "builtin_feats.h"
#include "damerau_levenshtein.h"
#include <string.h>
#include <stdio.h>
#include <iso646.h>
@ -55,6 +56,7 @@ static size_t foreach_avail_action ( int(*parFunc)(const char*, const void*), ch
static int printf_stream ( const char* parMsg, const void* parStream );
static int printf_stream_inplace ( const char* parMsg, const void* parPrintContext );
static int same_action ( const char* parAction1, const void* parAction2 );
static int find_similar ( const char* parAction, const void* parUserInput );
static void print_usage ( void );
static int manage_commandline ( int parArgc, char* parArgv[], char** parActions, size_t parActionCount, int* parShouldQuit );
@ -88,8 +90,17 @@ int main (int parArgc, char* parArgv[]) {
selected_action = foreach_avail_action(&same_action, actions, actions_count, specified_action);
if (actions_count == selected_action) {
fprintf(stderr, "Unrecognized action \"%s\" - available actions are:\n", specified_action);
foreach_avail_action(&printf_stream, actions, actions_count, stderr);
//Find a possible mispelling and show a hint to the user if any
selected_action = foreach_avail_action(&find_similar, actions, actions_count, specified_action);
if (selected_action < actions_count) {
fprintf(stderr, "Unrecognized action \"%s\" - maybe you meant \"%s\"?\n",
specified_action,
get_actionname(actions[selected_action])
);
}
else {
fprintf(stderr, "Unrecognized action \"%s\"\n", specified_action);
}
free_actions(actions, actions_count);
return 2;
}
@ -177,6 +188,14 @@ static int same_action (const char* parAction1, const void* parAction2) {
}
}
static int find_similar (const char* parAction, const void* parUserInput) {
const int distance = damerau_levenshtein((const char*)parUserInput, parAction, 1, 1, 1, 1);
if (distance <= 2)
return 1;
else
return 0;
}
static void print_usage() {
printf("--help, -h - show this help\n");
printf("--builtin, -b - show build info\n");

25
src/main/pbl_wrapper.h Normal file
View file

@ -0,0 +1,25 @@
/* Copyright 2015, 2016, Michele Santullo
* This file is part of "dindexer".
*
* "dindexer" is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* "dindexer" is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with "dindexer". If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef idDB955D222C4A4AF5B9BD1A4A0BBDD9E3
#define idDB955D222C4A4AF5B9BD1A4A0BBDD9E3
#include <stdio.h>
#include <memory.h>
#include "pbl.h"
#endif