/** * Spell checker * * Does not have any dependencies on the rest of DMD. * * Copyright: Copyright (C) 1999-2022 by The D Language Foundation, All Rights Reserved * Authors: Walter Bright, https://www.digitalmars.com * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/speller.d, root/_speller.d) * Documentation: https://dlang.org/phobos/dmd_root_speller.html * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/speller.d */ module dmd.root.speller; /************************************************** * Looks for correct spelling. * Looks a distance of up to two. * This does an exhaustive search, so can potentially be very slow. * Params: * seed = wrongly spelled word * dg = search delegate of the form `T delegate(const(char)[] p, out int cost)` * Returns: * T.init = no correct spellings found, * otherwise the value returned by dg() for first possible correct spelling */ auto speller(alias dg)(const(char)[] seed) if (isSearchFunction!dg) { const size_t maxdist = seed.length < 4 ? seed.length / 2 : 2; foreach (distance; 0 .. maxdist) { if (auto p = spellerX!dg(seed, distance != 0)) return p; // if (seedlen > 10) // break; } return null; // didn't find it } private: import core.stdc.stdlib; import core.stdc.string; import dmd.common.string : SmallBuffer; enum isSearchFunction(alias fun) = is(searchFunctionType!fun); alias searchFunctionType(alias fun) = typeof(() {int x; return fun("", x);}()); /************************************* * Spell check level 1. * Params: * dg = delegate that looks up string in dictionary AA and returns value found * seed = starting string * flag = if true, do 2 level lookup otherwise 1 level * Returns: * whatever dg returns, null if no match */ auto spellerX(alias dg)(const(char)[] seed, bool flag) { if (!seed.length) return null; /* Need buffer to store trial strings in */ char[30] tmp = void; auto sb = SmallBuffer!char(seed.length + 1, tmp[]); char[] buf = sb[]; int cost = int.max; searchFunctionType!dg p = null; /* Deletions */ buf[0 .. seed.length - 1] = seed[1 .. $]; foreach (i; 0 .. seed.length) { //printf("del buf = '%s'\n", buf); int ncost; auto np = flag ? spellerY!dg(buf[0 .. seed.length - 1], i, ncost) : dg(buf[0 .. seed.length - 1], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; buf[i] = seed[i]; } /* Transpositions */ if (!flag) { buf[0 .. seed.length] = seed; for (size_t i = 0; i + 1 < seed.length; i++) { // swap [i] and [i + 1] buf[i] = seed[i + 1]; buf[i + 1] = seed[i]; //printf("tra buf = '%s'\n", buf); int ncost; auto np = dg(buf[0 .. seed.length], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; buf[i] = seed[i]; } } /* Substitutions */ buf[0 .. seed.length] = seed; foreach (i; 0 .. seed.length) { foreach (s; idchars) { buf[i] = s; //printf("sub buf = '%s'\n", buf); int ncost; auto np = flag ? spellerY!dg(buf[0 .. seed.length], i + 1, ncost) : dg(buf[0 .. seed.length], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; } buf[i] = seed[i]; } /* Insertions */ buf[1 .. seed.length + 1] = seed; foreach (i; 0 .. seed.length + 1) // yes, do seed.length+1 iterations { foreach (s; idchars) { buf[i] = s; //printf("ins buf = '%s'\n", buf); int ncost; auto np = flag ? spellerY!dg(buf[0 .. seed.length + 1], i + 1, ncost) : dg(buf[0 .. seed.length + 1], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; } if (i < seed.length) buf[i] = seed[i]; } return p; // return "best" result } /********************************************** * Do second level of spell matching. * Params: * dg = delegate that looks up string in dictionary AA and returns value found * seed = starting string * index = index into seed[] that is where we will mutate it * cost = set to cost of match * Returns: * whatever dg returns, null if no match */ auto spellerY(alias dg)(const(char)[] seed, size_t index, out int cost) { if (!seed.length) return null; /* Allocate a buf to store the new string to play with, needs * space for an extra char for insertions */ char[30] tmp = void; // stack allocations are fastest auto sb = SmallBuffer!char(seed.length + 1, tmp[]); char[] buf = sb[]; buf[0 .. index] = seed[0 .. index]; cost = int.max; // start with worst possible match searchFunctionType!dg p = null; /* Delete character at seed[index] */ if (index < seed.length) { buf[index .. seed.length - 1] = seed[index + 1 .. $]; // seed[] with deleted character int ncost; auto np = dg(buf[0 .. seed.length - 1], ncost); // look it up if (combineSpellerResult(p, cost, np, ncost)) // compare with prev match return p; // cannot get any better } /* Substitute character at seed[index] */ if (index < seed.length) { buf[0 .. seed.length] = seed; foreach (s; idchars) { buf[index] = s; // seed[] with substituted character //printf("sub buf = '%s'\n", buf); int ncost; auto np = dg(buf[0 .. seed.length], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; } } /* Insert character at seed[index] */ buf[index + 1 .. seed.length + 1] = seed[index .. $]; foreach (s; idchars) { buf[index] = s; //printf("ins buf = '%s'\n", buf); int ncost; auto np = dg(buf[0 .. seed.length + 1], ncost); if (combineSpellerResult(p, cost, np, ncost)) return p; } return p; // return "best" result } /* Characters used to substitute ones in the string we're checking * the spelling on. */ immutable string idchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; /************************************************** * Combine a new result from the spell checker to * find the one with the closest symbol with * respect to the cost defined by the search function * Params: * p = best found spelling so far, T.init if none found yet. * If np is better, p is replaced with np * cost = cost of p (int.max if none found yet). * If np is better, cost is replaced with ncost * np = current spelling to check against p, T.init if none * ncost = cost of np if np is not T.init * Returns: * true if the cost is less or equal 0, meaning we can stop looking * false otherwise */ bool combineSpellerResult(T)(ref T p, ref int cost, T np, int ncost) { if (np && ncost < cost) // if np is better { p = np; // np is new best match cost = ncost; if (cost <= 0) return true; // meaning we can stop looking } return false; } /************************************* Tests ***********************/ unittest { static immutable string[][] cases = [ ["hello", "hell", "y"], ["hello", "hel", "y"], ["hello", "ello", "y"], ["hello", "llo", "y"], ["hello", "hellox", "y"], ["hello", "helloxy", "y"], ["hello", "xhello", "y"], ["hello", "xyhello", "y"], ["hello", "ehllo", "y"], ["hello", "helol", "y"], ["hello", "abcd", "n"], ["hello", "helxxlo", "y"], ["hello", "ehlxxlo", "n"], ["hello", "heaao", "y"], ["_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y"], ]; //printf("unittest_speller()\n"); string dgarg; string speller_test(const(char)[] s, ref int cost) { assert(s[$-1] != '\0'); //printf("speller_test(%s, %s)\n", dgarg, s); cost = 0; if (dgarg == s) return dgarg; return null; } dgarg = "hell"; auto p = speller!speller_test("hello"); assert(p !is null); foreach (testCase; cases) { //printf("case [%d]\n", i); dgarg = testCase[1]; auto p2 = speller!speller_test(testCase[0]); if (p2) assert(testCase[2][0] == 'y'); else assert(testCase[2][0] == 'n'); } //printf("unittest_speller() success\n"); }