diff --git a/search/levenshtein.go b/search/levenshtein.go new file mode 100644 index 00000000..7a647096 --- /dev/null +++ b/search/levenshtein.go @@ -0,0 +1,91 @@ +package search + +import ( + "math" +) + +func LevenshteinDistance(a, b *string) int { + la := len(*a) + lb := len(*b) + d := make([]int, la+1) + var lastdiag, olddiag, temp int + + for i := 1; i <= la; i++ { + d[i] = i + } + for i := 1; i <= lb; i++ { + d[0] = i + lastdiag = i - 1 + for j := 1; j <= la; j++ { + olddiag = d[j] + min := d[j] + 1 + if (d[j-1] + 1) < min { + min = d[j-1] + 1 + } + if (*a)[j-1] == (*b)[i-1] { + temp = 0 + } else { + temp = 1 + } + if (lastdiag + temp) < min { + min = lastdiag + temp + } + d[j] = min + lastdiag = olddiag + } + } + return d[la] +} + +// levenshteinDistanceMax same as levenshteinDistance but +// attempts to bail early once we know the distance +// will be greater than max +// in which case the first return val will be the max +// and the second will be true, indicating max was exceeded +func LevenshteinDistanceMax(a, b *string, max int) (int, bool) { + la := len(*a) + lb := len(*b) + + ld := int(math.Abs(float64(la - lb))) + if ld > max { + return max, true + } + + d := make([]int, la+1) + var lastdiag, olddiag, temp int + + for i := 1; i <= la; i++ { + d[i] = i + } + for i := 1; i <= lb; i++ { + d[0] = i + lastdiag = i - 1 + rowmin := max + 1 + for j := 1; j <= la; j++ { + olddiag = d[j] + min := d[j] + 1 + if (d[j-1] + 1) < min { + min = d[j-1] + 1 + } + if (*a)[j-1] == (*b)[i-1] { + temp = 0 + } else { + temp = 1 + } + if (lastdiag + temp) < min { + min = lastdiag + temp + } + if min < rowmin { + rowmin = min + } + d[j] = min + + lastdiag = olddiag + } + // after each row if rowmin isnt less than max stop + if rowmin > max { + return max, true + } + } + return d[la], false +} diff --git a/search/searchers/search_fuzzy_test.go b/search/levenshtein_test.go similarity index 91% rename from search/searchers/search_fuzzy_test.go rename to search/levenshtein_test.go index a9bc0269..7547b956 100644 --- a/search/searchers/search_fuzzy_test.go +++ b/search/levenshtein_test.go @@ -7,7 +7,7 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package searchers +package search import ( "testing" @@ -33,7 +33,7 @@ func TestLevenshteinDistance(t *testing.T) { } for _, test := range tests { - actual := levenshteinDistance(&test.a, &test.b) + actual := LevenshteinDistance(&test.a, &test.b) if actual != test.dist { t.Errorf("expected %d, got %d for %s and %s", test.dist, actual, test.a, test.b) } @@ -73,7 +73,7 @@ func TestLevenshteinDistanceMax(t *testing.T) { } for _, test := range tests { - actual, exceeded := levenshteinDistanceMax(&test.a, &test.b, test.max) + actual, exceeded := LevenshteinDistanceMax(&test.a, &test.b, test.max) if actual != test.dist || exceeded != test.exceeded { t.Errorf("expected %d %t, got %d %t for %s and %s", test.dist, test.exceeded, actual, exceeded, test.a, test.b) } @@ -99,7 +99,7 @@ func BenchmarkLevenshteinDistance(b *testing.B) { a := "water" for i := 0; i < b.N; i++ { for _, t := range benchmarkTerms { - levenshteinDistance(&a, &t) + LevenshteinDistance(&a, &t) } } } @@ -108,7 +108,7 @@ func BenchmarkLevenshteinDistanceMax(b *testing.B) { a := "water" for i := 0; i < b.N; i++ { for _, t := range benchmarkTerms { - levenshteinDistanceMax(&a, &t, 2) + LevenshteinDistanceMax(&a, &t, 2) } } } diff --git a/search/searchers/search_fuzzy.go b/search/searchers/search_fuzzy.go index 8498ded7..cbf99a3d 100644 --- a/search/searchers/search_fuzzy.go +++ b/search/searchers/search_fuzzy.go @@ -10,8 +10,6 @@ package searchers import ( - "math" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -41,7 +39,7 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzzin candidateTerms := make([]string, 0) tfd, err := fieldReader.Next() for err == nil && tfd != nil { - ld, exceeded := levenshteinDistanceMax(&term, &tfd.Term, fuzziness) + ld, exceeded := search.LevenshteinDistanceMax(&term, &tfd.Term, fuzziness) if !exceeded && ld <= fuzziness { candidateTerms = append(candidateTerms, tfd.Term) } @@ -103,89 +101,3 @@ func (s *FuzzySearcher) Close() { func (s *FuzzySearcher) Min() int { return 0 } - -func levenshteinDistance(a, b *string) int { - la := len(*a) - lb := len(*b) - d := make([]int, la+1) - var lastdiag, olddiag, temp int - - for i := 1; i <= la; i++ { - d[i] = i - } - for i := 1; i <= lb; i++ { - d[0] = i - lastdiag = i - 1 - for j := 1; j <= la; j++ { - olddiag = d[j] - min := d[j] + 1 - if (d[j-1] + 1) < min { - min = d[j-1] + 1 - } - if (*a)[j-1] == (*b)[i-1] { - temp = 0 - } else { - temp = 1 - } - if (lastdiag + temp) < min { - min = lastdiag + temp - } - d[j] = min - lastdiag = olddiag - } - } - return d[la] -} - -// levenshteinDistanceMax same as levenshteinDistance but -// attempts to bail early once we know the distance -// will be greater than max -// in which case the first return val will be the max -// and the second will be true, indicating max was exceeded -func levenshteinDistanceMax(a, b *string, max int) (int, bool) { - la := len(*a) - lb := len(*b) - - ld := int(math.Abs(float64(la - lb))) - if ld > max { - return max, true - } - - d := make([]int, la+1) - var lastdiag, olddiag, temp int - - for i := 1; i <= la; i++ { - d[i] = i - } - for i := 1; i <= lb; i++ { - d[0] = i - lastdiag = i - 1 - rowmin := max + 1 - for j := 1; j <= la; j++ { - olddiag = d[j] - min := d[j] + 1 - if (d[j-1] + 1) < min { - min = d[j-1] + 1 - } - if (*a)[j-1] == (*b)[i-1] { - temp = 0 - } else { - temp = 1 - } - if (lastdiag + temp) < min { - min = lastdiag + temp - } - if min < rowmin { - rowmin = min - } - d[j] = min - - lastdiag = olddiag - } - // after each row if rowmin isnt less than max stop - if rowmin > max { - return max, true - } - } - return d[la], false -}