0
0
Fork 0

finished initial impl of fuzzy search

you can do a manual fuzzy term search using the FuzzyQuery struct
or, more suitable for most users the MatchQuery now supports
some fuzzy options.  Here you can specify fuzziness and
prefix_length, to turn the underlying term search into a fuzzy
term search.  This has the benefit that analysis is performed
on your input, just like the analyzed field, prior to computing
the fuzzy variants.

closes #82
This commit is contained in:
Marty Schoch 2014-10-24 13:39:48 -04:00
parent 78467c0836
commit 3a0263bb72
10 changed files with 919 additions and 508 deletions

View File

@ -35,8 +35,9 @@ func ParseQuery(input []byte) (Query, error) {
if err != nil {
return nil, err
}
_, isMatchQuery := tmp["match"]
_, hasFuzziness := tmp["fuzziness"]
if hasFuzziness {
if hasFuzziness && !isMatchQuery {
var rv fuzzyQuery
err := json.Unmarshal(input, &rv)
if err != nil {
@ -59,7 +60,6 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, isMatchQuery := tmp["match"]
if isMatchQuery {
var rv matchQuery
err := json.Unmarshal(input, &rv)

View File

@ -16,11 +16,11 @@ import (
)
type fuzzyQuery struct {
Term string `json:"term"`
Prefix int `json:"prefix_length"`
Fuzziness int `json:"fuzziness"`
FieldVal string `json:"field,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
Term string `json:"term"`
PrefixVal int `json:"prefix_length"`
FuzzinessVal int `json:"fuzziness"`
FieldVal string `json:"field,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
}
// NewPrefixQuery creates a new Query which finds
@ -28,10 +28,10 @@ type fuzzyQuery struct {
// specified prefix.
func NewFuzzyQuery(term string) *fuzzyQuery {
return &fuzzyQuery{
Term: term,
Prefix: 0,
Fuzziness: 1,
BoostVal: 1.0,
Term: term,
PrefixVal: 0,
FuzzinessVal: 1,
BoostVal: 1.0,
}
}
@ -53,12 +53,30 @@ func (q *fuzzyQuery) SetField(f string) Query {
return q
}
func (q *fuzzyQuery) Fuzziness() int {
return q.FuzzinessVal
}
func (q *fuzzyQuery) SetFuzziness(f int) Query {
q.FuzzinessVal = f
return q
}
func (q *fuzzyQuery) Prefix() int {
return q.PrefixVal
}
func (q *fuzzyQuery) SetPrefix(p int) Query {
q.PrefixVal = p
return q
}
func (q *fuzzyQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = m.DefaultField
}
return searchers.NewFuzzySearcher(i, q.Term, q.Prefix, q.Fuzziness, field, q.BoostVal, explain)
return searchers.NewFuzzySearcher(i, q.Term, q.PrefixVal, q.FuzzinessVal, field, q.BoostVal, explain)
}
func (q *fuzzyQuery) Validate() error {

View File

@ -11,16 +11,19 @@ package bleve
import (
"fmt"
"log"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
type matchQuery struct {
Match string `json:"match"`
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
Match string `json:"match"`
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
PrefixVal int `json:"prefix_length"`
FuzzinessVal int `json:"fuzziness"`
}
// NewMatchQuery creates a Query for matching text.
@ -54,6 +57,24 @@ func (q *matchQuery) SetField(f string) Query {
return q
}
func (q *matchQuery) Fuzziness() int {
return q.FuzzinessVal
}
func (q *matchQuery) SetFuzziness(f int) Query {
q.FuzzinessVal = f
return q
}
func (q *matchQuery) Prefix() int {
return q.PrefixVal
}
func (q *matchQuery) SetPrefix(p int) Query {
q.PrefixVal = p
return q
}
func (q *matchQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
field := q.FieldVal
@ -77,10 +98,22 @@ func (q *matchQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool
if len(tokens) > 0 {
tqs := make([]Query, len(tokens))
for i, token := range tokens {
tqs[i] = NewTermQuery(string(token.Term)).
SetField(field).
SetBoost(q.BoostVal)
if q.FuzzinessVal != 0 {
log.Printf("fuzziness is %d", q.FuzzinessVal)
for i, token := range tokens {
query := NewFuzzyQuery(string(token.Term))
query.SetFuzziness(q.FuzzinessVal)
query.SetPrefix(q.PrefixVal)
query.SetField(field)
query.SetBoost(q.BoostVal)
tqs[i] = query
}
} else {
for i, token := range tokens {
tqs[i] = NewTermQuery(string(token.Term)).
SetField(field).
SetBoost(q.BoostVal)
}
}
shouldQuery := NewDisjunctionQueryMin(tqs, 1).

View File

@ -12,6 +12,13 @@
/>/ { logDebugTokens("GREATER"); return tGREATER }
/</ { logDebugTokens("LESS"); return tLESS }
/=/ { logDebugTokens("EQUAL"); return tEQUAL }
/~([0-9]|[1-9][0-9]*)/
{
lval.s = yylex.Text()[1:]
logDebugTokens("TILDENUMBER - %s", lval.s);
return tTILDENUMBER
}
/~/ { logDebugTokens("TILDE"); return tTILDE }
/-?([0-9]|[1-9][0-9]*)(\.[0-9][0-9]*)?/
{
lval.s = yylex.Text()
@ -19,7 +26,7 @@
return tNUMBER
}
/[ \t\n]+/ { logDebugTokens("WHITESPACE (count=%d)", len(yylex.Text())) /* eat up whitespace */ }
/[^\t\n\f\r :^\+\-><=][^\t\n\f\r :^]*/ {
/[^\t\n\f\r :^\+\-><=~][^\t\n\f\r :^~]*/ {
lval.s = yylex.Text()
logDebugTokens("STRING - %s", lval.s);
return tSTRING

File diff suppressed because it is too large Load Diff

View File

@ -16,11 +16,13 @@ n int
f float64
q Query}
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tLPAREN tRPAREN tNUMBER tSTRING tGREATER tLESS tEQUAL
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tLPAREN tRPAREN tNUMBER tSTRING tGREATER tLESS
tEQUAL tTILDE tTILDENUMBER
%type <s> tSTRING
%type <s> tPHRASE
%type <s> tNUMBER
%type <s> tTILDENUMBER
%type <q> searchBase
%type <f> searchSuffix
%type <n> searchPrefix
@ -87,6 +89,23 @@ tSTRING {
$$ = q
}
|
tSTRING tTILDE {
str := $1
logDebugGrammar("STRING - %s", str)
q := NewMatchQuery(str)
q.SetFuzziness(1)
$$ = q
}
|
tSTRING tTILDENUMBER {
str := $1
fuzziness, _ := strconv.ParseFloat($2, 64)
logDebugGrammar("STRING - %s", str)
q := NewMatchQuery(str)
q.SetFuzziness(int(fuzziness))
$$ = q
}
|
tNUMBER {
str := $1
logDebugGrammar("STRING - %s", str)
@ -161,7 +180,6 @@ tSTRING tCOLON tLESS tEQUAL tNUMBER {
$$ = q
};
searchBoost:
tBOOST tNUMBER {
boost, _ := strconv.ParseFloat($2, 64)

View File

@ -33,6 +33,8 @@ const tNUMBER = 57354
const tGREATER = 57355
const tLESS = 57356
const tEQUAL = 57357
const tTILDE = 57358
const tTILDENUMBER = 57359
var yyToknames = []string{
"tSTRING",
@ -47,6 +49,8 @@ var yyToknames = []string{
"tGREATER",
"tLESS",
"tEQUAL",
"tTILDE",
"tTILDENUMBER",
}
var yyStatenames = []string{}
@ -64,53 +68,56 @@ var yyExca = []int{
-2, 5,
}
const yyNprod = 22
const yyNprod = 24
const yyPrivate = 57344
var yyTokenNames []string
var yyStates []string
const yyLast = 28
const yyLast = 30
var yyAct = []int{
18, 20, 25, 23, 28, 26, 24, 27, 19, 21,
22, 10, 12, 17, 15, 3, 16, 6, 7, 11,
2, 1, 14, 5, 8, 4, 13, 9,
18, 27, 20, 22, 28, 30, 10, 12, 16, 17,
21, 23, 24, 25, 11, 29, 26, 19, 15, 6,
7, 2, 3, 1, 14, 8, 5, 4, 13, 9,
}
var yyPact = []int{
11, -1000, -1000, 11, 7, -1000, -1000, -1000, -1000, 5,
8, -1000, -1000, -1000, -1000, 1, -4, -1000, -1000, -1000,
-1000, -9, -10, -1000, -5, -1000, -8, -1000, -1000,
13, -1000, -1000, 13, 2, -1000, -1000, -1000, -1000, 9,
-8, -1000, -1000, -1000, -1000, 5, -1000, -1000, -2, -1000,
-1000, -1000, -1000, 1, -11, -1000, 3, -1000, -7, -1000,
-1000,
}
var yyPgo = []int{
0, 27, 26, 25, 23, 22, 21, 20, 15,
0, 29, 28, 27, 26, 24, 23, 21, 22,
}
var yyR1 = []int{
0, 6, 7, 7, 8, 3, 3, 4, 4, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 5,
2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 5, 2, 2,
}
var yyR2 = []int{
0, 1, 2, 1, 3, 0, 1, 1, 1, 1,
1, 1, 3, 3, 3, 4, 5, 4, 5, 2,
0, 1,
2, 2, 1, 1, 3, 3, 3, 4, 5, 4,
5, 2, 0, 1,
}
var yyChk = []int{
-1000, -6, -7, -8, -3, -4, 6, 7, -7, -1,
4, 12, 5, -2, -5, 9, 8, 12, 4, 12,
5, 13, 14, 12, 15, 12, 15, 12, 12,
4, 12, 5, -2, -5, 9, 16, 17, 8, 12,
4, 12, 5, 13, 14, 12, 15, 12, 15, 12,
12,
}
var yyDef = []int{
5, -2, 1, -2, 0, 6, 7, 8, 2, 20,
9, 10, 11, 4, 21, 0, 0, 19, 12, 13,
14, 0, 0, 15, 0, 17, 0, 16, 18,
5, -2, 1, -2, 0, 6, 7, 8, 2, 22,
9, 12, 13, 4, 23, 0, 10, 11, 0, 21,
14, 15, 16, 0, 0, 17, 0, 19, 0, 18,
20,
}
var yyTok1 = []int{
@ -119,7 +126,7 @@ var yyTok1 = []int{
var yyTok2 = []int{
2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15,
12, 13, 14, 15, 16, 17,
}
var yyTok3 = []int{
0,
@ -351,22 +358,22 @@ yydefault:
switch yynt {
case 1:
//line query_string.y:33
//line query_string.y:35
{
logDebugGrammar("INPUT")
}
case 2:
//line query_string.y:38
//line query_string.y:40
{
logDebugGrammar("SEARCH PARTS")
}
case 3:
//line query_string.y:42
//line query_string.y:44
{
logDebugGrammar("SEARCH PART")
}
case 4:
//line query_string.y:47
//line query_string.y:49
{
query := yyS[yypt-1].q
query.SetBoost(yyS[yypt-0].f)
@ -380,29 +387,29 @@ yydefault:
}
}
case 5:
//line query_string.y:62
//line query_string.y:64
{
yyVAL.n = queryShould
}
case 6:
//line query_string.y:66
//line query_string.y:68
{
yyVAL.n = yyS[yypt-0].n
}
case 7:
//line query_string.y:72
//line query_string.y:74
{
logDebugGrammar("PLUS")
yyVAL.n = queryMust
}
case 8:
//line query_string.y:77
//line query_string.y:79
{
logDebugGrammar("MINUS")
yyVAL.n = queryMustNot
}
case 9:
//line query_string.y:83
//line query_string.y:85
{
str := yyS[yypt-0].s
logDebugGrammar("STRING - %s", str)
@ -410,41 +417,60 @@ yydefault:
yyVAL.q = q
}
case 10:
//line query_string.y:90
//line query_string.y:92
{
str := yyS[yypt-1].s
logDebugGrammar("STRING - %s", str)
q := NewMatchQuery(str)
q.SetFuzziness(1)
yyVAL.q = q
}
case 11:
//line query_string.y:100
{
str := yyS[yypt-1].s
fuzziness, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
logDebugGrammar("STRING - %s", str)
q := NewMatchQuery(str)
q.SetFuzziness(int(fuzziness))
yyVAL.q = q
}
case 12:
//line query_string.y:109
{
str := yyS[yypt-0].s
logDebugGrammar("STRING - %s", str)
q := NewMatchQuery(str)
yyVAL.q = q
}
case 11:
//line query_string.y:97
case 13:
//line query_string.y:116
{
phrase := yyS[yypt-0].s
logDebugGrammar("PHRASE - %s", phrase)
q := NewMatchPhraseQuery(phrase)
yyVAL.q = q
}
case 12:
//line query_string.y:104
{
field := yyS[yypt-2].s
str := yyS[yypt-0].s
logDebugGrammar("FIELD - %s STRING - %s", field, str)
q := NewMatchQuery(str).SetField(field)
yyVAL.q = q
}
case 13:
//line query_string.y:112
{
field := yyS[yypt-2].s
str := yyS[yypt-0].s
logDebugGrammar("FIELD - %s STRING - %s", field, str)
q := NewMatchQuery(str).SetField(field)
yyVAL.q = q
}
case 14:
//line query_string.y:120
//line query_string.y:123
{
field := yyS[yypt-2].s
str := yyS[yypt-0].s
logDebugGrammar("FIELD - %s STRING - %s", field, str)
q := NewMatchQuery(str).SetField(field)
yyVAL.q = q
}
case 15:
//line query_string.y:131
{
field := yyS[yypt-2].s
str := yyS[yypt-0].s
logDebugGrammar("FIELD - %s STRING - %s", field, str)
q := NewMatchQuery(str).SetField(field)
yyVAL.q = q
}
case 16:
//line query_string.y:139
{
field := yyS[yypt-2].s
phrase := yyS[yypt-0].s
@ -452,8 +478,8 @@ yydefault:
q := NewMatchPhraseQuery(phrase).SetField(field)
yyVAL.q = q
}
case 15:
//line query_string.y:128
case 17:
//line query_string.y:147
{
field := yyS[yypt-3].s
min, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
@ -462,8 +488,8 @@ yydefault:
q := NewNumericRangeInclusiveQuery(&min, nil, &minInclusive, nil).SetField(field)
yyVAL.q = q
}
case 16:
//line query_string.y:137
case 18:
//line query_string.y:156
{
field := yyS[yypt-4].s
min, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
@ -472,8 +498,8 @@ yydefault:
q := NewNumericRangeInclusiveQuery(&min, nil, &minInclusive, nil).SetField(field)
yyVAL.q = q
}
case 17:
//line query_string.y:146
case 19:
//line query_string.y:165
{
field := yyS[yypt-3].s
max, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
@ -482,8 +508,8 @@ yydefault:
q := NewNumericRangeInclusiveQuery(nil, &max, nil, &maxInclusive).SetField(field)
yyVAL.q = q
}
case 18:
//line query_string.y:155
case 20:
//line query_string.y:174
{
field := yyS[yypt-4].s
max, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
@ -492,20 +518,20 @@ yydefault:
q := NewNumericRangeInclusiveQuery(nil, &max, nil, &maxInclusive).SetField(field)
yyVAL.q = q
}
case 19:
//line query_string.y:166
case 21:
//line query_string.y:184
{
boost, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
yyVAL.f = boost
logDebugGrammar("BOOST %f", boost)
}
case 20:
//line query_string.y:173
case 22:
//line query_string.y:191
{
yyVAL.f = 1.0
}
case 21:
//line query_string.y:177
case 23:
//line query_string.y:195
{
}

View File

@ -165,6 +165,37 @@ func TestQuerySyntaxParserValid(t *testing.T) {
},
nil),
},
{
input: "watex~",
mapping: NewIndexMapping(),
result: NewBooleanQuery(
nil,
[]Query{
NewMatchQuery("watex").SetFuzziness(1),
},
nil),
},
{
input: "watex~2",
mapping: NewIndexMapping(),
result: NewBooleanQuery(
nil,
[]Query{
NewMatchQuery("watex").SetFuzziness(2),
},
nil),
},
{
input: "watex~ 2",
mapping: NewIndexMapping(),
result: NewBooleanQuery(
nil,
[]Query{
NewMatchQuery("watex").SetFuzziness(1),
NewMatchQuery("2"),
},
nil),
},
}
for _, test := range tests {

View File

@ -10,6 +10,8 @@
package searchers
import (
"math"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
@ -39,8 +41,8 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzzin
candidateTerms := make([]string, 0)
tfd, err := fieldReader.Next()
for err == nil && tfd != nil {
ld := levenshteinDistance(&term, &tfd.Term)
if ld <= fuzziness {
ld, exceeded := levenshteinDistanceMax(&term, &tfd.Term, fuzziness)
if !exceeded && ld <= fuzziness {
candidateTerms = append(candidateTerms, tfd.Term)
}
tfd, err = fieldReader.Next()
@ -134,3 +136,56 @@ func levenshteinDistance(a, b *string) int {
}
return d[la]
}
// levenshteinDistanceMax same as levenshteinDistance but
// attempts to bail early once we know the distance
// will be greater than max
// in which case the first return val will be the max
// and the second will be true, indicating max was exceeded
func levenshteinDistanceMax(a, b *string, max int) (int, bool) {
la := len(*a)
lb := len(*b)
ld := int(math.Abs(float64(la - lb)))
if ld > max {
return max, true
}
d := make([]int, la+1)
var lastdiag, olddiag, temp int
for i := 1; i <= la; i++ {
d[i] = i
}
for i := 1; i <= lb; i++ {
d[0] = i
lastdiag = i - 1
rowmin := max + 1
for j := 1; j <= la; j++ {
olddiag = d[j]
min := d[j] + 1
if (d[j-1] + 1) < min {
min = d[j-1] + 1
}
if (*a)[j-1] == (*b)[i-1] {
temp = 0
} else {
temp = 1
}
if (lastdiag + temp) < min {
min = lastdiag + temp
}
if min < rowmin {
rowmin = min
}
d[j] = min
lastdiag = olddiag
}
// after each row if rowmin isnt less than max stop
if rowmin > max {
return max, true
}
}
return d[la], false
}

View File

@ -0,0 +1,114 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package searchers
import (
"testing"
)
func TestLevenshteinDistance(t *testing.T) {
tests := []struct {
a string
b string
dist int
}{
{
"water",
"atec",
2,
},
{
"water",
"aphex",
4,
},
}
for _, test := range tests {
actual := levenshteinDistance(&test.a, &test.b)
if actual != test.dist {
t.Errorf("expected %d, got %d for %s and %s", test.dist, actual, test.a, test.b)
}
}
}
func TestLevenshteinDistanceMax(t *testing.T) {
tests := []struct {
a string
b string
max int
dist int
exceeded bool
}{
{
a: "water",
b: "atec",
max: 1,
dist: 1,
exceeded: true,
},
{
a: "water",
b: "christmas",
max: 3,
dist: 3,
exceeded: true,
},
{
a: "water",
b: "water",
max: 1,
dist: 0,
exceeded: false,
},
}
for _, test := range tests {
actual, exceeded := levenshteinDistanceMax(&test.a, &test.b, test.max)
if actual != test.dist || exceeded != test.exceeded {
t.Errorf("expected %d %t, got %d %t for %s and %s", test.dist, test.exceeded, actual, exceeded, test.a, test.b)
}
}
}
// 5 terms that are less than 2
// 5 terms that are more than 2
var benchmarkTerms = []string{
"watex",
"aters",
"wayer",
"wbter",
"yater",
"christmas",
"waterwaterwater",
"watcatdogfish",
"q",
"couchbase",
}
func BenchmarkLevenshteinDistance(b *testing.B) {
a := "water"
for i := 0; i < b.N; i++ {
for _, t := range benchmarkTerms {
levenshteinDistance(&a, &t)
}
}
}
func BenchmarkLevenshteinDistanceMax(b *testing.B) {
a := "water"
for i := 0; i < b.N; i++ {
for _, t := range benchmarkTerms {
levenshteinDistanceMax(&a, &t, 2)
}
}
}