finished initial impl of fuzzy search
you can do a manual fuzzy term search using the FuzzyQuery struct or, more suitable for most users the MatchQuery now supports some fuzzy options. Here you can specify fuzziness and prefix_length, to turn the underlying term search into a fuzzy term search. This has the benefit that analysis is performed on your input, just like the analyzed field, prior to computing the fuzzy variants. closes #82
This commit is contained in:
parent
78467c0836
commit
3a0263bb72
4
query.go
4
query.go
|
@ -35,8 +35,9 @@ func ParseQuery(input []byte) (Query, error) {
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, isMatchQuery := tmp["match"]
|
||||
_, hasFuzziness := tmp["fuzziness"]
|
||||
if hasFuzziness {
|
||||
if hasFuzziness && !isMatchQuery {
|
||||
var rv fuzzyQuery
|
||||
err := json.Unmarshal(input, &rv)
|
||||
if err != nil {
|
||||
|
@ -59,7 +60,6 @@ func ParseQuery(input []byte) (Query, error) {
|
|||
}
|
||||
return &rv, nil
|
||||
}
|
||||
_, isMatchQuery := tmp["match"]
|
||||
if isMatchQuery {
|
||||
var rv matchQuery
|
||||
err := json.Unmarshal(input, &rv)
|
||||
|
|
|
@ -16,11 +16,11 @@ import (
|
|||
)
|
||||
|
||||
type fuzzyQuery struct {
|
||||
Term string `json:"term"`
|
||||
Prefix int `json:"prefix_length"`
|
||||
Fuzziness int `json:"fuzziness"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
BoostVal float64 `json:"boost,omitempty"`
|
||||
Term string `json:"term"`
|
||||
PrefixVal int `json:"prefix_length"`
|
||||
FuzzinessVal int `json:"fuzziness"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
BoostVal float64 `json:"boost,omitempty"`
|
||||
}
|
||||
|
||||
// NewPrefixQuery creates a new Query which finds
|
||||
|
@ -28,10 +28,10 @@ type fuzzyQuery struct {
|
|||
// specified prefix.
|
||||
func NewFuzzyQuery(term string) *fuzzyQuery {
|
||||
return &fuzzyQuery{
|
||||
Term: term,
|
||||
Prefix: 0,
|
||||
Fuzziness: 1,
|
||||
BoostVal: 1.0,
|
||||
Term: term,
|
||||
PrefixVal: 0,
|
||||
FuzzinessVal: 1,
|
||||
BoostVal: 1.0,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -53,12 +53,30 @@ func (q *fuzzyQuery) SetField(f string) Query {
|
|||
return q
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) Fuzziness() int {
|
||||
return q.FuzzinessVal
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) SetFuzziness(f int) Query {
|
||||
q.FuzzinessVal = f
|
||||
return q
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) Prefix() int {
|
||||
return q.PrefixVal
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) SetPrefix(p int) Query {
|
||||
q.PrefixVal = p
|
||||
return q
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
|
||||
field := q.FieldVal
|
||||
if q.FieldVal == "" {
|
||||
field = m.DefaultField
|
||||
}
|
||||
return searchers.NewFuzzySearcher(i, q.Term, q.Prefix, q.Fuzziness, field, q.BoostVal, explain)
|
||||
return searchers.NewFuzzySearcher(i, q.Term, q.PrefixVal, q.FuzzinessVal, field, q.BoostVal, explain)
|
||||
}
|
||||
|
||||
func (q *fuzzyQuery) Validate() error {
|
||||
|
|
|
@ -11,16 +11,19 @@ package bleve
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
)
|
||||
|
||||
type matchQuery struct {
|
||||
Match string `json:"match"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
Analyzer string `json:"analyzer,omitempty"`
|
||||
BoostVal float64 `json:"boost,omitempty"`
|
||||
Match string `json:"match"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
Analyzer string `json:"analyzer,omitempty"`
|
||||
BoostVal float64 `json:"boost,omitempty"`
|
||||
PrefixVal int `json:"prefix_length"`
|
||||
FuzzinessVal int `json:"fuzziness"`
|
||||
}
|
||||
|
||||
// NewMatchQuery creates a Query for matching text.
|
||||
|
@ -54,6 +57,24 @@ func (q *matchQuery) SetField(f string) Query {
|
|||
return q
|
||||
}
|
||||
|
||||
func (q *matchQuery) Fuzziness() int {
|
||||
return q.FuzzinessVal
|
||||
}
|
||||
|
||||
func (q *matchQuery) SetFuzziness(f int) Query {
|
||||
q.FuzzinessVal = f
|
||||
return q
|
||||
}
|
||||
|
||||
func (q *matchQuery) Prefix() int {
|
||||
return q.PrefixVal
|
||||
}
|
||||
|
||||
func (q *matchQuery) SetPrefix(p int) Query {
|
||||
q.PrefixVal = p
|
||||
return q
|
||||
}
|
||||
|
||||
func (q *matchQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
|
||||
|
||||
field := q.FieldVal
|
||||
|
@ -77,10 +98,22 @@ func (q *matchQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool
|
|||
if len(tokens) > 0 {
|
||||
|
||||
tqs := make([]Query, len(tokens))
|
||||
for i, token := range tokens {
|
||||
tqs[i] = NewTermQuery(string(token.Term)).
|
||||
SetField(field).
|
||||
SetBoost(q.BoostVal)
|
||||
if q.FuzzinessVal != 0 {
|
||||
log.Printf("fuzziness is %d", q.FuzzinessVal)
|
||||
for i, token := range tokens {
|
||||
query := NewFuzzyQuery(string(token.Term))
|
||||
query.SetFuzziness(q.FuzzinessVal)
|
||||
query.SetPrefix(q.PrefixVal)
|
||||
query.SetField(field)
|
||||
query.SetBoost(q.BoostVal)
|
||||
tqs[i] = query
|
||||
}
|
||||
} else {
|
||||
for i, token := range tokens {
|
||||
tqs[i] = NewTermQuery(string(token.Term)).
|
||||
SetField(field).
|
||||
SetBoost(q.BoostVal)
|
||||
}
|
||||
}
|
||||
|
||||
shouldQuery := NewDisjunctionQueryMin(tqs, 1).
|
||||
|
|
|
@ -12,6 +12,13 @@
|
|||
/>/ { logDebugTokens("GREATER"); return tGREATER }
|
||||
/</ { logDebugTokens("LESS"); return tLESS }
|
||||
/=/ { logDebugTokens("EQUAL"); return tEQUAL }
|
||||
/~([0-9]|[1-9][0-9]*)/
|
||||
{
|
||||
lval.s = yylex.Text()[1:]
|
||||
logDebugTokens("TILDENUMBER - %s", lval.s);
|
||||
return tTILDENUMBER
|
||||
}
|
||||
/~/ { logDebugTokens("TILDE"); return tTILDE }
|
||||
/-?([0-9]|[1-9][0-9]*)(\.[0-9][0-9]*)?/
|
||||
{
|
||||
lval.s = yylex.Text()
|
||||
|
@ -19,7 +26,7 @@
|
|||
return tNUMBER
|
||||
}
|
||||
/[ \t\n]+/ { logDebugTokens("WHITESPACE (count=%d)", len(yylex.Text())) /* eat up whitespace */ }
|
||||
/[^\t\n\f\r :^\+\-><=][^\t\n\f\r :^]*/ {
|
||||
/[^\t\n\f\r :^\+\-><=~][^\t\n\f\r :^~]*/ {
|
||||
lval.s = yylex.Text()
|
||||
logDebugTokens("STRING - %s", lval.s);
|
||||
return tSTRING
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -16,11 +16,13 @@ n int
|
|||
f float64
|
||||
q Query}
|
||||
|
||||
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tLPAREN tRPAREN tNUMBER tSTRING tGREATER tLESS tEQUAL
|
||||
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tLPAREN tRPAREN tNUMBER tSTRING tGREATER tLESS
|
||||
tEQUAL tTILDE tTILDENUMBER
|
||||
|
||||
%type <s> tSTRING
|
||||
%type <s> tPHRASE
|
||||
%type <s> tNUMBER
|
||||
%type <s> tTILDENUMBER
|
||||
%type <q> searchBase
|
||||
%type <f> searchSuffix
|
||||
%type <n> searchPrefix
|
||||
|
@ -87,6 +89,23 @@ tSTRING {
|
|||
$$ = q
|
||||
}
|
||||
|
|
||||
tSTRING tTILDE {
|
||||
str := $1
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(1)
|
||||
$$ = q
|
||||
}
|
||||
|
|
||||
tSTRING tTILDENUMBER {
|
||||
str := $1
|
||||
fuzziness, _ := strconv.ParseFloat($2, 64)
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
$$ = q
|
||||
}
|
||||
|
|
||||
tNUMBER {
|
||||
str := $1
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
|
@ -161,7 +180,6 @@ tSTRING tCOLON tLESS tEQUAL tNUMBER {
|
|||
$$ = q
|
||||
};
|
||||
|
||||
|
||||
searchBoost:
|
||||
tBOOST tNUMBER {
|
||||
boost, _ := strconv.ParseFloat($2, 64)
|
||||
|
|
|
@ -33,6 +33,8 @@ const tNUMBER = 57354
|
|||
const tGREATER = 57355
|
||||
const tLESS = 57356
|
||||
const tEQUAL = 57357
|
||||
const tTILDE = 57358
|
||||
const tTILDENUMBER = 57359
|
||||
|
||||
var yyToknames = []string{
|
||||
"tSTRING",
|
||||
|
@ -47,6 +49,8 @@ var yyToknames = []string{
|
|||
"tGREATER",
|
||||
"tLESS",
|
||||
"tEQUAL",
|
||||
"tTILDE",
|
||||
"tTILDENUMBER",
|
||||
}
|
||||
var yyStatenames = []string{}
|
||||
|
||||
|
@ -64,53 +68,56 @@ var yyExca = []int{
|
|||
-2, 5,
|
||||
}
|
||||
|
||||
const yyNprod = 22
|
||||
const yyNprod = 24
|
||||
const yyPrivate = 57344
|
||||
|
||||
var yyTokenNames []string
|
||||
var yyStates []string
|
||||
|
||||
const yyLast = 28
|
||||
const yyLast = 30
|
||||
|
||||
var yyAct = []int{
|
||||
|
||||
18, 20, 25, 23, 28, 26, 24, 27, 19, 21,
|
||||
22, 10, 12, 17, 15, 3, 16, 6, 7, 11,
|
||||
2, 1, 14, 5, 8, 4, 13, 9,
|
||||
18, 27, 20, 22, 28, 30, 10, 12, 16, 17,
|
||||
21, 23, 24, 25, 11, 29, 26, 19, 15, 6,
|
||||
7, 2, 3, 1, 14, 8, 5, 4, 13, 9,
|
||||
}
|
||||
var yyPact = []int{
|
||||
|
||||
11, -1000, -1000, 11, 7, -1000, -1000, -1000, -1000, 5,
|
||||
8, -1000, -1000, -1000, -1000, 1, -4, -1000, -1000, -1000,
|
||||
-1000, -9, -10, -1000, -5, -1000, -8, -1000, -1000,
|
||||
13, -1000, -1000, 13, 2, -1000, -1000, -1000, -1000, 9,
|
||||
-8, -1000, -1000, -1000, -1000, 5, -1000, -1000, -2, -1000,
|
||||
-1000, -1000, -1000, 1, -11, -1000, 3, -1000, -7, -1000,
|
||||
-1000,
|
||||
}
|
||||
var yyPgo = []int{
|
||||
|
||||
0, 27, 26, 25, 23, 22, 21, 20, 15,
|
||||
0, 29, 28, 27, 26, 24, 23, 21, 22,
|
||||
}
|
||||
var yyR1 = []int{
|
||||
|
||||
0, 6, 7, 7, 8, 3, 3, 4, 4, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 5,
|
||||
2, 2,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 5, 2, 2,
|
||||
}
|
||||
var yyR2 = []int{
|
||||
|
||||
0, 1, 2, 1, 3, 0, 1, 1, 1, 1,
|
||||
1, 1, 3, 3, 3, 4, 5, 4, 5, 2,
|
||||
0, 1,
|
||||
2, 2, 1, 1, 3, 3, 3, 4, 5, 4,
|
||||
5, 2, 0, 1,
|
||||
}
|
||||
var yyChk = []int{
|
||||
|
||||
-1000, -6, -7, -8, -3, -4, 6, 7, -7, -1,
|
||||
4, 12, 5, -2, -5, 9, 8, 12, 4, 12,
|
||||
5, 13, 14, 12, 15, 12, 15, 12, 12,
|
||||
4, 12, 5, -2, -5, 9, 16, 17, 8, 12,
|
||||
4, 12, 5, 13, 14, 12, 15, 12, 15, 12,
|
||||
12,
|
||||
}
|
||||
var yyDef = []int{
|
||||
|
||||
5, -2, 1, -2, 0, 6, 7, 8, 2, 20,
|
||||
9, 10, 11, 4, 21, 0, 0, 19, 12, 13,
|
||||
14, 0, 0, 15, 0, 17, 0, 16, 18,
|
||||
5, -2, 1, -2, 0, 6, 7, 8, 2, 22,
|
||||
9, 12, 13, 4, 23, 0, 10, 11, 0, 21,
|
||||
14, 15, 16, 0, 0, 17, 0, 19, 0, 18,
|
||||
20,
|
||||
}
|
||||
var yyTok1 = []int{
|
||||
|
||||
|
@ -119,7 +126,7 @@ var yyTok1 = []int{
|
|||
var yyTok2 = []int{
|
||||
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
12, 13, 14, 15, 16, 17,
|
||||
}
|
||||
var yyTok3 = []int{
|
||||
0,
|
||||
|
@ -351,22 +358,22 @@ yydefault:
|
|||
switch yynt {
|
||||
|
||||
case 1:
|
||||
//line query_string.y:33
|
||||
//line query_string.y:35
|
||||
{
|
||||
logDebugGrammar("INPUT")
|
||||
}
|
||||
case 2:
|
||||
//line query_string.y:38
|
||||
//line query_string.y:40
|
||||
{
|
||||
logDebugGrammar("SEARCH PARTS")
|
||||
}
|
||||
case 3:
|
||||
//line query_string.y:42
|
||||
//line query_string.y:44
|
||||
{
|
||||
logDebugGrammar("SEARCH PART")
|
||||
}
|
||||
case 4:
|
||||
//line query_string.y:47
|
||||
//line query_string.y:49
|
||||
{
|
||||
query := yyS[yypt-1].q
|
||||
query.SetBoost(yyS[yypt-0].f)
|
||||
|
@ -380,29 +387,29 @@ yydefault:
|
|||
}
|
||||
}
|
||||
case 5:
|
||||
//line query_string.y:62
|
||||
//line query_string.y:64
|
||||
{
|
||||
yyVAL.n = queryShould
|
||||
}
|
||||
case 6:
|
||||
//line query_string.y:66
|
||||
//line query_string.y:68
|
||||
{
|
||||
yyVAL.n = yyS[yypt-0].n
|
||||
}
|
||||
case 7:
|
||||
//line query_string.y:72
|
||||
//line query_string.y:74
|
||||
{
|
||||
logDebugGrammar("PLUS")
|
||||
yyVAL.n = queryMust
|
||||
}
|
||||
case 8:
|
||||
//line query_string.y:77
|
||||
//line query_string.y:79
|
||||
{
|
||||
logDebugGrammar("MINUS")
|
||||
yyVAL.n = queryMustNot
|
||||
}
|
||||
case 9:
|
||||
//line query_string.y:83
|
||||
//line query_string.y:85
|
||||
{
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
|
@ -410,41 +417,60 @@ yydefault:
|
|||
yyVAL.q = q
|
||||
}
|
||||
case 10:
|
||||
//line query_string.y:90
|
||||
//line query_string.y:92
|
||||
{
|
||||
str := yyS[yypt-1].s
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(1)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 11:
|
||||
//line query_string.y:100
|
||||
{
|
||||
str := yyS[yypt-1].s
|
||||
fuzziness, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 12:
|
||||
//line query_string.y:109
|
||||
{
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
q := NewMatchQuery(str)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 11:
|
||||
//line query_string.y:97
|
||||
case 13:
|
||||
//line query_string.y:116
|
||||
{
|
||||
phrase := yyS[yypt-0].s
|
||||
logDebugGrammar("PHRASE - %s", phrase)
|
||||
q := NewMatchPhraseQuery(phrase)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 12:
|
||||
//line query_string.y:104
|
||||
{
|
||||
field := yyS[yypt-2].s
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("FIELD - %s STRING - %s", field, str)
|
||||
q := NewMatchQuery(str).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 13:
|
||||
//line query_string.y:112
|
||||
{
|
||||
field := yyS[yypt-2].s
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("FIELD - %s STRING - %s", field, str)
|
||||
q := NewMatchQuery(str).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 14:
|
||||
//line query_string.y:120
|
||||
//line query_string.y:123
|
||||
{
|
||||
field := yyS[yypt-2].s
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("FIELD - %s STRING - %s", field, str)
|
||||
q := NewMatchQuery(str).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 15:
|
||||
//line query_string.y:131
|
||||
{
|
||||
field := yyS[yypt-2].s
|
||||
str := yyS[yypt-0].s
|
||||
logDebugGrammar("FIELD - %s STRING - %s", field, str)
|
||||
q := NewMatchQuery(str).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 16:
|
||||
//line query_string.y:139
|
||||
{
|
||||
field := yyS[yypt-2].s
|
||||
phrase := yyS[yypt-0].s
|
||||
|
@ -452,8 +478,8 @@ yydefault:
|
|||
q := NewMatchPhraseQuery(phrase).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 15:
|
||||
//line query_string.y:128
|
||||
case 17:
|
||||
//line query_string.y:147
|
||||
{
|
||||
field := yyS[yypt-3].s
|
||||
min, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
|
@ -462,8 +488,8 @@ yydefault:
|
|||
q := NewNumericRangeInclusiveQuery(&min, nil, &minInclusive, nil).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 16:
|
||||
//line query_string.y:137
|
||||
case 18:
|
||||
//line query_string.y:156
|
||||
{
|
||||
field := yyS[yypt-4].s
|
||||
min, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
|
@ -472,8 +498,8 @@ yydefault:
|
|||
q := NewNumericRangeInclusiveQuery(&min, nil, &minInclusive, nil).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 17:
|
||||
//line query_string.y:146
|
||||
case 19:
|
||||
//line query_string.y:165
|
||||
{
|
||||
field := yyS[yypt-3].s
|
||||
max, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
|
@ -482,8 +508,8 @@ yydefault:
|
|||
q := NewNumericRangeInclusiveQuery(nil, &max, nil, &maxInclusive).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 18:
|
||||
//line query_string.y:155
|
||||
case 20:
|
||||
//line query_string.y:174
|
||||
{
|
||||
field := yyS[yypt-4].s
|
||||
max, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
|
@ -492,20 +518,20 @@ yydefault:
|
|||
q := NewNumericRangeInclusiveQuery(nil, &max, nil, &maxInclusive).SetField(field)
|
||||
yyVAL.q = q
|
||||
}
|
||||
case 19:
|
||||
//line query_string.y:166
|
||||
case 21:
|
||||
//line query_string.y:184
|
||||
{
|
||||
boost, _ := strconv.ParseFloat(yyS[yypt-0].s, 64)
|
||||
yyVAL.f = boost
|
||||
logDebugGrammar("BOOST %f", boost)
|
||||
}
|
||||
case 20:
|
||||
//line query_string.y:173
|
||||
case 22:
|
||||
//line query_string.y:191
|
||||
{
|
||||
yyVAL.f = 1.0
|
||||
}
|
||||
case 21:
|
||||
//line query_string.y:177
|
||||
case 23:
|
||||
//line query_string.y:195
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -165,6 +165,37 @@ func TestQuerySyntaxParserValid(t *testing.T) {
|
|||
},
|
||||
nil),
|
||||
},
|
||||
{
|
||||
input: "watex~",
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("watex").SetFuzziness(1),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
{
|
||||
input: "watex~2",
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("watex").SetFuzziness(2),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
{
|
||||
input: "watex~ 2",
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("watex").SetFuzziness(1),
|
||||
NewMatchQuery("2"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
package searchers
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
)
|
||||
|
@ -39,8 +41,8 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzzin
|
|||
candidateTerms := make([]string, 0)
|
||||
tfd, err := fieldReader.Next()
|
||||
for err == nil && tfd != nil {
|
||||
ld := levenshteinDistance(&term, &tfd.Term)
|
||||
if ld <= fuzziness {
|
||||
ld, exceeded := levenshteinDistanceMax(&term, &tfd.Term, fuzziness)
|
||||
if !exceeded && ld <= fuzziness {
|
||||
candidateTerms = append(candidateTerms, tfd.Term)
|
||||
}
|
||||
tfd, err = fieldReader.Next()
|
||||
|
@ -134,3 +136,56 @@ func levenshteinDistance(a, b *string) int {
|
|||
}
|
||||
return d[la]
|
||||
}
|
||||
|
||||
// levenshteinDistanceMax same as levenshteinDistance but
|
||||
// attempts to bail early once we know the distance
|
||||
// will be greater than max
|
||||
// in which case the first return val will be the max
|
||||
// and the second will be true, indicating max was exceeded
|
||||
func levenshteinDistanceMax(a, b *string, max int) (int, bool) {
|
||||
la := len(*a)
|
||||
lb := len(*b)
|
||||
|
||||
ld := int(math.Abs(float64(la - lb)))
|
||||
if ld > max {
|
||||
return max, true
|
||||
}
|
||||
|
||||
d := make([]int, la+1)
|
||||
var lastdiag, olddiag, temp int
|
||||
|
||||
for i := 1; i <= la; i++ {
|
||||
d[i] = i
|
||||
}
|
||||
for i := 1; i <= lb; i++ {
|
||||
d[0] = i
|
||||
lastdiag = i - 1
|
||||
rowmin := max + 1
|
||||
for j := 1; j <= la; j++ {
|
||||
olddiag = d[j]
|
||||
min := d[j] + 1
|
||||
if (d[j-1] + 1) < min {
|
||||
min = d[j-1] + 1
|
||||
}
|
||||
if (*a)[j-1] == (*b)[i-1] {
|
||||
temp = 0
|
||||
} else {
|
||||
temp = 1
|
||||
}
|
||||
if (lastdiag + temp) < min {
|
||||
min = lastdiag + temp
|
||||
}
|
||||
if min < rowmin {
|
||||
rowmin = min
|
||||
}
|
||||
d[j] = min
|
||||
|
||||
lastdiag = olddiag
|
||||
}
|
||||
// after each row if rowmin isnt less than max stop
|
||||
if rowmin > max {
|
||||
return max, true
|
||||
}
|
||||
}
|
||||
return d[la], false
|
||||
}
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package searchers
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLevenshteinDistance(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
a string
|
||||
b string
|
||||
dist int
|
||||
}{
|
||||
{
|
||||
"water",
|
||||
"atec",
|
||||
2,
|
||||
},
|
||||
{
|
||||
"water",
|
||||
"aphex",
|
||||
4,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := levenshteinDistance(&test.a, &test.b)
|
||||
if actual != test.dist {
|
||||
t.Errorf("expected %d, got %d for %s and %s", test.dist, actual, test.a, test.b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLevenshteinDistanceMax(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
a string
|
||||
b string
|
||||
max int
|
||||
dist int
|
||||
exceeded bool
|
||||
}{
|
||||
{
|
||||
a: "water",
|
||||
b: "atec",
|
||||
max: 1,
|
||||
dist: 1,
|
||||
exceeded: true,
|
||||
},
|
||||
{
|
||||
a: "water",
|
||||
b: "christmas",
|
||||
max: 3,
|
||||
dist: 3,
|
||||
exceeded: true,
|
||||
},
|
||||
{
|
||||
a: "water",
|
||||
b: "water",
|
||||
max: 1,
|
||||
dist: 0,
|
||||
exceeded: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual, exceeded := levenshteinDistanceMax(&test.a, &test.b, test.max)
|
||||
if actual != test.dist || exceeded != test.exceeded {
|
||||
t.Errorf("expected %d %t, got %d %t for %s and %s", test.dist, test.exceeded, actual, exceeded, test.a, test.b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5 terms that are less than 2
|
||||
// 5 terms that are more than 2
|
||||
var benchmarkTerms = []string{
|
||||
"watex",
|
||||
"aters",
|
||||
"wayer",
|
||||
"wbter",
|
||||
"yater",
|
||||
"christmas",
|
||||
"waterwaterwater",
|
||||
"watcatdogfish",
|
||||
"q",
|
||||
"couchbase",
|
||||
}
|
||||
|
||||
func BenchmarkLevenshteinDistance(b *testing.B) {
|
||||
a := "water"
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, t := range benchmarkTerms {
|
||||
levenshteinDistance(&a, &t)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLevenshteinDistanceMax(b *testing.B) {
|
||||
a := "water"
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, t := range benchmarkTerms {
|
||||
levenshteinDistanceMax(&a, &t, 2)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue