vellum adoption for regex and fuzzy queries
This commit is contained in:
parent
6693a89441
commit
1ef41101ba
|
@ -96,6 +96,13 @@ type IndexReader interface {
|
||||||
Close() error
|
Close() error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IndexReaderAdv is an optional interface for advanced users
|
||||||
|
// Hope to have a better name here...
|
||||||
|
type IndexReaderAdv interface {
|
||||||
|
FieldDictRegex(field string, regex []byte) (FieldDict, error)
|
||||||
|
FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
|
||||||
|
}
|
||||||
|
|
||||||
// FieldTerms contains the terms used by a document, keyed by field
|
// FieldTerms contains the terms used by a document, keyed by field
|
||||||
type FieldTerms map[string][]string
|
type FieldTerms map[string][]string
|
||||||
|
|
||||||
|
|
|
@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
|
||||||
return &EmptyDictionaryIterator{}
|
return &EmptyDictionaryIterator{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator {
|
||||||
|
return &EmptyDictionaryIterator{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *EmptyDictionary) FuzzyIterator(term string,
|
||||||
|
fuzziness int) DictionaryIterator {
|
||||||
|
return &EmptyDictionaryIterator{}
|
||||||
|
}
|
||||||
|
|
||||||
type EmptyDictionaryIterator struct{}
|
type EmptyDictionaryIterator struct{}
|
||||||
|
|
||||||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
||||||
|
|
|
@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RegexIterator returns an iterator which only visits terms matching
|
||||||
|
// the given regex expression.
|
||||||
|
// TODO complete the implementation
|
||||||
|
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
|
||||||
|
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex)
|
||||||
|
return &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
offset: offset,
|
||||||
|
prefix: regex,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuzzyIterator returns an iterator which only visits terms matching
|
||||||
|
// the given edit distance.
|
||||||
|
// TODO complete the implementation
|
||||||
|
func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator {
|
||||||
|
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term)
|
||||||
|
return &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
offset: offset,
|
||||||
|
prefix: term,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// DictionaryIterator is an iterator for term dictionary
|
// DictionaryIterator is an iterator for term dictionary
|
||||||
type DictionaryIterator struct {
|
type DictionaryIterator struct {
|
||||||
d *Dictionary
|
d *Dictionary
|
||||||
|
|
|
@ -48,6 +48,8 @@ type TermDictionary interface {
|
||||||
Iterator() DictionaryIterator
|
Iterator() DictionaryIterator
|
||||||
PrefixIterator(prefix string) DictionaryIterator
|
PrefixIterator(prefix string) DictionaryIterator
|
||||||
RangeIterator(start, end string) DictionaryIterator
|
RangeIterator(start, end string) DictionaryIterator
|
||||||
|
RegexIterator(regex string) DictionaryIterator
|
||||||
|
FuzzyIterator(term string, fuzziness int) DictionaryIterator
|
||||||
}
|
}
|
||||||
|
|
||||||
type DictionaryIterator interface {
|
type DictionaryIterator interface {
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/couchbase/vellum"
|
"github.com/couchbase/vellum"
|
||||||
|
"github.com/couchbase/vellum/levenshtein"
|
||||||
"github.com/couchbase/vellum/regexp"
|
"github.com/couchbase/vellum/regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RegexIterator returns an iterator which only visits terms having the
|
||||||
|
// the specified regex
|
||||||
|
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
|
||||||
|
rv := &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.fst != nil {
|
||||||
|
r, err := regexp.New(regex)
|
||||||
|
if err == nil {
|
||||||
|
itr, err := d.fst.Search(r, nil, nil)
|
||||||
|
if err == nil {
|
||||||
|
rv.itr = itr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// FuzzyIterator returns an iterator which only visits terms having the
|
||||||
|
// the specified edit/levenshtein distance
|
||||||
|
func (d *Dictionary) FuzzyIterator(term string,
|
||||||
|
fuzziness int) segment.DictionaryIterator {
|
||||||
|
rv := &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.fst != nil {
|
||||||
|
la, err := levenshtein.New(term, fuzziness)
|
||||||
|
if err == nil {
|
||||||
|
itr, err := d.fst.Search(la, nil, nil)
|
||||||
|
if err == nil {
|
||||||
|
rv.itr = itr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
// DictionaryIterator is an iterator for term dictionary
|
// DictionaryIterator is an iterator for term dictionary
|
||||||
type DictionaryIterator struct {
|
type DictionaryIterator struct {
|
||||||
d *Dictionary
|
d *Dictionary
|
||||||
|
|
|
@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) FieldDictRegex(field string,
|
||||||
|
termRegex []byte) (index.FieldDict, error) {
|
||||||
|
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||||
|
return i.RegexIterator(string(termRegex))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) FieldDictFuzzy(field string,
|
||||||
|
term []byte, fuzziness int) (index.FieldDict, error) {
|
||||||
|
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||||
|
return i.FuzzyIterator(string(term), fuzziness)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
||||||
results := make(chan *asynchSegmentResult)
|
results := make(chan *asynchSegmentResult)
|
||||||
for index, segment := range i.segment {
|
for index, segment := range i.segment {
|
||||||
|
|
|
@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic
|
||||||
return s.d.RangeIterator(start, end)
|
return s.d.RangeIterator(start, end)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator {
|
||||||
|
return s.d.RegexIterator(regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
|
||||||
|
fuzziness int) segment.DictionaryIterator {
|
||||||
|
return s.d.FuzzyIterator(term, fuzziness)
|
||||||
|
}
|
||||||
|
|
||||||
type SegmentSnapshot struct {
|
type SegmentSnapshot struct {
|
||||||
id uint64
|
id uint64
|
||||||
segment segment.Segment
|
segment segment.Segment
|
||||||
|
|
|
@ -15,6 +15,9 @@
|
||||||
package searcher
|
package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"log"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
)
|
)
|
||||||
|
@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
t := time.Now()
|
||||||
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
||||||
field, prefixTerm)
|
field, prefixTerm)
|
||||||
|
log.Printf("time taken-> %f", time.Since(t).Seconds())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
||||||
if len(prefixTerm) > 0 {
|
if len(prefixTerm) > 0 {
|
||||||
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
||||||
} else {
|
} else {
|
||||||
|
// in case of advanced reader implementations directly call
|
||||||
|
// the levenshtein automaton based iterator to collect the
|
||||||
|
// candidate terms
|
||||||
|
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
|
||||||
|
fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
|
||||||
|
if err != nil {
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
tfd, err := fieldDict.Next()
|
||||||
|
for err == nil && tfd != nil {
|
||||||
|
rv = append(rv, tfd.Term)
|
||||||
|
tfd, err = fieldDict.Next()
|
||||||
|
}
|
||||||
|
log.Printf("candidate FSA fuzzy terms: %+v", rv)
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
fieldDict, err = indexReader.FieldDict(field)
|
fieldDict, err = indexReader.FieldDict(field)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
|
|
|
@ -15,7 +15,9 @@
|
||||||
package searcher
|
package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"log"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
@ -29,19 +31,40 @@ import (
|
||||||
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
|
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
|
||||||
field string, boost float64, options search.SearcherOptions) (
|
field string, boost float64, options search.SearcherOptions) (
|
||||||
search.Searcher, error) {
|
search.Searcher, error) {
|
||||||
|
|
||||||
prefixTerm, complete := pattern.LiteralPrefix()
|
|
||||||
var candidateTerms []string
|
var candidateTerms []string
|
||||||
if complete {
|
t := time.Now()
|
||||||
// there is no pattern
|
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
|
||||||
candidateTerms = []string{prefixTerm}
|
fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String()))
|
||||||
} else {
|
|
||||||
var err error
|
|
||||||
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
|
|
||||||
prefixTerm)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer func() {
|
||||||
|
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||||
|
err = cerr
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// enumerate the terms and check against regexp
|
||||||
|
tfd, err := fieldDict.Next()
|
||||||
|
for err == nil && tfd != nil {
|
||||||
|
candidateTerms = append(candidateTerms, tfd.Term)
|
||||||
|
tfd, err = fieldDict.Next()
|
||||||
|
}
|
||||||
|
log.Printf("fsa time took-> %f", time.Since(t).Seconds())
|
||||||
|
} else {
|
||||||
|
prefixTerm, complete := pattern.LiteralPrefix()
|
||||||
|
if complete {
|
||||||
|
// there is no pattern
|
||||||
|
candidateTerms = []string{prefixTerm}
|
||||||
|
} else {
|
||||||
|
var err error
|
||||||
|
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
|
||||||
|
prefixTerm)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.Printf("time took-> %f", time.Since(t).Seconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,
|
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,
|
||||||
|
|
Loading…
Reference in New Issue