vellum adoption for regex and fuzzy queries
This commit is contained in:
parent
6693a89441
commit
1ef41101ba
|
@ -96,6 +96,13 @@ type IndexReader interface {
|
|||
Close() error
|
||||
}
|
||||
|
||||
// IndexReaderAdv is an optional interface for advanced users
|
||||
// Hope to have a better name here...
|
||||
type IndexReaderAdv interface {
|
||||
FieldDictRegex(field string, regex []byte) (FieldDict, error)
|
||||
FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
|
||||
}
|
||||
|
||||
// FieldTerms contains the terms used by a document, keyed by field
|
||||
type FieldTerms map[string][]string
|
||||
|
||||
|
|
|
@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
|
|||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) FuzzyIterator(term string,
|
||||
fuzziness int) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
type EmptyDictionaryIterator struct{}
|
||||
|
||||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
|
|
|
@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
|||
}
|
||||
}
|
||||
|
||||
// RegexIterator returns an iterator which only visits terms matching
|
||||
// the given regex expression.
|
||||
// TODO complete the implementation
|
||||
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
prefix: regex,
|
||||
}
|
||||
}
|
||||
|
||||
// FuzzyIterator returns an iterator which only visits terms matching
|
||||
// the given edit distance.
|
||||
// TODO complete the implementation
|
||||
func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
prefix: term,
|
||||
}
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
|
|
|
@ -48,6 +48,8 @@ type TermDictionary interface {
|
|||
Iterator() DictionaryIterator
|
||||
PrefixIterator(prefix string) DictionaryIterator
|
||||
RangeIterator(start, end string) DictionaryIterator
|
||||
RegexIterator(regex string) DictionaryIterator
|
||||
FuzzyIterator(term string, fuzziness int) DictionaryIterator
|
||||
}
|
||||
|
||||
type DictionaryIterator interface {
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/couchbase/vellum/levenshtein"
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
|
@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
|||
return rv
|
||||
}
|
||||
|
||||
// RegexIterator returns an iterator which only visits terms having the
|
||||
// the specified regex
|
||||
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
r, err := regexp.New(regex)
|
||||
if err == nil {
|
||||
itr, err := d.fst.Search(r, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// FuzzyIterator returns an iterator which only visits terms having the
|
||||
// the specified edit/levenshtein distance
|
||||
func (d *Dictionary) FuzzyIterator(term string,
|
||||
fuzziness int) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
la, err := levenshtein.New(term, fuzziness)
|
||||
if err == nil {
|
||||
itr, err := d.fst.Search(la, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
|
|
|
@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
|
|||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictRegex(field string,
|
||||
termRegex []byte) (index.FieldDict, error) {
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.RegexIterator(string(termRegex))
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictFuzzy(field string,
|
||||
term []byte, fuzziness int) (index.FieldDict, error) {
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.FuzzyIterator(string(term), fuzziness)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
results := make(chan *asynchSegmentResult)
|
||||
for index, segment := range i.segment {
|
||||
|
|
|
@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic
|
|||
return s.d.RangeIterator(start, end)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator {
|
||||
return s.d.RegexIterator(regex)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
|
||||
fuzziness int) segment.DictionaryIterator {
|
||||
return s.d.FuzzyIterator(term, fuzziness)
|
||||
}
|
||||
|
||||
type SegmentSnapshot struct {
|
||||
id uint64
|
||||
segment segment.Segment
|
||||
|
|
|
@ -15,6 +15,9 @@
|
|||
package searcher
|
||||
|
||||
import (
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
)
|
||||
|
@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
|||
break
|
||||
}
|
||||
}
|
||||
|
||||
t := time.Now()
|
||||
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
||||
field, prefixTerm)
|
||||
log.Printf("time taken-> %f", time.Since(t).Seconds())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
|||
if len(prefixTerm) > 0 {
|
||||
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
||||
} else {
|
||||
// in case of advanced reader implementations directly call
|
||||
// the levenshtein automaton based iterator to collect the
|
||||
// candidate terms
|
||||
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
|
||||
fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
|
||||
if err != nil {
|
||||
return rv, err
|
||||
}
|
||||
tfd, err := fieldDict.Next()
|
||||
for err == nil && tfd != nil {
|
||||
rv = append(rv, tfd.Term)
|
||||
tfd, err = fieldDict.Next()
|
||||
}
|
||||
log.Printf("candidate FSA fuzzy terms: %+v", rv)
|
||||
return rv, nil
|
||||
}
|
||||
fieldDict, err = indexReader.FieldDict(field)
|
||||
}
|
||||
defer func() {
|
||||
|
|
|
@ -15,7 +15,9 @@
|
|||
package searcher
|
||||
|
||||
import (
|
||||
"log"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
|
@ -29,19 +31,40 @@ import (
|
|||
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
|
||||
field string, boost float64, options search.SearcherOptions) (
|
||||
search.Searcher, error) {
|
||||
|
||||
prefixTerm, complete := pattern.LiteralPrefix()
|
||||
var candidateTerms []string
|
||||
if complete {
|
||||
// there is no pattern
|
||||
candidateTerms = []string{prefixTerm}
|
||||
} else {
|
||||
var err error
|
||||
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
|
||||
prefixTerm)
|
||||
t := time.Now()
|
||||
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
|
||||
fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||
err = cerr
|
||||
}
|
||||
}()
|
||||
|
||||
// enumerate the terms and check against regexp
|
||||
tfd, err := fieldDict.Next()
|
||||
for err == nil && tfd != nil {
|
||||
candidateTerms = append(candidateTerms, tfd.Term)
|
||||
tfd, err = fieldDict.Next()
|
||||
}
|
||||
log.Printf("fsa time took-> %f", time.Since(t).Seconds())
|
||||
} else {
|
||||
prefixTerm, complete := pattern.LiteralPrefix()
|
||||
if complete {
|
||||
// there is no pattern
|
||||
candidateTerms = []string{prefixTerm}
|
||||
} else {
|
||||
var err error
|
||||
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
|
||||
prefixTerm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
log.Printf("time took-> %f", time.Since(t).Seconds())
|
||||
}
|
||||
|
||||
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,
|
||||
|
|
Loading…
Reference in New Issue