0
0
Fork 0

vellum adoption for regex and fuzzy queries

This commit is contained in:
Sreekanth Sivasankaran 2018-03-19 17:20:49 +05:30
parent 6693a89441
commit 1ef41101ba
9 changed files with 160 additions and 10 deletions

View File

@ -96,6 +96,13 @@ type IndexReader interface {
Close() error
}
// IndexReaderAdv is an optional interface for advanced users
// Hope to have a better name here...
type IndexReaderAdv interface {
FieldDictRegex(field string, regex []byte) (FieldDict, error)
FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
}
// FieldTerms contains the terms used by a document, keyed by field
type FieldTerms map[string][]string

View File

@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
func (e *EmptyDictionary) FuzzyIterator(term string,
fuzziness int) DictionaryIterator {
return &EmptyDictionaryIterator{}
}
type EmptyDictionaryIterator struct{}
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {

View File

@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
}
}
// RegexIterator returns an iterator which only visits terms matching
// the given regex expression.
// TODO complete the implementation
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex)
return &DictionaryIterator{
d: d,
offset: offset,
prefix: regex,
}
}
// FuzzyIterator returns an iterator which only visits terms matching
// the given edit distance.
// TODO complete the implementation
func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term)
return &DictionaryIterator{
d: d,
offset: offset,
prefix: term,
}
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary

View File

@ -48,6 +48,8 @@ type TermDictionary interface {
Iterator() DictionaryIterator
PrefixIterator(prefix string) DictionaryIterator
RangeIterator(start, end string) DictionaryIterator
RegexIterator(regex string) DictionaryIterator
FuzzyIterator(term string, fuzziness int) DictionaryIterator
}
type DictionaryIterator interface {

View File

@ -21,6 +21,7 @@ import (
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbase/vellum"
"github.com/couchbase/vellum/levenshtein"
"github.com/couchbase/vellum/regexp"
)
@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
return rv
}
// RegexIterator returns an iterator which only visits terms having the
// the specified regex
func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
r, err := regexp.New(regex)
if err == nil {
itr, err := d.fst.Search(r, nil, nil)
if err == nil {
rv.itr = itr
}
}
}
return rv
}
// FuzzyIterator returns an iterator which only visits terms having the
// the specified edit/levenshtein distance
func (d *Dictionary) FuzzyIterator(term string,
fuzziness int) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
la, err := levenshtein.New(term, fuzziness)
if err == nil {
itr, err := d.fst.Search(la, nil, nil)
if err == nil {
rv.itr = itr
}
}
}
return rv
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary

View File

@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
})
}
func (i *IndexSnapshot) FieldDictRegex(field string,
termRegex []byte) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.RegexIterator(string(termRegex))
})
}
func (i *IndexSnapshot) FieldDictFuzzy(field string,
term []byte, fuzziness int) (index.FieldDict, error) {
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
return i.FuzzyIterator(string(term), fuzziness)
})
}
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
results := make(chan *asynchSegmentResult)
for index, segment := range i.segment {

View File

@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic
return s.d.RangeIterator(start, end)
}
func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator {
return s.d.RegexIterator(regex)
}
func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
fuzziness int) segment.DictionaryIterator {
return s.d.FuzzyIterator(term, fuzziness)
}
type SegmentSnapshot struct {
id uint64
segment segment.Segment

View File

@ -15,6 +15,9 @@
package searcher
import (
"log"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
break
}
}
t := time.Now()
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
field, prefixTerm)
log.Printf("time taken-> %f", time.Since(t).Seconds())
if err != nil {
return nil, err
}
@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else {
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
if err != nil {
return rv, err
}
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
rv = append(rv, tfd.Term)
tfd, err = fieldDict.Next()
}
log.Printf("candidate FSA fuzzy terms: %+v", rv)
return rv, nil
}
fieldDict, err = indexReader.FieldDict(field)
}
defer func() {

View File

@ -15,7 +15,9 @@
package searcher
import (
"log"
"regexp"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
@ -29,19 +31,40 @@ import (
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
field string, boost float64, options search.SearcherOptions) (
search.Searcher, error) {
prefixTerm, complete := pattern.LiteralPrefix()
var candidateTerms []string
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
t := time.Now()
if ir, ok := indexReader.(index.IndexReaderAdv); ok {
fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String()))
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
// enumerate the terms and check against regexp
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
candidateTerms = append(candidateTerms, tfd.Term)
tfd, err = fieldDict.Next()
}
log.Printf("fsa time took-> %f", time.Since(t).Seconds())
} else {
prefixTerm, complete := pattern.LiteralPrefix()
if complete {
// there is no pattern
candidateTerms = []string{prefixTerm}
} else {
var err error
candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
prefixTerm)
if err != nil {
return nil, err
}
}
log.Printf("time took-> %f", time.Since(t).Seconds())
}
return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,