vellum adoption for regex and fuzzy queries

2018-03-19 17:20:49 +05:30 · 2018-03-19 17:20:49 +05:30 · 1ef41101ba
parent 6693a89441
commit 1ef41101ba
9 changed files with 160 additions and 10 deletions
--- a/index/index.go
+++ b/index/index.go
@ -96,6 +96,13 @@ type IndexReader interface {
 	Close() error
 }
 // IndexReaderAdv is an optional interface for advanced users
 // Hope to have a better name here...
 type IndexReaderAdv interface {
 	FieldDictRegex(field string, regex []byte) (FieldDict, error)
 	FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
 }
 // FieldTerms contains the terms used by a document, keyed by field
 type FieldTerms map[string][]string
--- a/index/scorch/segment/empty.go
+++ b/index/scorch/segment/empty.go
@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
 	return &EmptyDictionaryIterator{}
 }
 func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator {
 	return &EmptyDictionaryIterator{}
 }
 func (e *EmptyDictionary) FuzzyIterator(term string,
 	fuzziness int) DictionaryIterator {
 	return &EmptyDictionaryIterator{}
 }
 type EmptyDictionaryIterator struct{}
 func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
--- a/index/scorch/segment/mem/dict.go
+++ b/index/scorch/segment/mem/dict.go
@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
 	}
 }
 // RegexIterator returns an iterator which only visits terms matching
 // the given regex expression.
 // TODO complete the implementation
 func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
 	offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex)
 	return &DictionaryIterator{
 		d:      d,
 		offset: offset,
 		prefix: regex,
 	}
 }
 // FuzzyIterator returns an iterator which only visits terms matching
 // the given edit distance.
 // TODO complete the implementation
 func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator {
 	offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term)
 	return &DictionaryIterator{
 		d:      d,
 		offset: offset,
 		prefix: term,
 	}
 }
 // DictionaryIterator is an iterator for term dictionary
 type DictionaryIterator struct {
 	d      *Dictionary
--- a/index/scorch/segment/segment.go
+++ b/index/scorch/segment/segment.go
@ -48,6 +48,8 @@ type TermDictionary interface {
 	Iterator() DictionaryIterator
 	PrefixIterator(prefix string) DictionaryIterator
 	RangeIterator(start, end string) DictionaryIterator
 	RegexIterator(regex string) DictionaryIterator
 	FuzzyIterator(term string, fuzziness int) DictionaryIterator
 }
 type DictionaryIterator interface {
--- a/index/scorch/segment/zap/dict.go
+++ b/index/scorch/segment/zap/dict.go
@ -21,6 +21,7 @@ import (
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/index/scorch/segment"
 	"github.com/couchbase/vellum"
 	"github.com/couchbase/vellum/levenshtein"
 	"github.com/couchbase/vellum/regexp"
 )
@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
 	return rv
 }
 // RegexIterator returns an iterator which only visits terms having the
 // the specified regex
 func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
 	rv := &DictionaryIterator{
 		d: d,
 	}
 	if d.fst != nil {
 		r, err := regexp.New(regex)
 		if err == nil {
 			itr, err := d.fst.Search(r, nil, nil)
 			if err == nil {
 				rv.itr = itr
 			}
 		}
 	}
 	return rv
 }
 // FuzzyIterator returns an iterator which only visits terms having the
 // the specified edit/levenshtein distance
 func (d *Dictionary) FuzzyIterator(term string,
 	fuzziness int) segment.DictionaryIterator {
 	rv := &DictionaryIterator{
 		d: d,
 	}
 	if d.fst != nil {
 		la, err := levenshtein.New(term, fuzziness)
 		if err == nil {
 			itr, err := d.fst.Search(la, nil, nil)
 			if err == nil {
 				rv.itr = itr
 			}
 		}
 	}
 	return rv
 }
 // DictionaryIterator is an iterator for term dictionary
 type DictionaryIterator struct {
 	d   *Dictionary
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
 	})
 }
 func (i *IndexSnapshot) FieldDictRegex(field string,
 	termRegex []byte) (index.FieldDict, error) {
 	return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
 		return i.RegexIterator(string(termRegex))
 	})
 }
 func (i *IndexSnapshot) FieldDictFuzzy(field string,
 	term []byte, fuzziness int) (index.FieldDict, error) {
 	return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
 		return i.FuzzyIterator(string(term), fuzziness)
 	})
 }
 func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
 	results := make(chan *asynchSegmentResult)
 	for index, segment := range i.segment {
--- a/index/scorch/snapshot_segment.go
+++ b/index/scorch/snapshot_segment.go
@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic
 	return s.d.RangeIterator(start, end)
 }
 func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator {
 	return s.d.RegexIterator(regex)
 }
 func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
 	fuzziness int) segment.DictionaryIterator {
 	return s.d.FuzzyIterator(term, fuzziness)
 }
 type SegmentSnapshot struct {
 	id      uint64
 	segment segment.Segment
--- a/search/searcher/search_fuzzy.go
+++ b/search/searcher/search_fuzzy.go
@ -15,6 +15,9 @@
 package searcher
 import (
 	"log"
 	"time"
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/search"
 )
@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
 			break
 		}
 	}
-
+	t := time.Now()
 	candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
 		field, prefixTerm)
 	log.Printf("time taken-> %f", time.Since(t).Seconds())
 	if err != nil {
 		return nil, err
 	}
@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
 	if len(prefixTerm) > 0 {
 		fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
 	} else {
 		// in case of advanced reader implementations directly call
 		// the levenshtein automaton based iterator to collect the
 		// candidate terms
 		if ir, ok := indexReader.(index.IndexReaderAdv); ok {
 			fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
 			if err != nil {
 				return rv, err
 			}
 			tfd, err := fieldDict.Next()
 			for err == nil && tfd != nil {
 				rv = append(rv, tfd.Term)
 				tfd, err = fieldDict.Next()
 			}
 			log.Printf("candidate FSA fuzzy terms: %+v", rv)
 			return rv, nil
 		}
 		fieldDict, err = indexReader.FieldDict(field)
 	}
 	defer func() {
--- a/search/searcher/search_regexp.go
+++ b/search/searcher/search_regexp.go
@ -15,7 +15,9 @@
 package searcher
 import (
 	"log"
 	"regexp"
 	"time"
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/search"
@ -29,19 +31,40 @@ import (
 func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
 	field string, boost float64, options search.SearcherOptions) (
 	search.Searcher, error) {
 	prefixTerm, complete := pattern.LiteralPrefix()
 	var candidateTerms []string
-	if complete {
+	t := time.Now()
-		// there is no pattern
+	if ir, ok := indexReader.(index.IndexReaderAdv); ok {
-		candidateTerms = []string{prefixTerm}
+		fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String()))
 	} else {
 		var err error
 		candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
 			prefixTerm)
 		if err != nil {
 			return nil, err
 		}
 		defer func() {
 			if cerr := fieldDict.Close(); cerr != nil && err == nil {
 				err = cerr
 			}
 		}()
 		// enumerate the terms and check against regexp
 		tfd, err := fieldDict.Next()
 		for err == nil && tfd != nil {
 			candidateTerms = append(candidateTerms, tfd.Term)
 			tfd, err = fieldDict.Next()
 		}
 		log.Printf("fsa time took-> %f", time.Since(t).Seconds())
 	} else {
 		prefixTerm, complete := pattern.LiteralPrefix()
 		if complete {
 			// there is no pattern
 			candidateTerms = []string{prefixTerm}
 		} else {
 			var err error
 			candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
 				prefixTerm)
 			if err != nil {
 				return nil, err
 			}
 		}
 		log.Printf("time took-> %f", time.Since(t).Seconds())
 	}
 	return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,