vellum adoption for regex and fuzzy queries

2018-03-19 17:20:49 +05:30 · 2018-03-19 17:20:49 +05:30 · 1ef41101ba
parent 6693a89441
commit 1ef41101ba
9 changed files with 160 additions and 10 deletions
--- a/index/index.go
+++ b/index/index.go
@ -96,6 +96,13 @@ type IndexReader interface {
 	Close() error
 }

+// IndexReaderAdv is an optional interface for advanced users
+// Hope to have a better name here...
+type IndexReaderAdv interface {
+	FieldDictRegex(field string, regex []byte) (FieldDict, error)
+	FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error)
+}
+
 // FieldTerms contains the terms used by a document, keyed by field
 type FieldTerms map[string][]string

--- a/index/scorch/segment/empty.go
+++ b/index/scorch/segment/empty.go
@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
 	return &EmptyDictionaryIterator{}
 }

+func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator {
+	return &EmptyDictionaryIterator{}
+}
+
+func (e *EmptyDictionary) FuzzyIterator(term string,
+	fuzziness int) DictionaryIterator {
+	return &EmptyDictionaryIterator{}
+}
+
 type EmptyDictionaryIterator struct{}

 func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
--- a/index/scorch/segment/mem/dict.go
+++ b/index/scorch/segment/mem/dict.go
@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
 	}
 }

+// RegexIterator returns an iterator which only visits terms matching
+// the given regex expression.
+// TODO complete the implementation
+func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
+	offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex)
+	return &DictionaryIterator{
+		d:      d,
+		offset: offset,
+		prefix: regex,
+	}
+}
+
+// FuzzyIterator returns an iterator which only visits terms matching
+// the given edit distance.
+// TODO complete the implementation
+func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator {
+	offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term)
+	return &DictionaryIterator{
+		d:      d,
+		offset: offset,
+		prefix: term,
+	}
+}
+
 // DictionaryIterator is an iterator for term dictionary
 type DictionaryIterator struct {
 	d      *Dictionary
--- a/index/scorch/segment/segment.go
+++ b/index/scorch/segment/segment.go
@ -48,6 +48,8 @@ type TermDictionary interface {
 	Iterator() DictionaryIterator
 	PrefixIterator(prefix string) DictionaryIterator
 	RangeIterator(start, end string) DictionaryIterator
+	RegexIterator(regex string) DictionaryIterator
+	FuzzyIterator(term string, fuzziness int) DictionaryIterator
 }

 type DictionaryIterator interface {
--- a/index/scorch/segment/zap/dict.go
+++ b/index/scorch/segment/zap/dict.go
@ -21,6 +21,7 @@ import (
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/index/scorch/segment"
 	"github.com/couchbase/vellum"
+	"github.com/couchbase/vellum/levenshtein"
 	"github.com/couchbase/vellum/regexp"
 )

@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
 	return rv
 }

+// RegexIterator returns an iterator which only visits terms having the
+// the specified regex
+func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator {
+	rv := &DictionaryIterator{
+		d: d,
+	}
+
+	if d.fst != nil {
+		r, err := regexp.New(regex)
+		if err == nil {
+			itr, err := d.fst.Search(r, nil, nil)
+			if err == nil {
+				rv.itr = itr
+			}
+		}
+	}
+
+	return rv
+}
+
+// FuzzyIterator returns an iterator which only visits terms having the
+// the specified edit/levenshtein distance
+func (d *Dictionary) FuzzyIterator(term string,
+	fuzziness int) segment.DictionaryIterator {
+	rv := &DictionaryIterator{
+		d: d,
+	}
+
+	if d.fst != nil {
+		la, err := levenshtein.New(term, fuzziness)
+		if err == nil {
+			itr, err := d.fst.Search(la, nil, nil)
+			if err == nil {
+				rv.itr = itr
+			}
+		}
+	}
+
+	return rv
+}
+
 // DictionaryIterator is an iterator for term dictionary
 type DictionaryIterator struct {
 	d   *Dictionary
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
 	})
 }

+func (i *IndexSnapshot) FieldDictRegex(field string,
+	termRegex []byte) (index.FieldDict, error) {
+	return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
+		return i.RegexIterator(string(termRegex))
+	})
+}
+
+func (i *IndexSnapshot) FieldDictFuzzy(field string,
+	term []byte, fuzziness int) (index.FieldDict, error) {
+	return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
+		return i.FuzzyIterator(string(term), fuzziness)
+	})
+}
+
 func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
 	results := make(chan *asynchSegmentResult)
 	for index, segment := range i.segment {
--- a/index/scorch/snapshot_segment.go
+++ b/index/scorch/snapshot_segment.go
@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic
 	return s.d.RangeIterator(start, end)
 }

+func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator {
+	return s.d.RegexIterator(regex)
+}
+
+func (s *SegmentDictionarySnapshot) FuzzyIterator(term string,
+	fuzziness int) segment.DictionaryIterator {
+	return s.d.FuzzyIterator(term, fuzziness)
+}
+
 type SegmentSnapshot struct {
 	id      uint64
 	segment segment.Segment
--- a/search/searcher/search_fuzzy.go
+++ b/search/searcher/search_fuzzy.go
@ -15,6 +15,9 @@
 package searcher

 import (
+	"log"
+	"time"
+
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/search"
 )
@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
 			break
 		}
 	}
-
+	t := time.Now()
 	candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
 		field, prefixTerm)
+	log.Printf("time taken-> %f", time.Since(t).Seconds())
 	if err != nil {
 		return nil, err
 	}
@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
 	if len(prefixTerm) > 0 {
 		fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
 	} else {
+		// in case of advanced reader implementations directly call
+		// the levenshtein automaton based iterator to collect the
+		// candidate terms
+		if ir, ok := indexReader.(index.IndexReaderAdv); ok {
+			fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness)
+			if err != nil {
+				return rv, err
+			}
+			tfd, err := fieldDict.Next()
+			for err == nil && tfd != nil {
+				rv = append(rv, tfd.Term)
+				tfd, err = fieldDict.Next()
+			}
+			log.Printf("candidate FSA fuzzy terms: %+v", rv)
+			return rv, nil
+		}
 		fieldDict, err = indexReader.FieldDict(field)
 	}
 	defer func() {
--- a/search/searcher/search_regexp.go
+++ b/search/searcher/search_regexp.go
@ -15,7 +15,9 @@
 package searcher

 import (
+	"log"
 	"regexp"
+	"time"

 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/search"
@ -29,19 +31,40 @@ import (
 func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp,
 	field string, boost float64, options search.SearcherOptions) (
 	search.Searcher, error) {
-
-	prefixTerm, complete := pattern.LiteralPrefix()
 	var candidateTerms []string
-	if complete {
-		// there is no pattern
-		candidateTerms = []string{prefixTerm}
-	} else {
-		var err error
-		candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
-			prefixTerm)
+	t := time.Now()
+	if ir, ok := indexReader.(index.IndexReaderAdv); ok {
+		fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String()))
 		if err != nil {
 			return nil, err
 		}
+		defer func() {
+			if cerr := fieldDict.Close(); cerr != nil && err == nil {
+				err = cerr
+			}
+		}()
+
+		// enumerate the terms and check against regexp
+		tfd, err := fieldDict.Next()
+		for err == nil && tfd != nil {
+			candidateTerms = append(candidateTerms, tfd.Term)
+			tfd, err = fieldDict.Next()
+		}
+		log.Printf("fsa time took-> %f", time.Since(t).Seconds())
+	} else {
+		prefixTerm, complete := pattern.LiteralPrefix()
+		if complete {
+			// there is no pattern
+			candidateTerms = []string{prefixTerm}
+		} else {
+			var err error
+			candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field,
+				prefixTerm)
+			if err != nil {
+				return nil, err
+			}
+		}
+		log.Printf("time took-> %f", time.Since(t).Seconds())
 	}

 	return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,