From 1ef41101bac733727e9930eeab96c458b5ebe68a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 19 Mar 2018 17:20:49 +0530 Subject: [PATCH] vellum adoption for regex and fuzzy queries --- index/index.go | 7 ++++++ index/scorch/segment/empty.go | 9 +++++++ index/scorch/segment/mem/dict.go | 24 ++++++++++++++++++ index/scorch/segment/segment.go | 2 ++ index/scorch/segment/zap/dict.go | 42 ++++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 14 +++++++++++ index/scorch/snapshot_segment.go | 9 +++++++ search/searcher/search_fuzzy.go | 22 ++++++++++++++++- search/searcher/search_regexp.go | 41 ++++++++++++++++++++++++------- 9 files changed, 160 insertions(+), 10 deletions(-) diff --git a/index/index.go b/index/index.go index e5a69297..96fb7d25 100644 --- a/index/index.go +++ b/index/index.go @@ -96,6 +96,13 @@ type IndexReader interface { Close() error } +// IndexReaderAdv is an optional interface for advanced users +// Hope to have a better name here... +type IndexReaderAdv interface { + FieldDictRegex(field string, regex []byte) (FieldDict, error) + FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error) +} + // FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 6c19f60f..918875e1 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +func (e *EmptyDictionary) FuzzyIterator(term string, + fuzziness int) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 9f5a873a..2877f945 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator } } +// RegexIterator returns an iterator which only visits terms matching +// the given regex expression. +// TODO complete the implementation +func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex) + return &DictionaryIterator{ + d: d, + offset: offset, + prefix: regex, + } +} + +// FuzzyIterator returns an iterator which only visits terms matching +// the given edit distance. +// TODO complete the implementation +func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term) + return &DictionaryIterator{ + d: d, + offset: offset, + prefix: term, + } +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 8eee5f75..cf3b21f4 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -48,6 +48,8 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator + RegexIterator(regex string) DictionaryIterator + FuzzyIterator(term string, fuzziness int) DictionaryIterator } type DictionaryIterator interface { diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3b8132f2..2281ec04 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" + "github.com/couchbase/vellum/levenshtein" "github.com/couchbase/vellum/regexp" ) @@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator return rv } +// RegexIterator returns an iterator which only visits terms having the +// the specified regex +func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + r, err := regexp.New(regex) + if err == nil { + itr, err := d.fst.Search(r, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + +// FuzzyIterator returns an iterator which only visits terms having the +// the specified edit/levenshtein distance +func (d *Dictionary) FuzzyIterator(term string, + fuzziness int) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + la, err := levenshtein.New(term, fuzziness) + if err == nil { + itr, err := d.fst.Search(la, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6f4b0288..17208698 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, }) } +func (i *IndexSnapshot) FieldDictRegex(field string, + termRegex []byte) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.RegexIterator(string(termRegex)) + }) +} + +func (i *IndexSnapshot) FieldDictFuzzy(field string, + term []byte, fuzziness int) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.FuzzyIterator(string(term), fuzziness) + }) +} + func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index edf52a6e..6edc6ae6 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } +func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator { + return s.d.RegexIterator(regex) +} + +func (s *SegmentDictionarySnapshot) FuzzyIterator(term string, + fuzziness int) segment.DictionaryIterator { + return s.d.FuzzyIterator(term, fuzziness) +} + type SegmentSnapshot struct { id uint64 segment segment.Segment diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 90abaa0a..69aab2f7 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -15,6 +15,9 @@ package searcher import ( + "log" + "time" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, break } } - + t := time.Now() candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, field, prefixTerm) + log.Printf("time taken-> %f", time.Since(t).Seconds()) if err != nil { return nil, err } @@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if len(prefixTerm) > 0 { fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) } else { + // in case of advanced reader implementations directly call + // the levenshtein automaton based iterator to collect the + // candidate terms + if ir, ok := indexReader.(index.IndexReaderAdv); ok { + fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness) + if err != nil { + return rv, err + } + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, tfd.Term) + tfd, err = fieldDict.Next() + } + log.Printf("candidate FSA fuzzy terms: %+v", rv) + return rv, nil + } fieldDict, err = indexReader.FieldDict(field) } defer func() { diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b7cf520a..806f135a 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -15,7 +15,9 @@ package searcher import ( + "log" "regexp" + "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -29,19 +31,40 @@ import ( func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { - - prefixTerm, complete := pattern.LiteralPrefix() var candidateTerms []string - if complete { - // there is no pattern - candidateTerms = []string{prefixTerm} - } else { - var err error - candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, - prefixTerm) + t := time.Now() + if ir, ok := indexReader.(index.IndexReaderAdv); ok { + fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String())) if err != nil { return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + + // enumerate the terms and check against regexp + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + log.Printf("fsa time took-> %f", time.Since(t).Seconds()) + } else { + prefixTerm, complete := pattern.LiteralPrefix() + if complete { + // there is no pattern + candidateTerms = []string{prefixTerm} + } else { + var err error + candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, + prefixTerm) + if err != nil { + return nil, err + } + } + log.Printf("time took-> %f", time.Since(t).Seconds()) } return NewMultiTermSearcher(indexReader, candidateTerms, field, boost,