From 5164e70f6e7f53e0f4847eaf1327d847301ccd7a Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Tue, 9 Aug 2016 16:18:53 +0300 Subject: [PATCH 01/24] Adding sort to SearchRequest. --- index_impl.go | 5 +- index_test.go | 49 ++++++ search.go | 5 + search/collectors/collector_heap.go | 225 ++++++++++++++++++++++++++++ search/search.go | 7 + 5 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 search/collectors/collector_heap.go diff --git a/index_impl.go b/index_impl.go index f914ec94..bd0aeeb1 100644 --- a/index_impl.go +++ b/index_impl.go @@ -384,8 +384,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return nil, ErrorIndexClosed } - collector := collectors.NewTopScorerSkipCollector(req.Size, req.From) - // open a reader for this search indexReader, err := i.i.Reader() if err != nil { @@ -407,6 +405,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } }() + collector := collectors.NewHeapCollector(req.Size, req.From, indexReader, req.Sort) + //collector := collectors.NewTopScorerSkipCollector(req.Size, req.From) + if req.Facets != nil { facetsBuilder := search.NewFacetsBuilder(indexReader) for facetName, facetRequest := range req.Facets { diff --git a/index_test.go b/index_test.go index ae0f21ae..1a20f456 100644 --- a/index_test.go +++ b/index_test.go @@ -29,6 +29,9 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store/null" "github.com/blevesearch/bleve/search" + + "github.com/Pallinder/go-randomdata" + "github.com/divan/num2words" ) func TestCrud(t *testing.T) { @@ -715,6 +718,52 @@ func TestIndexMetadataRaceBug198(t *testing.T) { close(done) } +func TestSortMatchSearch(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + index, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 200; i++ { + doc := make(map[string]interface{}) + doc["Name"] = randomdata.SillyName() + doc["Day"] = randomdata.Day() + doc["Number"] = num2words.Convert(i) + err = index.Index(fmt.Sprintf("%d", i), doc) + if err != nil { + t.Fatal(err) + } + } + + req := NewSearchRequest(NewMatchQuery("one")) + req.SortBy("Day", true) + req.SortBy("Name", true) + req.Fields = []string{"*"} + sr, err := index.Search(req) + if err != nil { + t.Fatal(err) + } + prev := "" + for _, hit := range sr.Hits { + val := hit.Fields["Day"].(string) + if prev > val { + t.Errorf("Hits must be sorted by 'Day'. Found '%s' before '%s'", prev, val) + } + prev = val + } + err = index.Close() + if err != nil { + t.Fatal(err) + } +} + func TestIndexCountMatchSearch(t *testing.T) { defer func() { err := os.RemoveAll("testidx") diff --git a/search.go b/search.go index e9ca34be..7f228e5d 100644 --- a/search.go +++ b/search.go @@ -201,6 +201,7 @@ type SearchRequest struct { Fields []string `json:"fields"` Facets FacetsRequest `json:"facets"` Explain bool `json:"explain"` + Sort search.SortOrder `json:"sort"` } func (sr *SearchRequest) Validate() error { @@ -220,6 +221,10 @@ func (r *SearchRequest) AddFacet(facetName string, f *FacetRequest) { r.Facets[facetName] = f } +func (r *SearchRequest) SortBy(fieldName string, ascends bool) { + r.Sort = append(r.Sort, search.SearchSort{Field: fieldName, Ascends: ascends}) +} + // UnmarshalJSON deserializes a JSON representation of // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go new file mode 100644 index 00000000..b4624f05 --- /dev/null +++ b/search/collectors/collector_heap.go @@ -0,0 +1,225 @@ +package collectors + +import ( + "container/heap" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/search" + "golang.org/x/net/context" + "time" +) + +type collectedDoc struct { + match search.DocumentMatch + doc *document.Document +} + +type HeapCollector struct { + size int + skip int + total uint64 + took time.Duration + sort search.SortOrder + results []*collectedDoc + facetsBuilder *search.FacetsBuilder + reader index.IndexReader +} + +func NewHeapCollector(size int, skip int, reader index.IndexReader, sort search.SortOrder) *HeapCollector { + hc := &HeapCollector{size: size, skip: skip, reader: reader, sort: sort} + heap.Init(hc) + return hc +} + +func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher) error { + startTime := time.Now() + var err error + var pre search.DocumentMatch // A single pre-alloc'ed, reused instance. + var next *search.DocumentMatch + select { + case <-ctx.Done(): + return ctx.Err() + default: + next, err = searcher.Next(&pre) + } + for err == nil && next != nil { + if hc.total%COLLECT_CHECK_DONE_EVERY == 0 { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + } + hc.collectSingle(next) + if hc.facetsBuilder != nil { + err = hc.facetsBuilder.Update(next) + if err != nil { + break + } + } + next, err = searcher.Next(pre.Reset()) + } + // compute search duration + hc.took = time.Since(startTime) + if err != nil { + return err + } + return nil +} + +func (hc *HeapCollector) collectSingle(dmIn *search.DocumentMatch) error { + // increment total hits + hc.total++ + single := new(collectedDoc) + single.match = *dmIn + var err error + if len(hc.sort) > 0 { + single.doc, err = hc.reader.Document(dmIn.ID) + if err != nil { + return err + } + } + if hc.Len() >= hc.size { + hc.Pop() + } + heap.Push(hc, single) + return nil + +} + +func (hc *HeapCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { + hc.facetsBuilder = facetsBuilder +} + +func (hc *HeapCollector) Results() search.DocumentMatchCollection { + rv := make(search.DocumentMatchCollection, hc.Len()) + for i := 0; hc.Len() > 0; i++ { + doc := heap.Pop(hc).(*collectedDoc) + rv[i] = &doc.match + } + return rv +} + +func (hc *HeapCollector) Total() uint64 { + return hc.total +} + +func (hc *HeapCollector) MaxScore() float64 { + return 0 +} + +func (hc *HeapCollector) FacetResults() search.FacetResults { + if hc.facetsBuilder != nil { + return hc.facetsBuilder.Results() + } + return search.FacetResults{} +} + +func (hc *HeapCollector) Len() int { + return len(hc.results) +} + +func field(doc *document.Document, field string) document.Field { + if doc == nil { + return nil + } + for _, f := range doc.Fields { + if f.Name() == field { + return f + } + } + return nil +} + +func textFieldCompare(i, j *document.TextField, ascends bool) (bool, bool) { + ivalue := string(i.Value()) + jvalue := string(j.Value()) + if ivalue == jvalue { + return true, false + } + if ascends { + return false, ivalue < jvalue + } + return false, ivalue > jvalue +} + +func numericFieldCompare(i, j *document.NumericField, ascends bool) (bool, bool) { + ivalue, _ := i.Number() + jvalue, _ := i.Number() + if ivalue == jvalue { + return true, false + } + if ascends { + return false, ivalue < jvalue + } + return false, ivalue > jvalue +} + +func dateTimeFieldCompare(i, j *document.DateTimeField, ascends bool) (bool, bool) { + ivalue, _ := i.DateTime() + jvalue, _ := i.DateTime() + if ivalue.Equal(jvalue) { + return true, false + } + if ascends { + return false, ivalue.Before(jvalue) + } + return false, ivalue.After(jvalue) +} + +func boolFieldCompare(i, j *document.BooleanField, ascends bool) (bool, bool) { + ivalue, _ := i.Boolean() + jvalue, _ := i.Boolean() + if ivalue == jvalue { + return true, false + } + return false, ascends && jvalue +} + +//returns: equals, less +func fieldCompare(i, j document.Field, ascends bool) (bool, bool) { + switch ft := i.(type) { + case *document.TextField: + return textFieldCompare(ft, j.(*document.TextField), ascends) + case *document.NumericField: + return numericFieldCompare(ft, j.(*document.NumericField), ascends) + case *document.DateTimeField: + return dateTimeFieldCompare(ft, j.(*document.DateTimeField), ascends) + case *document.BooleanField: + return boolFieldCompare(ft, j.(*document.BooleanField), ascends) + } + return false, false +} + +func (hc *HeapCollector) Less(i, j int) bool { + if len(hc.sort) > 0 { + doci := hc.results[i].doc + docj := hc.results[j].doc + if doci != nil && docj != nil { + for _, so := range hc.sort { + fieldi := field(doci, so.Field) + fieldj := field(docj, so.Field) + equals, lt := fieldCompare(fieldi, fieldj, so.Ascends) + if !equals { + return lt + } + } + } + } + return hc.results[i].match.Score > hc.results[j].match.Score +} + +func (hc *HeapCollector) Swap(i, j int) { + hc.results[i], hc.results[j] = hc.results[j], hc.results[i] +} + +func (hc *HeapCollector) Push(x interface{}) { + hc.results = append(hc.results, x.(*collectedDoc)) +} + +func (hc *HeapCollector) Pop() interface{} { + n := len(hc.results) + doc := hc.results[n-1] + hc.results = hc.results[0 : n-1] + return doc +} diff --git a/search/search.go b/search/search.go index 4b3b988b..af004499 100644 --- a/search/search.go +++ b/search/search.go @@ -105,3 +105,10 @@ type Searcher interface { Count() uint64 Min() int } + +type SearchSort struct { + Field string `json:"field"` + Ascends bool `json:"ascends"` +} + +type SortOrder []SearchSort From 154d1b904be201d6529e5b5562d6c2adf38e3f43 Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Wed, 10 Aug 2016 11:13:38 +0300 Subject: [PATCH 02/24] Adding sort to SearchRequest. --- search/collectors/collector_heap.go | 54 ++++-- ...p_score_test.go => collector_heap_test.go} | 18 +- search/collectors/collector_top_score.go | 165 ------------------ 3 files changed, 52 insertions(+), 185 deletions(-) rename search/collectors/{collector_top_score_test.go => collector_heap_test.go} (94%) delete mode 100644 search/collectors/collector_top_score.go diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index b4624f05..7a6e542a 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -1,12 +1,23 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +// package collectors import ( "container/heap" + "math" + "time" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "golang.org/x/net/context" - "time" ) type collectedDoc struct { @@ -25,6 +36,8 @@ type HeapCollector struct { reader index.IndexReader } +var COLLECT_CHECK_DONE_EVERY = uint64(1024) + func NewHeapCollector(size int, skip int, reader index.IndexReader, sort search.SortOrder) *HeapCollector { hc := &HeapCollector{size: size, skip: skip, reader: reader, sort: sort} heap.Init(hc) @@ -79,12 +92,11 @@ func (hc *HeapCollector) collectSingle(dmIn *search.DocumentMatch) error { return err } } - if hc.Len() >= hc.size { - hc.Pop() - } heap.Push(hc, single) + if hc.Len() > hc.size+hc.skip { + heap.Pop(hc) + } return nil - } func (hc *HeapCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { @@ -92,10 +104,16 @@ func (hc *HeapCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { } func (hc *HeapCollector) Results() search.DocumentMatchCollection { - rv := make(search.DocumentMatchCollection, hc.Len()) - for i := 0; hc.Len() > 0; i++ { - doc := heap.Pop(hc).(*collectedDoc) - rv[i] = &doc.match + count := hc.Len() + size := count - hc.skip + rv := make(search.DocumentMatchCollection, size) + for count > 0 { + count-- + if count >= hc.skip { + size-- + doc := heap.Pop(hc).(*collectedDoc) + rv[size] = &doc.match + } } return rv } @@ -105,7 +123,15 @@ func (hc *HeapCollector) Total() uint64 { } func (hc *HeapCollector) MaxScore() float64 { - return 0 + var max float64 + for _, res := range hc.results { + max = math.Max(max, res.match.Score) + } + return max +} + +func (hc *HeapCollector) Took() time.Duration { + return hc.took } func (hc *HeapCollector) FacetResults() search.FacetResults { @@ -206,7 +232,13 @@ func (hc *HeapCollector) Less(i, j int) bool { } } } - return hc.results[i].match.Score > hc.results[j].match.Score + scori := hc.results[i].match.Score + scorj := hc.results[j].match.Score + // make sure the list is ordered if everything else is the same... + if scori==scorj{ + return hc.results[i].match.ID < hc.results[j].match.ID + } + return scori < scorj } func (hc *HeapCollector) Swap(i, j int) { diff --git a/search/collectors/collector_top_score_test.go b/search/collectors/collector_heap_test.go similarity index 94% rename from search/collectors/collector_top_score_test.go rename to search/collectors/collector_heap_test.go index 4bf76140..d5308098 100644 --- a/search/collectors/collector_top_score_test.go +++ b/search/collectors/collector_heap_test.go @@ -83,7 +83,8 @@ func TestTop10Scores(t *testing.T) { }, } - collector := NewTopScorerCollector(10) + collector := NewHeapCollector(10, 0, nil, nil) + //collector:=NewTopScorerCollector(10) err := collector.Collect(context.Background(), searcher) if err != nil { t.Fatal(err) @@ -191,7 +192,7 @@ func TestTop10ScoresSkip10(t *testing.T) { }, } - collector := NewTopScorerSkipCollector(10, 10) + collector := NewHeapCollector(10, 10, nil, nil) err := collector.Collect(context.Background(), searcher) if err != nil { t.Fatal(err) @@ -288,7 +289,7 @@ func TestPaginationSameScores(t *testing.T) { } // first get first 5 hits - collector := NewTopScorerSkipCollector(5, 0) + collector := NewHeapCollector(5, 0, nil, nil) err := collector.Collect(context.Background(), searcher) if err != nil { t.Fatal(err) @@ -374,7 +375,7 @@ func TestPaginationSameScores(t *testing.T) { } // now get next 5 hits - collector = NewTopScorerSkipCollector(5, 5) + collector = NewHeapCollector(5, 5, nil, nil) err = collector.Collect(context.Background(), searcher) if err != nil { t.Fatal(err) @@ -397,21 +398,20 @@ func TestPaginationSameScores(t *testing.T) { t.Errorf("doc ID %s is in top 5 and next 5 result sets", hit.ID) } } - } func BenchmarkTop10of100000Scores(b *testing.B) { - benchHelper(10000, NewTopScorerCollector(10), b) + benchHelper(10000, NewHeapCollector(10, 0, nil, nil), b) } func BenchmarkTop100of100000Scores(b *testing.B) { - benchHelper(10000, NewTopScorerCollector(100), b) + benchHelper(10000, NewHeapCollector(100, 0, nil, nil), b) } func BenchmarkTop10of1000000Scores(b *testing.B) { - benchHelper(100000, NewTopScorerCollector(10), b) + benchHelper(100000, NewHeapCollector(10, 0, nil, nil), b) } func BenchmarkTop100of1000000Scores(b *testing.B) { - benchHelper(100000, NewTopScorerCollector(100), b) + benchHelper(100000, NewHeapCollector(100, 0, nil, nil), b) } diff --git a/search/collectors/collector_top_score.go b/search/collectors/collector_top_score.go deleted file mode 100644 index 90b0c75e..00000000 --- a/search/collectors/collector_top_score.go +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2014 Couchbase, Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file -// except in compliance with the License. You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the -// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. - -package collectors - -import ( - "container/list" - "time" - - "golang.org/x/net/context" - - "github.com/blevesearch/bleve/search" -) - -type TopScoreCollector struct { - k int - skip int - results *list.List - took time.Duration - maxScore float64 - minScore float64 - total uint64 - facetsBuilder *search.FacetsBuilder -} - -func NewTopScorerCollector(k int) *TopScoreCollector { - return &TopScoreCollector{ - k: k, - skip: 0, - results: list.New(), - } -} - -func NewTopScorerSkipCollector(k, skip int) *TopScoreCollector { - return &TopScoreCollector{ - k: k, - skip: skip, - results: list.New(), - } -} - -func (tksc *TopScoreCollector) Total() uint64 { - return tksc.total -} - -func (tksc *TopScoreCollector) MaxScore() float64 { - return tksc.maxScore -} - -func (tksc *TopScoreCollector) Took() time.Duration { - return tksc.took -} - -var COLLECT_CHECK_DONE_EVERY = uint64(1024) - -func (tksc *TopScoreCollector) Collect(ctx context.Context, searcher search.Searcher) error { - startTime := time.Now() - var err error - var pre search.DocumentMatch // A single pre-alloc'ed, reused instance. - var next *search.DocumentMatch - select { - case <-ctx.Done(): - return ctx.Err() - default: - next, err = searcher.Next(&pre) - } - for err == nil && next != nil { - if tksc.total%COLLECT_CHECK_DONE_EVERY == 0 { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - } - tksc.collectSingle(next) - if tksc.facetsBuilder != nil { - err = tksc.facetsBuilder.Update(next) - if err != nil { - break - } - } - next, err = searcher.Next(pre.Reset()) - } - // compute search duration - tksc.took = time.Since(startTime) - if err != nil { - return err - } - return nil -} - -func (tksc *TopScoreCollector) collectSingle(dmIn *search.DocumentMatch) { - // increment total hits - tksc.total++ - - // update max score - if dmIn.Score > tksc.maxScore { - tksc.maxScore = dmIn.Score - } - - if dmIn.Score <= tksc.minScore { - return - } - - // Because the dmIn will be the single, pre-allocated, reused - // instance, we need to copy the dmIn into a new, standalone - // instance before inserting into our candidate results list. - dm := &search.DocumentMatch{} - *dm = *dmIn - - for e := tksc.results.Front(); e != nil; e = e.Next() { - curr := e.Value.(*search.DocumentMatch) - if dm.Score <= curr.Score { - - tksc.results.InsertBefore(dm, e) - // if we just made the list too long - if tksc.results.Len() > (tksc.k + tksc.skip) { - // remove the head - tksc.minScore = tksc.results.Remove(tksc.results.Front()).(*search.DocumentMatch).Score - } - return - } - } - // if we got to the end, we still have to add it - tksc.results.PushBack(dm) - if tksc.results.Len() > (tksc.k + tksc.skip) { - // remove the head - tksc.minScore = tksc.results.Remove(tksc.results.Front()).(*search.DocumentMatch).Score - } -} - -func (tksc *TopScoreCollector) Results() search.DocumentMatchCollection { - if tksc.results.Len()-tksc.skip > 0 { - rv := make(search.DocumentMatchCollection, tksc.results.Len()-tksc.skip) - i := 0 - skipped := 0 - for e := tksc.results.Back(); e != nil; e = e.Prev() { - if skipped < tksc.skip { - skipped++ - continue - } - rv[i] = e.Value.(*search.DocumentMatch) - i++ - } - return rv - } - return search.DocumentMatchCollection{} -} - -func (tksc *TopScoreCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { - tksc.facetsBuilder = facetsBuilder -} - -func (tksc *TopScoreCollector) FacetResults() search.FacetResults { - if tksc.facetsBuilder != nil { - return tksc.facetsBuilder.Results() - } - return search.FacetResults{} -} From 176a6e7a0d270f08fcc97ba702dbf882b812a570 Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Wed, 10 Aug 2016 11:35:47 +0300 Subject: [PATCH 03/24] Adding sort to SearchRequest. --- search/collectors/collector_heap.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 7a6e542a..4a02e4d0 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -235,8 +235,8 @@ func (hc *HeapCollector) Less(i, j int) bool { scori := hc.results[i].match.Score scorj := hc.results[j].match.Score // make sure the list is ordered if everything else is the same... - if scori==scorj{ - return hc.results[i].match.ID < hc.results[j].match.ID + if scori == scorj { + return hc.results[i].match.ID > hc.results[j].match.ID } return scori < scorj } From cb8a1e95a84b58f51d9b3122d34f365c96e953eb Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Wed, 10 Aug 2016 11:41:16 +0300 Subject: [PATCH 04/24] closes #110 --- search/collectors/collector_heap_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/search/collectors/collector_heap_test.go b/search/collectors/collector_heap_test.go index d5308098..58599915 100644 --- a/search/collectors/collector_heap_test.go +++ b/search/collectors/collector_heap_test.go @@ -84,7 +84,6 @@ func TestTop10Scores(t *testing.T) { } collector := NewHeapCollector(10, 0, nil, nil) - //collector:=NewTopScorerCollector(10) err := collector.Collect(context.Background(), searcher) if err != nil { t.Fatal(err) From 0d6a2b565f66d8e194c531cf8a232b25dca797bb Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Wed, 10 Aug 2016 11:44:31 +0300 Subject: [PATCH 05/24] closes #110 --- index_impl.go | 1 - 1 file changed, 1 deletion(-) diff --git a/index_impl.go b/index_impl.go index bd0aeeb1..06161c76 100644 --- a/index_impl.go +++ b/index_impl.go @@ -406,7 +406,6 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr }() collector := collectors.NewHeapCollector(req.Size, req.From, indexReader, req.Sort) - //collector := collectors.NewTopScorerSkipCollector(req.Size, req.From) if req.Facets != nil { facetsBuilder := search.NewFacetsBuilder(indexReader) From 7d4b7fb67d7ca832291c8d37a7c2b9b7766321f7 Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Wed, 10 Aug 2016 16:16:58 +0300 Subject: [PATCH 06/24] Adding sort to SearchRequest. --- search/collectors/collector_heap.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 4a02e4d0..be58a4d4 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -63,7 +63,10 @@ func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher) default: } } - hc.collectSingle(next) + err = hc.collectSingle(next) + if err != nil { + break + } if hc.facetsBuilder != nil { err = hc.facetsBuilder.Update(next) if err != nil { From b585c5786b61d695f696e46c04ec4692b6021a6a Mon Sep 17 00:00:00 2001 From: Danny Tylman Date: Thu, 11 Aug 2016 11:35:08 +0300 Subject: [PATCH 07/24] removing mock data generation packages from unit-tests fixing wrong sort order on certain fields --- index_test.go | 14 +++++++------- search/collectors/collector_heap.go | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/index_test.go b/index_test.go index 1a20f456..e2970eb6 100644 --- a/index_test.go +++ b/index_test.go @@ -29,9 +29,6 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store/null" "github.com/blevesearch/bleve/search" - - "github.com/Pallinder/go-randomdata" - "github.com/divan/num2words" ) func TestCrud(t *testing.T) { @@ -731,18 +728,21 @@ func TestSortMatchSearch(t *testing.T) { t.Fatal(err) } + names := []string{"Noam", "Uri", "David", "Yosef", "Eitan", "Itay", "Ariel", "Daniel", "Omer", "Yogev", "Yehonatan", "Moshe", "Mohammed", "Yusuf", "Omar"} + days := []string{"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"} + numbers := []string{"One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten", "Eleven", "Twelve"} for i := 0; i < 200; i++ { doc := make(map[string]interface{}) - doc["Name"] = randomdata.SillyName() - doc["Day"] = randomdata.Day() - doc["Number"] = num2words.Convert(i) + doc["Name"] = names[i%len(names)] + doc["Day"] = days[i%len(days)] + doc["Number"] = numbers[i%len(numbers)] err = index.Index(fmt.Sprintf("%d", i), doc) if err != nil { t.Fatal(err) } } - req := NewSearchRequest(NewMatchQuery("one")) + req := NewSearchRequest(NewMatchQuery("One")) req.SortBy("Day", true) req.SortBy("Name", true) req.Fields = []string{"*"} diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index be58a4d4..e6364b6f 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -167,9 +167,9 @@ func textFieldCompare(i, j *document.TextField, ascends bool) (bool, bool) { return true, false } if ascends { - return false, ivalue < jvalue + return false, ivalue > jvalue } - return false, ivalue > jvalue + return false, ivalue < jvalue } func numericFieldCompare(i, j *document.NumericField, ascends bool) (bool, bool) { @@ -179,9 +179,9 @@ func numericFieldCompare(i, j *document.NumericField, ascends bool) (bool, bool) return true, false } if ascends { - return false, ivalue < jvalue + return false, ivalue > jvalue } - return false, ivalue > jvalue + return false, ivalue < jvalue } func dateTimeFieldCompare(i, j *document.DateTimeField, ascends bool) (bool, bool) { @@ -191,9 +191,9 @@ func dateTimeFieldCompare(i, j *document.DateTimeField, ascends bool) (bool, boo return true, false } if ascends { - return false, ivalue.Before(jvalue) + return false, ivalue.After(jvalue) } - return false, ivalue.After(jvalue) + return false, ivalue.Before(jvalue) } func boolFieldCompare(i, j *document.BooleanField, ascends bool) (bool, bool) { From be56380833c50a383cbe2467147e98f6ea83f57d Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 12 Aug 2016 14:49:22 -0400 Subject: [PATCH 08/24] fix SearchRequest parsing to default to proper default sort order --- search.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/search.go b/search.go index 9c5dd786..8caa1f8c 100644 --- a/search.go +++ b/search.go @@ -238,6 +238,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { Fields []string `json:"fields"` Facets FacetsRequest `json:"facets"` Explain bool `json:"explain"` + Sort search.SortOrder `json:"sort"` } err := json.Unmarshal(input, &temp) @@ -250,6 +251,9 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { } else { r.Size = *temp.Size } + if temp.Sort == nil { + r.Sort = search.SortOrder{&search.SortScore{Descending: true}} + } r.From = temp.From r.Explain = temp.Explain r.Highlight = temp.Highlight From 0d873916f076d27f9685cd6d5cbeb838a6e76731 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 12 Aug 2016 19:16:24 -0400 Subject: [PATCH 09/24] support JSON marshal/unmarshal of search request sort The syntax used is an array of strings. The strings "_id" and "_score" are special and reserved to mean sorting on the document id and score repsectively. All other strings refer to the literal field name with that value. If the string is prefixed with "-" the order of that sort is descending, without it, it defaults to ascending. Examples: "sort":["-abv","-_score"] This will sort results in decreasing order of the "abv" field. Results which have the same value of the "abv" field will then be sorted by their score, also decreasing. If no value for "sort" is provided in the search request the default soring is the same as before, which is decreasing score. --- search.go | 7 ++- search/search.go | 56 ---------------------- search/sort.go | 120 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 58 deletions(-) diff --git a/search.go b/search.go index 8caa1f8c..7d793002 100644 --- a/search.go +++ b/search.go @@ -238,7 +238,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { Fields []string `json:"fields"` Facets FacetsRequest `json:"facets"` Explain bool `json:"explain"` - Sort search.SortOrder `json:"sort"` + Sort []json.RawMessage `json:"sort"` } err := json.Unmarshal(input, &temp) @@ -253,6 +253,11 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { } if temp.Sort == nil { r.Sort = search.SortOrder{&search.SortScore{Descending: true}} + } else { + r.Sort, err = search.ParseSortOrder(temp.Sort) + if err != nil { + return err + } } r.From = temp.From r.Explain = temp.Explain diff --git a/search/search.go b/search/search.go index d0b9d842..b6a2f6be 100644 --- a/search/search.go +++ b/search/search.go @@ -136,59 +136,3 @@ type Searcher interface { type SearchContext struct { DocumentMatchPool *DocumentMatchPool } - -type SearchSort interface { - Compare(a, b *DocumentMatch) int - - RequiresDocID() bool - RequiresScoring() bool - RequiresStoredFields() []string -} - -type SortOrder []SearchSort - -func (so SortOrder) Compare(i, j *DocumentMatch) int { - // compare the documents on all search sorts until a differences is found - for _, soi := range so { - c := soi.Compare(i, j) - if c == 0 { - continue - } - return c - } - // if they are the same at this point, impose order based on index natural sort order - if i.HitNumber == j.HitNumber { - return 0 - } else if i.HitNumber > j.HitNumber { - return 1 - } - return -1 -} - -func (so SortOrder) RequiresScore() bool { - rv := false - for _, soi := range so { - if soi.RequiresScoring() { - rv = true - } - } - return rv -} - -func (so SortOrder) RequiresDocID() bool { - rv := false - for _, soi := range so { - if soi.RequiresDocID() { - rv = true - } - } - return rv -} - -func (so SortOrder) RequiredStoredFields() []string { - var rv []string - for _, soi := range so { - rv = append(rv, soi.RequiresStoredFields()...) - } - return rv -} diff --git a/search/sort.go b/search/sort.go index 3746d41f..921482fe 100644 --- a/search/sort.go +++ b/search/sort.go @@ -9,7 +9,104 @@ package search -import "strings" +import ( + "encoding/json" + "strings" +) + +type SearchSort interface { + Compare(a, b *DocumentMatch) int + + RequiresDocID() bool + RequiresScoring() bool + RequiresStoredFields() []string +} + +func ParseSearchSort(input json.RawMessage) (SearchSort, error) { + var tmp string + err := json.Unmarshal(input, &tmp) + if err != nil { + return nil, err + } + descending := false + if strings.HasPrefix(tmp, "-") { + descending = true + tmp = tmp[1:] + } + if tmp == "_id" { + return &SortDocID{ + Descending: descending, + }, nil + } else if tmp == "_score" { + return &SortScore{ + Descending: descending, + }, nil + } + return &SortStoredField{ + Field: tmp, + Descending: descending, + }, nil +} + +func ParseSortOrder(in []json.RawMessage) (SortOrder, error) { + rv := make(SortOrder, 0, len(in)) + for _, i := range in { + ss, err := ParseSearchSort(i) + if err != nil { + return nil, err + } + rv = append(rv, ss) + } + return rv, nil +} + +type SortOrder []SearchSort + +func (so SortOrder) Compare(i, j *DocumentMatch) int { + // compare the documents on all search sorts until a differences is found + for _, soi := range so { + c := soi.Compare(i, j) + if c == 0 { + continue + } + return c + } + // if they are the same at this point, impose order based on index natural sort order + if i.HitNumber == j.HitNumber { + return 0 + } else if i.HitNumber > j.HitNumber { + return 1 + } + return -1 +} + +func (so SortOrder) RequiresScore() bool { + rv := false + for _, soi := range so { + if soi.RequiresScoring() { + rv = true + } + } + return rv +} + +func (so SortOrder) RequiresDocID() bool { + rv := false + for _, soi := range so { + if soi.RequiresDocID() { + rv = true + } + } + return rv +} + +func (so SortOrder) RequiredStoredFields() []string { + var rv []string + for _, soi := range so { + rv = append(rv, soi.RequiresStoredFields()...) + } + return rv +} // SortStoredField will sort results by the value of a stored field type SortStoredField struct { @@ -31,6 +128,13 @@ func (s *SortStoredField) RequiresScoring() bool { return false } // RequiresStoredFields says this SearchStore requires the specified stored field func (s *SortStoredField) RequiresStoredFields() []string { return []string{s.Field} } +func (s *SortStoredField) MarshalJSON() ([]byte, error) { + if s.Descending { + return json.Marshal("-" + s.Field) + } + return json.Marshal(s.Field) +} + // SortDocID will sort results by the document identifier type SortDocID struct { Descending bool @@ -53,6 +157,13 @@ func (s *SortDocID) RequiresScoring() bool { return false } // RequiresStoredFields says this SearchStore does not require any stored fields func (s *SortDocID) RequiresStoredFields() []string { return nil } +func (s *SortDocID) MarshalJSON() ([]byte, error) { + if s.Descending { + return json.Marshal("-_id") + } + return json.Marshal("_id") +} + // SortScore will sort results by the document match score type SortScore struct { Descending bool @@ -76,3 +187,10 @@ func (s *SortScore) RequiresScoring() bool { return true } // RequiresStoredFields says this SearchStore does not require any store fields func (s *SortScore) RequiresStoredFields() []string { return nil } + +func (s *SortScore) MarshalJSON() ([]byte, error) { + if s.Descending { + return json.Marshal("-_score") + } + return json.Marshal("_score") +} From 750e0ac16c7019b94d7340b33d73bcd900a0dff3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 17 Aug 2016 09:20:12 -0700 Subject: [PATCH 10/24] change sort field impl to use indexed values not stored values --- document/document.go | 15 - document/field.go | 83 ------ document/field_boolean.go | 11 - document/field_datetime.go | 11 - document/field_numeric.go | 11 - document/field_test.go | 383 -------------------------- document/field_text.go | 8 - index/index.go | 24 +- index/upside_down/index_reader.go | 20 +- index/upside_down/upside_down_test.go | 2 +- index_test.go | 4 +- numeric_util/prefix_coded.go | 19 +- numeric_util/prefix_coded_test.go | 39 +++ search/collectors/collector_heap.go | 19 +- search/collectors/search_test.go | 6 +- search/facets_builder.go | 2 +- search/search.go | 4 + search/sort.go | 151 ++++++++-- 18 files changed, 231 insertions(+), 581 deletions(-) delete mode 100644 document/field_test.go diff --git a/document/document.go b/document/document.go index e5e015c1..db6a1e0c 100644 --- a/document/document.go +++ b/document/document.go @@ -36,21 +36,6 @@ func (d *Document) AddField(f Field) *Document { return d } -func (d *Document) FieldNamed(field string) Field { - for _, f := range d.Fields { - if f.Name() == field { - return f - } - } - return nil -} - -func (d *Document) CompareFieldsNamed(other *Document, field string, descending bool) int { - fieldi := d.FieldNamed(field) - fieldj := other.FieldNamed(field) - return CompareFieldValues(fieldi, fieldj, descending) -} - func (d *Document) GoString() string { fields := "" for i, field := range d.Fields { diff --git a/document/field.go b/document/field.go index 78fb2df8..9b4c1ccc 100644 --- a/document/field.go +++ b/document/field.go @@ -32,86 +32,3 @@ type Field interface { // the rate of indexing NumPlainTextBytes() uint64 } - -// CompareFieldValues provides ordering amongst stored field values -// when trying compare field values of different types, -// we impose the following order: -// - nil (missing field) -// - boolean -// - number -// - text -// - date -func CompareFieldValues(i, j Field, descending bool) int { - - lower := func() int { - if descending { - return 1 - } - return -1 - } - - higher := func() int { - if descending { - return -1 - } - return 1 - } - - switch i := i.(type) { - case nil: - switch j.(type) { - case nil: - return 0 - default: - return lower() - } - case *BooleanField: - switch j := j.(type) { - case nil: - return higher() - case *BooleanField: - return i.Compare(j, descending) - default: - return lower() - } - case *NumericField: - switch j := j.(type) { - case nil: - return higher() - case *BooleanField: - return higher() - case *NumericField: - return i.Compare(j, descending) - default: - return lower() - } - case *TextField: - switch j := j.(type) { - case nil: - return higher() - case *BooleanField: - return higher() - case *NumericField: - return higher() - case *TextField: - return i.Compare(j, descending) - default: - return lower() - } - case *DateTimeField: - switch j := j.(type) { - case nil: - return higher() - case *BooleanField: - return higher() - case *NumericField: - return higher() - case *TextField: - return higher() - case *DateTimeField: - return i.Compare(j, descending) - } - } - - return 0 -} diff --git a/document/field_boolean.go b/document/field_boolean.go index 7674f824..9860197f 100644 --- a/document/field_boolean.go +++ b/document/field_boolean.go @@ -71,17 +71,6 @@ func (b *BooleanField) NumPlainTextBytes() uint64 { return b.numPlainTextBytes } -func (b *BooleanField) Compare(other *BooleanField, descending bool) int { - bv, _ := b.Boolean() - otherbv, _ := other.Boolean() - if bv == otherbv { - return 0 - } else if (otherbv && !descending) || (bv && descending) { - return -1 - } - return 1 -} - func NewBooleanFieldFromBytes(name string, arrayPositions []uint64, value []byte) *BooleanField { return &BooleanField{ name: name, diff --git a/document/field_datetime.go b/document/field_datetime.go index ee85fca2..e5050c33 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -100,17 +100,6 @@ func (n *DateTimeField) NumPlainTextBytes() uint64 { return n.numPlainTextBytes } -func (n *DateTimeField) Compare(other *DateTimeField, descending bool) int { - dt, _ := n.DateTime() - otherdt, _ := other.DateTime() - if dt.Equal(otherdt) { - return 0 - } else if (dt.Before(otherdt) && !descending) || (otherdt.Before(dt) && descending) { - return -1 - } - return 1 -} - func NewDateTimeFieldFromBytes(name string, arrayPositions []uint64, value []byte) *DateTimeField { return &DateTimeField{ name: name, diff --git a/document/field_numeric.go b/document/field_numeric.go index 17ce5f7b..e7e15684 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -96,17 +96,6 @@ func (n *NumericField) NumPlainTextBytes() uint64 { return n.numPlainTextBytes } -func (n *NumericField) Compare(other *NumericField, descending bool) int { - num, _ := n.Number() - othernum, _ := other.Number() - if num == othernum { - return 0 - } else if (num < othernum && !descending) || (num > othernum && descending) { - return -1 - } - return 1 -} - func NewNumericFieldFromBytes(name string, arrayPositions []uint64, value []byte) *NumericField { return &NumericField{ name: name, diff --git a/document/field_test.go b/document/field_test.go deleted file mode 100644 index 6e0f890a..00000000 --- a/document/field_test.go +++ /dev/null @@ -1,383 +0,0 @@ -// Copyright (c) 2014 Couchbase, Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file -// except in compliance with the License. You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the -// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. - -package document - -import ( - "testing" - "time" -) - -func TestCompareFieldValues(t *testing.T) { - - t1 := time.Now() - t2 := t1.Add(1 * time.Hour) - - dtf1, _ := NewDateTimeField("", nil, t1) - dtf2, _ := NewDateTimeField("", nil, t2) - - tests := []struct { - l Field - r Field - desc bool - res int - }{ - // nil simple - { - l: nil, - r: nil, - res: 0, - }, - // boolean simple - { - l: NewBooleanField("", nil, true), - r: NewBooleanField("", nil, true), - res: 0, - }, - { - l: NewBooleanField("", nil, true), - r: NewBooleanField("", nil, false), - res: 1, - }, - { - l: NewBooleanField("", nil, false), - r: NewBooleanField("", nil, true), - res: -1, - }, - { - l: NewBooleanField("", nil, true), - r: NewBooleanField("", nil, false), - desc: true, - res: -1, - }, - { - l: NewBooleanField("", nil, false), - r: NewBooleanField("", nil, true), - desc: true, - res: 1, - }, - // numeric simple - { - l: NewNumericField("", nil, 3.14), - r: NewNumericField("", nil, 3.14), - res: 0, - }, - { - l: NewNumericField("", nil, 5.14), - r: NewNumericField("", nil, 3.14), - res: 1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewNumericField("", nil, 5.14), - res: -1, - }, - { - l: NewNumericField("", nil, 5.14), - r: NewNumericField("", nil, 3.14), - desc: true, - res: -1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewNumericField("", nil, 5.14), - desc: true, - res: 1, - }, - // text simple - { - l: NewTextField("", nil, []byte("cat")), - r: NewTextField("", nil, []byte("cat")), - res: 0, - }, - { - l: NewTextField("", nil, []byte("dog")), - r: NewTextField("", nil, []byte("cat")), - res: 1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewTextField("", nil, []byte("dog")), - res: -1, - }, - { - l: NewTextField("", nil, []byte("dog")), - r: NewTextField("", nil, []byte("cat")), - desc: true, - res: -1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewTextField("", nil, []byte("dog")), - desc: true, - res: 1, - }, - // datetime simple - { - l: dtf1, - r: dtf1, - res: 0, - }, - { - l: dtf2, - r: dtf1, - res: 1, - }, - { - l: dtf1, - r: dtf2, - res: -1, - }, - { - l: dtf2, - r: dtf1, - desc: true, - res: -1, - }, - { - l: dtf1, - r: dtf2, - desc: true, - res: 1, - }, - // mixed types, nil left - { - l: nil, - r: NewBooleanField("", nil, true), - res: -1, - }, - { - l: nil, - r: NewBooleanField("", nil, true), - desc: true, - res: 1, - }, - { - l: nil, - r: NewNumericField("", nil, 3.14), - res: -1, - }, - { - l: nil, - r: NewNumericField("", nil, 3.14), - desc: true, - res: 1, - }, - { - l: nil, - r: NewTextField("", nil, []byte("cat")), - res: -1, - }, - { - l: nil, - r: NewTextField("", nil, []byte("cat")), - desc: true, - res: 1, - }, - { - l: nil, - r: dtf1, - res: -1, - }, - { - l: nil, - r: dtf1, - desc: true, - res: 1, - }, - // mixed types, boolean left - { - l: NewBooleanField("", nil, true), - r: nil, - res: 1, - }, - { - l: NewBooleanField("", nil, true), - r: nil, - desc: true, - res: -1, - }, - { - l: NewBooleanField("", nil, true), - r: NewNumericField("", nil, 3.14), - res: -1, - }, - { - l: NewBooleanField("", nil, true), - r: NewNumericField("", nil, 3.14), - desc: true, - res: 1, - }, - { - l: NewBooleanField("", nil, true), - r: NewTextField("", nil, []byte("cat")), - res: -1, - }, - { - l: NewBooleanField("", nil, true), - r: NewTextField("", nil, []byte("cat")), - desc: true, - res: 1, - }, - { - l: NewBooleanField("", nil, true), - r: dtf1, - res: -1, - }, - { - l: NewBooleanField("", nil, true), - r: dtf1, - desc: true, - res: 1, - }, - // mixed types, number left - { - l: NewNumericField("", nil, 3.14), - r: nil, - res: 1, - }, - { - l: NewNumericField("", nil, 3.14), - r: nil, - desc: true, - res: -1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewBooleanField("", nil, true), - res: 1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewBooleanField("", nil, true), - desc: true, - res: -1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewTextField("", nil, []byte("cat")), - res: -1, - }, - { - l: NewNumericField("", nil, 3.14), - r: NewTextField("", nil, []byte("cat")), - desc: true, - res: 1, - }, - { - l: NewNumericField("", nil, 3.14), - r: dtf1, - res: -1, - }, - { - l: NewNumericField("", nil, 3.14), - r: dtf1, - desc: true, - res: 1, - }, - // mixed types, text left - { - l: NewTextField("", nil, []byte("cat")), - r: nil, - res: 1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: nil, - desc: true, - res: -1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewBooleanField("", nil, true), - res: 1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewBooleanField("", nil, true), - desc: true, - res: -1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewNumericField("", nil, 3.14), - res: 1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: NewNumericField("", nil, 3.14), - desc: true, - res: -1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: dtf1, - res: -1, - }, - { - l: NewTextField("", nil, []byte("cat")), - r: dtf1, - desc: true, - res: 1, - }, - // mixed types, datetimes left - { - l: dtf1, - r: nil, - res: 1, - }, - { - l: dtf1, - r: nil, - desc: true, - res: -1, - }, - { - l: dtf1, - r: NewBooleanField("", nil, true), - res: 1, - }, - { - l: dtf1, - r: NewBooleanField("", nil, true), - desc: true, - res: -1, - }, - { - l: dtf1, - r: NewNumericField("", nil, 3.14), - res: 1, - }, - { - l: dtf1, - r: NewNumericField("", nil, 3.14), - desc: true, - res: -1, - }, - { - l: dtf1, - r: NewTextField("", nil, []byte("cat")), - res: 1, - }, - { - l: dtf1, - r: NewTextField("", nil, []byte("cat")), - desc: true, - res: -1, - }, - } - - for i, test := range tests { - actual := CompareFieldValues(test.l, test.r, test.desc) - if actual != test.res { - t.Errorf("expected %d, got %d for case %d", test.res, actual, i) - } - } - -} diff --git a/document/field_text.go b/document/field_text.go index 4cfa656e..37ddf170 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -11,7 +11,6 @@ package document import ( "fmt" - "strings" "github.com/blevesearch/bleve/analysis" ) @@ -78,13 +77,6 @@ func (t *TextField) NumPlainTextBytes() uint64 { return t.numPlainTextBytes } -func (t *TextField) Compare(other *TextField, descending bool) int { - if descending { - return strings.Compare(string(other.value), string(t.value)) - } - return strings.Compare(string(t.value), string(other.value)) -} - func NewTextField(name string, arrayPositions []uint64, value []byte) *TextField { return NewTextFieldWithIndexingOptions(name, arrayPositions, value, DefaultTextIndexingOptions) } diff --git a/index/index.go b/index/index.go index ac86f20a..0d1383a4 100644 --- a/index/index.go +++ b/index/index.go @@ -79,8 +79,7 @@ type IndexReader interface { FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error) Document(id string) (*document.Document, error) - DocumentFieldTerms(id IndexInternalID) (FieldTerms, error) - DocumentFieldTermsForFields(id IndexInternalID, fields []string) (FieldTerms, error) + DocumentFieldTerms(id IndexInternalID, fields []string) (FieldTerms, error) Fields() ([]string, error) @@ -93,8 +92,29 @@ type IndexReader interface { Close() error } +// FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string +// FieldsNotYetCached returns a list of fields not yet cached out of a larger list of fields +func (f FieldTerms) FieldsNotYetCached(fields []string) []string { + var rv []string + for _, field := range fields { + if _, ok := f[field]; !ok { + rv = append(rv, field) + } + } + return rv +} + +// Merge will combine two FieldTerms +// it assumes that the terms lists are complete (thus do not need to be merged) +// field terms from the other list always replace the ones in the receiver +func (f FieldTerms) Merge(other FieldTerms) { + for field, terms := range other { + f[field] = terms + } +} + type TermFieldVector struct { Field string ArrayPositions []uint64 diff --git a/index/upside_down/index_reader.go b/index/upside_down/index_reader.go index 181b8b14..72210fbf 100644 --- a/index/upside_down/index_reader.go +++ b/index/upside_down/index_reader.go @@ -98,25 +98,7 @@ func (i *IndexReader) Document(id string) (doc *document.Document, err error) { return } -func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID) (index.FieldTerms, error) { - back, err := i.index.backIndexRowForDoc(i.kvreader, id) - if err != nil { - return nil, err - } - rv := make(index.FieldTerms, len(back.termEntries)) - for _, entry := range back.termEntries { - fieldName := i.index.fieldCache.FieldIndexed(uint16(*entry.Field)) - terms, ok := rv[fieldName] - if !ok { - terms = make([]string, 0) - } - terms = append(terms, *entry.Term) - rv[fieldName] = terms - } - return rv, nil -} - -func (i *IndexReader) DocumentFieldTermsForFields(id index.IndexInternalID, fields []string) (index.FieldTerms, error) { +func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID, fields []string) (index.FieldTerms, error) { back, err := i.index.backIndexRowForDoc(i.kvreader, id) if err != nil { return nil, err diff --git a/index/upside_down/upside_down_test.go b/index/upside_down/upside_down_test.go index 099a2b0c..cbfdab54 100644 --- a/index/upside_down/upside_down_test.go +++ b/index/upside_down/upside_down_test.go @@ -1179,7 +1179,7 @@ func TestIndexDocumentFieldTerms(t *testing.T) { } }() - fieldTerms, err := indexReader.DocumentFieldTerms(index.IndexInternalID("1")) + fieldTerms, err := indexReader.DocumentFieldTerms(index.IndexInternalID("1"), []string{"name", "title"}) if err != nil { t.Error(err) } diff --git a/index_test.go b/index_test.go index dbd1789d..198ccda3 100644 --- a/index_test.go +++ b/index_test.go @@ -744,8 +744,8 @@ func TestSortMatchSearch(t *testing.T) { req := NewSearchRequest(NewMatchQuery("One")) req.SortBy(search.SortOrder{ - &search.SortStoredField{Field: "Day"}, - &search.SortStoredField{Field: "Name"}, + &search.SortField{Field: "Day"}, + &search.SortField{Field: "Name"}, }) req.Fields = []string{"*"} sr, err := index.Search(req) diff --git a/numeric_util/prefix_coded.go b/numeric_util/prefix_coded.go index 25931151..4cf36dec 100644 --- a/numeric_util/prefix_coded.go +++ b/numeric_util/prefix_coded.go @@ -9,9 +9,7 @@ package numeric_util -import ( - "fmt" -) +import "fmt" const ShiftStartInt64 byte = 0x20 @@ -72,3 +70,18 @@ func (p PrefixCoded) Int64() (int64, error) { } return int64(uint64((sortableBits << shift)) ^ 0x8000000000000000), nil } + +func ValidPrefixCodedTerm(p string) (bool, int) { + if len(p) > 0 { + if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { + return false, 0 + } + shift := p[0] - ShiftStartInt64 + nChars := ((63 - int(shift)) / 7) + 1 + if len(p) != nChars+1 { + return false, 0 + } + return true, int(shift) + } + return false, 0 +} diff --git a/numeric_util/prefix_coded_test.go b/numeric_util/prefix_coded_test.go index fe55475d..6f064fdd 100644 --- a/numeric_util/prefix_coded_test.go +++ b/numeric_util/prefix_coded_test.go @@ -98,6 +98,45 @@ func TestPrefixCoded(t *testing.T) { } } +func TestPrefixCodedValid(t *testing.T) { + // all of the shared tests should be valid + for _, test := range tests { + valid, _ := ValidPrefixCodedTerm(string(test.output)) + if !valid { + t.Errorf("expected %s to be valid prefix coded, is not", string(test.output)) + } + } + + invalidTests := []struct { + data PrefixCoded + }{ + // first byte invalid skip (too low) + { + data: PrefixCoded{0x19, 'c', 'a', 't'}, + }, + // first byte invalid skip (too high) + { + data: PrefixCoded{0x20 + 64, 'c'}, + }, + // length of trailing bytes wrong (too long) + { + data: PrefixCoded{0x20, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, + }, + // length of trailing bytes wrong (too short) + { + data: PrefixCoded{0x20 + 63}, + }, + } + + // all of the shared tests should be valid + for _, test := range invalidTests { + valid, _ := ValidPrefixCodedTerm(string(test.data)) + if valid { + t.Errorf("expected %s to be invalid prefix coded, it is", string(test.data)) + } + } +} + func BenchmarkTestPrefixCoded(b *testing.B) { for i := 0; i < b.N; i++ { diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 7f6e3f35..86a6c046 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -113,18 +113,19 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } // see if we need to load the stored fields - if len(hc.sort.RequiredStoredFields()) > 0 { - if d.ID == "" { - // look up the id since we need it for lookup - d.ID, err = reader.FinalizeDocID(d.IndexInternalID) - if err != nil { - return err - } - } - d.Document, err = reader.Document(d.ID) + if len(hc.sort.RequiredFields()) > 0 { + // find out which fields haven't been loaded yet + fieldsToLoad := d.CachedFieldTerms.FieldsNotYetCached(hc.sort.RequiredFields()) + // look them up + fieldTerms, err := reader.DocumentFieldTerms(d.IndexInternalID, fieldsToLoad) if err != nil { return err } + // cache these as well + if d.CachedFieldTerms == nil { + d.CachedFieldTerms = make(map[string][]string) + } + d.CachedFieldTerms.Merge(fieldTerms) } // optimization, we track lowest sorting hit already removed from heap diff --git a/search/collectors/search_test.go b/search/collectors/search_test.go index c5faa243..26de7c53 100644 --- a/search/collectors/search_test.go +++ b/search/collectors/search_test.go @@ -95,11 +95,7 @@ func (sr *stubReader) Document(id string) (*document.Document, error) { return nil, nil } -func (sr *stubReader) DocumentFieldTerms(id index.IndexInternalID) (index.FieldTerms, error) { - return nil, nil -} - -func (sr *stubReader) DocumentFieldTermsForFields(id index.IndexInternalID, fields []string) (index.FieldTerms, error) { +func (sr *stubReader) DocumentFieldTerms(id index.IndexInternalID, fields []string) (index.FieldTerms, error) { return nil, nil } diff --git a/search/facets_builder.go b/search/facets_builder.go index 55cb33ab..4709e454 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -42,7 +42,7 @@ func (fb *FacetsBuilder) Update(docMatch *DocumentMatch) error { for _, facetBuilder := range fb.facets { fields = append(fields, facetBuilder.Field()) } - fieldTerms, err := fb.indexReader.DocumentFieldTermsForFields(docMatch.IndexInternalID, fields) + fieldTerms, err := fb.indexReader.DocumentFieldTerms(docMatch.IndexInternalID, fields) if err != nil { return err } diff --git a/search/search.go b/search/search.go index b6a2f6be..355184b1 100644 --- a/search/search.go +++ b/search/search.go @@ -71,6 +71,10 @@ type DocumentMatch struct { // fields as float64s and date fields as time.RFC3339 formatted strings. Fields map[string]interface{} `json:"fields,omitempty"` + // as we learn field terms, we can cache important ones for later use + // for example, sorting and building facets need these values + CachedFieldTerms index.FieldTerms `json:"-"` + // if we load the document for this hit, remember it so we dont load again Document *document.Document `json:"-"` diff --git a/search/sort.go b/search/sort.go index 921482fe..92d5e893 100644 --- a/search/sort.go +++ b/search/sort.go @@ -11,15 +11,21 @@ package search import ( "encoding/json" + "sort" "strings" + + "github.com/blevesearch/bleve/numeric_util" ) +var HighTerm = strings.Repeat(string([]byte{0xff}), 10) +var LowTerm = string([]byte{0x00}) + type SearchSort interface { Compare(a, b *DocumentMatch) int RequiresDocID() bool RequiresScoring() bool - RequiresStoredFields() []string + RequiresFields() []string } func ParseSearchSort(input json.RawMessage) (SearchSort, error) { @@ -42,7 +48,7 @@ func ParseSearchSort(input json.RawMessage) (SearchSort, error) { Descending: descending, }, nil } - return &SortStoredField{ + return &SortField{ Field: tmp, Descending: descending, }, nil @@ -100,35 +106,146 @@ func (so SortOrder) RequiresDocID() bool { return rv } -func (so SortOrder) RequiredStoredFields() []string { +func (so SortOrder) RequiredFields() []string { var rv []string for _, soi := range so { - rv = append(rv, soi.RequiresStoredFields()...) + rv = append(rv, soi.RequiresFields()...) } return rv } -// SortStoredField will sort results by the value of a stored field -type SortStoredField struct { +// SortFieldType lets you control some internal sort behavior +// normally leaving this to the zero-value of SortFieldAuto is fine +type SortFieldType int + +const ( + // SortFieldAuto applies heuristics attempt to automatically sort correctly + SortFieldAuto SortFieldType = iota + // SortFieldAsString forces sort as string (no prefix coded terms removed) + SortFieldAsString + // SortFieldAsNumber forces sort as string (prefix coded terms with shift > 0 removed) + SortFieldAsNumber + // SortFieldAsDate forces sort as string (prefix coded terms with shift > 0 removed) + SortFieldAsDate +) + +// SortFieldMode describes the behavior if the field has multiple values +type SortFieldMode int + +const ( + // SortFieldFirst uses the first (or only) value, this is the default zero-value + SortFieldFirst SortFieldMode = iota // FIXME name is confusing + // SortFieldMin uses the minimum value + SortFieldMin + // SortFieldMax uses the maximum value + SortFieldMax +) + +const SortFieldMissingLast = "_last" +const SortFieldMissingFirst = "_first" + +// SortField will sort results by the value of a stored field +// Field is the name of the field +// Descending reverse the sort order (default false) +// Type allows forcing of string/number/date behavior (default auto) +// Mode controls behavior for multi-values fields (default first) +// Missing controls behavior of missing values (default last) +type SortField struct { Field string Descending bool + Type SortFieldType + Mode SortFieldMode + Missing string } // Compare orders DocumentMatch instances by stored field values -func (s *SortStoredField) Compare(i, j *DocumentMatch) int { - return i.Document.CompareFieldsNamed(j.Document, s.Field, s.Descending) +func (s *SortField) Compare(i, j *DocumentMatch) int { + iTerms := i.CachedFieldTerms[s.Field] + iTerms = s.filterTermsByType(iTerms) + iTerm := s.filterTermsByMode(iTerms) + jTerms := j.CachedFieldTerms[s.Field] + jTerms = s.filterTermsByType(jTerms) + jTerm := s.filterTermsByMode(jTerms) + rv := strings.Compare(iTerm, jTerm) + if s.Descending { + rv = -rv + } + return rv +} + +func (s *SortField) filterTermsByMode(terms []string) string { + if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldFirst) { + return terms[0] + } else if len(terms) > 1 { + switch s.Mode { + case SortFieldMin: + sort.Strings(terms) + return terms[0] + case SortFieldMax: + sort.Strings(terms) + return terms[len(terms)-1] + } + } + + // handle missing terms + if s.Missing == "" || s.Missing == SortFieldMissingLast { + if s.Descending { + return LowTerm + } + return HighTerm + } else if s.Missing == SortFieldMissingFirst { + if s.Descending { + return HighTerm + } + return LowTerm + } + return s.Missing +} + +// filterTermsByType attempts to make one pass on the terms +// if we are in auto-mode AND all the terms look like prefix-coded numbers +// return only the terms which had shift of 0 +// if we are in explicit number or date mode, return only valid +// prefix coded numbers with shift of 0 +func (s *SortField) filterTermsByType(terms []string) []string { + stype := s.Type + if stype == SortFieldAuto { + allTermsPrefixCoded := true + var termsWithShiftZero []string + for _, term := range terms { + valid, shift := numeric_util.ValidPrefixCodedTerm(term) + if valid && shift == 0 { + termsWithShiftZero = append(termsWithShiftZero, term) + } else if !valid { + allTermsPrefixCoded = false + } + } + if allTermsPrefixCoded { + terms = termsWithShiftZero + } + } else if stype == SortFieldAsNumber || stype == SortFieldAsDate { + var termsWithShiftZero []string + for _, term := range terms { + valid, shift := numeric_util.ValidPrefixCodedTerm(term) + if valid && shift == 0 { + termsWithShiftZero = append(termsWithShiftZero) + } + } + terms = termsWithShiftZero + } + return terms } // RequiresDocID says this SearchSort does not require the DocID be loaded -func (s *SortStoredField) RequiresDocID() bool { return false } +func (s *SortField) RequiresDocID() bool { return false } // RequiresScoring says this SearchStore does not require scoring -func (s *SortStoredField) RequiresScoring() bool { return false } +func (s *SortField) RequiresScoring() bool { return false } -// RequiresStoredFields says this SearchStore requires the specified stored field -func (s *SortStoredField) RequiresStoredFields() []string { return []string{s.Field} } +// RequiresFields says this SearchStore requires the specified stored field +func (s *SortField) RequiresFields() []string { return []string{s.Field} } -func (s *SortStoredField) MarshalJSON() ([]byte, error) { +func (s *SortField) MarshalJSON() ([]byte, error) { if s.Descending { return json.Marshal("-" + s.Field) } @@ -154,8 +271,8 @@ func (s *SortDocID) RequiresDocID() bool { return true } // RequiresScoring says this SearchStore does not require scoring func (s *SortDocID) RequiresScoring() bool { return false } -// RequiresStoredFields says this SearchStore does not require any stored fields -func (s *SortDocID) RequiresStoredFields() []string { return nil } +// RequiresFields says this SearchStore does not require any stored fields +func (s *SortDocID) RequiresFields() []string { return nil } func (s *SortDocID) MarshalJSON() ([]byte, error) { if s.Descending { @@ -185,8 +302,8 @@ func (s *SortScore) RequiresDocID() bool { return false } // RequiresScoring says this SearchStore does require scoring func (s *SortScore) RequiresScoring() bool { return true } -// RequiresStoredFields says this SearchStore does not require any store fields -func (s *SortScore) RequiresStoredFields() []string { return nil } +// RequiresFields says this SearchStore does not require any store fields +func (s *SortScore) RequiresFields() []string { return nil } func (s *SortScore) MarshalJSON() ([]byte, error) { if s.Descending { From 27ba6187bc314c70d4cd85cb26e5050cf79bc370 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 17 Aug 2016 14:33:51 -0700 Subject: [PATCH 11/24] adds support for more complex field sorts with object (not string) previously from JSON we would just deserialize strings like "-abv" or "city" or "_id" or "_score" as simple sorts on fields, ids or scores respectively while this is simple and compact, it can be ambiguous (for example if you have a field starting with - or if you have a field named "_id" already. also, this simple syntax doesnt allow us to specify more cmoplex options to deal with type/mode/missing we keep support for the simple string syntax, but now also recognize a more expressive syntax like: { "by": "field", "field": "abv", "desc": true, "type": "string", "mode": "min", "missing": "first" } type, mode and missing are optional and default to "auto", "default", and "last" respectively --- index/upside_down/index_reader.go | 7 +- search.go | 2 +- search/facets_builder.go | 19 ++- search/sort.go | 188 +++++++++++++++++++++++++----- 4 files changed, 175 insertions(+), 41 deletions(-) diff --git a/index/upside_down/index_reader.go b/index/upside_down/index_reader.go index 72210fbf..2349b749 100644 --- a/index/upside_down/index_reader.go +++ b/index/upside_down/index_reader.go @@ -10,8 +10,6 @@ package upside_down import ( - "fmt" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" @@ -107,10 +105,9 @@ func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID, fields []stri fieldsMap := make(map[uint16]string, len(fields)) for _, f := range fields { id, ok := i.index.fieldCache.FieldNamed(f, false) - if !ok { - return nil, fmt.Errorf("Field %s was not found in cache", f) + if ok { + fieldsMap[id] = f } - fieldsMap[id] = f } for _, entry := range back.termEntries { if field, ok := fieldsMap[uint16(*entry.Field)]; ok { diff --git a/search.go b/search.go index 7d793002..9c6bce85 100644 --- a/search.go +++ b/search.go @@ -254,7 +254,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { if temp.Sort == nil { r.Sort = search.SortOrder{&search.SortScore{Descending: true}} } else { - r.Sort, err = search.ParseSortOrder(temp.Sort) + r.Sort, err = search.ParseSortOrderJSON(temp.Sort) if err != nil { return err } diff --git a/search/facets_builder.go b/search/facets_builder.go index 4709e454..66f96a6a 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -42,12 +42,23 @@ func (fb *FacetsBuilder) Update(docMatch *DocumentMatch) error { for _, facetBuilder := range fb.facets { fields = append(fields, facetBuilder.Field()) } - fieldTerms, err := fb.indexReader.DocumentFieldTerms(docMatch.IndexInternalID, fields) - if err != nil { - return err + + if len(fields) > 0 { + // find out which fields haven't been loaded yet + fieldsToLoad := docMatch.CachedFieldTerms.FieldsNotYetCached(fields) + // look them up + fieldTerms, err := fb.indexReader.DocumentFieldTerms(docMatch.IndexInternalID, fieldsToLoad) + if err != nil { + return err + } + // cache these as well + if docMatch.CachedFieldTerms == nil { + docMatch.CachedFieldTerms = make(map[string][]string) + } + docMatch.CachedFieldTerms.Merge(fieldTerms) } for _, facetBuilder := range fb.facets { - facetBuilder.Update(fieldTerms) + facetBuilder.Update(docMatch.CachedFieldTerms) } return nil } diff --git a/search/sort.go b/search/sort.go index 92d5e893..0459d6e0 100644 --- a/search/sort.go +++ b/search/sort.go @@ -11,6 +11,7 @@ package search import ( "encoding/json" + "fmt" "sort" "strings" @@ -28,36 +29,117 @@ type SearchSort interface { RequiresFields() []string } -func ParseSearchSort(input json.RawMessage) (SearchSort, error) { - var tmp string - err := json.Unmarshal(input, &tmp) - if err != nil { - return nil, err +func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) { + descending, ok := input["desc"].(bool) + by, ok := input["by"].(string) + if !ok { + return nil, fmt.Errorf("search sort must specify by") } - descending := false - if strings.HasPrefix(tmp, "-") { - descending = true - tmp = tmp[1:] - } - if tmp == "_id" { + switch by { + case "id": return &SortDocID{ Descending: descending, }, nil - } else if tmp == "_score" { + case "score": + return &SortScore{ + Descending: descending, + }, nil + case "field": + field, ok := input["field"].(string) + if !ok { + return nil, fmt.Errorf("search sort mode field must specify field") + } + rv := &SortField{ + Field: field, + Descending: descending, + } + typ, ok := input["type"].(string) + if ok { + switch typ { + case "auto": + rv.Type = SortFieldAuto + case "string": + rv.Type = SortFieldAsString + case "number": + rv.Type = SortFieldAsNumber + case "date": + rv.Type = SortFieldAsDate + default: + return nil, fmt.Errorf("unkown sort field type: %s", typ) + } + } + mode, ok := input["mode"].(string) + if ok { + switch mode { + case "default": + rv.Mode = SortFieldDefault + case "min": + rv.Mode = SortFieldMin + case "max": + rv.Mode = SortFieldMax + default: + return nil, fmt.Errorf("unknown sort field mode: %s", mode) + } + } + missing, ok := input["missing"].(string) + if ok { + switch missing { + case "first": + rv.Missing = SortFieldMissingFirst + case "last": + rv.Missing = SortFieldMissingLast + default: + return nil, fmt.Errorf("unknown sort field missing: %s", missing) + } + } + return rv, nil + } + + return nil, fmt.Errorf("unknown search sort by: %s", by) +} + +func ParseSearchSortString(input string) (SearchSort, error) { + descending := false + if strings.HasPrefix(input, "-") { + descending = true + input = input[1:] + } else if strings.HasPrefix(input, "+") { + input = input[1:] + } + if input == "_id" { + return &SortDocID{ + Descending: descending, + }, nil + } else if input == "_score" { return &SortScore{ Descending: descending, }, nil } return &SortField{ - Field: tmp, + Field: input, Descending: descending, }, nil } -func ParseSortOrder(in []json.RawMessage) (SortOrder, error) { +func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) { + // first try to parse it as string + var sortString string + err := json.Unmarshal(input, &sortString) + if err != nil { + var sortObj map[string]interface{} + err = json.Unmarshal(input, &sortObj) + if err != nil { + return nil, err + } + return ParseSearchSortObj(sortObj) + } + return ParseSearchSortString(sortString) +} + +func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) { rv := make(SortOrder, 0, len(in)) for _, i := range in { - ss, err := ParseSearchSort(i) + ss, err := ParseSearchSortJSON(i) if err != nil { return nil, err } @@ -133,16 +215,24 @@ const ( type SortFieldMode int const ( - // SortFieldFirst uses the first (or only) value, this is the default zero-value - SortFieldFirst SortFieldMode = iota // FIXME name is confusing + // SortFieldDefault uses the first (or only) value, this is the default zero-value + SortFieldDefault SortFieldMode = iota // FIXME name is confusing // SortFieldMin uses the minimum value SortFieldMin // SortFieldMax uses the maximum value SortFieldMax ) -const SortFieldMissingLast = "_last" -const SortFieldMissingFirst = "_first" +// SortFieldMissing controls where documents missing a field value should be sorted +type SortFieldMissing int + +const ( + // SortFieldMissingLast sorts documents missing a field at the end + SortFieldMissingLast SortFieldMissing = iota + + // SortFieldMissingFirst sorts documents missing a field at the beginning + SortFieldMissingFirst +) // SortField will sort results by the value of a stored field // Field is the name of the field @@ -155,7 +245,7 @@ type SortField struct { Descending bool Type SortFieldType Mode SortFieldMode - Missing string + Missing SortFieldMissing } // Compare orders DocumentMatch instances by stored field values @@ -174,7 +264,7 @@ func (s *SortField) Compare(i, j *DocumentMatch) int { } func (s *SortField) filterTermsByMode(terms []string) string { - if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldFirst) { + if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) { return terms[0] } else if len(terms) > 1 { switch s.Mode { @@ -188,18 +278,16 @@ func (s *SortField) filterTermsByMode(terms []string) string { } // handle missing terms - if s.Missing == "" || s.Missing == SortFieldMissingLast { + if s.Missing == SortFieldMissingLast { if s.Descending { return LowTerm } return HighTerm - } else if s.Missing == SortFieldMissingFirst { - if s.Descending { - return HighTerm - } - return LowTerm } - return s.Missing + if s.Descending { + return HighTerm + } + return LowTerm } // filterTermsByType attempts to make one pass on the terms @@ -246,10 +334,48 @@ func (s *SortField) RequiresScoring() bool { return false } func (s *SortField) RequiresFields() []string { return []string{s.Field} } func (s *SortField) MarshalJSON() ([]byte, error) { - if s.Descending { - return json.Marshal("-" + s.Field) + // see if simple format can be used + if s.Missing == SortFieldMissingLast && + s.Mode == SortFieldDefault && + s.Type == SortFieldAuto { + if s.Descending { + return json.Marshal("-" + s.Field) + } + return json.Marshal(s.Field) } - return json.Marshal(s.Field) + sfm := map[string]interface{}{ + "by": "field", + "field": s.Field, + } + if s.Descending { + sfm["desc"] = true + } + if s.Missing > SortFieldMissingLast { + switch s.Missing { + case SortFieldMissingFirst: + sfm["missing"] = "first" + } + } + if s.Mode > SortFieldDefault { + switch s.Mode { + case SortFieldMin: + sfm["mode"] = "min" + case SortFieldMax: + sfm["mode"] = "max" + } + } + if s.Type > SortFieldAuto { + switch s.Type { + case SortFieldAsString: + sfm["type"] = "string" + case SortFieldAsNumber: + sfm["type"] = "number" + case SortFieldAsDate: + sfm["type"] = "date" + } + } + + return json.Marshal(sfm) } // SortDocID will sort results by the document identifier From 27f5c6ec92397fc571e66b939ebae743adadeff9 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 17 Aug 2016 14:49:06 -0700 Subject: [PATCH 12/24] expose simple string slice based sorting to top-level bleve this change means simple sort requirements no longer require importing the search package (high-level API goal) also the sort test at the top-level was changed to use this form --- index_test.go | 5 +---- search.go | 12 +++++++++++- search/sort.go | 19 ++++++++++++++----- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/index_test.go b/index_test.go index 198ccda3..5e070366 100644 --- a/index_test.go +++ b/index_test.go @@ -743,10 +743,7 @@ func TestSortMatchSearch(t *testing.T) { } req := NewSearchRequest(NewMatchQuery("One")) - req.SortBy(search.SortOrder{ - &search.SortField{Field: "Day"}, - &search.SortField{Field: "Name"}, - }) + req.SortBy([]string{"Day", "Name"}) req.Fields = []string{"*"} sr, err := index.Search(req) if err != nil { diff --git a/search.go b/search.go index 9c6bce85..22e49c10 100644 --- a/search.go +++ b/search.go @@ -223,7 +223,17 @@ func (r *SearchRequest) AddFacet(facetName string, f *FacetRequest) { } // SortBy changes the request to use the requested sort order -func (r *SearchRequest) SortBy(order search.SortOrder) { +// this form uses the simplified syntax with an array of strings +// each string can either be a field name +// or the magic value _id and _score which refer to the doc id and search score +// any of these values can optionally be prefixed with - to reverse the order +func (r *SearchRequest) SortBy(order []string) { + so := search.ParseSortOrderStrings(order) + r.Sort = so +} + +// SortByCustom changes the request to use the requested sort order +func (r *SearchRequest) SortByCustom(order search.SortOrder) { r.Sort = order } diff --git a/search/sort.go b/search/sort.go index 0459d6e0..5b162831 100644 --- a/search/sort.go +++ b/search/sort.go @@ -98,7 +98,7 @@ func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) { return nil, fmt.Errorf("unknown search sort by: %s", by) } -func ParseSearchSortString(input string) (SearchSort, error) { +func ParseSearchSortString(input string) SearchSort { descending := false if strings.HasPrefix(input, "-") { descending = true @@ -109,16 +109,16 @@ func ParseSearchSortString(input string) (SearchSort, error) { if input == "_id" { return &SortDocID{ Descending: descending, - }, nil + } } else if input == "_score" { return &SortScore{ Descending: descending, - }, nil + } } return &SortField{ Field: input, Descending: descending, - }, nil + } } func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) { @@ -133,7 +133,16 @@ func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) { } return ParseSearchSortObj(sortObj) } - return ParseSearchSortString(sortString) + return ParseSearchSortString(sortString), nil +} + +func ParseSortOrderStrings(in []string) SortOrder { + rv := make(SortOrder, 0, len(in)) + for _, i := range in { + ss := ParseSearchSortString(i) + rv = append(rv, ss) + } + return rv } func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) { From 2311d060d113e8549e00602a09c4bbad00dcfffb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 18 Aug 2016 13:03:48 -0700 Subject: [PATCH 13/24] add example usage of SortBy and SortByCustom --- examples_test.go | 50 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/examples_test.go b/examples_test.go index f33968cf..a0e8e097 100644 --- a/examples_test.go +++ b/examples_test.go @@ -15,6 +15,7 @@ import ( "testing" "time" + "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/highlight/highlighters/ansi" ) @@ -61,11 +62,13 @@ func ExampleIndex_indexing() { data := struct { Name string Created time.Time - }{Name: "named one", Created: time.Now()} + Age int + }{Name: "named one", Created: time.Now(), Age: 50} data2 := struct { Name string Created time.Time - }{Name: "great nameless one", Created: time.Now()} + Age int + }{Name: "great nameless one", Created: time.Now(), Age: 25} // index some data err = example_index.Index("document id 1", data) @@ -504,3 +507,46 @@ func ExampleDocumentMapping_AddFieldMappingsAt() { // Output: // 1 } + +func ExampleSearchRequest_SortBy() { + // find docs containing "one", order by Age instead of score + query := NewMatchQuery("one") + searchRequest := NewSearchRequest(query) + searchRequest.SortBy([]string{"Age"}) + searchResults, err := example_index.Search(searchRequest) + if err != nil { + panic(err) + } + + fmt.Println(searchResults.Hits[0].ID) + fmt.Println(searchResults.Hits[1].ID) + // Output: + // document id 2 + // document id 1 +} + +func ExampleSearchRequest_SortByCustom() { + // find all docs, order by Age, with docs missing Age field first + query := NewMatchAllQuery() + searchRequest := NewSearchRequest(query) + searchRequest.SortByCustom(search.SortOrder{ + &search.SortField{ + Field: "Age", + Missing: search.SortFieldMissingFirst, + }, + }) + searchResults, err := example_index.Search(searchRequest) + if err != nil { + panic(err) + } + + fmt.Println(searchResults.Hits[0].ID) + fmt.Println(searchResults.Hits[1].ID) + fmt.Println(searchResults.Hits[2].ID) + fmt.Println(searchResults.Hits[3].ID) + // Output: + // document id 3 + // document id 4 + // document id 2 + // document id 1 +} From 1ae938b781c4f2238b77d22417f4de73c77cb61b Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 20 Aug 2016 14:45:53 -0400 Subject: [PATCH 14/24] add integration tests for sorting --- test/tests/sort/data/a.json | 8 + test/tests/sort/data/b.json | 8 + test/tests/sort/data/c.json | 8 + test/tests/sort/data/d.json | 7 + test/tests/sort/data/e.json | 7 + test/tests/sort/data/f.json | 7 + test/tests/sort/mapping.json | 3 + test/tests/sort/searches.json | 443 ++++++++++++++++++++++++++++++++++ 8 files changed, 491 insertions(+) create mode 100644 test/tests/sort/data/a.json create mode 100644 test/tests/sort/data/b.json create mode 100644 test/tests/sort/data/c.json create mode 100644 test/tests/sort/data/d.json create mode 100644 test/tests/sort/data/e.json create mode 100644 test/tests/sort/data/f.json create mode 100644 test/tests/sort/mapping.json create mode 100644 test/tests/sort/searches.json diff --git a/test/tests/sort/data/a.json b/test/tests/sort/data/a.json new file mode 100644 index 00000000..66ac5969 --- /dev/null +++ b/test/tests/sort/data/a.json @@ -0,0 +1,8 @@ +{ + "id": "a", + "name": "marty", + "age": 19, + "born": "2014-11-25", + "title": "mista", + "tags": ["gopher", "belieber"] +} diff --git a/test/tests/sort/data/b.json b/test/tests/sort/data/b.json new file mode 100644 index 00000000..0b84ce89 --- /dev/null +++ b/test/tests/sort/data/b.json @@ -0,0 +1,8 @@ +{ + "id": "b", + "name": "steve", + "age": 21, + "born": "2000-09-11", + "title": "zebra", + "tags": ["thought-leader", "futurist"] +} diff --git a/test/tests/sort/data/c.json b/test/tests/sort/data/c.json new file mode 100644 index 00000000..a1b17b00 --- /dev/null +++ b/test/tests/sort/data/c.json @@ -0,0 +1,8 @@ +{ + "id": "c", + "name": "aster", + "age": 21, + "born": "1954-02-02", + "title": "blogger", + "tags": ["red", "blue", "green"] +} diff --git a/test/tests/sort/data/d.json b/test/tests/sort/data/d.json new file mode 100644 index 00000000..926869a8 --- /dev/null +++ b/test/tests/sort/data/d.json @@ -0,0 +1,7 @@ +{ + "id": "d", + "age": 65, + "born": "1978-12-02", + "title": "agent", + "tags": ["cats"] +} diff --git a/test/tests/sort/data/e.json b/test/tests/sort/data/e.json new file mode 100644 index 00000000..436f010b --- /dev/null +++ b/test/tests/sort/data/e.json @@ -0,0 +1,7 @@ +{ + "id": "e", + "name": "nancy", + "born": "1954-10-22", + "title": "rapstar", + "tags": ["pain"] +} diff --git a/test/tests/sort/data/f.json b/test/tests/sort/data/f.json new file mode 100644 index 00000000..14f0921a --- /dev/null +++ b/test/tests/sort/data/f.json @@ -0,0 +1,7 @@ +{ + "id": "f", + "name": "frank", + "age": 1, + "title": "taxman", + "tags": ["vitamin","purple"] +} diff --git a/test/tests/sort/mapping.json b/test/tests/sort/mapping.json new file mode 100644 index 00000000..0db3279e --- /dev/null +++ b/test/tests/sort/mapping.json @@ -0,0 +1,3 @@ +{ + +} diff --git a/test/tests/sort/searches.json b/test/tests/sort/searches.json new file mode 100644 index 00000000..a679f0b8 --- /dev/null +++ b/test/tests/sort/searches.json @@ -0,0 +1,443 @@ +[ + { + "comment": "default order, all have same score, then by natural index order", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + } + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "c" + }, + { + "id": "d" + }, + { + "id": "e" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "sort by name, ascending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["name"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "c" + }, + { + "id": "f" + }, + { + "id": "a" + }, + { + "id": "e" + }, + { + "id": "b" + }, + { + "id": "d" + } + ] + } + }, + { + "comment": "sort by name, descending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["-name"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "b" + }, + { + "id": "e" + }, + { + "id": "a" + }, + { + "id": "f" + }, + { + "id": "c" + }, + { + "id": "d" + } + ] + } + }, + { + "comment": "sort by name, descending, missing first", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": [{"by":"field","field":"name","missing":"first","desc":true}] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "d" + }, + { + "id": "b" + }, + { + "id": "e" + }, + { + "id": "a" + }, + { + "id": "f" + }, + { + "id": "c" + } + ] + } + }, + { + "comment": "sort by age, ascending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["age"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "f" + }, + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "c" + }, + { + "id": "d" + }, + { + "id": "e" + } + ] + } + }, + { + "comment": "sort by age, descending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["-age"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "d" + }, + { + "id": "b" + }, + { + "id": "c" + }, + { + "id": "a" + }, + { + "id": "f" + }, + { + "id": "e" + } + ] + } + }, + { + "comment": "sort by age, descending, missing first", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": [{"by":"field","field":"age","missing":"first","desc":true}] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "e" + }, + { + "id": "d" + }, + { + "id": "b" + }, + { + "id": "c" + }, + { + "id": "a" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "sort by born, ascending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["born"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "c" + }, + { + "id": "e" + }, + { + "id": "d" + }, + { + "id": "b" + }, + { + "id": "a" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "sort by born, descending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["-born"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "d" + }, + { + "id": "e" + }, + { + "id": "c" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "sort by born, descending, missing first", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": [{"by":"field","field":"born","missing":"first","desc":true}] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "f" + }, + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "d" + }, + { + "id": "e" + }, + { + "id": "c" + } + ] + } + }, + { + "comment": "sort on multi-valued field", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": [{"by":"field","field":"tags","mode":"min"}] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "a" + }, + { + "id": "c" + }, + { + "id": "d" + }, + { + "id": "b" + }, + { + "id": "e" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "multi-column sort by age, ascending, name, ascending (flips b and c which have same age)", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["age", "name"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "f" + }, + { + "id": "a" + }, + { + "id": "c" + }, + { + "id": "b" + }, + { + "id": "d" + }, + { + "id": "e" + } + ] + } + }, + { + "comment": "sort by docid descending", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["-_id"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "f" + }, + { + "id": "e" + }, + { + "id": "d" + }, + { + "id": "c" + }, + { + "id": "b" + }, + { + "id": "a" + } + ] + } + } +] From 0322ecd4418820c7d3d1d686645c5d9303ca3e2a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 24 Aug 2016 14:07:10 -0400 Subject: [PATCH 15/24] adjust new sort functionality to also work with MultiSearch --- index_alias_impl.go | 23 +++- index_alias_impl_test.go | 134 +++++++++++++++++++++++ search.go | 4 +- search/collectors/collector_heap.go | 3 + search/collectors/collector_heap_test.go | 16 +-- search/search.go | 1 + search/sort.go | 113 +++++++++++-------- 7 files changed, 234 insertions(+), 60 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 03f30f5a..7bd6cba4 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -474,6 +474,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { Fields: req.Fields, Facets: req.Facets, Explain: req.Explain, + Sort: req.Sort, } return &rv } @@ -568,8 +569,14 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } } - // first sort it by score - sort.Sort(sr.Hits) + // sort all hits with the requested order + if len(req.Sort) > 0 { + sorter := &multiSearchHitSorter{ + hits: sr.Hits, + sort: req.Sort, + } + sort.Sort(sorter) + } // now skip over the correct From if req.From > 0 && len(sr.Hits) > req.From { @@ -645,3 +652,15 @@ func (f *indexAliasImplFieldDict) Close() error { defer f.index.mutex.RUnlock() return f.fieldDict.Close() } + +type multiSearchHitSorter struct { + hits search.DocumentMatchCollection + sort search.SortOrder +} + +func (m *multiSearchHitSorter) Len() int { return len(m.hits) } +func (m *multiSearchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] } +func (m *multiSearchHitSorter) Less(i, j int) bool { + c := m.sort.Compare(m.hits[i], m.hits[j]) + return c < 0 +} diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 1fb67647..7aa2dd46 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -11,6 +11,7 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/numeric_util" "github.com/blevesearch/bleve/search" ) @@ -451,6 +452,8 @@ func TestIndexAliasEmpty(t *testing.T) { } func TestIndexAliasMulti(t *testing.T) { + score1, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(1.0), 0) + score2, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(2.0), 0) ei1Count := uint64(7) ei1 := &stubIndex{ err: nil, @@ -466,6 +469,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -485,6 +489,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -572,10 +577,12 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, { ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -601,6 +608,8 @@ func TestIndexAliasMulti(t *testing.T) { // TestMultiSearchNoError func TestMultiSearchNoError(t *testing.T) { + score1, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(1.0), 0) + score2, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(2.0), 0) ei1 := &stubIndex{err: nil, searchResult: &SearchResult{ Status: &SearchStatus{ Total: 1, @@ -613,6 +622,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -629,6 +639,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -648,11 +659,13 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -784,6 +797,8 @@ func TestMultiSearchSecondPage(t *testing.T) { // 2. no searchers finish before the timeout // 3. no searches finish before cancellation func TestMultiSearchTimeout(t *testing.T) { + score1, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(1.0), 0) + score2, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(2.0), 0) ei1 := &stubIndex{ name: "ei1", checkRequest: func(req *SearchRequest) error { @@ -803,6 +818,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -826,6 +842,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -909,6 +926,9 @@ func TestMultiSearchTimeout(t *testing.T) { // TestMultiSearchTimeoutPartial tests the case where some indexes exceed // the timeout, while others complete successfully func TestMultiSearchTimeoutPartial(t *testing.T) { + score1, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(1.0), 0) + score2, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(2.0), 0) + score3, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(3.0), 0) ei1 := &stubIndex{ name: "ei1", err: nil, @@ -924,6 +944,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -943,6 +964,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -967,6 +989,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "3", ID: "c", Score: 3.0, + Sort: []string{string(score3)}, }, }, MaxScore: 3.0, @@ -993,11 +1016,13 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -1014,6 +1039,10 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { } func TestIndexAliasMultipleLayer(t *testing.T) { + score1, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(1.0), 0) + score2, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(2.0), 0) + score3, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(3.0), 0) + score4, _ := numeric_util.NewPrefixCodedInt64(numeric_util.Float64ToInt64(4.0), 0) ei1 := &stubIndex{ name: "ei1", err: nil, @@ -1029,6 +1058,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -1052,6 +1082,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "2", ID: "b", Score: 2.0, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -1076,6 +1107,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "3", ID: "c", Score: 3.0, + Sort: []string{string(score3)}, }, }, MaxScore: 3.0, @@ -1096,6 +1128,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, + Sort: []string{string(score4)}, }, }, MaxScore: 4.0, @@ -1129,11 +1162,13 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, + Sort: []string{string(score4)}, }, { Index: "1", ID: "a", Score: 1.0, + Sort: []string{string(score1)}, }, }, MaxScore: 4.0, @@ -1149,6 +1184,105 @@ func TestIndexAliasMultipleLayer(t *testing.T) { } } +// TestMultiSearchNoError +func TestMultiSearchCustomSort(t *testing.T) { + ei1 := &stubIndex{err: nil, searchResult: &SearchResult{ + Status: &SearchStatus{ + Total: 1, + Successful: 1, + Errors: make(map[string]error), + }, + Total: 2, + Hits: search.DocumentMatchCollection{ + { + Index: "1", + ID: "a", + Score: 1.0, + Sort: []string{"albert"}, + }, + { + Index: "1", + ID: "b", + Score: 2.0, + Sort: []string{"crown"}, + }, + }, + MaxScore: 2.0, + }} + ei2 := &stubIndex{err: nil, searchResult: &SearchResult{ + Status: &SearchStatus{ + Total: 1, + Successful: 1, + Errors: make(map[string]error), + }, + Total: 2, + Hits: search.DocumentMatchCollection{ + { + Index: "2", + ID: "c", + Score: 2.5, + Sort: []string{"frank"}, + }, + { + Index: "2", + ID: "d", + Score: 3.0, + Sort: []string{"zombie"}, + }, + }, + MaxScore: 3.0, + }} + + sr := NewSearchRequest(NewTermQuery("test")) + sr.SortBy([]string{"name"}) + expected := &SearchResult{ + Status: &SearchStatus{ + Total: 2, + Successful: 2, + Errors: make(map[string]error), + }, + Request: sr, + Total: 4, + Hits: search.DocumentMatchCollection{ + { + Index: "1", + ID: "a", + Score: 1.0, + Sort: []string{"albert"}, + }, + { + Index: "1", + ID: "b", + Score: 2.0, + Sort: []string{"crown"}, + }, + { + Index: "2", + ID: "c", + Score: 2.5, + Sort: []string{"frank"}, + }, + { + Index: "2", + ID: "d", + Score: 3.0, + Sort: []string{"zombie"}, + }, + }, + MaxScore: 3.0, + } + + results, err := MultiSearch(context.Background(), sr, ei1, ei2) + if err != nil { + t.Error(err) + } + // cheat and ensure that Took field matches since it invovles time + expected.Took = results.Took + if !reflect.DeepEqual(results, expected) { + t.Errorf("expected %v, got %v", expected, results) + } +} + // stubIndex is an Index impl for which all operations // return the configured error value, unless the // corresponding operation result value has been diff --git a/search.go b/search.go index 22e49c10..dc6bc26e 100644 --- a/search.go +++ b/search.go @@ -262,7 +262,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { r.Size = *temp.Size } if temp.Sort == nil { - r.Sort = search.SortOrder{&search.SortScore{Descending: true}} + r.Sort = search.SortOrder{&search.SortScore{Desc: true}} } else { r.Sort, err = search.ParseSortOrderJSON(temp.Sort) if err != nil { @@ -307,7 +307,7 @@ func NewSearchRequestOptions(q Query, size, from int, explain bool) *SearchReque Size: size, From: from, Explain: explain, - Sort: search.SortOrder{&search.SortScore{Descending: true}}, + Sort: search.SortOrder{&search.SortScore{Desc: true}}, } } diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 86a6c046..6c91c1dd 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -128,6 +128,9 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I d.CachedFieldTerms.Merge(fieldTerms) } + // compute this hits sort value + d.Sort = hc.sort.Value(d) + // optimization, we track lowest sorting hit already removed from heap // with this one comparision, we can avoid all heap operations if // this hit would have been added and then immediately removed diff --git a/search/collectors/collector_heap_test.go b/search/collectors/collector_heap_test.go index 4a0d2ad6..69bbf46a 100644 --- a/search/collectors/collector_heap_test.go +++ b/search/collectors/collector_heap_test.go @@ -84,7 +84,7 @@ func TestTop10Scores(t *testing.T) { }, } - collector := NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Descending: true}}) + collector := NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -192,7 +192,7 @@ func TestTop10ScoresSkip10(t *testing.T) { }, } - collector := NewHeapCollector(10, 10, search.SortOrder{&search.SortScore{Descending: true}}) + collector := NewHeapCollector(10, 10, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -289,7 +289,7 @@ func TestPaginationSameScores(t *testing.T) { } // first get first 5 hits - collector := NewHeapCollector(5, 0, search.SortOrder{&search.SortScore{Descending: true}}) + collector := NewHeapCollector(5, 0, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -375,7 +375,7 @@ func TestPaginationSameScores(t *testing.T) { } // now get next 5 hits - collector = NewHeapCollector(5, 5, search.SortOrder{&search.SortScore{Descending: true}}) + collector = NewHeapCollector(5, 5, search.SortOrder{&search.SortScore{Desc: true}}) err = collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -401,17 +401,17 @@ func TestPaginationSameScores(t *testing.T) { } func BenchmarkTop10of100000Scores(b *testing.B) { - benchHelper(10000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Descending: true}}), b) + benchHelper(10000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) } func BenchmarkTop100of100000Scores(b *testing.B) { - benchHelper(10000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Descending: true}}), b) + benchHelper(10000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) } func BenchmarkTop10of1000000Scores(b *testing.B) { - benchHelper(100000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Descending: true}}), b) + benchHelper(100000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) } func BenchmarkTop100of1000000Scores(b *testing.B) { - benchHelper(100000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Descending: true}}), b) + benchHelper(100000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) } diff --git a/search/search.go b/search/search.go index 355184b1..34eb620d 100644 --- a/search/search.go +++ b/search/search.go @@ -65,6 +65,7 @@ type DocumentMatch struct { Expl *Explanation `json:"explanation,omitempty"` Locations FieldTermLocationMap `json:"locations,omitempty"` Fragments FieldFragmentMap `json:"fragments,omitempty"` + Sort []string `json:"sort,omitempty"` // Fields contains the values for document fields listed in // SearchRequest.Fields. Text fields are returned as strings, numeric diff --git a/search/sort.go b/search/sort.go index 5b162831..ead0132f 100644 --- a/search/sort.go +++ b/search/sort.go @@ -22,7 +22,8 @@ var HighTerm = strings.Repeat(string([]byte{0xff}), 10) var LowTerm = string([]byte{0x00}) type SearchSort interface { - Compare(a, b *DocumentMatch) int + Value(a *DocumentMatch) string + Descending() bool RequiresDocID() bool RequiresScoring() bool @@ -38,11 +39,11 @@ func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) { switch by { case "id": return &SortDocID{ - Descending: descending, + Desc: descending, }, nil case "score": return &SortScore{ - Descending: descending, + Desc: descending, }, nil case "field": field, ok := input["field"].(string) @@ -50,8 +51,8 @@ func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) { return nil, fmt.Errorf("search sort mode field must specify field") } rv := &SortField{ - Field: field, - Descending: descending, + Field: field, + Desc: descending, } typ, ok := input["type"].(string) if ok { @@ -108,16 +109,16 @@ func ParseSearchSortString(input string) SearchSort { } if input == "_id" { return &SortDocID{ - Descending: descending, + Desc: descending, } } else if input == "_score" { return &SortScore{ - Descending: descending, + Desc: descending, } } return &SortField{ - Field: input, - Descending: descending, + Field: input, + Desc: descending, } } @@ -159,13 +160,27 @@ func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) { type SortOrder []SearchSort +func (so SortOrder) Value(doc *DocumentMatch) []string { + rv := make([]string, len(so)) + for i, soi := range so { + rv[i] = soi.Value(doc) + } + return rv +} + func (so SortOrder) Compare(i, j *DocumentMatch) int { // compare the documents on all search sorts until a differences is found - for _, soi := range so { - c := soi.Compare(i, j) + for x, soi := range so { + iVal := i.Sort[x] + jVal := j.Sort[x] + c := strings.Compare(iVal, jVal) if c == 0 { continue } + if soi.Descending() { + c = -c + } + //c := soi.Compare(i, j) return c } // if they are the same at this point, impose order based on index natural sort order @@ -250,26 +265,24 @@ const ( // Mode controls behavior for multi-values fields (default first) // Missing controls behavior of missing values (default last) type SortField struct { - Field string - Descending bool - Type SortFieldType - Mode SortFieldMode - Missing SortFieldMissing + Field string + Desc bool + Type SortFieldType + Mode SortFieldMode + Missing SortFieldMissing } -// Compare orders DocumentMatch instances by stored field values -func (s *SortField) Compare(i, j *DocumentMatch) int { +// Value returns the sort value of the DocumentMatch +func (s *SortField) Value(i *DocumentMatch) string { iTerms := i.CachedFieldTerms[s.Field] iTerms = s.filterTermsByType(iTerms) iTerm := s.filterTermsByMode(iTerms) - jTerms := j.CachedFieldTerms[s.Field] - jTerms = s.filterTermsByType(jTerms) - jTerm := s.filterTermsByMode(jTerms) - rv := strings.Compare(iTerm, jTerm) - if s.Descending { - rv = -rv - } - return rv + return iTerm +} + +// Descending determines the order of the sort +func (s *SortField) Descending() bool { + return s.Desc } func (s *SortField) filterTermsByMode(terms []string) string { @@ -288,12 +301,12 @@ func (s *SortField) filterTermsByMode(terms []string) string { // handle missing terms if s.Missing == SortFieldMissingLast { - if s.Descending { + if s.Desc { return LowTerm } return HighTerm } - if s.Descending { + if s.Desc { return HighTerm } return LowTerm @@ -347,7 +360,7 @@ func (s *SortField) MarshalJSON() ([]byte, error) { if s.Missing == SortFieldMissingLast && s.Mode == SortFieldDefault && s.Type == SortFieldAuto { - if s.Descending { + if s.Desc { return json.Marshal("-" + s.Field) } return json.Marshal(s.Field) @@ -356,7 +369,7 @@ func (s *SortField) MarshalJSON() ([]byte, error) { "by": "field", "field": s.Field, } - if s.Descending { + if s.Desc { sfm["desc"] = true } if s.Missing > SortFieldMissingLast { @@ -389,15 +402,17 @@ func (s *SortField) MarshalJSON() ([]byte, error) { // SortDocID will sort results by the document identifier type SortDocID struct { - Descending bool + Desc bool } -// Compare orders DocumentMatch instances by document identifiers -func (s *SortDocID) Compare(i, j *DocumentMatch) int { - if s.Descending { - return strings.Compare(j.ID, i.ID) - } - return strings.Compare(i.ID, j.ID) +// Value returns the sort value of the DocumentMatch +func (s *SortDocID) Value(i *DocumentMatch) string { + return i.ID +} + +// Descending determines the order of the sort +func (s *SortDocID) Descending() bool { + return s.Desc } // RequiresDocID says this SearchSort does require the DocID be loaded @@ -410,7 +425,7 @@ func (s *SortDocID) RequiresScoring() bool { return false } func (s *SortDocID) RequiresFields() []string { return nil } func (s *SortDocID) MarshalJSON() ([]byte, error) { - if s.Descending { + if s.Desc { return json.Marshal("-_id") } return json.Marshal("_id") @@ -418,17 +433,19 @@ func (s *SortDocID) MarshalJSON() ([]byte, error) { // SortScore will sort results by the document match score type SortScore struct { - Descending bool + Desc bool } -// Compare orders DocumentMatch instances by computed scores -func (s *SortScore) Compare(i, j *DocumentMatch) int { - if i.Score == j.Score { - return 0 - } else if (i.Score < j.Score && !s.Descending) || (j.Score < i.Score && s.Descending) { - return -1 - } - return 1 +// Value returns the sort value of the DocumentMatch +func (s *SortScore) Value(i *DocumentMatch) string { + scoreInt := numeric_util.Float64ToInt64(i.Score) + prefixCodedScore, _ := numeric_util.NewPrefixCodedInt64(scoreInt, 0) + return string(prefixCodedScore) +} + +// Descending determines the order of the sort +func (s *SortScore) Descending() bool { + return s.Desc } // RequiresDocID says this SearchSort does not require the DocID be loaded @@ -441,7 +458,7 @@ func (s *SortScore) RequiresScoring() bool { return true } func (s *SortScore) RequiresFields() []string { return nil } func (s *SortScore) MarshalJSON() ([]byte, error) { - if s.Descending { + if s.Desc { return json.Marshal("-_score") } return json.Marshal("_score") From 5e94145cf4da8f2524085384f493bc9a63f726d2 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 24 Aug 2016 15:56:26 -0400 Subject: [PATCH 16/24] apply same colletor benchmark change --- search/collectors/bench_test.go | 5 ++++- search/collectors/collector_heap_test.go | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/search/collectors/bench_test.go b/search/collectors/bench_test.go index cddf5e7f..43ceb764 100644 --- a/search/collectors/bench_test.go +++ b/search/collectors/bench_test.go @@ -10,7 +10,9 @@ import ( "golang.org/x/net/context" ) -func benchHelper(numOfMatches int, collector search.Collector, b *testing.B) { +type createCollector func() search.Collector + +func benchHelper(numOfMatches int, cc createCollector, b *testing.B) { matches := make([]*search.DocumentMatch, 0, numOfMatches) for i := 0; i < numOfMatches; i++ { matches = append(matches, &search.DocumentMatch{ @@ -25,6 +27,7 @@ func benchHelper(numOfMatches int, collector search.Collector, b *testing.B) { searcher := &stubSearcher{ matches: matches, } + collector := cc() err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { b.Fatal(err) diff --git a/search/collectors/collector_heap_test.go b/search/collectors/collector_heap_test.go index 69bbf46a..fad0fe04 100644 --- a/search/collectors/collector_heap_test.go +++ b/search/collectors/collector_heap_test.go @@ -401,17 +401,25 @@ func TestPaginationSameScores(t *testing.T) { } func BenchmarkTop10of100000Scores(b *testing.B) { - benchHelper(10000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) + benchHelper(10000, func() search.Collector { + return NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + }, b) } func BenchmarkTop100of100000Scores(b *testing.B) { - benchHelper(10000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) + benchHelper(10000, func() search.Collector { + return NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) + }, b) } func BenchmarkTop10of1000000Scores(b *testing.B) { - benchHelper(100000, NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) + benchHelper(100000, func() search.Collector { + return NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + }, b) } func BenchmarkTop100of1000000Scores(b *testing.B) { - benchHelper(100000, NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}), b) + benchHelper(100000, func() search.Collector { + return NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) + }, b) } From ce0b299d6f12e39b4b4d4946c1f69fcb69c85e8e Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 24 Aug 2016 19:02:22 -0400 Subject: [PATCH 17/24] switch sort impl to use interface this improves perf in the case where we're not doing any sorting as we avoid allocating memory and converting scores into numeric terms --- index_alias_impl_test.go | 58 +++++++++--------- search/collectors/bench_test.go | 9 +-- search/collectors/collector_heap.go | 21 +++++-- search/pool.go | 3 +- search/pool_test.go | 2 +- search/scorers/scorer_constant_test.go | 6 +- search/scorers/scorer_term_test.go | 8 ++- search/search.go | 10 ++- search/searchers/search_boolean_test.go | 2 +- search/searchers/search_conjunction_test.go | 2 +- search/searchers/search_disjunction_test.go | 4 +- search/searchers/search_docid_test.go | 2 +- search/searchers/search_fuzzy_test.go | 2 +- search/searchers/search_match_all_test.go | 2 +- search/searchers/search_match_none_test.go | 2 +- search/searchers/search_phrase_test.go | 2 +- search/searchers/search_regexp_test.go | 2 +- search/searchers/search_term_test.go | 2 +- search/sort.go | 67 ++++++++++++++++----- 19 files changed, 134 insertions(+), 72 deletions(-) diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 7aa2dd46..adbc478d 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -469,7 +469,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 1.0, @@ -489,7 +489,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, }, MaxScore: 2.0, @@ -577,12 +577,12 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, { ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 2.0, @@ -622,7 +622,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 1.0, @@ -639,7 +639,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, }, MaxScore: 2.0, @@ -659,13 +659,13 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 2.0, @@ -818,7 +818,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 1.0, @@ -842,7 +842,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, }, MaxScore: 2.0, @@ -944,7 +944,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 1.0, @@ -964,7 +964,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, }, MaxScore: 2.0, @@ -989,7 +989,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "3", ID: "c", Score: 3.0, - Sort: []string{string(score3)}, + Sort: []interface{}{string(score3)}, }, }, MaxScore: 3.0, @@ -1016,13 +1016,13 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 2.0, @@ -1058,7 +1058,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 1.0, @@ -1082,7 +1082,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []string{string(score2)}, + Sort: []interface{}{string(score2)}, }, }, MaxScore: 2.0, @@ -1107,7 +1107,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "3", ID: "c", Score: 3.0, - Sort: []string{string(score3)}, + Sort: []interface{}{string(score3)}, }, }, MaxScore: 3.0, @@ -1128,7 +1128,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, - Sort: []string{string(score4)}, + Sort: []interface{}{string(score4)}, }, }, MaxScore: 4.0, @@ -1162,13 +1162,13 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, - Sort: []string{string(score4)}, + Sort: []interface{}{string(score4)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []string{string(score1)}, + Sort: []interface{}{string(score1)}, }, }, MaxScore: 4.0, @@ -1198,13 +1198,13 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{"albert"}, + Sort: []interface{}{"albert"}, }, { Index: "1", ID: "b", Score: 2.0, - Sort: []string{"crown"}, + Sort: []interface{}{"crown"}, }, }, MaxScore: 2.0, @@ -1221,13 +1221,13 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "2", ID: "c", Score: 2.5, - Sort: []string{"frank"}, + Sort: []interface{}{"frank"}, }, { Index: "2", ID: "d", Score: 3.0, - Sort: []string{"zombie"}, + Sort: []interface{}{"zombie"}, }, }, MaxScore: 3.0, @@ -1248,25 +1248,25 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []string{"albert"}, + Sort: []interface{}{"albert"}, }, { Index: "1", ID: "b", Score: 2.0, - Sort: []string{"crown"}, + Sort: []interface{}{"crown"}, }, { Index: "2", ID: "c", Score: 2.5, - Sort: []string{"frank"}, + Sort: []interface{}{"frank"}, }, { Index: "2", ID: "d", Score: 3.0, - Sort: []string{"zombie"}, + Sort: []interface{}{"zombie"}, }, }, MaxScore: 3.0, diff --git a/search/collectors/bench_test.go b/search/collectors/bench_test.go index 43ceb764..8367a693 100644 --- a/search/collectors/bench_test.go +++ b/search/collectors/bench_test.go @@ -13,12 +13,13 @@ import ( type createCollector func() search.Collector func benchHelper(numOfMatches int, cc createCollector, b *testing.B) { + dp := search.NewDocumentMatchPool(numOfMatches, 1) matches := make([]*search.DocumentMatch, 0, numOfMatches) for i := 0; i < numOfMatches; i++ { - matches = append(matches, &search.DocumentMatch{ - IndexInternalID: index.IndexInternalID(strconv.Itoa(i)), - Score: rand.Float64(), - }) + match := dp.Get() + match.IndexInternalID = index.IndexInternalID(strconv.Itoa(i)) + match.Score = rand.Float64() + matches = append(matches, match) } b.ResetTimer() diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 6c91c1dd..43a53757 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -28,6 +28,9 @@ type HeapCollector struct { results search.DocumentMatchCollection facetsBuilder *search.FacetsBuilder + needDocIds bool + neededFields []string + lowestMatchOutsideResults *search.DocumentMatch } @@ -35,7 +38,15 @@ var COLLECT_CHECK_DONE_EVERY = uint64(1024) func NewHeapCollector(size int, skip int, sort search.SortOrder) *HeapCollector { hc := &HeapCollector{size: size, skip: skip, sort: sort} + hc.results = make(search.DocumentMatchCollection, 0, size+skip) heap.Init(hc) + + // these lookups traverse an interface, so do once up-front + if sort.RequiresDocID() { + hc.needDocIds = true + } + hc.neededFields = sort.RequiredFields() + return hc } @@ -49,7 +60,7 @@ func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher, // plus possibly one extra for the highestMatchOutsideResults // plus the amount required by the searcher tree searchContext := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(hc.size + hc.skip + 1 + searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(hc.size+hc.skip+1+searcher.DocumentMatchPoolSize(), len(hc.sort)), } select { @@ -105,7 +116,7 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I var err error // see if we need to load ID (at this early stage, for example to sort on it) - if hc.sort.RequiresDocID() { + if hc.needDocIds { d.ID, err = reader.FinalizeDocID(d.IndexInternalID) if err != nil { return err @@ -113,9 +124,9 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } // see if we need to load the stored fields - if len(hc.sort.RequiredFields()) > 0 { + if len(hc.neededFields) > 0 { // find out which fields haven't been loaded yet - fieldsToLoad := d.CachedFieldTerms.FieldsNotYetCached(hc.sort.RequiredFields()) + fieldsToLoad := d.CachedFieldTerms.FieldsNotYetCached(hc.neededFields) // look them up fieldTerms, err := reader.DocumentFieldTerms(d.IndexInternalID, fieldsToLoad) if err != nil { @@ -129,7 +140,7 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } // compute this hits sort value - d.Sort = hc.sort.Value(d) + hc.sort.Value(d) // optimization, we track lowest sorting hit already removed from heap // with this one comparision, we can avoid all heap operations if diff --git a/search/pool.go b/search/pool.go index 108d494b..89e1841d 100644 --- a/search/pool.go +++ b/search/pool.go @@ -31,12 +31,13 @@ func defaultDocumentMatchPoolTooSmall(p *DocumentMatchPool) *DocumentMatch { // NewDocumentMatchPool will build a DocumentMatchPool with memory // pre-allocated to accomodate the requested number of DocumentMatch // instances -func NewDocumentMatchPool(size int) *DocumentMatchPool { +func NewDocumentMatchPool(size, sortsize int) *DocumentMatchPool { avail := make(DocumentMatchCollection, 0, size) // pre-allocate the expected number of instances startBlock := make([]DocumentMatch, size) // make these initial instances available for i := range startBlock { + startBlock[i].Sort = make([]interface{}, 0, sortsize) avail = append(avail, &startBlock[i]) } return &DocumentMatchPool{ diff --git a/search/pool_test.go b/search/pool_test.go index 875f607c..51001757 100644 --- a/search/pool_test.go +++ b/search/pool_test.go @@ -16,7 +16,7 @@ func TestDocumentMatchPool(t *testing.T) { tooManyCalled := false // create a pool - dmp := NewDocumentMatchPool(10) + dmp := NewDocumentMatchPool(10, 0) dmp.TooSmall = func(inner *DocumentMatchPool) *DocumentMatch { tooManyCalled = true return &DocumentMatch{} diff --git a/search/scorers/scorer_constant_test.go b/search/scorers/scorer_constant_test.go index e1e6e9c6..8757a779 100644 --- a/search/scorers/scorer_constant_test.go +++ b/search/scorers/scorer_constant_test.go @@ -47,13 +47,14 @@ func TestConstantScorer(t *testing.T) { Value: 1.0, Message: "ConstantScore()", }, + Sort: []interface{}{}, }, }, } for _, test := range tests { ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(1), + DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } actual := scorer.Score(ctx, test.termMatch.ID) @@ -82,6 +83,7 @@ func TestConstantScorerWithQueryNorm(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: 2.0, + Sort: []interface{}{}, Expl: &search.Explanation{ Value: 2.0, Message: "weight(^1.000000), product of:", @@ -112,7 +114,7 @@ func TestConstantScorerWithQueryNorm(t *testing.T) { for _, test := range tests { ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(1), + DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } actual := scorer.Score(ctx, test.termMatch.ID) diff --git a/search/scorers/scorer_term_test.go b/search/scorers/scorer_term_test.go index 0241d163..3d47a9d3 100644 --- a/search/scorers/scorer_term_test.go +++ b/search/scorers/scorer_term_test.go @@ -50,6 +50,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf, + Sort: []interface{}{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -91,6 +92,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf, + Sort: []interface{}{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -121,6 +123,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(65) * idf, + Sort: []interface{}{}, Expl: &search.Explanation{ Value: math.Sqrt(65) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -145,7 +148,7 @@ func TestTermScorer(t *testing.T) { for _, test := range tests { ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(1), + DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } actual := scorer.Score(ctx, test.termMatch) @@ -187,6 +190,7 @@ func TestTermScorerWithQueryNorm(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf * 3.0 * idf * 2.0, + Sort: []interface{}{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf * 3.0 * idf * 2.0, Message: "weight(desc:beer^3.000000 in one), product of:", @@ -235,7 +239,7 @@ func TestTermScorerWithQueryNorm(t *testing.T) { for _, test := range tests { ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(1), + DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } actual := scorer.Score(ctx, test.termMatch) diff --git a/search/search.go b/search/search.go index 34eb620d..69a806e8 100644 --- a/search/search.go +++ b/search/search.go @@ -65,7 +65,7 @@ type DocumentMatch struct { Expl *Explanation `json:"explanation,omitempty"` Locations FieldTermLocationMap `json:"locations,omitempty"` Fragments FieldFragmentMap `json:"fragments,omitempty"` - Sort []string `json:"sort,omitempty"` + Sort []interface{} `json:"sort,omitempty"` // Fields contains the values for document fields listed in // SearchRequest.Fields. Text fields are returned as strings, numeric @@ -107,11 +107,15 @@ func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { // Reset allows an already allocated DocumentMatch to be reused func (dm *DocumentMatch) Reset() *DocumentMatch { // remember the []byte used for the IndexInternalID - indexInternalId := dm.IndexInternalID + indexInternalID := dm.IndexInternalID + // remember the []interface{} used for sort + sort := dm.Sort // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) - dm.IndexInternalID = indexInternalId[:0] + dm.IndexInternalID = indexInternalID[:0] + // reuse the []interface{} already allocated (and reset len to 0) + dm.Sort = sort[:0] return dm } diff --git a/search/searchers/search_boolean_test.go b/search/searchers/search_boolean_test.go index ddd31d2e..c9aea8a2 100644 --- a/search/searchers/search_boolean_test.go +++ b/search/searchers/search_boolean_test.go @@ -344,7 +344,7 @@ func TestBooleanSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_conjunction_test.go b/search/searchers/search_conjunction_test.go index 227a70ba..8554e0bf 100644 --- a/search/searchers/search_conjunction_test.go +++ b/search/searchers/search_conjunction_test.go @@ -189,7 +189,7 @@ func TestConjunctionSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(10), + DocumentMatchPool: search.NewDocumentMatchPool(10, 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_disjunction_test.go b/search/searchers/search_disjunction_test.go index 5f194065..9fe7cc62 100644 --- a/search/searchers/search_disjunction_test.go +++ b/search/searchers/search_disjunction_test.go @@ -110,7 +110,7 @@ func TestDisjunctionSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 @@ -164,7 +164,7 @@ func TestDisjunctionAdvance(t *testing.T) { } ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(martyOrDustinSearcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(martyOrDustinSearcher.DocumentMatchPoolSize(), 0), } match, err := martyOrDustinSearcher.Advance(ctx, index.IndexInternalID("3")) if err != nil { diff --git a/search/searchers/search_docid_test.go b/search/searchers/search_docid_test.go index 00fe3db9..56b2da97 100644 --- a/search/searchers/search_docid_test.go +++ b/search/searchers/search_docid_test.go @@ -64,7 +64,7 @@ func testDocIDSearcher(t *testing.T, indexed, searched, wanted []string) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(searcher.DocumentMatchPoolSize(), 0), } // Check the sequence diff --git a/search/searchers/search_fuzzy_test.go b/search/searchers/search_fuzzy_test.go index b4469666..086c1f72 100644 --- a/search/searchers/search_fuzzy_test.go +++ b/search/searchers/search_fuzzy_test.go @@ -107,7 +107,7 @@ func TestFuzzySearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_match_all_test.go b/search/searchers/search_match_all_test.go index 26e8a3c2..04b9421a 100644 --- a/search/searchers/search_match_all_test.go +++ b/search/searchers/search_match_all_test.go @@ -111,7 +111,7 @@ func TestMatchAllSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_match_none_test.go b/search/searchers/search_match_none_test.go index 90ec526c..e7c8876a 100644 --- a/search/searchers/search_match_none_test.go +++ b/search/searchers/search_match_none_test.go @@ -52,7 +52,7 @@ func TestMatchNoneSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_phrase_test.go b/search/searchers/search_phrase_test.go index 8c9a76d2..2a6a485f 100644 --- a/search/searchers/search_phrase_test.go +++ b/search/searchers/search_phrase_test.go @@ -70,7 +70,7 @@ func TestPhraseSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_regexp_test.go b/search/searchers/search_regexp_test.go index f2859cb9..8ea1c720 100644 --- a/search/searchers/search_regexp_test.go +++ b/search/searchers/search_regexp_test.go @@ -87,7 +87,7 @@ func TestRegexpSearch(t *testing.T) { }() ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize()), + DocumentMatchPool: search.NewDocumentMatchPool(test.searcher.DocumentMatchPoolSize(), 0), } next, err := test.searcher.Next(ctx) i := 0 diff --git a/search/searchers/search_term_test.go b/search/searchers/search_term_test.go index c94803e1..9220d725 100644 --- a/search/searchers/search_term_test.go +++ b/search/searchers/search_term_test.go @@ -165,7 +165,7 @@ func TestTermSearcher(t *testing.T) { } ctx := &search.SearchContext{ - DocumentMatchPool: search.NewDocumentMatchPool(1), + DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } docMatch, err := searcher.Next(ctx) if err != nil { diff --git a/search/sort.go b/search/sort.go index ead0132f..62632ca8 100644 --- a/search/sort.go +++ b/search/sort.go @@ -22,7 +22,7 @@ var HighTerm = strings.Repeat(string([]byte{0xff}), 10) var LowTerm = string([]byte{0x00}) type SearchSort interface { - Value(a *DocumentMatch) string + Value(a *DocumentMatch) interface{} Descending() bool RequiresDocID() bool @@ -160,27 +160,55 @@ func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) { type SortOrder []SearchSort -func (so SortOrder) Value(doc *DocumentMatch) []string { - rv := make([]string, len(so)) - for i, soi := range so { - rv[i] = soi.Value(doc) +func (so SortOrder) Value(doc *DocumentMatch) { + for _, soi := range so { + doc.Sort = append(doc.Sort, soi.Value(doc)) } - return rv } +// Compare will compare two document matches using the specified sort order +// if both are numbers, we avoid converting back to term func (so SortOrder) Compare(i, j *DocumentMatch) int { // compare the documents on all search sorts until a differences is found for x, soi := range so { iVal := i.Sort[x] jVal := j.Sort[x] - c := strings.Compare(iVal, jVal) + c := 0 + switch iVal := iVal.(type) { + case string: + switch jVal := jVal.(type) { + case string: + // both string + c = strings.Compare(iVal, jVal) + case float64: + // i is string, j is number, i sorts higher + ji := numeric_util.Float64ToInt64(jVal) + jt, _ := numeric_util.NewPrefixCodedInt64(ji, 0) + c = strings.Compare(iVal, string(jt)) + } + case float64: + switch jVal := jVal.(type) { + case string: + // i is number, j is string + ii := numeric_util.Float64ToInt64(iVal) + it, _ := numeric_util.NewPrefixCodedInt64(ii, 0) + c = strings.Compare(string(it), jVal) + case float64: + // numeric comparison + if iVal < jVal { + c = -1 + } else if iVal > jVal { + c = 1 + } + } + } + if c == 0 { continue } if soi.Descending() { c = -c } - //c := soi.Compare(i, j) return c } // if they are the same at this point, impose order based on index natural sort order @@ -273,10 +301,23 @@ type SortField struct { } // Value returns the sort value of the DocumentMatch -func (s *SortField) Value(i *DocumentMatch) string { +func (s *SortField) Value(i *DocumentMatch) interface{} { iTerms := i.CachedFieldTerms[s.Field] iTerms = s.filterTermsByType(iTerms) iTerm := s.filterTermsByMode(iTerms) + if s.Type == SortFieldAsNumber || s.Type == SortFieldAsDate { + // explicitly asked for numeric sort + rv, _ := numeric_util.PrefixCoded(iTerm).Int64() + return rv + } else if s.Type == SortFieldAuto { + // asked for auto, looks like a number + valid, shift := numeric_util.ValidPrefixCodedTerm(iTerm) + if valid && shift == 0 { + ri, _ := numeric_util.PrefixCoded(iTerm).Int64() + rv := numeric_util.Int64ToFloat64(ri) + return rv + } + } return iTerm } @@ -406,7 +447,7 @@ type SortDocID struct { } // Value returns the sort value of the DocumentMatch -func (s *SortDocID) Value(i *DocumentMatch) string { +func (s *SortDocID) Value(i *DocumentMatch) interface{} { return i.ID } @@ -437,10 +478,8 @@ type SortScore struct { } // Value returns the sort value of the DocumentMatch -func (s *SortScore) Value(i *DocumentMatch) string { - scoreInt := numeric_util.Float64ToInt64(i.Score) - prefixCodedScore, _ := numeric_util.NewPrefixCodedInt64(scoreInt, 0) - return string(prefixCodedScore) +func (s *SortScore) Value(i *DocumentMatch) interface{} { + return i.Score } // Descending determines the order of the sort From 60750c16140fe90492143f0eb9f39560e75f6124 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 25 Aug 2016 15:47:07 -0400 Subject: [PATCH 18/24] improved implementation to address perf regressions primary change is going back to sort values be []string and not []interface{}, this avoid allocatiosn converting into the interface{} that sounds obvious, so why didn't we just do that first? because a common (default) sort is score, which is naturally a number, not a string (like terms). converting into the number was also expensive, and the common case. so, this solution also makes the change to NOT put the score into the sort value list. instead you see the dummy value "_score". this is just a placeholder, the actual sort impl knows that field of the sort is the score, and will sort using the actual score. also, several other aspets of the benchmark were cleaned up so that unnecessary allocations do not pollute the cpu profiles Here are the updated benchmarks: $ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.188s Prior to this PR, master reported: $ go test -run=xxx -bench=. -benchmem BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.385s So, we're pretty close on the smaller datasets, and we scale better on the larger datasets. We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup). --- index_alias_impl_test.go | 58 ++++++++++++------------- search/collectors/bench_test.go | 9 ++-- search/collectors/collector_heap.go | 4 +- search/collectors/search_test.go | 8 +++- search/pool.go | 2 +- search/scorers/scorer_constant_test.go | 4 +- search/scorers/scorer_term_test.go | 8 ++-- search/search.go | 2 +- search/sort.go | 60 ++++++-------------------- 9 files changed, 64 insertions(+), 91 deletions(-) diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index adbc478d..7aa2dd46 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -469,7 +469,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -489,7 +489,7 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -577,12 +577,12 @@ func TestIndexAliasMulti(t *testing.T) { { ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, { ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -622,7 +622,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -639,7 +639,7 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -659,13 +659,13 @@ func TestMultiSearchNoError(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -818,7 +818,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -842,7 +842,7 @@ func TestMultiSearchTimeout(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -944,7 +944,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -964,7 +964,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -989,7 +989,7 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "3", ID: "c", Score: 3.0, - Sort: []interface{}{string(score3)}, + Sort: []string{string(score3)}, }, }, MaxScore: 3.0, @@ -1016,13 +1016,13 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 2.0, @@ -1058,7 +1058,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 1.0, @@ -1082,7 +1082,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "2", ID: "b", Score: 2.0, - Sort: []interface{}{string(score2)}, + Sort: []string{string(score2)}, }, }, MaxScore: 2.0, @@ -1107,7 +1107,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "3", ID: "c", Score: 3.0, - Sort: []interface{}{string(score3)}, + Sort: []string{string(score3)}, }, }, MaxScore: 3.0, @@ -1128,7 +1128,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, - Sort: []interface{}{string(score4)}, + Sort: []string{string(score4)}, }, }, MaxScore: 4.0, @@ -1162,13 +1162,13 @@ func TestIndexAliasMultipleLayer(t *testing.T) { Index: "4", ID: "d", Score: 4.0, - Sort: []interface{}{string(score4)}, + Sort: []string{string(score4)}, }, { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{string(score1)}, + Sort: []string{string(score1)}, }, }, MaxScore: 4.0, @@ -1198,13 +1198,13 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{"albert"}, + Sort: []string{"albert"}, }, { Index: "1", ID: "b", Score: 2.0, - Sort: []interface{}{"crown"}, + Sort: []string{"crown"}, }, }, MaxScore: 2.0, @@ -1221,13 +1221,13 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "2", ID: "c", Score: 2.5, - Sort: []interface{}{"frank"}, + Sort: []string{"frank"}, }, { Index: "2", ID: "d", Score: 3.0, - Sort: []interface{}{"zombie"}, + Sort: []string{"zombie"}, }, }, MaxScore: 3.0, @@ -1248,25 +1248,25 @@ func TestMultiSearchCustomSort(t *testing.T) { Index: "1", ID: "a", Score: 1.0, - Sort: []interface{}{"albert"}, + Sort: []string{"albert"}, }, { Index: "1", ID: "b", Score: 2.0, - Sort: []interface{}{"crown"}, + Sort: []string{"crown"}, }, { Index: "2", ID: "c", Score: 2.5, - Sort: []interface{}{"frank"}, + Sort: []string{"frank"}, }, { Index: "2", ID: "d", Score: 3.0, - Sort: []interface{}{"zombie"}, + Sort: []string{"zombie"}, }, }, MaxScore: 3.0, diff --git a/search/collectors/bench_test.go b/search/collectors/bench_test.go index 8367a693..43ceb764 100644 --- a/search/collectors/bench_test.go +++ b/search/collectors/bench_test.go @@ -13,13 +13,12 @@ import ( type createCollector func() search.Collector func benchHelper(numOfMatches int, cc createCollector, b *testing.B) { - dp := search.NewDocumentMatchPool(numOfMatches, 1) matches := make([]*search.DocumentMatch, 0, numOfMatches) for i := 0; i < numOfMatches; i++ { - match := dp.Get() - match.IndexInternalID = index.IndexInternalID(strconv.Itoa(i)) - match.Score = rand.Float64() - matches = append(matches, match) + matches = append(matches, &search.DocumentMatch{ + IndexInternalID: index.IndexInternalID(strconv.Itoa(i)), + Score: rand.Float64(), + }) } b.ResetTimer() diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 43a53757..0d0154d3 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -38,7 +38,9 @@ var COLLECT_CHECK_DONE_EVERY = uint64(1024) func NewHeapCollector(size int, skip int, sort search.SortOrder) *HeapCollector { hc := &HeapCollector{size: size, skip: skip, sort: sort} - hc.results = make(search.DocumentMatchCollection, 0, size+skip) + // pre-allocate space on the heap, we need size+skip results + // +1 additional while figuring out which to evict + hc.results = make(search.DocumentMatchCollection, 0, size+skip+1) heap.Init(hc) // these lookups traverse an interface, so do once up-front diff --git a/search/collectors/search_test.go b/search/collectors/search_test.go index 26de7c53..035486b5 100644 --- a/search/collectors/search_test.go +++ b/search/collectors/search_test.go @@ -22,7 +22,9 @@ type stubSearcher struct { func (ss *stubSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { if ss.index < len(ss.matches) { - rv := ss.matches[ss.index] + rv := ctx.DocumentMatchPool.Get() + rv.IndexInternalID = ss.matches[ss.index].IndexInternalID + rv.Score = ss.matches[ss.index].Score ss.index++ return rv, nil } @@ -35,7 +37,9 @@ func (ss *stubSearcher) Advance(ctx *search.SearchContext, ID index.IndexInterna ss.index++ } if ss.index < len(ss.matches) { - rv := ss.matches[ss.index] + rv := ctx.DocumentMatchPool.Get() + rv.IndexInternalID = ss.matches[ss.index].IndexInternalID + rv.Score = ss.matches[ss.index].Score ss.index++ return rv, nil } diff --git a/search/pool.go b/search/pool.go index 89e1841d..5600f488 100644 --- a/search/pool.go +++ b/search/pool.go @@ -37,7 +37,7 @@ func NewDocumentMatchPool(size, sortsize int) *DocumentMatchPool { startBlock := make([]DocumentMatch, size) // make these initial instances available for i := range startBlock { - startBlock[i].Sort = make([]interface{}, 0, sortsize) + startBlock[i].Sort = make([]string, 0, sortsize) avail = append(avail, &startBlock[i]) } return &DocumentMatchPool{ diff --git a/search/scorers/scorer_constant_test.go b/search/scorers/scorer_constant_test.go index 8757a779..8238b07b 100644 --- a/search/scorers/scorer_constant_test.go +++ b/search/scorers/scorer_constant_test.go @@ -47,7 +47,7 @@ func TestConstantScorer(t *testing.T) { Value: 1.0, Message: "ConstantScore()", }, - Sort: []interface{}{}, + Sort: []string{}, }, }, } @@ -83,7 +83,7 @@ func TestConstantScorerWithQueryNorm(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: 2.0, - Sort: []interface{}{}, + Sort: []string{}, Expl: &search.Explanation{ Value: 2.0, Message: "weight(^1.000000), product of:", diff --git a/search/scorers/scorer_term_test.go b/search/scorers/scorer_term_test.go index 3d47a9d3..592463a4 100644 --- a/search/scorers/scorer_term_test.go +++ b/search/scorers/scorer_term_test.go @@ -50,7 +50,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf, - Sort: []interface{}{}, + Sort: []string{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -92,7 +92,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf, - Sort: []interface{}{}, + Sort: []string{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -123,7 +123,7 @@ func TestTermScorer(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(65) * idf, - Sort: []interface{}{}, + Sort: []string{}, Expl: &search.Explanation{ Value: math.Sqrt(65) * idf, Message: "fieldWeight(desc:beer in one), product of:", @@ -190,7 +190,7 @@ func TestTermScorerWithQueryNorm(t *testing.T) { result: &search.DocumentMatch{ IndexInternalID: index.IndexInternalID("one"), Score: math.Sqrt(1.0) * idf * 3.0 * idf * 2.0, - Sort: []interface{}{}, + Sort: []string{}, Expl: &search.Explanation{ Value: math.Sqrt(1.0) * idf * 3.0 * idf * 2.0, Message: "weight(desc:beer^3.000000 in one), product of:", diff --git a/search/search.go b/search/search.go index 69a806e8..5e43b748 100644 --- a/search/search.go +++ b/search/search.go @@ -65,7 +65,7 @@ type DocumentMatch struct { Expl *Explanation `json:"explanation,omitempty"` Locations FieldTermLocationMap `json:"locations,omitempty"` Fragments FieldFragmentMap `json:"fragments,omitempty"` - Sort []interface{} `json:"sort,omitempty"` + Sort []string `json:"sort,omitempty"` // Fields contains the values for document fields listed in // SearchRequest.Fields. Text fields are returned as strings, numeric diff --git a/search/sort.go b/search/sort.go index 62632ca8..6a34d2c1 100644 --- a/search/sort.go +++ b/search/sort.go @@ -22,7 +22,7 @@ var HighTerm = strings.Repeat(string([]byte{0xff}), 10) var LowTerm = string([]byte{0x00}) type SearchSort interface { - Value(a *DocumentMatch) interface{} + Value(a *DocumentMatch) string Descending() bool RequiresDocID() bool @@ -171,36 +171,17 @@ func (so SortOrder) Value(doc *DocumentMatch) { func (so SortOrder) Compare(i, j *DocumentMatch) int { // compare the documents on all search sorts until a differences is found for x, soi := range so { - iVal := i.Sort[x] - jVal := j.Sort[x] c := 0 - switch iVal := iVal.(type) { - case string: - switch jVal := jVal.(type) { - case string: - // both string - c = strings.Compare(iVal, jVal) - case float64: - // i is string, j is number, i sorts higher - ji := numeric_util.Float64ToInt64(jVal) - jt, _ := numeric_util.NewPrefixCodedInt64(ji, 0) - c = strings.Compare(iVal, string(jt)) - } - case float64: - switch jVal := jVal.(type) { - case string: - // i is number, j is string - ii := numeric_util.Float64ToInt64(iVal) - it, _ := numeric_util.NewPrefixCodedInt64(ii, 0) - c = strings.Compare(string(it), jVal) - case float64: - // numeric comparison - if iVal < jVal { - c = -1 - } else if iVal > jVal { - c = 1 - } + if soi.RequiresScoring() { + if i.Score < j.Score { + c = -1 + } else if i.Score > j.Score { + c = 1 } + } else { + iVal := i.Sort[x] + jVal := j.Sort[x] + c = strings.Compare(iVal, jVal) } if c == 0 { @@ -301,23 +282,10 @@ type SortField struct { } // Value returns the sort value of the DocumentMatch -func (s *SortField) Value(i *DocumentMatch) interface{} { +func (s *SortField) Value(i *DocumentMatch) string { iTerms := i.CachedFieldTerms[s.Field] iTerms = s.filterTermsByType(iTerms) iTerm := s.filterTermsByMode(iTerms) - if s.Type == SortFieldAsNumber || s.Type == SortFieldAsDate { - // explicitly asked for numeric sort - rv, _ := numeric_util.PrefixCoded(iTerm).Int64() - return rv - } else if s.Type == SortFieldAuto { - // asked for auto, looks like a number - valid, shift := numeric_util.ValidPrefixCodedTerm(iTerm) - if valid && shift == 0 { - ri, _ := numeric_util.PrefixCoded(iTerm).Int64() - rv := numeric_util.Int64ToFloat64(ri) - return rv - } - } return iTerm } @@ -447,7 +415,7 @@ type SortDocID struct { } // Value returns the sort value of the DocumentMatch -func (s *SortDocID) Value(i *DocumentMatch) interface{} { +func (s *SortDocID) Value(i *DocumentMatch) string { return i.ID } @@ -478,8 +446,8 @@ type SortScore struct { } // Value returns the sort value of the DocumentMatch -func (s *SortScore) Value(i *DocumentMatch) interface{} { - return i.Score +func (s *SortScore) Value(i *DocumentMatch) string { + return "_score" } // Descending determines the order of the sort From 127f37212be89173dca1a909cfc90df6fbad1562 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 25 Aug 2016 16:24:26 -0400 Subject: [PATCH 19/24] cache values to avoid dynamic dispatch inside hot loop --- index_alias_impl.go | 22 +++++++++++++++------- search/collectors/collector_heap.go | 14 +++++++++----- search/sort.go | 24 ++++++++++++++++++++---- 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 7bd6cba4..4367f548 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -571,10 +571,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se // sort all hits with the requested order if len(req.Sort) > 0 { - sorter := &multiSearchHitSorter{ - hits: sr.Hits, - sort: req.Sort, - } + sorter := newMultiSearchHitSorter(req.Sort, sr.Hits) sort.Sort(sorter) } @@ -654,13 +651,24 @@ func (f *indexAliasImplFieldDict) Close() error { } type multiSearchHitSorter struct { - hits search.DocumentMatchCollection - sort search.SortOrder + hits search.DocumentMatchCollection + sort search.SortOrder + cachedScoring []bool + cachedDesc []bool +} + +func newMultiSearchHitSorter(sort search.SortOrder, hits search.DocumentMatchCollection) *multiSearchHitSorter { + return &multiSearchHitSorter{ + sort: sort, + hits: hits, + cachedScoring: sort.CacheIsScore(), + cachedDesc: sort.CacheDescending(), + } } func (m *multiSearchHitSorter) Len() int { return len(m.hits) } func (m *multiSearchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] } func (m *multiSearchHitSorter) Less(i, j int) bool { - c := m.sort.Compare(m.hits[i], m.hits[j]) + c := m.sort.Compare(m.cachedScoring, m.cachedDesc, m.hits[i], m.hits[j]) return c < 0 } diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 0d0154d3..97b2faba 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -28,8 +28,10 @@ type HeapCollector struct { results search.DocumentMatchCollection facetsBuilder *search.FacetsBuilder - needDocIds bool - neededFields []string + needDocIds bool + neededFields []string + cachedScoring []bool + cachedDesc []bool lowestMatchOutsideResults *search.DocumentMatch } @@ -48,6 +50,8 @@ func NewHeapCollector(size int, skip int, sort search.SortOrder) *HeapCollector hc.needDocIds = true } hc.neededFields = sort.RequiredFields() + hc.cachedScoring = sort.CacheIsScore() + hc.cachedDesc = sort.CacheDescending() return hc } @@ -148,7 +152,7 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I // with this one comparision, we can avoid all heap operations if // this hit would have been added and then immediately removed if hc.lowestMatchOutsideResults != nil { - cmp := hc.sort.Compare(d, hc.lowestMatchOutsideResults) + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) if cmp >= 0 { // this hit can't possibly be in the result set, so avoid heap ops ctx.DocumentMatchPool.Put(d) @@ -162,7 +166,7 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I if hc.lowestMatchOutsideResults == nil { hc.lowestMatchOutsideResults = removed } else { - cmp := hc.sort.Compare(removed, hc.lowestMatchOutsideResults) + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) if cmp < 0 { tmp := hc.lowestMatchOutsideResults hc.lowestMatchOutsideResults = removed @@ -239,7 +243,7 @@ func (hc *HeapCollector) Len() int { } func (hc *HeapCollector) Less(i, j int) bool { - so := hc.sort.Compare(hc.results[i], hc.results[j]) + so := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, hc.results[i], hc.results[j]) return -so < 0 } diff --git a/search/sort.go b/search/sort.go index 6a34d2c1..64287422 100644 --- a/search/sort.go +++ b/search/sort.go @@ -168,11 +168,11 @@ func (so SortOrder) Value(doc *DocumentMatch) { // Compare will compare two document matches using the specified sort order // if both are numbers, we avoid converting back to term -func (so SortOrder) Compare(i, j *DocumentMatch) int { +func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatch) int { // compare the documents on all search sorts until a differences is found - for x, soi := range so { + for x := range so { c := 0 - if soi.RequiresScoring() { + if cachedScoring[x] { if i.Score < j.Score { c = -1 } else if i.Score > j.Score { @@ -187,7 +187,7 @@ func (so SortOrder) Compare(i, j *DocumentMatch) int { if c == 0 { continue } - if soi.Descending() { + if cachedDesc[x] { c = -c } return c @@ -229,6 +229,22 @@ func (so SortOrder) RequiredFields() []string { return rv } +func (so SortOrder) CacheIsScore() []bool { + var rv []bool + for _, soi := range so { + rv = append(rv, soi.RequiresScoring()) + } + return rv +} + +func (so SortOrder) CacheDescending() []bool { + var rv []bool + for _, soi := range so { + rv = append(rv, soi.Descending()) + } + return rv +} + // SortFieldType lets you control some internal sort behavior // normally leaving this to the zero-value of SortFieldAuto is fine type SortFieldType int From 931ec677c43c0da2c3faf4bbe1fbf7ae3e0adfe5 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 25 Aug 2016 22:59:08 -0400 Subject: [PATCH 20/24] completely avoid dynamic dispatch if only sorting on score --- search/collectors/collector_heap.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 97b2faba..0aae88a7 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -146,7 +146,9 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } // compute this hits sort value - hc.sort.Value(d) + if len(hc.sort) > 1 || len(hc.sort) == 1 && !hc.cachedScoring[0] { + hc.sort.Value(d) + } // optimization, we track lowest sorting hit already removed from heap // with this one comparision, we can avoid all heap operations if From 3f8757c05be957fba675cef714e679ea5efbb550 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 25 Aug 2016 23:13:22 -0400 Subject: [PATCH 21/24] slight fixup to last change to set the sort value i'd like the sort value to be correct even with the optimizations not using it --- search/collectors/collector_heap.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/search/collectors/collector_heap.go b/search/collectors/collector_heap.go index 0aae88a7..163ca8d1 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/collector_heap.go @@ -110,6 +110,8 @@ func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher, return nil } +var sortByScoreOpt = []string{"_score"} + func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { // increment total hits hc.total++ @@ -146,7 +148,9 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } // compute this hits sort value - if len(hc.sort) > 1 || len(hc.sort) == 1 && !hc.cachedScoring[0] { + if len(hc.sort) == 1 && hc.cachedScoring[0] { + d.Sort = sortByScoreOpt + } else { hc.sort.Value(d) } From 47c239ca7be47b2ee16e2138fa5558787bcd1dc4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 26 Aug 2016 10:29:50 -0400 Subject: [PATCH 22/24] refactored data structure out of collector the TopNCollector now can either use a heap or a list i did not code it to use an interface, because this is a very hot loop during searching. rather, it lets bleve developers easily toggle between the two (or other ideas) by changing 2 lines The list is faster in the benchmark, but causes more allocations. The list is once again the default (for now). To switch to the heap implementation, change: store *collectStoreList to store *collectStoreHeap and newStoreList(... to newStoreHeap(... --- index_impl.go | 2 +- search/collectors/heap.go | 83 ++++++++++++ search/collectors/list.go | 73 +++++++++++ .../collectors/{collector_heap.go => topn.go} | 118 ++++++++---------- .../{collector_heap_test.go => topn_test.go} | 17 +-- 5 files changed, 216 insertions(+), 77 deletions(-) create mode 100644 search/collectors/heap.go create mode 100644 search/collectors/list.go rename search/collectors/{collector_heap.go => topn.go} (70%) rename search/collectors/{collector_heap_test.go => topn_test.go} (95%) diff --git a/index_impl.go b/index_impl.go index 63485a4b..6093ec97 100644 --- a/index_impl.go +++ b/index_impl.go @@ -384,7 +384,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return nil, ErrorIndexClosed } - collector := collectors.NewHeapCollector(req.Size, req.From, req.Sort) + collector := collectors.NewTopNCollector(req.Size, req.From, req.Sort) // open a reader for this search indexReader, err := i.i.Reader() diff --git a/search/collectors/heap.go b/search/collectors/heap.go new file mode 100644 index 00000000..1c834e9c --- /dev/null +++ b/search/collectors/heap.go @@ -0,0 +1,83 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package collectors + +import ( + "container/heap" + + "github.com/blevesearch/bleve/search" +) + +type collectStoreHeap struct { + heap search.DocumentMatchCollection + compare collectorCompare +} + +func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { + rv := &collectStoreHeap{ + heap: make(search.DocumentMatchCollection, 0, cap), + compare: compare, + } + heap.Init(rv) + return rv +} + +func (c *collectStoreHeap) Add(doc *search.DocumentMatch) { + heap.Push(c, doc) +} + +func (c *collectStoreHeap) RemoveLast() *search.DocumentMatch { + return heap.Pop(c).(*search.DocumentMatch) +} + +func (c *collectStoreHeap) Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error) { + count := c.Len() + size := count - skip + rv := make(search.DocumentMatchCollection, size) + for count > 0 { + count-- + + if count >= skip { + size-- + doc := heap.Pop(c).(*search.DocumentMatch) + rv[size] = doc + err := fixup(doc) + if err != nil { + return nil, err + } + } + } + return rv, nil +} + +// heap interface implementation + +func (c *collectStoreHeap) Len() int { + return len(c.heap) +} + +func (c *collectStoreHeap) Less(i, j int) bool { + so := c.compare(c.heap[i], c.heap[j]) + return -so < 0 +} + +func (c *collectStoreHeap) Swap(i, j int) { + c.heap[i], c.heap[j] = c.heap[j], c.heap[i] +} + +func (c *collectStoreHeap) Push(x interface{}) { + c.heap = append(c.heap, x.(*search.DocumentMatch)) +} + +func (c *collectStoreHeap) Pop() interface{} { + var rv *search.DocumentMatch + rv, c.heap = c.heap[len(c.heap)-1], c.heap[:len(c.heap)-1] + return rv +} diff --git a/search/collectors/list.go b/search/collectors/list.go new file mode 100644 index 00000000..d3f49410 --- /dev/null +++ b/search/collectors/list.go @@ -0,0 +1,73 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package collectors + +import ( + "container/list" + + "github.com/blevesearch/bleve/search" +) + +type collectStoreList struct { + results *list.List + compare collectorCompare +} + +func newStoreList(cap int, compare collectorCompare) *collectStoreList { + rv := &collectStoreList{ + results: list.New(), + compare: compare, + } + + return rv +} + +func (c *collectStoreList) Add(doc *search.DocumentMatch) { + for e := c.results.Front(); e != nil; e = e.Next() { + curr := e.Value.(*search.DocumentMatch) + if c.compare(doc, curr) >= 0 { + c.results.InsertBefore(doc, e) + return + } + } + // if we got to the end, we still have to add it + c.results.PushBack(doc) +} + +func (c *collectStoreList) RemoveLast() *search.DocumentMatch { + return c.results.Remove(c.results.Front()).(*search.DocumentMatch) +} + +func (c *collectStoreList) Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error) { + if c.results.Len()-skip > 0 { + rv := make(search.DocumentMatchCollection, c.results.Len()-skip) + i := 0 + skipped := 0 + for e := c.results.Back(); e != nil; e = e.Prev() { + if skipped < skip { + skipped++ + continue + } + + rv[i] = e.Value.(*search.DocumentMatch) + err := fixup(rv[i]) + if err != nil { + return nil, err + } + i++ + } + return rv, nil + } + return search.DocumentMatchCollection{}, nil +} + +func (c *collectStoreList) Len() int { + return c.results.Len() +} diff --git a/search/collectors/collector_heap.go b/search/collectors/topn.go similarity index 70% rename from search/collectors/collector_heap.go rename to search/collectors/topn.go index 163ca8d1..15ce77c8 100644 --- a/search/collectors/collector_heap.go +++ b/search/collectors/topn.go @@ -6,11 +6,10 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -// + package collectors import ( - "container/heap" "time" "github.com/blevesearch/bleve/index" @@ -18,7 +17,12 @@ import ( "golang.org/x/net/context" ) -type HeapCollector struct { +type collectorCompare func(i, j *search.DocumentMatch) int + +type collectorFixup func(d *search.DocumentMatch) error + +// TopNCollector collects the top N hits, optionally skipping some results +type TopNCollector struct { size int skip int total uint64 @@ -28,6 +32,8 @@ type HeapCollector struct { results search.DocumentMatchCollection facetsBuilder *search.FacetsBuilder + store *collectStoreList + needDocIds bool neededFields []string cachedScoring []bool @@ -36,14 +42,19 @@ type HeapCollector struct { lowestMatchOutsideResults *search.DocumentMatch } -var COLLECT_CHECK_DONE_EVERY = uint64(1024) +// CheckDoneEvery controls how frequently we check the context deadline +const CheckDoneEvery = uint64(1024) -func NewHeapCollector(size int, skip int, sort search.SortOrder) *HeapCollector { - hc := &HeapCollector{size: size, skip: skip, sort: sort} +// NewTopNCollector builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { + hc := &TopNCollector{size: size, skip: skip, sort: sort} // pre-allocate space on the heap, we need size+skip results // +1 additional while figuring out which to evict - hc.results = make(search.DocumentMatchCollection, 0, size+skip+1) - heap.Init(hc) + hc.store = newStoreList(size+skip+1, func(i, j *search.DocumentMatch) int { + return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) + }) // these lookups traverse an interface, so do once up-front if sort.RequiresDocID() { @@ -56,7 +67,8 @@ func NewHeapCollector(size int, skip int, sort search.SortOrder) *HeapCollector return hc } -func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { +// Collect goes to the index to find the matching documents +func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { startTime := time.Now() var err error var next *search.DocumentMatch @@ -76,7 +88,7 @@ func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher, next, err = searcher.Next(searchContext) } for err == nil && next != nil { - if hc.total%COLLECT_CHECK_DONE_EVERY == 0 { + if hc.total%CheckDoneEvery == 0 { select { case <-ctx.Done(): return ctx.Err() @@ -112,7 +124,7 @@ func (hc *HeapCollector) Collect(ctx context.Context, searcher search.Searcher, var sortByScoreOpt = []string{"_score"} -func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { +func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { // increment total hits hc.total++ d.HitNumber = hc.total @@ -166,9 +178,9 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I } } - heap.Push(hc, d) - if hc.Len() > hc.size+hc.skip { - removed := heap.Pop(hc).(*search.DocumentMatch) + hc.store.Add(d) + if hc.store.Len() > hc.size+hc.skip { + removed := hc.store.RemoveLast() if hc.lowestMatchOutsideResults == nil { hc.lowestMatchOutsideResults = removed } else { @@ -184,85 +196,55 @@ func (hc *HeapCollector) collectSingle(ctx *search.SearchContext, reader index.I return nil } -func (hc *HeapCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { +// SetFacetsBuilder registers a facet builder for this collector +func (hc *TopNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) { hc.facetsBuilder = facetsBuilder } // finalizeResults starts with the heap containing the final top size+skip // it now throws away the results to be skipped // and does final doc id lookup (if necessary) -func (hc *HeapCollector) finalizeResults(r index.IndexReader) error { - count := hc.Len() - size := count - hc.skip - rv := make(search.DocumentMatchCollection, size) - for count > 0 { - count-- - - if count >= hc.skip { - size-- - doc := heap.Pop(hc).(*search.DocumentMatch) - rv[size] = doc - if doc.ID == "" { - // look up the id since we need it for lookup - var err error - doc.ID, err = r.FinalizeDocID(doc.IndexInternalID) - if err != nil { - return err - } +func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { + var err error + hc.results, err = hc.store.Final(hc.skip, func(doc *search.DocumentMatch) error { + if doc.ID == "" { + // look up the id since we need it for lookup + var err error + doc.ID, err = r.FinalizeDocID(doc.IndexInternalID) + if err != nil { + return err } } - } + return nil + }) - // no longer a heap - hc.results = rv - - return nil + return err } -func (hc *HeapCollector) Results() search.DocumentMatchCollection { +// Results returns the collected hits +func (hc *TopNCollector) Results() search.DocumentMatchCollection { return hc.results } -func (hc *HeapCollector) Total() uint64 { +// Total returns the total number of hits +func (hc *TopNCollector) Total() uint64 { return hc.total } -func (hc *HeapCollector) MaxScore() float64 { +// MaxScore returns the maximum score seen across all the hits +func (hc *TopNCollector) MaxScore() float64 { return hc.maxScore } -func (hc *HeapCollector) Took() time.Duration { +// Took returns the time spent collecting hits +func (hc *TopNCollector) Took() time.Duration { return hc.took } -func (hc *HeapCollector) FacetResults() search.FacetResults { +// FacetResults returns the computed facets results +func (hc *TopNCollector) FacetResults() search.FacetResults { if hc.facetsBuilder != nil { return hc.facetsBuilder.Results() } return search.FacetResults{} } - -// heap interface implementation - -func (hc *HeapCollector) Len() int { - return len(hc.results) -} - -func (hc *HeapCollector) Less(i, j int) bool { - so := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, hc.results[i], hc.results[j]) - return -so < 0 -} - -func (hc *HeapCollector) Swap(i, j int) { - hc.results[i], hc.results[j] = hc.results[j], hc.results[i] -} - -func (hc *HeapCollector) Push(x interface{}) { - hc.results = append(hc.results, x.(*search.DocumentMatch)) -} - -func (hc *HeapCollector) Pop() interface{} { - var rv *search.DocumentMatch - rv, hc.results = hc.results[len(hc.results)-1], hc.results[:len(hc.results)-1] - return rv -} diff --git a/search/collectors/collector_heap_test.go b/search/collectors/topn_test.go similarity index 95% rename from search/collectors/collector_heap_test.go rename to search/collectors/topn_test.go index fad0fe04..4c7b25b0 100644 --- a/search/collectors/collector_heap_test.go +++ b/search/collectors/topn_test.go @@ -84,7 +84,7 @@ func TestTop10Scores(t *testing.T) { }, } - collector := NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + collector := NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -103,6 +103,7 @@ func TestTop10Scores(t *testing.T) { results := collector.Results() if len(results) != 10 { + t.Logf("results: %v", results) t.Fatalf("expected 10 results, got %d", len(results)) } @@ -192,7 +193,7 @@ func TestTop10ScoresSkip10(t *testing.T) { }, } - collector := NewHeapCollector(10, 10, search.SortOrder{&search.SortScore{Desc: true}}) + collector := NewTopNCollector(10, 10, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -289,7 +290,7 @@ func TestPaginationSameScores(t *testing.T) { } // first get first 5 hits - collector := NewHeapCollector(5, 0, search.SortOrder{&search.SortScore{Desc: true}}) + collector := NewTopNCollector(5, 0, search.SortOrder{&search.SortScore{Desc: true}}) err := collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -375,7 +376,7 @@ func TestPaginationSameScores(t *testing.T) { } // now get next 5 hits - collector = NewHeapCollector(5, 5, search.SortOrder{&search.SortScore{Desc: true}}) + collector = NewTopNCollector(5, 5, search.SortOrder{&search.SortScore{Desc: true}}) err = collector.Collect(context.Background(), searcher, &stubReader{}) if err != nil { t.Fatal(err) @@ -402,24 +403,24 @@ func TestPaginationSameScores(t *testing.T) { func BenchmarkTop10of100000Scores(b *testing.B) { benchHelper(10000, func() search.Collector { - return NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) }, b) } func BenchmarkTop100of100000Scores(b *testing.B) { benchHelper(10000, func() search.Collector { - return NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) + return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) }, b) } func BenchmarkTop10of1000000Scores(b *testing.B) { benchHelper(100000, func() search.Collector { - return NewHeapCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) }, b) } func BenchmarkTop100of1000000Scores(b *testing.B) { benchHelper(100000, func() search.Collector { - return NewHeapCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) + return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}}) }, b) } From c9310b906d6bb591e18fe33ceb8d7ceb30826aaf Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 26 Aug 2016 11:50:38 -0400 Subject: [PATCH 23/24] introduced new collector store impl based on slice counter-intuitively the list impl was faster than the heap the theory was the heap did more comparisons and swapping so even though it benefited from no interface and some cache locality, it was still slower the idea was to just use a raw slice kept in order this avoids the need for interface, but can take same comparison approach as the list it seems to work out: go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out BenchmarkTop10of100000Scores-4 5000 299959 ns/op 2600 B/op 36 allocs/op BenchmarkTop100of100000Scores-4 2000 601104 ns/op 20720 B/op 216 allocs/op BenchmarkTop10of1000000Scores-4 500 3450196 ns/op 2616 B/op 36 allocs/op BenchmarkTop100of1000000Scores-4 500 3874276 ns/op 20856 B/op 216 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.440s --- search/collectors/slice.go | 63 ++++++++++++++++++++++++++++++++++++++ search/collectors/topn.go | 4 +-- 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 search/collectors/slice.go diff --git a/search/collectors/slice.go b/search/collectors/slice.go new file mode 100644 index 00000000..24eba815 --- /dev/null +++ b/search/collectors/slice.go @@ -0,0 +1,63 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package collectors + +import "github.com/blevesearch/bleve/search" + +type collectStoreSlice struct { + slice search.DocumentMatchCollection + compare collectorCompare +} + +func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { + rv := &collectStoreSlice{ + slice: make(search.DocumentMatchCollection, 0, cap), + compare: compare, + } + return rv +} + +func (c *collectStoreSlice) Add(doc *search.DocumentMatch) { + // find where to insert, starting at end (lowest) + i := len(c.slice) + for ; i > 0; i-- { + cmp := c.compare(doc, c.slice[i-1]) + if cmp >= 0 { + break + } + } + if i < 0 { + i = 0 + } + // insert at i + c.slice = append(c.slice, nil) + copy(c.slice[i+1:], c.slice[i:]) + c.slice[i] = doc +} + +func (c *collectStoreSlice) RemoveLast() *search.DocumentMatch { + var rv *search.DocumentMatch + rv, c.slice = c.slice[len(c.slice)-1], c.slice[:len(c.slice)-1] + return rv +} + +func (c *collectStoreSlice) Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error) { + for i := skip; i < len(c.slice); i++ { + err := fixup(c.slice[i]) + if err != nil { + return nil, err + } + } + return c.slice[skip:], nil +} + +func (c *collectStoreSlice) Len() int { + return len(c.slice) +} diff --git a/search/collectors/topn.go b/search/collectors/topn.go index 15ce77c8..636b7593 100644 --- a/search/collectors/topn.go +++ b/search/collectors/topn.go @@ -32,7 +32,7 @@ type TopNCollector struct { results search.DocumentMatchCollection facetsBuilder *search.FacetsBuilder - store *collectStoreList + store *collectStoreSlice needDocIds bool neededFields []string @@ -52,7 +52,7 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector hc := &TopNCollector{size: size, skip: skip, sort: sort} // pre-allocate space on the heap, we need size+skip results // +1 additional while figuring out which to evict - hc.store = newStoreList(size+skip+1, func(i, j *search.DocumentMatch) int { + hc.store = newStoreSlice(size+skip+1, func(i, j *search.DocumentMatch) int { return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) }) From b1b93d5ff9fb6a9ee30a69c73606ea7915e1f4d1 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 26 Aug 2016 17:27:19 -0400 Subject: [PATCH 24/24] remove unneeded code fixes code review comment from @steveyen --- search/collectors/slice.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/search/collectors/slice.go b/search/collectors/slice.go index 24eba815..dee08c81 100644 --- a/search/collectors/slice.go +++ b/search/collectors/slice.go @@ -33,9 +33,6 @@ func (c *collectStoreSlice) Add(doc *search.DocumentMatch) { break } } - if i < 0 { - i = 0 - } // insert at i c.slice = append(c.slice, nil) copy(c.slice[i+1:], c.slice[i:])