0
0
bleve/search/collector/topn.go
Marty Schoch a4a34cc3b2 topn collector switch approach based on size+skip
we now use the slice store when size+skip <= 10
and use the heap store when size+skip > 10

here are the new perf numbers:

go test -run=xxx -bench=. -benchmem
BenchmarkTop10of0Scores-4            	 1000000	      1150 ns/op	    2304 B/op	      15 allocs/op
BenchmarkTop10of3Scores-4            	 1000000	      1417 ns/op	    2304 B/op	      18 allocs/op
BenchmarkTop10of10Scores-4           	 1000000	      2133 ns/op	    2312 B/op	      25 allocs/op
BenchmarkTop10of25Scores-4           	  500000	      3410 ns/op	    2464 B/op	      26 allocs/op
BenchmarkTop10of50Scores-4           	  300000	      5174 ns/op	    2464 B/op	      26 allocs/op
BenchmarkTop10of10000Scores-4        	    5000	    342955 ns/op	    2488 B/op	      26 allocs/op
BenchmarkTop100of0Scores-4           	  300000	      4796 ns/op	   18320 B/op	      15 allocs/op
BenchmarkTop100of3Scores-4           	  300000	      5160 ns/op	   18352 B/op	      19 allocs/op
BenchmarkTop100of10Scores-4          	  200000	      6354 ns/op	   18408 B/op	      26 allocs/op
BenchmarkTop100of25Scores-4          	  200000	     10023 ns/op	   18568 B/op	      41 allocs/op
BenchmarkTop100of50Scores-4          	  100000	     16821 ns/op	   18832 B/op	      66 allocs/op
BenchmarkTop100of10000Scores-4       	    3000	    508989 ns/op	   19760 B/op	     117 allocs/op
BenchmarkTop1000of10000Scores-4      	    1000	   1814198 ns/op	  184768 B/op	    1017 allocs/op
BenchmarkTop10000of100000Scores-4    	      50	  26623920 ns/op	 1939592 B/op	   19024 allocs/op
BenchmarkTop10of100000Scores-4       	     500	   3730204 ns/op	    2496 B/op	      26 allocs/op
BenchmarkTop100of100000Scores-4      	     300	   4057127 ns/op	   19912 B/op	     117 allocs/op
BenchmarkTop1000of100000Scores-4     	     200	   6390180 ns/op	  186200 B/op	    1017 allocs/op
BenchmarkTop10000of1000000Scores-4   	      20	  82785756 ns/op	 1963897 B/op	   19024 allocs/op
PASS
ok  	github.com/blevesearch/bleve/search/collector	31.537s

Previously with heap:

go test -run=xxx -bench=. -benchmem
BenchmarkTop10of0Scores-4            	 1000000	      1216 ns/op	    2288 B/op	      15 allocs/op
BenchmarkTop10of3Scores-4            	 1000000	      1593 ns/op	    2320 B/op	      19 allocs/op
BenchmarkTop10of10Scores-4           	  500000	      2734 ns/op	    2376 B/op	      26 allocs/op
BenchmarkTop10of25Scores-4           	  300000	      5077 ns/op	    2520 B/op	      27 allocs/op
BenchmarkTop10of50Scores-4           	  200000	      6875 ns/op	    2528 B/op	      27 allocs/op
BenchmarkTop10of10000Scores-4        	    3000	    351210 ns/op	    2552 B/op	      27 allocs/op
BenchmarkTop100of0Scores-4           	  300000	      4846 ns/op	   18304 B/op	      15 allocs/op
BenchmarkTop100of3Scores-4           	  300000	      5357 ns/op	   18336 B/op	      19 allocs/op
BenchmarkTop100of10Scores-4          	  200000	      6462 ns/op	   18392 B/op	      26 allocs/op
BenchmarkTop100of25Scores-4          	  200000	     10012 ns/op	   18552 B/op	      41 allocs/op
BenchmarkTop100of50Scores-4          	  100000	     17089 ns/op	   18816 B/op	      66 allocs/op
BenchmarkTop100of10000Scores-4       	    3000	    528193 ns/op	   19744 B/op	     117 allocs/op
BenchmarkTop1000of10000Scores-4      	    1000	   1859447 ns/op	  184752 B/op	    1017 allocs/op
BenchmarkTop10000of100000Scores-4    	      50	  28005664 ns/op	 1939576 B/op	   19024 allocs/op
BenchmarkTop10of100000Scores-4       	     300	   4120091 ns/op	    2560 B/op	      27 allocs/op
BenchmarkTop100of100000Scores-4      	     300	   4325227 ns/op	   19896 B/op	     117 allocs/op
BenchmarkTop1000of100000Scores-4     	     200	   6799804 ns/op	  186184 B/op	    1017 allocs/op
BenchmarkTop10000of1000000Scores-4   	      20	  88494230 ns/op	 1963881 B/op	   19024 allocs/op
PASS
ok  	github.com/blevesearch/bleve/search/collector	30.198s

Previously with slice:

go test -run=xxx -bench=. -benchmem
BenchmarkTop10of0Scores-4            	 1000000	      1202 ns/op	    2288 B/op	      15 allocs/op
BenchmarkTop10of3Scores-4            	 1000000	      1453 ns/op	    2288 B/op	      18 allocs/op
BenchmarkTop10of10Scores-4           	 1000000	      2162 ns/op	    2296 B/op	      25 allocs/op
BenchmarkTop10of25Scores-4           	  500000	      3420 ns/op	    2448 B/op	      26 allocs/op
BenchmarkTop10of50Scores-4           	  300000	      5336 ns/op	    2448 B/op	      26 allocs/op
BenchmarkTop10of10000Scores-4        	    5000	    356733 ns/op	    2472 B/op	      26 allocs/op
BenchmarkTop100of0Scores-4           	  300000	      4877 ns/op	   18304 B/op	      15 allocs/op
BenchmarkTop100of3Scores-4           	  300000	      5132 ns/op	   18304 B/op	      18 allocs/op
BenchmarkTop100of10Scores-4          	  200000	      5787 ns/op	   18312 B/op	      25 allocs/op
BenchmarkTop100of25Scores-4          	  200000	      8083 ns/op	   18344 B/op	      40 allocs/op
BenchmarkTop100of50Scores-4          	  100000	     14419 ns/op	   18400 B/op	      65 allocs/op
BenchmarkTop100of10000Scores-4       	    2000	    665401 ns/op	   18848 B/op	     116 allocs/op
BenchmarkTop1000of10000Scores-4      	     100	  15417063 ns/op	  176560 B/op	    1016 allocs/op
BenchmarkTop10000of100000Scores-4    	       1	1860011022 ns/op	 1857960 B/op	   19023 allocs/op
BenchmarkTop10of100000Scores-4       	     300	   4099276 ns/op	    2480 B/op	      26 allocs/op
BenchmarkTop100of100000Scores-4      	     300	   4533645 ns/op	   18984 B/op	     116 allocs/op
BenchmarkTop1000of100000Scores-4     	      50	  30519235 ns/op	  178008 B/op	    1016 allocs/op
BenchmarkTop10000of1000000Scores-4   	       1	3483977385 ns/op	 1882072 B/op	   19023 allocs/op
PASS
ok  	github.com/blevesearch/bleve/search/collector	31.666s

It appears that this sucessfully gets the best of both, in these particular benchmark sizes.
2017-04-27 08:57:13 -04:00

293 lines
8.1 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collector
import (
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"golang.org/x/net/context"
)
type collectorStore interface {
// Add the document, and if the new store size exceeds the provided size
// the last element is removed and returned. If the size has not been
// exceeded, nil is returned.
AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch
Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)
}
// PreAllocSizeSkipCap will cap preallocation to this amount when
// size+skip exceeds this value
var PreAllocSizeSkipCap = 1000
type collectorCompare func(i, j *search.DocumentMatch) int
type collectorFixup func(d *search.DocumentMatch) error
// TopNCollector collects the top N hits, optionally skipping some results
type TopNCollector struct {
size int
skip int
total uint64
maxScore float64
took time.Duration
sort search.SortOrder
results search.DocumentMatchCollection
facetsBuilder *search.FacetsBuilder
store collectorStore
needDocIds bool
neededFields []string
cachedScoring []bool
cachedDesc []bool
lowestMatchOutsideResults *search.DocumentMatch
}
// CheckDoneEvery controls how frequently we check the context deadline
const CheckDoneEvery = uint64(1024)
// NewTopNCollector builds a collector to find the top 'size' hits
// skipping over the first 'skip' hits
// ordering hits by the provided sort order
func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
hc := &TopNCollector{size: size, skip: skip, sort: sort}
// pre-allocate space on the store to avoid reslicing
// unless the size + skip is too large, then cap it
// everything should still work, just reslices as necessary
backingSize := size + skip + 1
if size+skip > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
if size+skip > 10 {
hc.store = newStoreHeap(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
} else {
hc.store = newStoreSlice(backingSize, func(i, j *search.DocumentMatch) int {
return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
})
}
// these lookups traverse an interface, so do once up-front
if sort.RequiresDocID() {
hc.needDocIds = true
}
hc.neededFields = sort.RequiredFields()
hc.cachedScoring = sort.CacheIsScore()
hc.cachedDesc = sort.CacheDescending()
return hc
}
// Collect goes to the index to find the matching documents
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
startTime := time.Now()
var err error
var next *search.DocumentMatch
// pre-allocate enough space in the DocumentMatchPool
// unless the size + skip is too large, then cap it
// everything should still work, just allocates DocumentMatches on demand
backingSize := hc.size + hc.skip + 1
if hc.size+hc.skip > PreAllocSizeSkipCap {
backingSize = PreAllocSizeSkipCap + 1
}
searchContext := &search.SearchContext{
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
}
select {
case <-ctx.Done():
return ctx.Err()
default:
next, err = searcher.Next(searchContext)
}
for err == nil && next != nil {
if hc.total%CheckDoneEvery == 0 {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
}
err = hc.collectSingle(searchContext, reader, next)
if err != nil {
break
}
next, err = searcher.Next(searchContext)
}
// compute search duration
hc.took = time.Since(startTime)
if err != nil {
return err
}
// finalize actual results
err = hc.finalizeResults(reader)
if err != nil {
return err
}
return nil
}
var sortByScoreOpt = []string{"_score"}
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error {
var err error
// visit field terms for features that require it (sort, facets)
if len(hc.neededFields) > 0 {
err = hc.visitFieldTerms(reader, d)
if err != nil {
return err
}
}
// increment total hits
hc.total++
d.HitNumber = hc.total
// update max score
if d.Score > hc.maxScore {
hc.maxScore = d.Score
}
// see if we need to load ID (at this early stage, for example to sort on it)
if hc.needDocIds {
d.ID, err = reader.ExternalID(d.IndexInternalID)
if err != nil {
return err
}
}
// compute this hits sort value
if len(hc.sort) == 1 && hc.cachedScoring[0] {
d.Sort = sortByScoreOpt
} else {
hc.sort.Value(d)
}
// optimization, we track lowest sorting hit already removed from heap
// with this one comparison, we can avoid all heap operations if
// this hit would have been added and then immediately removed
if hc.lowestMatchOutsideResults != nil {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults)
if cmp >= 0 {
// this hit can't possibly be in the result set, so avoid heap ops
ctx.DocumentMatchPool.Put(d)
return nil
}
}
removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
if removed != nil {
if hc.lowestMatchOutsideResults == nil {
hc.lowestMatchOutsideResults = removed
} else {
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults)
if cmp < 0 {
tmp := hc.lowestMatchOutsideResults
hc.lowestMatchOutsideResults = removed
ctx.DocumentMatchPool.Put(tmp)
}
}
}
return nil
}
// visitFieldTerms is responsible for visiting the field terms of the
// search hit, and passing visited terms to the sort and facet builder
func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error {
if hc.facetsBuilder != nil {
hc.facetsBuilder.StartDoc()
}
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) {
if hc.facetsBuilder != nil {
hc.facetsBuilder.UpdateVisitor(field, term)
}
hc.sort.UpdateVisitor(field, term)
})
if hc.facetsBuilder != nil {
hc.facetsBuilder.EndDoc()
}
return err
}
// SetFacetsBuilder registers a facet builder for this collector
func (hc *TopNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
hc.facetsBuilder = facetsBuilder
hc.neededFields = append(hc.neededFields, hc.facetsBuilder.RequiredFields()...)
}
// finalizeResults starts with the heap containing the final top size+skip
// it now throws away the results to be skipped
// and does final doc id lookup (if necessary)
func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
var err error
hc.results, err = hc.store.Final(hc.skip, func(doc *search.DocumentMatch) error {
if doc.ID == "" {
// look up the id since we need it for lookup
var err error
doc.ID, err = r.ExternalID(doc.IndexInternalID)
if err != nil {
return err
}
}
return nil
})
return err
}
// Results returns the collected hits
func (hc *TopNCollector) Results() search.DocumentMatchCollection {
return hc.results
}
// Total returns the total number of hits
func (hc *TopNCollector) Total() uint64 {
return hc.total
}
// MaxScore returns the maximum score seen across all the hits
func (hc *TopNCollector) MaxScore() float64 {
return hc.maxScore
}
// Took returns the time spent collecting hits
func (hc *TopNCollector) Took() time.Duration {
return hc.took
}
// FacetResults returns the computed facets results
func (hc *TopNCollector) FacetResults() search.FacetResults {
if hc.facetsBuilder != nil {
return hc.facetsBuilder.Results()
}
return search.FacetResults{}
}