2016-08-12 20:23:55 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
2016-10-02 16:13:14 +02:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2016-08-12 20:23:55 +02:00
|
|
|
|
|
|
|
package search
|
|
|
|
|
2016-08-13 01:16:24 +02:00
|
|
|
import (
|
|
|
|
"encoding/json"
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
"fmt"
|
2016-08-17 18:20:12 +02:00
|
|
|
"sort"
|
2016-08-13 01:16:24 +02:00
|
|
|
"strings"
|
2016-08-17 18:20:12 +02:00
|
|
|
|
2016-09-30 17:35:22 +02:00
|
|
|
"github.com/blevesearch/bleve/numeric"
|
2016-08-13 01:16:24 +02:00
|
|
|
)
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
var HighTerm = strings.Repeat(string([]byte{0xff}), 10)
|
|
|
|
var LowTerm = string([]byte{0x00})
|
|
|
|
|
2016-08-13 01:16:24 +02:00
|
|
|
type SearchSort interface {
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
Value(a *DocumentMatch) string
|
2016-08-24 20:07:10 +02:00
|
|
|
Descending() bool
|
2016-08-13 01:16:24 +02:00
|
|
|
|
|
|
|
RequiresDocID() bool
|
|
|
|
RequiresScoring() bool
|
2016-08-17 18:20:12 +02:00
|
|
|
RequiresFields() []string
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
|
|
|
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) {
|
|
|
|
descending, ok := input["desc"].(bool)
|
|
|
|
by, ok := input["by"].(string)
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("search sort must specify by")
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
switch by {
|
|
|
|
case "id":
|
|
|
|
return &SortDocID{
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc: descending,
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
}, nil
|
|
|
|
case "score":
|
|
|
|
return &SortScore{
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc: descending,
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
}, nil
|
|
|
|
case "field":
|
|
|
|
field, ok := input["field"].(string)
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("search sort mode field must specify field")
|
|
|
|
}
|
|
|
|
rv := &SortField{
|
2016-08-24 20:07:10 +02:00
|
|
|
Field: field,
|
|
|
|
Desc: descending,
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
}
|
|
|
|
typ, ok := input["type"].(string)
|
|
|
|
if ok {
|
|
|
|
switch typ {
|
|
|
|
case "auto":
|
|
|
|
rv.Type = SortFieldAuto
|
|
|
|
case "string":
|
|
|
|
rv.Type = SortFieldAsString
|
|
|
|
case "number":
|
|
|
|
rv.Type = SortFieldAsNumber
|
|
|
|
case "date":
|
|
|
|
rv.Type = SortFieldAsDate
|
|
|
|
default:
|
2016-10-02 18:11:15 +02:00
|
|
|
return nil, fmt.Errorf("unknown sort field type: %s", typ)
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
mode, ok := input["mode"].(string)
|
|
|
|
if ok {
|
|
|
|
switch mode {
|
|
|
|
case "default":
|
|
|
|
rv.Mode = SortFieldDefault
|
|
|
|
case "min":
|
|
|
|
rv.Mode = SortFieldMin
|
|
|
|
case "max":
|
|
|
|
rv.Mode = SortFieldMax
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("unknown sort field mode: %s", mode)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
missing, ok := input["missing"].(string)
|
|
|
|
if ok {
|
|
|
|
switch missing {
|
|
|
|
case "first":
|
|
|
|
rv.Missing = SortFieldMissingFirst
|
|
|
|
case "last":
|
|
|
|
rv.Missing = SortFieldMissingLast
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("unknown sort field missing: %s", missing)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, fmt.Errorf("unknown search sort by: %s", by)
|
|
|
|
}
|
|
|
|
|
2016-08-17 23:49:06 +02:00
|
|
|
func ParseSearchSortString(input string) SearchSort {
|
2016-08-13 01:16:24 +02:00
|
|
|
descending := false
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if strings.HasPrefix(input, "-") {
|
2016-08-13 01:16:24 +02:00
|
|
|
descending = true
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
input = input[1:]
|
|
|
|
} else if strings.HasPrefix(input, "+") {
|
|
|
|
input = input[1:]
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if input == "_id" {
|
2016-08-13 01:16:24 +02:00
|
|
|
return &SortDocID{
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc: descending,
|
2016-08-17 23:49:06 +02:00
|
|
|
}
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
} else if input == "_score" {
|
2016-08-13 01:16:24 +02:00
|
|
|
return &SortScore{
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc: descending,
|
2016-08-17 23:49:06 +02:00
|
|
|
}
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
2016-08-17 18:20:12 +02:00
|
|
|
return &SortField{
|
2016-08-24 20:07:10 +02:00
|
|
|
Field: input,
|
|
|
|
Desc: descending,
|
2016-08-17 23:49:06 +02:00
|
|
|
}
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
|
|
|
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) {
|
|
|
|
// first try to parse it as string
|
|
|
|
var sortString string
|
|
|
|
err := json.Unmarshal(input, &sortString)
|
|
|
|
if err != nil {
|
|
|
|
var sortObj map[string]interface{}
|
|
|
|
err = json.Unmarshal(input, &sortObj)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return ParseSearchSortObj(sortObj)
|
|
|
|
}
|
2016-08-17 23:49:06 +02:00
|
|
|
return ParseSearchSortString(sortString), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func ParseSortOrderStrings(in []string) SortOrder {
|
|
|
|
rv := make(SortOrder, 0, len(in))
|
|
|
|
for _, i := range in {
|
|
|
|
ss := ParseSearchSortString(i)
|
|
|
|
rv = append(rv, ss)
|
|
|
|
}
|
|
|
|
return rv
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) {
|
2016-08-13 01:16:24 +02:00
|
|
|
rv := make(SortOrder, 0, len(in))
|
|
|
|
for _, i := range in {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
ss, err := ParseSearchSortJSON(i)
|
2016-08-13 01:16:24 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
rv = append(rv, ss)
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type SortOrder []SearchSort
|
|
|
|
|
2016-08-25 01:02:22 +02:00
|
|
|
func (so SortOrder) Value(doc *DocumentMatch) {
|
|
|
|
for _, soi := range so {
|
|
|
|
doc.Sort = append(doc.Sort, soi.Value(doc))
|
2016-08-24 20:07:10 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-25 01:02:22 +02:00
|
|
|
// Compare will compare two document matches using the specified sort order
|
|
|
|
// if both are numbers, we avoid converting back to term
|
2016-08-25 22:24:26 +02:00
|
|
|
func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatch) int {
|
2016-08-13 01:16:24 +02:00
|
|
|
// compare the documents on all search sorts until a differences is found
|
2016-08-25 22:24:26 +02:00
|
|
|
for x := range so {
|
2016-08-25 01:02:22 +02:00
|
|
|
c := 0
|
2016-08-25 22:24:26 +02:00
|
|
|
if cachedScoring[x] {
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
if i.Score < j.Score {
|
|
|
|
c = -1
|
|
|
|
} else if i.Score > j.Score {
|
|
|
|
c = 1
|
2016-08-25 01:02:22 +02:00
|
|
|
}
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
} else {
|
|
|
|
iVal := i.Sort[x]
|
|
|
|
jVal := j.Sort[x]
|
|
|
|
c = strings.Compare(iVal, jVal)
|
2016-08-25 01:02:22 +02:00
|
|
|
}
|
|
|
|
|
2016-08-13 01:16:24 +02:00
|
|
|
if c == 0 {
|
|
|
|
continue
|
|
|
|
}
|
2016-08-25 22:24:26 +02:00
|
|
|
if cachedDesc[x] {
|
2016-08-24 20:07:10 +02:00
|
|
|
c = -c
|
|
|
|
}
|
2016-08-13 01:16:24 +02:00
|
|
|
return c
|
|
|
|
}
|
|
|
|
// if they are the same at this point, impose order based on index natural sort order
|
|
|
|
if i.HitNumber == j.HitNumber {
|
|
|
|
return 0
|
|
|
|
} else if i.HitNumber > j.HitNumber {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
return -1
|
|
|
|
}
|
|
|
|
|
|
|
|
func (so SortOrder) RequiresScore() bool {
|
|
|
|
rv := false
|
|
|
|
for _, soi := range so {
|
|
|
|
if soi.RequiresScoring() {
|
|
|
|
rv = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (so SortOrder) RequiresDocID() bool {
|
|
|
|
rv := false
|
|
|
|
for _, soi := range so {
|
|
|
|
if soi.RequiresDocID() {
|
|
|
|
rv = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
func (so SortOrder) RequiredFields() []string {
|
2016-08-13 01:16:24 +02:00
|
|
|
var rv []string
|
|
|
|
for _, soi := range so {
|
2016-08-17 18:20:12 +02:00
|
|
|
rv = append(rv, soi.RequiresFields()...)
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
2016-08-12 20:23:55 +02:00
|
|
|
|
2016-08-25 22:24:26 +02:00
|
|
|
func (so SortOrder) CacheIsScore() []bool {
|
|
|
|
var rv []bool
|
|
|
|
for _, soi := range so {
|
|
|
|
rv = append(rv, soi.RequiresScoring())
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (so SortOrder) CacheDescending() []bool {
|
|
|
|
var rv []bool
|
|
|
|
for _, soi := range so {
|
|
|
|
rv = append(rv, soi.Descending())
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
// SortFieldType lets you control some internal sort behavior
|
|
|
|
// normally leaving this to the zero-value of SortFieldAuto is fine
|
|
|
|
type SortFieldType int
|
|
|
|
|
|
|
|
const (
|
|
|
|
// SortFieldAuto applies heuristics attempt to automatically sort correctly
|
|
|
|
SortFieldAuto SortFieldType = iota
|
|
|
|
// SortFieldAsString forces sort as string (no prefix coded terms removed)
|
|
|
|
SortFieldAsString
|
|
|
|
// SortFieldAsNumber forces sort as string (prefix coded terms with shift > 0 removed)
|
|
|
|
SortFieldAsNumber
|
|
|
|
// SortFieldAsDate forces sort as string (prefix coded terms with shift > 0 removed)
|
|
|
|
SortFieldAsDate
|
|
|
|
)
|
|
|
|
|
|
|
|
// SortFieldMode describes the behavior if the field has multiple values
|
|
|
|
type SortFieldMode int
|
|
|
|
|
|
|
|
const (
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
// SortFieldDefault uses the first (or only) value, this is the default zero-value
|
|
|
|
SortFieldDefault SortFieldMode = iota // FIXME name is confusing
|
2016-08-17 18:20:12 +02:00
|
|
|
// SortFieldMin uses the minimum value
|
|
|
|
SortFieldMin
|
|
|
|
// SortFieldMax uses the maximum value
|
|
|
|
SortFieldMax
|
|
|
|
)
|
|
|
|
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
// SortFieldMissing controls where documents missing a field value should be sorted
|
|
|
|
type SortFieldMissing int
|
|
|
|
|
|
|
|
const (
|
|
|
|
// SortFieldMissingLast sorts documents missing a field at the end
|
|
|
|
SortFieldMissingLast SortFieldMissing = iota
|
|
|
|
|
|
|
|
// SortFieldMissingFirst sorts documents missing a field at the beginning
|
|
|
|
SortFieldMissingFirst
|
|
|
|
)
|
2016-08-17 18:20:12 +02:00
|
|
|
|
|
|
|
// SortField will sort results by the value of a stored field
|
|
|
|
// Field is the name of the field
|
|
|
|
// Descending reverse the sort order (default false)
|
|
|
|
// Type allows forcing of string/number/date behavior (default auto)
|
|
|
|
// Mode controls behavior for multi-values fields (default first)
|
|
|
|
// Missing controls behavior of missing values (default last)
|
|
|
|
type SortField struct {
|
2016-08-24 20:07:10 +02:00
|
|
|
Field string
|
|
|
|
Desc bool
|
|
|
|
Type SortFieldType
|
|
|
|
Mode SortFieldMode
|
|
|
|
Missing SortFieldMissing
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
2016-08-24 20:07:10 +02:00
|
|
|
// Value returns the sort value of the DocumentMatch
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
func (s *SortField) Value(i *DocumentMatch) string {
|
2016-08-17 18:20:12 +02:00
|
|
|
iTerms := i.CachedFieldTerms[s.Field]
|
|
|
|
iTerms = s.filterTermsByType(iTerms)
|
|
|
|
iTerm := s.filterTermsByMode(iTerms)
|
2016-08-24 20:07:10 +02:00
|
|
|
return iTerm
|
|
|
|
}
|
|
|
|
|
|
|
|
// Descending determines the order of the sort
|
|
|
|
func (s *SortField) Descending() bool {
|
|
|
|
return s.Desc
|
2016-08-17 18:20:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *SortField) filterTermsByMode(terms []string) string {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) {
|
2016-08-17 18:20:12 +02:00
|
|
|
return terms[0]
|
|
|
|
} else if len(terms) > 1 {
|
|
|
|
switch s.Mode {
|
|
|
|
case SortFieldMin:
|
|
|
|
sort.Strings(terms)
|
|
|
|
return terms[0]
|
|
|
|
case SortFieldMax:
|
|
|
|
sort.Strings(terms)
|
|
|
|
return terms[len(terms)-1]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// handle missing terms
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if s.Missing == SortFieldMissingLast {
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
2016-08-17 18:20:12 +02:00
|
|
|
return LowTerm
|
|
|
|
}
|
|
|
|
return HighTerm
|
|
|
|
}
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
return HighTerm
|
|
|
|
}
|
|
|
|
return LowTerm
|
2016-08-17 18:20:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// filterTermsByType attempts to make one pass on the terms
|
|
|
|
// if we are in auto-mode AND all the terms look like prefix-coded numbers
|
|
|
|
// return only the terms which had shift of 0
|
|
|
|
// if we are in explicit number or date mode, return only valid
|
|
|
|
// prefix coded numbers with shift of 0
|
|
|
|
func (s *SortField) filterTermsByType(terms []string) []string {
|
|
|
|
stype := s.Type
|
|
|
|
if stype == SortFieldAuto {
|
|
|
|
allTermsPrefixCoded := true
|
|
|
|
var termsWithShiftZero []string
|
|
|
|
for _, term := range terms {
|
2016-09-30 17:35:22 +02:00
|
|
|
valid, shift := numeric.ValidPrefixCodedTerm(term)
|
2016-08-17 18:20:12 +02:00
|
|
|
if valid && shift == 0 {
|
|
|
|
termsWithShiftZero = append(termsWithShiftZero, term)
|
|
|
|
} else if !valid {
|
|
|
|
allTermsPrefixCoded = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if allTermsPrefixCoded {
|
|
|
|
terms = termsWithShiftZero
|
|
|
|
}
|
|
|
|
} else if stype == SortFieldAsNumber || stype == SortFieldAsDate {
|
|
|
|
var termsWithShiftZero []string
|
|
|
|
for _, term := range terms {
|
2016-09-30 17:35:22 +02:00
|
|
|
valid, shift := numeric.ValidPrefixCodedTerm(term)
|
2016-08-17 18:20:12 +02:00
|
|
|
if valid && shift == 0 {
|
|
|
|
termsWithShiftZero = append(termsWithShiftZero)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
terms = termsWithShiftZero
|
|
|
|
}
|
|
|
|
return terms
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// RequiresDocID says this SearchSort does not require the DocID be loaded
|
2016-08-17 18:20:12 +02:00
|
|
|
func (s *SortField) RequiresDocID() bool { return false }
|
2016-08-12 20:23:55 +02:00
|
|
|
|
|
|
|
// RequiresScoring says this SearchStore does not require scoring
|
2016-08-17 18:20:12 +02:00
|
|
|
func (s *SortField) RequiresScoring() bool { return false }
|
2016-08-12 20:23:55 +02:00
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
// RequiresFields says this SearchStore requires the specified stored field
|
|
|
|
func (s *SortField) RequiresFields() []string { return []string{s.Field} }
|
2016-08-12 20:23:55 +02:00
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
func (s *SortField) MarshalJSON() ([]byte, error) {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
// see if simple format can be used
|
|
|
|
if s.Missing == SortFieldMissingLast &&
|
|
|
|
s.Mode == SortFieldDefault &&
|
|
|
|
s.Type == SortFieldAuto {
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
return json.Marshal("-" + s.Field)
|
|
|
|
}
|
|
|
|
return json.Marshal(s.Field)
|
|
|
|
}
|
|
|
|
sfm := map[string]interface{}{
|
|
|
|
"by": "field",
|
|
|
|
"field": s.Field,
|
|
|
|
}
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
sfm["desc"] = true
|
|
|
|
}
|
|
|
|
if s.Missing > SortFieldMissingLast {
|
|
|
|
switch s.Missing {
|
|
|
|
case SortFieldMissingFirst:
|
|
|
|
sfm["missing"] = "first"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if s.Mode > SortFieldDefault {
|
|
|
|
switch s.Mode {
|
|
|
|
case SortFieldMin:
|
|
|
|
sfm["mode"] = "min"
|
|
|
|
case SortFieldMax:
|
|
|
|
sfm["mode"] = "max"
|
|
|
|
}
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if s.Type > SortFieldAuto {
|
|
|
|
switch s.Type {
|
|
|
|
case SortFieldAsString:
|
|
|
|
sfm["type"] = "string"
|
|
|
|
case SortFieldAsNumber:
|
|
|
|
sfm["type"] = "number"
|
|
|
|
case SortFieldAsDate:
|
|
|
|
sfm["type"] = "date"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return json.Marshal(sfm)
|
2016-08-13 01:16:24 +02:00
|
|
|
}
|
|
|
|
|
2016-08-12 20:23:55 +02:00
|
|
|
// SortDocID will sort results by the document identifier
|
|
|
|
type SortDocID struct {
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc bool
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
2016-08-24 20:07:10 +02:00
|
|
|
// Value returns the sort value of the DocumentMatch
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
func (s *SortDocID) Value(i *DocumentMatch) string {
|
2016-08-24 20:07:10 +02:00
|
|
|
return i.ID
|
|
|
|
}
|
|
|
|
|
|
|
|
// Descending determines the order of the sort
|
|
|
|
func (s *SortDocID) Descending() bool {
|
|
|
|
return s.Desc
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// RequiresDocID says this SearchSort does require the DocID be loaded
|
|
|
|
func (s *SortDocID) RequiresDocID() bool { return true }
|
|
|
|
|
|
|
|
// RequiresScoring says this SearchStore does not require scoring
|
|
|
|
func (s *SortDocID) RequiresScoring() bool { return false }
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
// RequiresFields says this SearchStore does not require any stored fields
|
|
|
|
func (s *SortDocID) RequiresFields() []string { return nil }
|
2016-08-12 20:23:55 +02:00
|
|
|
|
2016-08-13 01:16:24 +02:00
|
|
|
func (s *SortDocID) MarshalJSON() ([]byte, error) {
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
2016-08-13 01:16:24 +02:00
|
|
|
return json.Marshal("-_id")
|
|
|
|
}
|
|
|
|
return json.Marshal("_id")
|
|
|
|
}
|
|
|
|
|
2016-08-12 20:23:55 +02:00
|
|
|
// SortScore will sort results by the document match score
|
|
|
|
type SortScore struct {
|
2016-08-24 20:07:10 +02:00
|
|
|
Desc bool
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
2016-08-24 20:07:10 +02:00
|
|
|
// Value returns the sort value of the DocumentMatch
|
improved implementation to address perf regressions
primary change is going back to sort values be []string
and not []interface{}, this avoid allocatiosn converting
into the interface{}
that sounds obvious, so why didn't we just do that first?
because a common (default) sort is score, which is naturally
a number, not a string (like terms). converting into the
number was also expensive, and the common case.
so, this solution also makes the change to NOT put the score
into the sort value list. instead you see the dummy value
"_score". this is just a placeholder, the actual sort impl
knows that field of the sort is the score, and will sort
using the actual score.
also, several other aspets of the benchmark were cleaned up
so that unnecessary allocations do not pollute the cpu profiles
Here are the updated benchmarks:
$ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out
BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op
BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op
BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op
BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.188s
Prior to this PR, master reported:
$ go test -run=xxx -bench=. -benchmem
BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op
BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op
BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op
BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op
PASS
ok github.com/blevesearch/bleve/search/collectors 7.385s
So, we're pretty close on the smaller datasets, and we scale better on the larger datasets.
We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
2016-08-25 21:47:07 +02:00
|
|
|
func (s *SortScore) Value(i *DocumentMatch) string {
|
|
|
|
return "_score"
|
2016-08-24 20:07:10 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Descending determines the order of the sort
|
|
|
|
func (s *SortScore) Descending() bool {
|
|
|
|
return s.Desc
|
2016-08-12 20:23:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// RequiresDocID says this SearchSort does not require the DocID be loaded
|
|
|
|
func (s *SortScore) RequiresDocID() bool { return false }
|
|
|
|
|
|
|
|
// RequiresScoring says this SearchStore does require scoring
|
|
|
|
func (s *SortScore) RequiresScoring() bool { return true }
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
// RequiresFields says this SearchStore does not require any store fields
|
|
|
|
func (s *SortScore) RequiresFields() []string { return nil }
|
2016-08-13 01:16:24 +02:00
|
|
|
|
|
|
|
func (s *SortScore) MarshalJSON() ([]byte, error) {
|
2016-08-24 20:07:10 +02:00
|
|
|
if s.Desc {
|
2016-08-13 01:16:24 +02:00
|
|
|
return json.Marshal("-_score")
|
|
|
|
}
|
|
|
|
return json.Marshal("_score")
|
|
|
|
}
|