60750c1614
primary change is going back to sort values be []string and not []interface{}, this avoid allocatiosn converting into the interface{} that sounds obvious, so why didn't we just do that first? because a common (default) sort is score, which is naturally a number, not a string (like terms). converting into the number was also expensive, and the common case. so, this solution also makes the change to NOT put the score into the sort value list. instead you see the dummy value "_score". this is just a placeholder, the actual sort impl knows that field of the sort is the score, and will sort using the actual score. also, several other aspets of the benchmark were cleaned up so that unnecessary allocations do not pollute the cpu profiles Here are the updated benchmarks: $ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.188s Prior to this PR, master reported: $ go test -run=xxx -bench=. -benchmem BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.385s So, we're pretty close on the smaller datasets, and we scale better on the larger datasets. We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup).
473 lines
12 KiB
Go
473 lines
12 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
// and limitations under the License.
|
|
|
|
package search
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/blevesearch/bleve/numeric_util"
|
|
)
|
|
|
|
var HighTerm = strings.Repeat(string([]byte{0xff}), 10)
|
|
var LowTerm = string([]byte{0x00})
|
|
|
|
type SearchSort interface {
|
|
Value(a *DocumentMatch) string
|
|
Descending() bool
|
|
|
|
RequiresDocID() bool
|
|
RequiresScoring() bool
|
|
RequiresFields() []string
|
|
}
|
|
|
|
func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) {
|
|
descending, ok := input["desc"].(bool)
|
|
by, ok := input["by"].(string)
|
|
if !ok {
|
|
return nil, fmt.Errorf("search sort must specify by")
|
|
}
|
|
switch by {
|
|
case "id":
|
|
return &SortDocID{
|
|
Desc: descending,
|
|
}, nil
|
|
case "score":
|
|
return &SortScore{
|
|
Desc: descending,
|
|
}, nil
|
|
case "field":
|
|
field, ok := input["field"].(string)
|
|
if !ok {
|
|
return nil, fmt.Errorf("search sort mode field must specify field")
|
|
}
|
|
rv := &SortField{
|
|
Field: field,
|
|
Desc: descending,
|
|
}
|
|
typ, ok := input["type"].(string)
|
|
if ok {
|
|
switch typ {
|
|
case "auto":
|
|
rv.Type = SortFieldAuto
|
|
case "string":
|
|
rv.Type = SortFieldAsString
|
|
case "number":
|
|
rv.Type = SortFieldAsNumber
|
|
case "date":
|
|
rv.Type = SortFieldAsDate
|
|
default:
|
|
return nil, fmt.Errorf("unkown sort field type: %s", typ)
|
|
}
|
|
}
|
|
mode, ok := input["mode"].(string)
|
|
if ok {
|
|
switch mode {
|
|
case "default":
|
|
rv.Mode = SortFieldDefault
|
|
case "min":
|
|
rv.Mode = SortFieldMin
|
|
case "max":
|
|
rv.Mode = SortFieldMax
|
|
default:
|
|
return nil, fmt.Errorf("unknown sort field mode: %s", mode)
|
|
}
|
|
}
|
|
missing, ok := input["missing"].(string)
|
|
if ok {
|
|
switch missing {
|
|
case "first":
|
|
rv.Missing = SortFieldMissingFirst
|
|
case "last":
|
|
rv.Missing = SortFieldMissingLast
|
|
default:
|
|
return nil, fmt.Errorf("unknown sort field missing: %s", missing)
|
|
}
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
return nil, fmt.Errorf("unknown search sort by: %s", by)
|
|
}
|
|
|
|
func ParseSearchSortString(input string) SearchSort {
|
|
descending := false
|
|
if strings.HasPrefix(input, "-") {
|
|
descending = true
|
|
input = input[1:]
|
|
} else if strings.HasPrefix(input, "+") {
|
|
input = input[1:]
|
|
}
|
|
if input == "_id" {
|
|
return &SortDocID{
|
|
Desc: descending,
|
|
}
|
|
} else if input == "_score" {
|
|
return &SortScore{
|
|
Desc: descending,
|
|
}
|
|
}
|
|
return &SortField{
|
|
Field: input,
|
|
Desc: descending,
|
|
}
|
|
}
|
|
|
|
func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) {
|
|
// first try to parse it as string
|
|
var sortString string
|
|
err := json.Unmarshal(input, &sortString)
|
|
if err != nil {
|
|
var sortObj map[string]interface{}
|
|
err = json.Unmarshal(input, &sortObj)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ParseSearchSortObj(sortObj)
|
|
}
|
|
return ParseSearchSortString(sortString), nil
|
|
}
|
|
|
|
func ParseSortOrderStrings(in []string) SortOrder {
|
|
rv := make(SortOrder, 0, len(in))
|
|
for _, i := range in {
|
|
ss := ParseSearchSortString(i)
|
|
rv = append(rv, ss)
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) {
|
|
rv := make(SortOrder, 0, len(in))
|
|
for _, i := range in {
|
|
ss, err := ParseSearchSortJSON(i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv = append(rv, ss)
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
type SortOrder []SearchSort
|
|
|
|
func (so SortOrder) Value(doc *DocumentMatch) {
|
|
for _, soi := range so {
|
|
doc.Sort = append(doc.Sort, soi.Value(doc))
|
|
}
|
|
}
|
|
|
|
// Compare will compare two document matches using the specified sort order
|
|
// if both are numbers, we avoid converting back to term
|
|
func (so SortOrder) Compare(i, j *DocumentMatch) int {
|
|
// compare the documents on all search sorts until a differences is found
|
|
for x, soi := range so {
|
|
c := 0
|
|
if soi.RequiresScoring() {
|
|
if i.Score < j.Score {
|
|
c = -1
|
|
} else if i.Score > j.Score {
|
|
c = 1
|
|
}
|
|
} else {
|
|
iVal := i.Sort[x]
|
|
jVal := j.Sort[x]
|
|
c = strings.Compare(iVal, jVal)
|
|
}
|
|
|
|
if c == 0 {
|
|
continue
|
|
}
|
|
if soi.Descending() {
|
|
c = -c
|
|
}
|
|
return c
|
|
}
|
|
// if they are the same at this point, impose order based on index natural sort order
|
|
if i.HitNumber == j.HitNumber {
|
|
return 0
|
|
} else if i.HitNumber > j.HitNumber {
|
|
return 1
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func (so SortOrder) RequiresScore() bool {
|
|
rv := false
|
|
for _, soi := range so {
|
|
if soi.RequiresScoring() {
|
|
rv = true
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (so SortOrder) RequiresDocID() bool {
|
|
rv := false
|
|
for _, soi := range so {
|
|
if soi.RequiresDocID() {
|
|
rv = true
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (so SortOrder) RequiredFields() []string {
|
|
var rv []string
|
|
for _, soi := range so {
|
|
rv = append(rv, soi.RequiresFields()...)
|
|
}
|
|
return rv
|
|
}
|
|
|
|
// SortFieldType lets you control some internal sort behavior
|
|
// normally leaving this to the zero-value of SortFieldAuto is fine
|
|
type SortFieldType int
|
|
|
|
const (
|
|
// SortFieldAuto applies heuristics attempt to automatically sort correctly
|
|
SortFieldAuto SortFieldType = iota
|
|
// SortFieldAsString forces sort as string (no prefix coded terms removed)
|
|
SortFieldAsString
|
|
// SortFieldAsNumber forces sort as string (prefix coded terms with shift > 0 removed)
|
|
SortFieldAsNumber
|
|
// SortFieldAsDate forces sort as string (prefix coded terms with shift > 0 removed)
|
|
SortFieldAsDate
|
|
)
|
|
|
|
// SortFieldMode describes the behavior if the field has multiple values
|
|
type SortFieldMode int
|
|
|
|
const (
|
|
// SortFieldDefault uses the first (or only) value, this is the default zero-value
|
|
SortFieldDefault SortFieldMode = iota // FIXME name is confusing
|
|
// SortFieldMin uses the minimum value
|
|
SortFieldMin
|
|
// SortFieldMax uses the maximum value
|
|
SortFieldMax
|
|
)
|
|
|
|
// SortFieldMissing controls where documents missing a field value should be sorted
|
|
type SortFieldMissing int
|
|
|
|
const (
|
|
// SortFieldMissingLast sorts documents missing a field at the end
|
|
SortFieldMissingLast SortFieldMissing = iota
|
|
|
|
// SortFieldMissingFirst sorts documents missing a field at the beginning
|
|
SortFieldMissingFirst
|
|
)
|
|
|
|
// SortField will sort results by the value of a stored field
|
|
// Field is the name of the field
|
|
// Descending reverse the sort order (default false)
|
|
// Type allows forcing of string/number/date behavior (default auto)
|
|
// Mode controls behavior for multi-values fields (default first)
|
|
// Missing controls behavior of missing values (default last)
|
|
type SortField struct {
|
|
Field string
|
|
Desc bool
|
|
Type SortFieldType
|
|
Mode SortFieldMode
|
|
Missing SortFieldMissing
|
|
}
|
|
|
|
// Value returns the sort value of the DocumentMatch
|
|
func (s *SortField) Value(i *DocumentMatch) string {
|
|
iTerms := i.CachedFieldTerms[s.Field]
|
|
iTerms = s.filterTermsByType(iTerms)
|
|
iTerm := s.filterTermsByMode(iTerms)
|
|
return iTerm
|
|
}
|
|
|
|
// Descending determines the order of the sort
|
|
func (s *SortField) Descending() bool {
|
|
return s.Desc
|
|
}
|
|
|
|
func (s *SortField) filterTermsByMode(terms []string) string {
|
|
if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) {
|
|
return terms[0]
|
|
} else if len(terms) > 1 {
|
|
switch s.Mode {
|
|
case SortFieldMin:
|
|
sort.Strings(terms)
|
|
return terms[0]
|
|
case SortFieldMax:
|
|
sort.Strings(terms)
|
|
return terms[len(terms)-1]
|
|
}
|
|
}
|
|
|
|
// handle missing terms
|
|
if s.Missing == SortFieldMissingLast {
|
|
if s.Desc {
|
|
return LowTerm
|
|
}
|
|
return HighTerm
|
|
}
|
|
if s.Desc {
|
|
return HighTerm
|
|
}
|
|
return LowTerm
|
|
}
|
|
|
|
// filterTermsByType attempts to make one pass on the terms
|
|
// if we are in auto-mode AND all the terms look like prefix-coded numbers
|
|
// return only the terms which had shift of 0
|
|
// if we are in explicit number or date mode, return only valid
|
|
// prefix coded numbers with shift of 0
|
|
func (s *SortField) filterTermsByType(terms []string) []string {
|
|
stype := s.Type
|
|
if stype == SortFieldAuto {
|
|
allTermsPrefixCoded := true
|
|
var termsWithShiftZero []string
|
|
for _, term := range terms {
|
|
valid, shift := numeric_util.ValidPrefixCodedTerm(term)
|
|
if valid && shift == 0 {
|
|
termsWithShiftZero = append(termsWithShiftZero, term)
|
|
} else if !valid {
|
|
allTermsPrefixCoded = false
|
|
}
|
|
}
|
|
if allTermsPrefixCoded {
|
|
terms = termsWithShiftZero
|
|
}
|
|
} else if stype == SortFieldAsNumber || stype == SortFieldAsDate {
|
|
var termsWithShiftZero []string
|
|
for _, term := range terms {
|
|
valid, shift := numeric_util.ValidPrefixCodedTerm(term)
|
|
if valid && shift == 0 {
|
|
termsWithShiftZero = append(termsWithShiftZero)
|
|
}
|
|
}
|
|
terms = termsWithShiftZero
|
|
}
|
|
return terms
|
|
}
|
|
|
|
// RequiresDocID says this SearchSort does not require the DocID be loaded
|
|
func (s *SortField) RequiresDocID() bool { return false }
|
|
|
|
// RequiresScoring says this SearchStore does not require scoring
|
|
func (s *SortField) RequiresScoring() bool { return false }
|
|
|
|
// RequiresFields says this SearchStore requires the specified stored field
|
|
func (s *SortField) RequiresFields() []string { return []string{s.Field} }
|
|
|
|
func (s *SortField) MarshalJSON() ([]byte, error) {
|
|
// see if simple format can be used
|
|
if s.Missing == SortFieldMissingLast &&
|
|
s.Mode == SortFieldDefault &&
|
|
s.Type == SortFieldAuto {
|
|
if s.Desc {
|
|
return json.Marshal("-" + s.Field)
|
|
}
|
|
return json.Marshal(s.Field)
|
|
}
|
|
sfm := map[string]interface{}{
|
|
"by": "field",
|
|
"field": s.Field,
|
|
}
|
|
if s.Desc {
|
|
sfm["desc"] = true
|
|
}
|
|
if s.Missing > SortFieldMissingLast {
|
|
switch s.Missing {
|
|
case SortFieldMissingFirst:
|
|
sfm["missing"] = "first"
|
|
}
|
|
}
|
|
if s.Mode > SortFieldDefault {
|
|
switch s.Mode {
|
|
case SortFieldMin:
|
|
sfm["mode"] = "min"
|
|
case SortFieldMax:
|
|
sfm["mode"] = "max"
|
|
}
|
|
}
|
|
if s.Type > SortFieldAuto {
|
|
switch s.Type {
|
|
case SortFieldAsString:
|
|
sfm["type"] = "string"
|
|
case SortFieldAsNumber:
|
|
sfm["type"] = "number"
|
|
case SortFieldAsDate:
|
|
sfm["type"] = "date"
|
|
}
|
|
}
|
|
|
|
return json.Marshal(sfm)
|
|
}
|
|
|
|
// SortDocID will sort results by the document identifier
|
|
type SortDocID struct {
|
|
Desc bool
|
|
}
|
|
|
|
// Value returns the sort value of the DocumentMatch
|
|
func (s *SortDocID) Value(i *DocumentMatch) string {
|
|
return i.ID
|
|
}
|
|
|
|
// Descending determines the order of the sort
|
|
func (s *SortDocID) Descending() bool {
|
|
return s.Desc
|
|
}
|
|
|
|
// RequiresDocID says this SearchSort does require the DocID be loaded
|
|
func (s *SortDocID) RequiresDocID() bool { return true }
|
|
|
|
// RequiresScoring says this SearchStore does not require scoring
|
|
func (s *SortDocID) RequiresScoring() bool { return false }
|
|
|
|
// RequiresFields says this SearchStore does not require any stored fields
|
|
func (s *SortDocID) RequiresFields() []string { return nil }
|
|
|
|
func (s *SortDocID) MarshalJSON() ([]byte, error) {
|
|
if s.Desc {
|
|
return json.Marshal("-_id")
|
|
}
|
|
return json.Marshal("_id")
|
|
}
|
|
|
|
// SortScore will sort results by the document match score
|
|
type SortScore struct {
|
|
Desc bool
|
|
}
|
|
|
|
// Value returns the sort value of the DocumentMatch
|
|
func (s *SortScore) Value(i *DocumentMatch) string {
|
|
return "_score"
|
|
}
|
|
|
|
// Descending determines the order of the sort
|
|
func (s *SortScore) Descending() bool {
|
|
return s.Desc
|
|
}
|
|
|
|
// RequiresDocID says this SearchSort does not require the DocID be loaded
|
|
func (s *SortScore) RequiresDocID() bool { return false }
|
|
|
|
// RequiresScoring says this SearchStore does require scoring
|
|
func (s *SortScore) RequiresScoring() bool { return true }
|
|
|
|
// RequiresFields says this SearchStore does not require any store fields
|
|
func (s *SortScore) RequiresFields() []string { return nil }
|
|
|
|
func (s *SortScore) MarshalJSON() ([]byte, error) {
|
|
if s.Desc {
|
|
return json.Marshal("-_score")
|
|
}
|
|
return json.Marshal("_score")
|
|
}
|