0
0
bleve/search/sort.go
Marty Schoch 0eba2a3f0c reduce garbage created while processing facets
previously we parsed/returned large sections of the documents
back index row in order to compute facet information.  this
would require parsing the protobuf of the entire back index row.
unfortunately this creates considerable garbage.

this new version introduces a visitor/callback approach to
working with data inside the back index row.  the benefit
of this approach is that we can let the higher-level code
see values, prior to any copies of data being made or
intermediate garbage being created.  implementations of
the callback must copy any value which they would like to
retain beyond the callback.

NOTE: this approach is duplicates code from the
automatically generated protobuf code

NOTE: this approach assumes that the "field" field be serialized
before the "terms" field.  This is guaranteed by our currently
generated protobuf encoder, and is recommended by the protobuf
spec.  But, decoders SHOULD support them occuring in any order,
which we do not.
2017-03-02 17:00:46 -05:00

524 lines
13 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package search
import (
"encoding/json"
"fmt"
"sort"
"strings"
"github.com/blevesearch/bleve/numeric"
)
var HighTerm = strings.Repeat(string([]byte{0xff}), 10)
var LowTerm = string([]byte{0x00})
type SearchSort interface {
UpdateVisitor(field string, term []byte)
Value(a *DocumentMatch) string
Descending() bool
RequiresDocID() bool
RequiresScoring() bool
RequiresFields() []string
}
func ParseSearchSortObj(input map[string]interface{}) (SearchSort, error) {
descending, ok := input["desc"].(bool)
by, ok := input["by"].(string)
if !ok {
return nil, fmt.Errorf("search sort must specify by")
}
switch by {
case "id":
return &SortDocID{
Desc: descending,
}, nil
case "score":
return &SortScore{
Desc: descending,
}, nil
case "field":
field, ok := input["field"].(string)
if !ok {
return nil, fmt.Errorf("search sort mode field must specify field")
}
rv := &SortField{
Field: field,
Desc: descending,
}
typ, ok := input["type"].(string)
if ok {
switch typ {
case "auto":
rv.Type = SortFieldAuto
case "string":
rv.Type = SortFieldAsString
case "number":
rv.Type = SortFieldAsNumber
case "date":
rv.Type = SortFieldAsDate
default:
return nil, fmt.Errorf("unknown sort field type: %s", typ)
}
}
mode, ok := input["mode"].(string)
if ok {
switch mode {
case "default":
rv.Mode = SortFieldDefault
case "min":
rv.Mode = SortFieldMin
case "max":
rv.Mode = SortFieldMax
default:
return nil, fmt.Errorf("unknown sort field mode: %s", mode)
}
}
missing, ok := input["missing"].(string)
if ok {
switch missing {
case "first":
rv.Missing = SortFieldMissingFirst
case "last":
rv.Missing = SortFieldMissingLast
default:
return nil, fmt.Errorf("unknown sort field missing: %s", missing)
}
}
return rv, nil
}
return nil, fmt.Errorf("unknown search sort by: %s", by)
}
func ParseSearchSortString(input string) SearchSort {
descending := false
if strings.HasPrefix(input, "-") {
descending = true
input = input[1:]
} else if strings.HasPrefix(input, "+") {
input = input[1:]
}
if input == "_id" {
return &SortDocID{
Desc: descending,
}
} else if input == "_score" {
return &SortScore{
Desc: descending,
}
}
return &SortField{
Field: input,
Desc: descending,
}
}
func ParseSearchSortJSON(input json.RawMessage) (SearchSort, error) {
// first try to parse it as string
var sortString string
err := json.Unmarshal(input, &sortString)
if err != nil {
var sortObj map[string]interface{}
err = json.Unmarshal(input, &sortObj)
if err != nil {
return nil, err
}
return ParseSearchSortObj(sortObj)
}
return ParseSearchSortString(sortString), nil
}
func ParseSortOrderStrings(in []string) SortOrder {
rv := make(SortOrder, 0, len(in))
for _, i := range in {
ss := ParseSearchSortString(i)
rv = append(rv, ss)
}
return rv
}
func ParseSortOrderJSON(in []json.RawMessage) (SortOrder, error) {
rv := make(SortOrder, 0, len(in))
for _, i := range in {
ss, err := ParseSearchSortJSON(i)
if err != nil {
return nil, err
}
rv = append(rv, ss)
}
return rv, nil
}
type SortOrder []SearchSort
func (so SortOrder) Value(doc *DocumentMatch) {
for _, soi := range so {
doc.Sort = append(doc.Sort, soi.Value(doc))
}
}
func (so SortOrder) UpdateVisitor(field string, term []byte) {
for _, soi := range so {
soi.UpdateVisitor(field, term)
}
}
// Compare will compare two document matches using the specified sort order
// if both are numbers, we avoid converting back to term
func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatch) int {
// compare the documents on all search sorts until a differences is found
for x := range so {
c := 0
if cachedScoring[x] {
if i.Score < j.Score {
c = -1
} else if i.Score > j.Score {
c = 1
}
} else {
iVal := i.Sort[x]
jVal := j.Sort[x]
c = strings.Compare(iVal, jVal)
}
if c == 0 {
continue
}
if cachedDesc[x] {
c = -c
}
return c
}
// if they are the same at this point, impose order based on index natural sort order
if i.HitNumber == j.HitNumber {
return 0
} else if i.HitNumber > j.HitNumber {
return 1
}
return -1
}
func (so SortOrder) RequiresScore() bool {
rv := false
for _, soi := range so {
if soi.RequiresScoring() {
rv = true
}
}
return rv
}
func (so SortOrder) RequiresDocID() bool {
rv := false
for _, soi := range so {
if soi.RequiresDocID() {
rv = true
}
}
return rv
}
func (so SortOrder) RequiredFields() []string {
var rv []string
for _, soi := range so {
rv = append(rv, soi.RequiresFields()...)
}
return rv
}
func (so SortOrder) CacheIsScore() []bool {
var rv []bool
for _, soi := range so {
rv = append(rv, soi.RequiresScoring())
}
return rv
}
func (so SortOrder) CacheDescending() []bool {
var rv []bool
for _, soi := range so {
rv = append(rv, soi.Descending())
}
return rv
}
// SortFieldType lets you control some internal sort behavior
// normally leaving this to the zero-value of SortFieldAuto is fine
type SortFieldType int
const (
// SortFieldAuto applies heuristics attempt to automatically sort correctly
SortFieldAuto SortFieldType = iota
// SortFieldAsString forces sort as string (no prefix coded terms removed)
SortFieldAsString
// SortFieldAsNumber forces sort as string (prefix coded terms with shift > 0 removed)
SortFieldAsNumber
// SortFieldAsDate forces sort as string (prefix coded terms with shift > 0 removed)
SortFieldAsDate
)
// SortFieldMode describes the behavior if the field has multiple values
type SortFieldMode int
const (
// SortFieldDefault uses the first (or only) value, this is the default zero-value
SortFieldDefault SortFieldMode = iota // FIXME name is confusing
// SortFieldMin uses the minimum value
SortFieldMin
// SortFieldMax uses the maximum value
SortFieldMax
)
// SortFieldMissing controls where documents missing a field value should be sorted
type SortFieldMissing int
const (
// SortFieldMissingLast sorts documents missing a field at the end
SortFieldMissingLast SortFieldMissing = iota
// SortFieldMissingFirst sorts documents missing a field at the beginning
SortFieldMissingFirst
)
// SortField will sort results by the value of a stored field
// Field is the name of the field
// Descending reverse the sort order (default false)
// Type allows forcing of string/number/date behavior (default auto)
// Mode controls behavior for multi-values fields (default first)
// Missing controls behavior of missing values (default last)
type SortField struct {
Field string
Desc bool
Type SortFieldType
Mode SortFieldMode
Missing SortFieldMissing
values []string
}
// UpdateVisitor notifies this sort field that in this document
// this field has the specified term
func (s *SortField) UpdateVisitor(field string, term []byte) {
if field == s.Field {
s.values = append(s.values, string(term))
}
}
// Value returns the sort value of the DocumentMatch
// it also resets the state of this SortField for
// processing the next document
func (s *SortField) Value(i *DocumentMatch) string {
iTerms := s.filterTermsByType(s.values)
iTerm := s.filterTermsByMode(iTerms)
s.values = nil
return iTerm
}
// Descending determines the order of the sort
func (s *SortField) Descending() bool {
return s.Desc
}
func (s *SortField) filterTermsByMode(terms []string) string {
if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) {
return terms[0]
} else if len(terms) > 1 {
switch s.Mode {
case SortFieldMin:
sort.Strings(terms)
return terms[0]
case SortFieldMax:
sort.Strings(terms)
return terms[len(terms)-1]
}
}
// handle missing terms
if s.Missing == SortFieldMissingLast {
if s.Desc {
return LowTerm
}
return HighTerm
}
if s.Desc {
return HighTerm
}
return LowTerm
}
// filterTermsByType attempts to make one pass on the terms
// if we are in auto-mode AND all the terms look like prefix-coded numbers
// return only the terms which had shift of 0
// if we are in explicit number or date mode, return only valid
// prefix coded numbers with shift of 0
func (s *SortField) filterTermsByType(terms []string) []string {
stype := s.Type
if stype == SortFieldAuto {
allTermsPrefixCoded := true
var termsWithShiftZero []string
for _, term := range terms {
valid, shift := numeric.ValidPrefixCodedTerm(term)
if valid && shift == 0 {
termsWithShiftZero = append(termsWithShiftZero, term)
} else if !valid {
allTermsPrefixCoded = false
}
}
if allTermsPrefixCoded {
terms = termsWithShiftZero
}
} else if stype == SortFieldAsNumber || stype == SortFieldAsDate {
var termsWithShiftZero []string
for _, term := range terms {
valid, shift := numeric.ValidPrefixCodedTerm(term)
if valid && shift == 0 {
termsWithShiftZero = append(termsWithShiftZero)
}
}
terms = termsWithShiftZero
}
return terms
}
// RequiresDocID says this SearchSort does not require the DocID be loaded
func (s *SortField) RequiresDocID() bool { return false }
// RequiresScoring says this SearchStore does not require scoring
func (s *SortField) RequiresScoring() bool { return false }
// RequiresFields says this SearchStore requires the specified stored field
func (s *SortField) RequiresFields() []string { return []string{s.Field} }
func (s *SortField) MarshalJSON() ([]byte, error) {
// see if simple format can be used
if s.Missing == SortFieldMissingLast &&
s.Mode == SortFieldDefault &&
s.Type == SortFieldAuto {
if s.Desc {
return json.Marshal("-" + s.Field)
}
return json.Marshal(s.Field)
}
sfm := map[string]interface{}{
"by": "field",
"field": s.Field,
}
if s.Desc {
sfm["desc"] = true
}
if s.Missing > SortFieldMissingLast {
switch s.Missing {
case SortFieldMissingFirst:
sfm["missing"] = "first"
}
}
if s.Mode > SortFieldDefault {
switch s.Mode {
case SortFieldMin:
sfm["mode"] = "min"
case SortFieldMax:
sfm["mode"] = "max"
}
}
if s.Type > SortFieldAuto {
switch s.Type {
case SortFieldAsString:
sfm["type"] = "string"
case SortFieldAsNumber:
sfm["type"] = "number"
case SortFieldAsDate:
sfm["type"] = "date"
}
}
return json.Marshal(sfm)
}
// SortDocID will sort results by the document identifier
type SortDocID struct {
Desc bool
}
// UpdateVisitor is a no-op for SortDocID as it's value
// is not dependent on any field terms
func (s *SortDocID) UpdateVisitor(field string, term []byte) {
}
// Value returns the sort value of the DocumentMatch
func (s *SortDocID) Value(i *DocumentMatch) string {
return i.ID
}
// Descending determines the order of the sort
func (s *SortDocID) Descending() bool {
return s.Desc
}
// RequiresDocID says this SearchSort does require the DocID be loaded
func (s *SortDocID) RequiresDocID() bool { return true }
// RequiresScoring says this SearchStore does not require scoring
func (s *SortDocID) RequiresScoring() bool { return false }
// RequiresFields says this SearchStore does not require any stored fields
func (s *SortDocID) RequiresFields() []string { return nil }
func (s *SortDocID) MarshalJSON() ([]byte, error) {
if s.Desc {
return json.Marshal("-_id")
}
return json.Marshal("_id")
}
// SortScore will sort results by the document match score
type SortScore struct {
Desc bool
}
// UpdateVisitor is a no-op for SortScore as it's value
// is not dependent on any field terms
func (s *SortScore) UpdateVisitor(field string, term []byte) {
}
// Value returns the sort value of the DocumentMatch
func (s *SortScore) Value(i *DocumentMatch) string {
return "_score"
}
// Descending determines the order of the sort
func (s *SortScore) Descending() bool {
return s.Desc
}
// RequiresDocID says this SearchSort does not require the DocID be loaded
func (s *SortScore) RequiresDocID() bool { return false }
// RequiresScoring says this SearchStore does require scoring
func (s *SortScore) RequiresScoring() bool { return true }
// RequiresFields says this SearchStore does not require any store fields
func (s *SortScore) RequiresFields() []string { return nil }
func (s *SortScore) MarshalJSON() ([]byte, error) {
if s.Desc {
return json.Marshal("-_score")
}
return json.Marshal("_score")
}