bleve/search/search.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package search

import (
	"fmt"

	"github.com/blevesearch/bleve/document"
	"github.com/blevesearch/bleve/index"
)

type ArrayPositions []float64

func (ap ArrayPositions) Equals(other ArrayPositions) bool {
	if len(ap) != len(other) {
		return false
	}
	for i := range ap {
		if ap[i] != other[i] {
			return false
		}
	}
	return true
}

type Location struct {
	Pos            float64        `json:"pos"`
	Start          float64        `json:"start"`
	End            float64        `json:"end"`
	ArrayPositions ArrayPositions `json:"array_positions"`
}

type Locations []*Location

type TermLocationMap map[string]Locations

func (t TermLocationMap) AddLocation(term string, location *Location) {
	t[term] = append(t[term], location)
}

type FieldTermLocationMap map[string]TermLocationMap

type FieldFragmentMap map[string][]string

type DocumentMatch struct {
	Index           string                `json:"index,omitempty"`
	ID              string                `json:"id"`
	IndexInternalID index.IndexInternalID `json:"-"`
	Score           float64               `json:"score"`
	Expl            *Explanation          `json:"explanation,omitempty"`
	Locations       FieldTermLocationMap  `json:"locations,omitempty"`
	Fragments       FieldFragmentMap      `json:"fragments,omitempty"`
	Sort            []string              `json:"sort,omitempty"`

	// Fields contains the values for document fields listed in
	// SearchRequest.Fields. Text fields are returned as strings, numeric
	// fields as float64s and date fields as time.RFC3339 formatted strings.
	Fields map[string]interface{} `json:"fields,omitempty"`

	// as we learn field terms, we can cache important ones for later use
	// for example, sorting and building facets need these values
	CachedFieldTerms index.FieldTerms `json:"-"`

	// if we load the document for this hit, remember it so we dont load again
	Document *document.Document `json:"-"`

	// used to maintain natural index order
	HitNumber uint64 `json:"-"`
}

func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
	if dm.Fields == nil {
		dm.Fields = make(map[string]interface{})
	}
	existingVal, ok := dm.Fields[name]
	if !ok {
		dm.Fields[name] = value
		return
	}

	valSlice, ok := existingVal.([]interface{})
	if ok {
		// already a slice, append to it
		valSlice = append(valSlice, value)
	} else {
		// create a slice
		valSlice = []interface{}{existingVal, value}
	}
	dm.Fields[name] = valSlice
}

// Reset allows an already allocated DocumentMatch to be reused
func (dm *DocumentMatch) Reset() *DocumentMatch {
	// remember the []byte used for the IndexInternalID
	indexInternalID := dm.IndexInternalID
	// remember the []interface{} used for sort
	sort := dm.Sort
	// idiom to copy over from empty DocumentMatch (0 allocations)
	*dm = DocumentMatch{}
	// reuse the []byte already allocated (and reset len to 0)
	dm.IndexInternalID = indexInternalID[:0]
	// reuse the []interface{} already allocated (and reset len to 0)
	dm.Sort = sort[:0]
	return dm
}

func (dm *DocumentMatch) String() string {
	return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
}

type DocumentMatchCollection []*DocumentMatch

func (c DocumentMatchCollection) Len() int           { return len(c) }
func (c DocumentMatchCollection) Swap(i, j int)      { c[i], c[j] = c[j], c[i] }
func (c DocumentMatchCollection) Less(i, j int) bool { return c[i].Score > c[j].Score }

type Searcher interface {
	Next(ctx *SearchContext) (*DocumentMatch, error)
	Advance(ctx *SearchContext, ID index.IndexInternalID) (*DocumentMatch, error)
	Close() error
	Weight() float64
	SetQueryNorm(float64)
	Count() uint64
	Min() int

	DocumentMatchPoolSize() int
}

type SearcherOptions struct {
	Explain            bool
	IncludeTermVectors bool
}

// SearchContext represents the context around a single search
type SearchContext struct {
	DocumentMatchPool *DocumentMatchPool
}
initial commit 2014-04-17 22:55:53 +02:00			`// Copyright (c) 2014 Couchbase, Inc.`
nicer formatting of license header 2016-10-02 16:13:14 +02:00			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
add newline between license and package this avoids cluttering godocs with the license 2014-09-02 16:54:50 +02:00
initial commit 2014-04-17 22:55:53 +02:00			`package search`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`import (`
			`"fmt"`

			`"github.com/blevesearch/bleve/document"`
			`"github.com/blevesearch/bleve/index"`
			`)`
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion) 2016-07-31 19:46:18 +02:00
add support for phrase slop to internals of phrase searcher phrase slop is not yet supported on the frontend added lots of tests around slop 2017-02-09 21:59:51 +01:00			`type ArrayPositions []float64`
initial commit 2014-04-17 22:55:53 +02:00
add support for phrase slop to internals of phrase searcher phrase slop is not yet supported on the frontend added lots of tests around slop 2017-02-09 21:59:51 +01:00			`func (ap ArrayPositions) Equals(other ArrayPositions) bool {`
			`if len(ap) != len(other) {`
fix incorrect results returned by phrase search previously phrase searcher would not validate that consecutive terms were actually occurring in the same array position fixes #292 2015-12-06 21:55:00 +01:00			`return false`
			`}`
add support for phrase slop to internals of phrase searcher phrase slop is not yet supported on the frontend added lots of tests around slop 2017-02-09 21:59:51 +01:00			`for i := range ap {`
			`if ap[i] != other[i] {`
fix incorrect results returned by phrase search previously phrase searcher would not validate that consecutive terms were actually occurring in the same array position fixes #292 2015-12-06 21:55:00 +01:00			`return false`
			`}`
			`}`
			`return true`
			`}`

add support for phrase slop to internals of phrase searcher phrase slop is not yet supported on the frontend added lots of tests around slop 2017-02-09 21:59:51 +01:00			`type Location struct {`
			Pos float64 `json:"pos"`
			Start float64 `json:"start"`
			End float64 `json:"end"`
			ArrayPositions ArrayPositions `json:"array_positions"`
			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type Locations []*Location`

			`type TermLocationMap map[string]Locations`

change higlight api to store in document match 2014-07-03 20:53:44 +02:00			`func (t TermLocationMap) AddLocation(term string, location *Location) {`
simplify TermLocationMap.AddLocation() 2016-10-11 21:14:48 +02:00			`t[term] = append(t[term], location)`
change higlight api to store in document match 2014-07-03 20:53:44 +02:00			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type FieldTermLocationMap map[string]TermLocationMap`

change higlight api to store in document match 2014-07-03 20:53:44 +02:00			`type FieldFragmentMap map[string][]string`

initial commit 2014-04-17 22:55:53 +02:00			`type DocumentMatch struct {`
switch back to single DocumentMatch struct instead of separate DocumentMatch/DocumentMatchInternal rules are simple, everything operates on the IndexInternalID field until the results are returned, then ID is set correctly the IndexInternalID field is not exported to JSON 2016-08-01 20:58:02 +02:00			Index string `json:"index,omitempty"`
			ID string `json:"id"`
			IndexInternalID index.IndexInternalID `json:"-"`
			Score float64 `json:"score"`
			Expl *Explanation `json:"explanation,omitempty"`
			Locations FieldTermLocationMap `json:"locations,omitempty"`
			Fragments FieldFragmentMap `json:"fragments,omitempty"`
improved implementation to address perf regressions primary change is going back to sort values be []string and not []interface{}, this avoid allocatiosn converting into the interface{} that sounds obvious, so why didn't we just do that first? because a common (default) sort is score, which is naturally a number, not a string (like terms). converting into the number was also expensive, and the common case. so, this solution also makes the change to NOT put the score into the sort value list. instead you see the dummy value "_score". this is just a placeholder, the actual sort impl knows that field of the sort is the score, and will sort using the actual score. also, several other aspets of the benchmark were cleaned up so that unnecessary allocations do not pollute the cpu profiles Here are the updated benchmarks: $ go test -run=xxx -bench=. -benchmem -cpuprofile=cpu.out BenchmarkTop10of100000Scores-4 3000 465809 ns/op 2548 B/op 33 allocs/op BenchmarkTop100of100000Scores-4 2000 626488 ns/op 21484 B/op 213 allocs/op BenchmarkTop10of1000000Scores-4 300 5107658 ns/op 2560 B/op 33 allocs/op BenchmarkTop100of1000000Scores-4 300 5275403 ns/op 21624 B/op 213 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.188s Prior to this PR, master reported: $ go test -run=xxx -bench=. -benchmem BenchmarkTop10of100000Scores-4 3000 453269 ns/op 360161 B/op 42 allocs/op BenchmarkTop100of100000Scores-4 2000 519131 ns/op 388275 B/op 219 allocs/op BenchmarkTop10of1000000Scores-4 200 7459004 ns/op 4628236 B/op 52 allocs/op BenchmarkTop100of1000000Scores-4 200 8064864 ns/op 4656596 B/op 232 allocs/op PASS ok github.com/blevesearch/bleve/search/collectors 7.385s So, we're pretty close on the smaller datasets, and we scale better on the larger datasets. We also show fewer allocations and bytes in all cases (some of this is artificial due to test cleanup). 2016-08-25 21:47:07 +02:00			Sort []string `json:"sort,omitempty"`
doc: document field values storage and retrieval 2015-10-04 11:25:58 +02:00
			`// Fields contains the values for document fields listed in`
			`// SearchRequest.Fields. Text fields are returned as strings, numeric`
			`// fields as float64s and date fields as time.RFC3339 formatted strings.`
			Fields map[string]interface{} `json:"fields,omitempty"`
Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00
change sort field impl to use indexed values not stored values 2016-08-17 18:20:12 +02:00			`// as we learn field terms, we can cache important ones for later use`
			`// for example, sorting and building facets need these values`
			CachedFieldTerms index.FieldTerms `json:"-"`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`// if we load the document for this hit, remember it so we dont load again`
			Document *document.Document `json:"-"`

			`// used to maintain natural index order`
			HitNumber uint64 `json:"-"`
fix storing/retrieving numeric and date fields also includes new ability to request stored fields be returned with results closes #55 and closes #56 and closes #58 2014-08-06 19:52:20 +02:00			`}`

			`func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {`
			`if dm.Fields == nil {`
			`dm.Fields = make(map[string]interface{})`
			`}`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`existingVal, ok := dm.Fields[name]`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`if !ok {`
			`dm.Fields[name] = value`
			`return`
			`}`

			`valSlice, ok := existingVal.([]interface{})`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`if ok {`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`// already a slice, append to it`
			`valSlice = append(valSlice, value)`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`} else {`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`// create a slice`
			`valSlice = []interface{}{existingVal, value}`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`}`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`dm.Fields[name] = valSlice`
initial commit 2014-04-17 22:55:53 +02:00			`}`

document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// Reset allows an already allocated DocumentMatch to be reused`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`func (dm DocumentMatch) Reset() DocumentMatch {`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// remember the []byte used for the IndexInternalID`
switch sort impl to use interface this improves perf in the case where we're not doing any sorting as we avoid allocating memory and converting scores into numeric terms 2016-08-25 01:02:22 +02:00			`indexInternalID := dm.IndexInternalID`
			`// remember the []interface{} used for sort`
			`sort := dm.Sort`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// idiom to copy over from empty DocumentMatch (0 allocations)`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`*dm = DocumentMatch{}`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// reuse the []byte already allocated (and reset len to 0)`
switch sort impl to use interface this improves perf in the case where we're not doing any sorting as we avoid allocating memory and converting scores into numeric terms 2016-08-25 01:02:22 +02:00			`dm.IndexInternalID = indexInternalID[:0]`
			`// reuse the []interface{} already allocated (and reset len to 0)`
			`dm.Sort = sort[:0]`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`return dm`
			`}`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`func (dm *DocumentMatch) String() string {`
			`return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)`
			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type DocumentMatchCollection []*DocumentMatch`

initial impl of Index Aliases an IndexAlias allows you easily work with one logical Index while changing the actual Index its pointing to behind the scenes Changing which actual Index is backing an IndexAlias can be done atomically so that your application smoothly transitions from one Index to another. A separate use of IndexAlias is allowed when the IndexAlias is defined to point to multiple Indexes. In this case only the Search() operation is supported, but the Search will be run on each of the underlying indexes in parallel, and the results will be merged. 2014-10-29 14:22:11 +01:00			`func (c DocumentMatchCollection) Len() int { return len(c) }`
			`func (c DocumentMatchCollection) Swap(i, j int) { c[i], c[j] = c[j], c[i] }`
			`func (c DocumentMatchCollection) Less(i, j int) bool { return c[i].Score > c[j].Score }`

initial commit 2014-04-17 22:55:53 +02:00			`type Searcher interface {`
refactor search package to reuse DocumentMatch and ID []byte's the motivation for this commit is long and detailed and has been documented externally here: https://gist.github.com/mschoch/5cc5c9cf4669a5fe8512cb7770d3c1a2 the core of the changes are: 1. recognize that collector/searcher need only a fixed number of DocumentMatch instances, and this number can be determined from the structure of the query, not the size of the data 2. knowing this, instances can be allocated in bulk, up front and they can be reused without locking (since all search operations take place in a single goroutine 3. combined with previous commits which enabled reuse of the IndexInternalID []byte, this allows for no allocation/copy of these bytes as well (by using DocumentMatch Reset() method when returning entries to the pool 2016-08-09 04:21:47 +02:00			`Next(ctx SearchContext) (DocumentMatch, error)`
			`Advance(ctx SearchContext, ID index.IndexInternalID) (DocumentMatch, error)`
first pass at checking errors that were ignored part of #169 2015-03-06 20:46:29 +01:00			`Close() error`
initial commit 2014-04-17 22:55:53 +02:00			`Weight() float64`
			`SetQueryNorm(float64)`
			`Count() uint64`
refactor to make all the query classes private 2014-08-30 00:14:12 +02:00			`Min() int`
refactor search package to reuse DocumentMatch and ID []byte's the motivation for this commit is long and detailed and has been documented externally here: https://gist.github.com/mschoch/5cc5c9cf4669a5fe8512cb7770d3c1a2 the core of the changes are: 1. recognize that collector/searcher need only a fixed number of DocumentMatch instances, and this number can be determined from the structure of the query, not the size of the data 2. knowing this, instances can be allocated in bulk, up front and they can be reused without locking (since all search operations take place in a single goroutine 3. combined with previous commits which enabled reuse of the IndexInternalID []byte, this allows for no allocation/copy of these bytes as well (by using DocumentMatch Reset() method when returning entries to the pool 2016-08-09 04:21:47 +02:00
			`DocumentMatchPoolSize() int`
			`}`

API change: optional SearchRequest.IncludeLocations flag This is a change in search result behavior in that location information is no longer provided by default with search results. Although this looks like a wide-ranging change, it's mostly a mechanical replacement of the explain bool flag with a new search.SearcherOptions struct, which holds both the Explain bool flag and the IncludeTermVectors bool flag. 2017-01-06 02:49:45 +01:00			`type SearcherOptions struct {`
			`Explain bool`
			`IncludeTermVectors bool`
			`}`

refactor search package to reuse DocumentMatch and ID []byte's the motivation for this commit is long and detailed and has been documented externally here: https://gist.github.com/mschoch/5cc5c9cf4669a5fe8512cb7770d3c1a2 the core of the changes are: 1. recognize that collector/searcher need only a fixed number of DocumentMatch instances, and this number can be determined from the structure of the query, not the size of the data 2. knowing this, instances can be allocated in bulk, up front and they can be reused without locking (since all search operations take place in a single goroutine 3. combined with previous commits which enabled reuse of the IndexInternalID []byte, this allows for no allocation/copy of these bytes as well (by using DocumentMatch Reset() method when returning entries to the pool 2016-08-09 04:21:47 +02:00			`// SearchContext represents the context around a single search`
			`type SearchContext struct {`
			`DocumentMatchPool *DocumentMatchPool`
initial commit 2014-04-17 22:55:53 +02:00			`}`