bleve/search/search.go

//  Copyright (c) 2014 Couchbase, Inc.
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
//  except in compliance with the License. You may obtain a copy of the License at
//    http://www.apache.org/licenses/LICENSE-2.0
//  Unless required by applicable law or agreed to in writing, software distributed under the
//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
//  either express or implied. See the License for the specific language governing permissions
//  and limitations under the License.

package search

import (
	"fmt"

	"github.com/blevesearch/bleve/document"
	"github.com/blevesearch/bleve/index"
)

type Location struct {
	Pos            float64   `json:"pos"`
	Start          float64   `json:"start"`
	End            float64   `json:"end"`
	ArrayPositions []float64 `json:"array_positions"`
}

// SameArrayElement returns true if two locations are point to
// the same array element
func (l *Location) SameArrayElement(other *Location) bool {
	if len(l.ArrayPositions) != len(other.ArrayPositions) {
		return false
	}
	for i, elem := range l.ArrayPositions {
		if other.ArrayPositions[i] != elem {
			return false
		}
	}
	return true
}

type Locations []*Location

type TermLocationMap map[string]Locations

func (t TermLocationMap) AddLocation(term string, location *Location) {
	existingLocations, exists := t[term]
	if exists {
		existingLocations = append(existingLocations, location)
		t[term] = existingLocations
	} else {
		locations := make(Locations, 1)
		locations[0] = location
		t[term] = locations
	}
}

type FieldTermLocationMap map[string]TermLocationMap

type FieldFragmentMap map[string][]string

type DocumentMatch struct {
	Index           string                `json:"index,omitempty"`
	ID              string                `json:"id"`
	IndexInternalID index.IndexInternalID `json:"-"`
	Score           float64               `json:"score"`
	Expl            *Explanation          `json:"explanation,omitempty"`
	Locations       FieldTermLocationMap  `json:"locations,omitempty"`
	Fragments       FieldFragmentMap      `json:"fragments,omitempty"`
	Sort            []string              `json:"sort,omitempty"`

	// Fields contains the values for document fields listed in
	// SearchRequest.Fields. Text fields are returned as strings, numeric
	// fields as float64s and date fields as time.RFC3339 formatted strings.
	Fields map[string]interface{} `json:"fields,omitempty"`

	// as we learn field terms, we can cache important ones for later use
	// for example, sorting and building facets need these values
	CachedFieldTerms index.FieldTerms `json:"-"`

	// if we load the document for this hit, remember it so we dont load again
	Document *document.Document `json:"-"`

	// used to maintain natural index order
	HitNumber uint64 `json:"-"`
}

func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
	if dm.Fields == nil {
		dm.Fields = make(map[string]interface{})
	}
	existingVal, ok := dm.Fields[name]
	if !ok {
		dm.Fields[name] = value
		return
	}

	valSlice, ok := existingVal.([]interface{})
	if ok {
		// already a slice, append to it
		valSlice = append(valSlice, value)
	} else {
		// create a slice
		valSlice = []interface{}{existingVal, value}
	}
	dm.Fields[name] = valSlice
}

// Reset allows an already allocated DocumentMatch to be reused
func (dm *DocumentMatch) Reset() *DocumentMatch {
	// remember the []byte used for the IndexInternalID
	indexInternalId := dm.IndexInternalID
	// idiom to copy over from empty DocumentMatch (0 allocations)
	*dm = DocumentMatch{}
	// reuse the []byte already allocated (and reset len to 0)
	dm.IndexInternalID = indexInternalId[:0]
	return dm
}

func (dm *DocumentMatch) String() string {
	return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
}

type DocumentMatchCollection []*DocumentMatch

func (c DocumentMatchCollection) Len() int           { return len(c) }
func (c DocumentMatchCollection) Swap(i, j int)      { c[i], c[j] = c[j], c[i] }
func (c DocumentMatchCollection) Less(i, j int) bool { return c[i].Score > c[j].Score }

type Searcher interface {
	Next(ctx *SearchContext) (*DocumentMatch, error)
	Advance(ctx *SearchContext, ID index.IndexInternalID) (*DocumentMatch, error)
	Close() error
	Weight() float64
	SetQueryNorm(float64)
	Count() uint64
	Min() int

	DocumentMatchPoolSize() int
}

// SearchContext represents the context around a single search
type SearchContext struct {
	DocumentMatchPool *DocumentMatchPool
}
initial commit 2014-04-17 22:55:53 +02:00			`// Copyright (c) 2014 Couchbase, Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file`
			`// except in compliance with the License. You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software distributed under the`
			`// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,`
			`// either express or implied. See the License for the specific language governing permissions`
			`// and limitations under the License.`
add newline between license and package this avoids cluttering godocs with the license 2014-09-02 16:54:50 +02:00
initial commit 2014-04-17 22:55:53 +02:00			`package search`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`import (`
			`"fmt"`

			`"github.com/blevesearch/bleve/document"`
			`"github.com/blevesearch/bleve/index"`
			`)`
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion) 2016-07-31 19:46:18 +02:00
initial commit 2014-04-17 22:55:53 +02:00			`type Location struct {`
update to correctly work with composite fields also updated search results to return array positions 2015-07-31 17:16:11 +02:00			Pos float64 `json:"pos"`
			Start float64 `json:"start"`
			End float64 `json:"end"`
			ArrayPositions []float64 `json:"array_positions"`
initial commit 2014-04-17 22:55:53 +02:00			`}`

fix incorrect results returned by phrase search previously phrase searcher would not validate that consecutive terms were actually occurring in the same array position fixes #292 2015-12-06 21:55:00 +01:00			`// SameArrayElement returns true if two locations are point to`
			`// the same array element`
			`func (l Location) SameArrayElement(other Location) bool {`
			`if len(l.ArrayPositions) != len(other.ArrayPositions) {`
			`return false`
			`}`
			`for i, elem := range l.ArrayPositions {`
			`if other.ArrayPositions[i] != elem {`
			`return false`
			`}`
			`}`
			`return true`
			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type Locations []*Location`

			`type TermLocationMap map[string]Locations`

change higlight api to store in document match 2014-07-03 20:53:44 +02:00			`func (t TermLocationMap) AddLocation(term string, location *Location) {`
			`existingLocations, exists := t[term]`
			`if exists {`
			`existingLocations = append(existingLocations, location)`
			`t[term] = existingLocations`
			`} else {`
			`locations := make(Locations, 1)`
			`locations[0] = location`
			`t[term] = locations`
			`}`
			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type FieldTermLocationMap map[string]TermLocationMap`

change higlight api to store in document match 2014-07-03 20:53:44 +02:00			`type FieldFragmentMap map[string][]string`

initial commit 2014-04-17 22:55:53 +02:00			`type DocumentMatch struct {`
switch back to single DocumentMatch struct instead of separate DocumentMatch/DocumentMatchInternal rules are simple, everything operates on the IndexInternalID field until the results are returned, then ID is set correctly the IndexInternalID field is not exported to JSON 2016-08-01 20:58:02 +02:00			Index string `json:"index,omitempty"`
			ID string `json:"id"`
			IndexInternalID index.IndexInternalID `json:"-"`
			Score float64 `json:"score"`
			Expl *Explanation `json:"explanation,omitempty"`
			Locations FieldTermLocationMap `json:"locations,omitempty"`
			Fragments FieldFragmentMap `json:"fragments,omitempty"`
adjust new sort functionality to also work with MultiSearch 2016-08-24 20:07:10 +02:00			Sort []string `json:"sort,omitempty"`
doc: document field values storage and retrieval 2015-10-04 11:25:58 +02:00
			`// Fields contains the values for document fields listed in`
			`// SearchRequest.Fields. Text fields are returned as strings, numeric`
			`// fields as float64s and date fields as time.RFC3339 formatted strings.`
			Fields map[string]interface{} `json:"fields,omitempty"`
Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00
change sort field impl to use indexed values not stored values 2016-08-17 18:20:12 +02:00			`// as we learn field terms, we can cache important ones for later use`
			`// for example, sorting and building facets need these values`
			CachedFieldTerms index.FieldTerms `json:"-"`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`// if we load the document for this hit, remember it so we dont load again`
			Document *document.Document `json:"-"`

			`// used to maintain natural index order`
			HitNumber uint64 `json:"-"`
fix storing/retrieving numeric and date fields also includes new ability to request stored fields be returned with results closes #55 and closes #56 and closes #58 2014-08-06 19:52:20 +02:00			`}`

			`func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {`
			`if dm.Fields == nil {`
			`dm.Fields = make(map[string]interface{})`
			`}`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`existingVal, ok := dm.Fields[name]`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`if !ok {`
			`dm.Fields[name] = value`
			`return`
			`}`

			`valSlice, ok := existingVal.([]interface{})`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`if ok {`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`// already a slice, append to it`
			`valSlice = append(valSlice, value)`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`} else {`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`// create a slice`
			`valSlice = []interface{}{existingVal, value}`
properly return multi-value fields in an array 2014-11-19 21:55:09 +01:00			`}`
Refactor AddFieldValue method Removing one level of nesting makes the method easier to read. 2015-09-21 20:46:33 +02:00			`dm.Fields[name] = valSlice`
initial commit 2014-04-17 22:55:53 +02:00			`}`

document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// Reset allows an already allocated DocumentMatch to be reused`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`func (dm DocumentMatch) Reset() DocumentMatch {`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// remember the []byte used for the IndexInternalID`
updated attempt to reuse []byte previous attempt was flawed (but maked by Reset() method) new approach is to do this work in the Reset() method itself, logically this is where it belongs. but further we acknowledge that IndexInternalID []byte lifetime lives beyond the TermFieldDoc, so another copy is made into the DocumentMatch. Although this introduces yet another copy the theory being tested is that it allows each of these structuress to reuse memory without additional allocation. 2016-08-03 23:01:27 +02:00			`indexInternalId := dm.IndexInternalID`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// idiom to copy over from empty DocumentMatch (0 allocations)`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`*dm = DocumentMatch{}`
document Reset behavior as its non-obvious 2016-08-03 23:16:15 +02:00			`// reuse the []byte already allocated (and reset len to 0)`
updated attempt to reuse []byte previous attempt was flawed (but maked by Reset() method) new approach is to do this work in the Reset() method itself, logically this is where it belongs. but further we acknowledge that IndexInternalID []byte lifetime lives beyond the TermFieldDoc, so another copy is made into the DocumentMatch. Although this introduces yet another copy the theory being tested is that it allows each of these structuress to reuse memory without additional allocation. 2016-08-03 23:01:27 +02:00			`dm.IndexInternalID = indexInternalId[:0]`
optimize upside_down reader Next() with doc match reuse This optimization changes the search.Search.Next() interface API, adding an optional, pre-allocated DocumentMatch parameter. When it's non-nil, the TermSearcher and TermQueryScorer will use that pre-allocated DocumentMatch, instead of allocating a brand new DocumentMatch instance. 2016-07-21 01:29:20 +02:00			`return dm`
			`}`

Merge branch 'master' of https://github.com/dtylman/bleve into sort-by-field-try2 2016-08-12 20:23:55 +02:00			`func (dm *DocumentMatch) String() string {`
			`return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)`
			`}`

initial commit 2014-04-17 22:55:53 +02:00			`type DocumentMatchCollection []*DocumentMatch`

initial impl of Index Aliases an IndexAlias allows you easily work with one logical Index while changing the actual Index its pointing to behind the scenes Changing which actual Index is backing an IndexAlias can be done atomically so that your application smoothly transitions from one Index to another. A separate use of IndexAlias is allowed when the IndexAlias is defined to point to multiple Indexes. In this case only the Search() operation is supported, but the Search will be run on each of the underlying indexes in parallel, and the results will be merged. 2014-10-29 14:22:11 +01:00			`func (c DocumentMatchCollection) Len() int { return len(c) }`
			`func (c DocumentMatchCollection) Swap(i, j int) { c[i], c[j] = c[j], c[i] }`
			`func (c DocumentMatchCollection) Less(i, j int) bool { return c[i].Score > c[j].Score }`

initial commit 2014-04-17 22:55:53 +02:00			`type Searcher interface {`
refactor search package to reuse DocumentMatch and ID []byte's the motivation for this commit is long and detailed and has been documented externally here: https://gist.github.com/mschoch/5cc5c9cf4669a5fe8512cb7770d3c1a2 the core of the changes are: 1. recognize that collector/searcher need only a fixed number of DocumentMatch instances, and this number can be determined from the structure of the query, not the size of the data 2. knowing this, instances can be allocated in bulk, up front and they can be reused without locking (since all search operations take place in a single goroutine 3. combined with previous commits which enabled reuse of the IndexInternalID []byte, this allows for no allocation/copy of these bytes as well (by using DocumentMatch Reset() method when returning entries to the pool 2016-08-09 04:21:47 +02:00			`Next(ctx SearchContext) (DocumentMatch, error)`
			`Advance(ctx SearchContext, ID index.IndexInternalID) (DocumentMatch, error)`
first pass at checking errors that were ignored part of #169 2015-03-06 20:46:29 +01:00			`Close() error`
initial commit 2014-04-17 22:55:53 +02:00			`Weight() float64`
			`SetQueryNorm(float64)`
			`Count() uint64`
refactor to make all the query classes private 2014-08-30 00:14:12 +02:00			`Min() int`
refactor search package to reuse DocumentMatch and ID []byte's the motivation for this commit is long and detailed and has been documented externally here: https://gist.github.com/mschoch/5cc5c9cf4669a5fe8512cb7770d3c1a2 the core of the changes are: 1. recognize that collector/searcher need only a fixed number of DocumentMatch instances, and this number can be determined from the structure of the query, not the size of the data 2. knowing this, instances can be allocated in bulk, up front and they can be reused without locking (since all search operations take place in a single goroutine 3. combined with previous commits which enabled reuse of the IndexInternalID []byte, this allows for no allocation/copy of these bytes as well (by using DocumentMatch Reset() method when returning entries to the pool 2016-08-09 04:21:47 +02:00
			`DocumentMatchPoolSize() int`
			`}`

			`// SearchContext represents the context around a single search`
			`type SearchContext struct {`
			`DocumentMatchPool *DocumentMatchPool`
initial commit 2014-04-17 22:55:53 +02:00			`}`