0
0
Fork 0
bleve/search/searcher/search_disjunction.go

310 lines
7.1 KiB
Go
Raw Normal View History

2014-04-25 17:31:28 +02:00
// Copyright (c) 2014 Couchbase, Inc.
2016-10-02 16:13:14 +02:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
2014-04-25 17:31:28 +02:00
import (
"fmt"
2014-04-25 17:31:28 +02:00
"math"
"reflect"
2014-04-25 17:31:28 +02:00
"sort"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/scorer"
"github.com/blevesearch/bleve/size"
2014-04-25 17:31:28 +02:00
)
var reflectStaticSizeDisjunctionSearcher int
func init() {
var ds DisjunctionSearcher
reflectStaticSizeDisjunctionSearcher = int(reflect.TypeOf(ds).Size())
}
// DisjunctionMaxClauseCount is a compile time setting that applications can
// adjust to non-zero value to cause the DisjunctionSearcher to return an
// error instead of exeucting searches when the size exceeds this value.
var DisjunctionMaxClauseCount = 0
type DisjunctionSearcher struct {
indexReader index.IndexReader
searchers OrderedSearcherList
numSearchers int
queryNorm float64
currs []*search.DocumentMatch
scorer *scorer.DisjunctionQueryScorer
min int
matching []*search.DocumentMatch
matchingIdxs []int
initialized bool
2014-04-25 17:31:28 +02:00
}
func tooManyClauses(count int) bool {
if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount {
return true
}
return false
}
func tooManyClausesErr() error {
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
DisjunctionMaxClauseCount)
}
func NewDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
*DisjunctionSearcher, error) {
return newDisjunctionSearcher(indexReader, qsearchers, min, options,
true)
}
func newDisjunctionSearcher(indexReader index.IndexReader,
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
limit bool) (
*DisjunctionSearcher, error) {
if limit && tooManyClauses(len(qsearchers)) {
return nil, tooManyClausesErr()
}
2014-12-18 18:43:12 +01:00
// build the downstream searchers
searchers := make(OrderedSearcherList, len(qsearchers))
for i, searcher := range qsearchers {
2014-04-25 17:31:28 +02:00
searchers[i] = searcher
}
// sort the searchers
sort.Sort(sort.Reverse(searchers))
// build our searcher
rv := DisjunctionSearcher{
indexReader: indexReader,
searchers: searchers,
numSearchers: len(searchers),
currs: make([]*search.DocumentMatch, len(searchers)),
scorer: scorer.NewDisjunctionQueryScorer(options),
min: int(min),
matching: make([]*search.DocumentMatch, len(searchers)),
matchingIdxs: make([]int, len(searchers)),
2014-04-25 17:31:28 +02:00
}
rv.computeQueryNorm()
return &rv, nil
}
func (s *DisjunctionSearcher) Size() int {
sizeInBytes := reflectStaticSizeDisjunctionSearcher + size.SizeOfPtr +
s.indexReader.Size() +
s.scorer.Size()
for _, entry := range s.searchers {
sizeInBytes += entry.Size()
}
for _, entry := range s.currs {
if entry != nil {
sizeInBytes += entry.Size()
}
}
for _, entry := range s.matching {
if entry != nil {
sizeInBytes += entry.Size()
}
}
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
return sizeInBytes
}
func (s *DisjunctionSearcher) computeQueryNorm() {
2014-04-25 17:31:28 +02:00
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
2014-04-25 17:31:28 +02:00
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
2014-12-18 18:43:12 +01:00
// finally tell all the downstream searchers the norm
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
2014-04-25 17:31:28 +02:00
}
}
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
2014-04-25 17:31:28 +02:00
var err error
// get all searchers pointing at their first match
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
2014-04-25 17:31:28 +02:00
if err != nil {
return err
}
}
err = s.updateMatches()
if err != nil {
return err
}
s.initialized = true
2014-04-25 17:31:28 +02:00
return nil
}
func (s *DisjunctionSearcher) updateMatches() error {
matching := s.matching[:0]
matchingIdxs := s.matchingIdxs[:0]
for i := 0; i < len(s.currs); i++ {
curr := s.currs[i]
if curr == nil {
continue
2014-04-25 17:31:28 +02:00
}
if len(matching) > 0 {
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
if cmp > 0 {
continue
}
if cmp < 0 {
matching = matching[:0]
matchingIdxs = matchingIdxs[:0]
}
}
matching = append(matching, curr)
matchingIdxs = append(matchingIdxs, i)
2014-04-25 17:31:28 +02:00
}
s.matching = matching
s.matchingIdxs = matchingIdxs
return nil
2014-04-25 17:31:28 +02:00
}
func (s *DisjunctionSearcher) Weight() float64 {
2014-04-25 17:31:28 +02:00
var rv float64
for _, searcher := range s.searchers {
rv += searcher.Weight()
}
return rv
}
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) {
2014-04-25 17:31:28 +02:00
for _, searcher := range s.searchers {
searcher.SetQueryNorm(qnorm)
}
}
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) (
*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
2014-04-25 17:31:28 +02:00
var err error
var rv *search.DocumentMatch
2014-04-25 17:31:28 +02:00
found := false
for !found && len(s.matching) > 0 {
if len(s.matching) >= s.min {
2014-04-25 17:31:28 +02:00
found = true
// score this match
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
2014-04-25 17:31:28 +02:00
}
// invoke next on all the matching searchers
for _, i := range s.matchingIdxs {
searcher := s.searchers[i]
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
2014-04-25 17:31:28 +02:00
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
2014-04-25 17:31:28 +02:00
}
return rv, nil
}
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)
if err != nil {
return nil, err
}
}
2014-04-25 17:31:28 +02:00
// get all searchers pointing at their first match
var err error
for i, searcher := range s.searchers {
if s.currs[i] != nil {
scorch fix disjunction searcher Advance() Found with "versus" test (TestScorchVersusUpsideDownBoltSmallMNSAM), which had a boolean query with a MustNot that was the same as the Must parameters. This replicates a situation found by Aruna/Mihir/testrunner/RQG (MB-27291). Example: "query": { "must_not": {"disjuncts": [ {"field": "body", "match": "hello"} ]}, "must": {"conjuncts": [ {"field": "body", "match": "hello"} ]} } The nested searchers along the MustNot pathway would end up looking roughly like... booleanSearcher MustNot => disjunctionSearcher => disjunctionSearcher => termSearcher On the first Next() call by the collector, the two disjunction searchers would run through their respective Next() method processing, which includes their initSearcher() processing on the first time. This has the effect of driving the leaf termSearcher through two Next() invocations. That is, if there were 3 docs (doc-1, doc-2, doc-3), the leaf termSearcher would at this point have moved to point to doc-3, while the topmost MustNot would have received doc-1. Next, the booleanSearcher's Must searcher would produce doc-2, so the booleanSearcher would try to Advance() the MustNot searcher to doc-2. But, in scorch, the leafmost termSearcher had already gotten past doc-2 and would return its doc-3. In upsidedown, in contrast, the leaf termSearcher would then drive the KVStore iterator with a Seek(doc-2), and the KVStore iterator would perform a backwards seek to reach doc-2. In scorch, however, backwards iteration seeking isn't supported. So, this fix checks the state of the disjunction searcher to see if we already have the necessary state so that we don't have to perform actual Advance()'es on the underlying searchers. This not only fixes the behavior w.r.t. scorch, but also can have an effect of potentially making upsidedown slightly faster as we're avoiding some backwards KVStore iterator seeks.
2017-12-22 02:49:55 +01:00
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = searcher.Advance(ctx, ID)
2014-04-25 17:31:28 +02:00
if err != nil {
return nil, err
}
}
err = s.updateMatches()
if err != nil {
return nil, err
}
2014-04-25 17:31:28 +02:00
return s.Next(ctx)
2014-04-25 17:31:28 +02:00
}
func (s *DisjunctionSearcher) Count() uint64 {
2014-04-25 17:31:28 +02:00
// for now return a worst case
2014-09-04 00:47:02 +02:00
var sum uint64
2014-04-25 17:31:28 +02:00
for _, searcher := range s.searchers {
sum += searcher.Count()
}
return sum
}
func (s *DisjunctionSearcher) Close() (rv error) {
2014-04-25 17:31:28 +02:00
for _, searcher := range s.searchers {
err := searcher.Close()
if err != nil && rv == nil {
rv = err
}
2014-04-25 17:31:28 +02:00
}
return rv
2014-04-25 17:31:28 +02:00
}
func (s *DisjunctionSearcher) Min() int {
return s.min
}
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int {
rv := len(s.currs)
for _, s := range s.searchers {
rv += s.DocumentMatchPoolSize()
}
return rv
}