7e36109b3c
This API (unexported) will estimate the amount of memory needed to execute a search query over an index before the collector begins data collection. Sample estimates for certain queries: {Size: 10, BenchmarkUpsidedownSearchOverhead} ESTIMATE BENCHMEM TermQuery 4616 4796 MatchQuery 5210 5405 DisjunctionQuery (Match queries) 7700 8447 DisjunctionQuery (Term queries) 6514 6591 ConjunctionQuery (Match queries) 7524 8175 Nested disjunction query (disjunction of disjunctions) 10306 10708 …
287 lines
7.4 KiB
Go
287 lines
7.4 KiB
Go
// Copyright (c) 2017 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package mem
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
|
|
"github.com/RoaringBitmap/roaring"
|
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
"github.com/blevesearch/bleve/size"
|
|
)
|
|
|
|
var reflectStaticSizeSegment int
|
|
|
|
func init() {
|
|
var s Segment
|
|
reflectStaticSizeSegment = int(reflect.TypeOf(s).Size())
|
|
}
|
|
|
|
// _id field is always guaranteed to have fieldID of 0
|
|
const idFieldID uint16 = 0
|
|
|
|
// KNOWN ISSUES
|
|
// - LIMITATION - we decided whether or not to store term vectors for a field
|
|
// at the segment level, based on the first definition of a
|
|
// field we see. in normal bleve usage this is fine, all
|
|
// instances of a field definition will be the same. however,
|
|
// advanced users may violate this and provide unique field
|
|
// definitions with each document. this segment does not
|
|
// support this usage.
|
|
|
|
// TODO
|
|
// - need better testing of multiple docs, iterating freqs, locations and
|
|
// and verifying the correct results are returned
|
|
|
|
// Segment is an in memory implementation of scorch.Segment
|
|
type Segment struct {
|
|
|
|
// FieldsMap adds 1 to field id to avoid zero value issues
|
|
// name -> field id + 1
|
|
FieldsMap map[string]uint16
|
|
|
|
// FieldsInv is the inverse of FieldsMap
|
|
// field id -> name
|
|
FieldsInv []string
|
|
|
|
// Term dictionaries for each field
|
|
// field id -> term -> postings list id + 1
|
|
Dicts []map[string]uint64
|
|
|
|
// Terms for each field, where terms are sorted ascending
|
|
// field id -> []term
|
|
DictKeys [][]string
|
|
|
|
// Postings list
|
|
// postings list id -> bitmap by docNum
|
|
Postings []*roaring.Bitmap
|
|
|
|
// Postings list has locations
|
|
PostingsLocs []*roaring.Bitmap
|
|
|
|
// Term frequencies
|
|
// postings list id -> Freqs (one for each hit in bitmap)
|
|
Freqs [][]uint64
|
|
|
|
// Field norms
|
|
// postings list id -> Norms (one for each hit in bitmap)
|
|
Norms [][]float32
|
|
|
|
// Field/start/end/pos/locarraypos
|
|
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
|
Locfields [][]uint16
|
|
Locstarts [][]uint64
|
|
Locends [][]uint64
|
|
Locpos [][]uint64
|
|
Locarraypos [][][]uint64
|
|
|
|
// Stored field values
|
|
// docNum -> field id -> slice of values (each value []byte)
|
|
Stored []map[uint16][][]byte
|
|
|
|
// Stored field types
|
|
// docNum -> field id -> slice of types (each type byte)
|
|
StoredTypes []map[uint16][]byte
|
|
|
|
// Stored field array positions
|
|
// docNum -> field id -> slice of array positions (each is []uint64)
|
|
StoredPos []map[uint16][][]uint64
|
|
|
|
// For storing the docValue persisted fields
|
|
DocValueFields map[uint16]bool
|
|
|
|
// Footprint of the segment, updated when analyzed document mutations
|
|
// are added into the segment
|
|
sizeInBytes int
|
|
}
|
|
|
|
// New builds a new empty Segment
|
|
func New() *Segment {
|
|
return &Segment{
|
|
FieldsMap: map[string]uint16{},
|
|
DocValueFields: map[uint16]bool{},
|
|
}
|
|
}
|
|
|
|
func (s *Segment) updateSize() {
|
|
sizeInBytes := reflectStaticSizeSegment
|
|
|
|
// FieldsMap, FieldsInv
|
|
for k, _ := range s.FieldsMap {
|
|
sizeInBytes += (len(k)+size.SizeOfString)*2 +
|
|
size.SizeOfUint16
|
|
}
|
|
|
|
// Dicts, DictKeys
|
|
for _, entry := range s.Dicts {
|
|
for k, _ := range entry {
|
|
sizeInBytes += (len(k)+size.SizeOfString)*2 +
|
|
size.SizeOfUint64
|
|
}
|
|
// overhead from the data structures
|
|
sizeInBytes += (size.SizeOfMap + size.SizeOfSlice)
|
|
}
|
|
|
|
// Postings, PostingsLocs
|
|
for i := 0; i < len(s.Postings); i++ {
|
|
sizeInBytes += (int(s.Postings[i].GetSizeInBytes()) + size.SizeOfPtr) +
|
|
(int(s.PostingsLocs[i].GetSizeInBytes()) + size.SizeOfPtr)
|
|
}
|
|
|
|
// Freqs, Norms
|
|
for i := 0; i < len(s.Freqs); i++ {
|
|
sizeInBytes += (len(s.Freqs[i])*size.SizeOfUint64 +
|
|
len(s.Norms[i])*size.SizeOfFloat32) +
|
|
(size.SizeOfSlice * 2)
|
|
}
|
|
|
|
// Location data
|
|
for i := 0; i < len(s.Locfields); i++ {
|
|
sizeInBytes += len(s.Locfields[i])*size.SizeOfUint16 +
|
|
len(s.Locstarts[i])*size.SizeOfUint64 +
|
|
len(s.Locends[i])*size.SizeOfUint64 +
|
|
len(s.Locpos[i])*size.SizeOfUint64
|
|
|
|
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
|
sizeInBytes += len(s.Locarraypos[i][j])*size.SizeOfUint64 +
|
|
size.SizeOfSlice
|
|
}
|
|
|
|
sizeInBytes += (size.SizeOfSlice * 5)
|
|
}
|
|
|
|
// Stored data
|
|
for i := 0; i < len(s.Stored); i++ {
|
|
for _, v := range s.Stored[i] {
|
|
sizeInBytes += size.SizeOfUint16
|
|
for _, arr := range v {
|
|
sizeInBytes += len(arr) + size.SizeOfSlice
|
|
}
|
|
sizeInBytes += size.SizeOfSlice
|
|
}
|
|
|
|
for _, v := range s.StoredTypes[i] {
|
|
sizeInBytes += size.SizeOfUint16 + len(v) + size.SizeOfSlice
|
|
}
|
|
|
|
for _, v := range s.StoredPos[i] {
|
|
sizeInBytes += size.SizeOfUint16
|
|
for _, arr := range v {
|
|
sizeInBytes += len(arr)*size.SizeOfUint64 +
|
|
size.SizeOfSlice
|
|
}
|
|
sizeInBytes += size.SizeOfSlice
|
|
}
|
|
|
|
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
|
sizeInBytes += (size.SizeOfMap * 3)
|
|
}
|
|
|
|
// DocValueFields
|
|
sizeInBytes += len(s.DocValueFields) * (size.SizeOfUint16 + size.SizeOfBool)
|
|
|
|
s.sizeInBytes = sizeInBytes
|
|
}
|
|
|
|
func (s *Segment) Size() int {
|
|
return s.sizeInBytes
|
|
}
|
|
|
|
func (s *Segment) AddRef() {
|
|
}
|
|
|
|
func (s *Segment) DecRef() error {
|
|
return nil
|
|
}
|
|
|
|
// Fields returns the field names used in this segment
|
|
func (s *Segment) Fields() []string {
|
|
return s.FieldsInv
|
|
}
|
|
|
|
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
|
// for the specified doc number
|
|
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
|
// ensure document number exists
|
|
if int(num) > len(s.Stored)-1 {
|
|
return nil
|
|
}
|
|
docFields := s.Stored[int(num)]
|
|
st := s.StoredTypes[int(num)]
|
|
sp := s.StoredPos[int(num)]
|
|
for field, values := range docFields {
|
|
for i, value := range values {
|
|
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
|
if !keepGoing {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Segment) getField(name string) (int, error) {
|
|
fieldID, ok := s.FieldsMap[name]
|
|
if !ok {
|
|
return 0, fmt.Errorf("no field named %s", name)
|
|
}
|
|
return int(fieldID - 1), nil
|
|
}
|
|
|
|
// Dictionary returns the term dictionary for the specified field
|
|
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
|
fieldID, err := s.getField(field)
|
|
if err != nil {
|
|
// no such field, return empty dictionary
|
|
return &segment.EmptyDictionary{}, nil
|
|
}
|
|
return &Dictionary{
|
|
segment: s,
|
|
field: field,
|
|
fieldID: uint16(fieldID),
|
|
}, nil
|
|
}
|
|
|
|
// Count returns the number of documents in this segment
|
|
// (this has no notion of deleted docs)
|
|
func (s *Segment) Count() uint64 {
|
|
return uint64(len(s.Stored))
|
|
}
|
|
|
|
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
|
// provided _id strings
|
|
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
|
rv := roaring.New()
|
|
|
|
// guard against empty segment
|
|
if len(s.FieldsMap) > 0 {
|
|
idDictionary := s.Dicts[idFieldID]
|
|
|
|
for _, id := range ids {
|
|
postingID := idDictionary[id]
|
|
if postingID > 0 {
|
|
rv.Or(s.Postings[postingID-1])
|
|
}
|
|
}
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
// Close releases all resources associated with this segment
|
|
func (s *Segment) Close() error {
|
|
return nil
|
|
}
|