2017-12-01 21:42:50 +01:00
|
|
|
// Copyright (c) 2017 Couchbase, Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2017-09-29 18:42:37 +02:00
|
|
|
package mem
|
|
|
|
|
|
|
|
import (
|
2017-12-07 00:33:47 +01:00
|
|
|
"fmt"
|
|
|
|
|
2017-09-29 18:42:37 +02:00
|
|
|
"github.com/RoaringBitmap/roaring"
|
|
|
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
|
|
)
|
|
|
|
|
2017-12-01 15:30:07 +01:00
|
|
|
// _id field is always guaranteed to have fieldID of 0
|
|
|
|
const idFieldID uint16 = 0
|
|
|
|
|
2017-09-29 18:42:37 +02:00
|
|
|
// KNOWN ISSUES
|
|
|
|
// - LIMITATION - we decided whether or not to store term vectors for a field
|
|
|
|
// at the segment level, based on the first definition of a
|
|
|
|
// field we see. in normal bleve usage this is fine, all
|
|
|
|
// instances of a field definition will be the same. however,
|
|
|
|
// advanced users may violate this and provide unique field
|
|
|
|
// definitions with each document. this segment does not
|
|
|
|
// support this usage.
|
|
|
|
|
|
|
|
// TODO
|
|
|
|
// - need better testing of multiple docs, iterating freqs, locations and
|
|
|
|
// and verifying the correct results are returned
|
|
|
|
|
|
|
|
// Segment is an in memory implementation of scorch.Segment
|
|
|
|
type Segment struct {
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// FieldsMap adds 1 to field id to avoid zero value issues
|
|
|
|
// name -> field id + 1
|
2017-12-01 13:26:47 +01:00
|
|
|
FieldsMap map[string]uint16
|
2018-01-18 03:46:57 +01:00
|
|
|
|
|
|
|
// FieldsInv is the inverse of FieldsMap
|
|
|
|
// field id -> name
|
2017-12-01 13:26:47 +01:00
|
|
|
FieldsInv []string
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Term dictionaries for each field
|
2018-01-15 20:06:44 +01:00
|
|
|
// field id -> term -> postings list id + 1
|
2017-12-01 13:26:47 +01:00
|
|
|
Dicts []map[string]uint64
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Terms for each field, where terms are sorted ascending
|
|
|
|
// field id -> []term
|
2017-12-01 13:26:47 +01:00
|
|
|
DictKeys [][]string
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2017-12-01 13:26:47 +01:00
|
|
|
// Postings list
|
2018-01-18 03:46:57 +01:00
|
|
|
// postings list id -> bitmap by docNum
|
2017-12-01 13:26:47 +01:00
|
|
|
Postings []*roaring.Bitmap
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Postings list has locations
|
2017-12-11 21:47:41 +01:00
|
|
|
PostingsLocs []*roaring.Bitmap
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Term frequencies
|
2017-12-01 13:26:47 +01:00
|
|
|
// postings list id -> Freqs (one for each hit in bitmap)
|
|
|
|
Freqs [][]uint64
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Field norms
|
2017-12-01 13:26:47 +01:00
|
|
|
// postings list id -> Norms (one for each hit in bitmap)
|
|
|
|
Norms [][]float32
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Field/start/end/pos/locarraypos
|
2017-09-29 18:42:37 +02:00
|
|
|
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
2017-12-01 13:26:47 +01:00
|
|
|
Locfields [][]uint16
|
|
|
|
Locstarts [][]uint64
|
|
|
|
Locends [][]uint64
|
|
|
|
Locpos [][]uint64
|
|
|
|
Locarraypos [][][]uint64
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2017-12-01 13:26:47 +01:00
|
|
|
// Stored field values
|
2017-09-29 18:42:37 +02:00
|
|
|
// docNum -> field id -> slice of values (each value []byte)
|
2017-12-01 13:26:47 +01:00
|
|
|
Stored []map[uint16][][]byte
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Stored field types
|
2017-09-29 18:42:37 +02:00
|
|
|
// docNum -> field id -> slice of types (each type byte)
|
2017-12-01 13:26:47 +01:00
|
|
|
StoredTypes []map[uint16][]byte
|
2017-09-29 18:42:37 +02:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Stored field array positions
|
2017-09-29 18:42:37 +02:00
|
|
|
// docNum -> field id -> slice of array positions (each is []uint64)
|
2017-12-01 13:26:47 +01:00
|
|
|
StoredPos []map[uint16][][]uint64
|
2017-12-28 07:35:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// For storing the docValue persisted fields
|
2018-01-04 11:04:55 +01:00
|
|
|
DocValueFields map[uint16]bool
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// Footprint of the segment, updated when analyzed document mutations
|
2017-12-29 02:48:38 +01:00
|
|
|
// are added into the segment
|
|
|
|
sizeInBytes uint64
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// New builds a new empty Segment
|
|
|
|
func New() *Segment {
|
|
|
|
return &Segment{
|
2018-01-04 11:04:55 +01:00
|
|
|
FieldsMap: map[string]uint16{},
|
|
|
|
DocValueFields: map[uint16]bool{},
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-29 02:48:38 +01:00
|
|
|
func (s *Segment) updateSizeInBytes() {
|
|
|
|
var sizeInBytes uint64
|
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// FieldsMap, FieldsInv
|
2017-12-29 02:48:38 +01:00
|
|
|
for k, _ := range s.FieldsMap {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
2017-12-29 02:48:38 +01:00
|
|
|
2 /* size of uint16 */)
|
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
// overhead from the data structures
|
|
|
|
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// Dicts, DictKeys
|
2017-12-29 02:48:38 +01:00
|
|
|
for _, entry := range s.Dicts {
|
|
|
|
for k, _ := range entry {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
2017-12-29 02:48:38 +01:00
|
|
|
8 /* size of uint64 */)
|
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
// overhead from the data structures
|
|
|
|
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += (segment.SizeOfSlice * 2)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// Postings, PostingsLocs
|
2017-12-29 02:48:38 +01:00
|
|
|
for i := 0; i < len(s.Postings); i++ {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
|
|
|
|
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += (segment.SizeOfSlice * 2)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// Freqs, Norms
|
2017-12-29 02:48:38 +01:00
|
|
|
for i := 0; i < len(s.Freqs); i++ {
|
|
|
|
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
|
2018-01-12 21:11:11 +01:00
|
|
|
len(s.Norms[i])*4 /* size of float32 */) +
|
|
|
|
(segment.SizeOfSlice * 2)
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += (segment.SizeOfSlice * 2)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// Location data
|
2017-12-29 02:48:38 +01:00
|
|
|
for i := 0; i < len(s.Locfields); i++ {
|
|
|
|
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
|
|
|
|
len(s.Locstarts[i])*8 /* size of uint64 */ +
|
|
|
|
len(s.Locends[i])*8 /* size of uint64 */ +
|
|
|
|
len(s.Locpos[i])*8 /* size of uint64 */)
|
|
|
|
|
|
|
|
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
|
|
|
|
segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
|
|
|
|
sizeInBytes += (segment.SizeOfSlice * 5)
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += (segment.SizeOfSlice * 5)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// Stored data
|
2017-12-29 02:48:38 +01:00
|
|
|
for i := 0; i < len(s.Stored); i++ {
|
|
|
|
for _, v := range s.Stored[i] {
|
|
|
|
sizeInBytes += uint64(2 /* size of uint16 */)
|
|
|
|
for _, arr := range v {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, v := range s.StoredTypes[i] {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, v := range s.StoredPos[i] {
|
|
|
|
sizeInBytes += uint64(2 /* size of uint16 */)
|
|
|
|
for _, arr := range v {
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
|
|
|
|
segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
sizeInBytes += segment.SizeOfSlice
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
|
|
|
|
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
|
|
|
sizeInBytes += (segment.SizeOfMap * 3)
|
2017-12-29 02:48:38 +01:00
|
|
|
}
|
2018-01-12 21:11:11 +01:00
|
|
|
// overhead from data structures: Stored, StoredTypes, StoredPos
|
|
|
|
sizeInBytes += (segment.SizeOfSlice * 3)
|
|
|
|
|
|
|
|
// DocValueFields
|
|
|
|
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
|
|
|
|
segment.SizeOfMap
|
2017-12-29 02:48:38 +01:00
|
|
|
|
2018-01-12 21:11:11 +01:00
|
|
|
// SizeInBytes
|
|
|
|
sizeInBytes += uint64(8)
|
2017-12-29 02:48:38 +01:00
|
|
|
|
|
|
|
s.sizeInBytes = sizeInBytes
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Segment) SizeInBytes() uint64 {
|
|
|
|
return s.sizeInBytes
|
|
|
|
}
|
|
|
|
|
2017-12-13 22:10:44 +01:00
|
|
|
func (s *Segment) AddRef() {
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Segment) DecRef() error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-09-29 18:42:37 +02:00
|
|
|
// Fields returns the field names used in this segment
|
|
|
|
func (s *Segment) Fields() []string {
|
2017-12-01 13:26:47 +01:00
|
|
|
return s.FieldsInv
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
|
|
|
// for the specified doc number
|
|
|
|
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
|
|
|
// ensure document number exists
|
2017-12-01 13:26:47 +01:00
|
|
|
if int(num) > len(s.Stored)-1 {
|
2017-09-29 18:42:37 +02:00
|
|
|
return nil
|
|
|
|
}
|
2017-12-01 13:26:47 +01:00
|
|
|
docFields := s.Stored[int(num)]
|
2018-01-15 20:54:46 +01:00
|
|
|
st := s.StoredTypes[int(num)]
|
|
|
|
sp := s.StoredPos[int(num)]
|
2017-09-29 18:42:37 +02:00
|
|
|
for field, values := range docFields {
|
|
|
|
for i, value := range values {
|
2018-01-15 20:54:46 +01:00
|
|
|
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
2017-09-29 18:42:37 +02:00
|
|
|
if !keepGoing {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-12-07 00:33:47 +01:00
|
|
|
func (s *Segment) getField(name string) (int, error) {
|
|
|
|
fieldID, ok := s.FieldsMap[name]
|
|
|
|
if !ok {
|
|
|
|
return 0, fmt.Errorf("no field named %s", name)
|
|
|
|
}
|
|
|
|
return int(fieldID - 1), nil
|
|
|
|
}
|
|
|
|
|
2017-09-29 18:42:37 +02:00
|
|
|
// Dictionary returns the term dictionary for the specified field
|
2017-12-05 00:06:06 +01:00
|
|
|
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
2017-12-07 00:33:47 +01:00
|
|
|
fieldID, err := s.getField(field)
|
|
|
|
if err != nil {
|
|
|
|
// no such field, return empty dictionary
|
|
|
|
return &segment.EmptyDictionary{}, nil
|
|
|
|
}
|
2017-09-29 18:42:37 +02:00
|
|
|
return &Dictionary{
|
|
|
|
segment: s,
|
|
|
|
field: field,
|
2017-12-07 00:33:47 +01:00
|
|
|
fieldID: uint16(fieldID),
|
2017-12-05 00:06:06 +01:00
|
|
|
}, nil
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Count returns the number of documents in this segment
|
|
|
|
// (this has no notion of deleted docs)
|
|
|
|
func (s *Segment) Count() uint64 {
|
2017-12-01 13:26:47 +01:00
|
|
|
return uint64(len(s.Stored))
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
|
|
|
// provided _id strings
|
2017-12-05 00:06:06 +01:00
|
|
|
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
2017-09-29 18:42:37 +02:00
|
|
|
rv := roaring.New()
|
2017-12-01 15:50:27 +01:00
|
|
|
|
|
|
|
// guard against empty segment
|
|
|
|
if len(s.FieldsMap) > 0 {
|
|
|
|
idDictionary := s.Dicts[idFieldID]
|
|
|
|
|
|
|
|
for _, id := range ids {
|
|
|
|
postingID := idDictionary[id]
|
|
|
|
if postingID > 0 {
|
|
|
|
rv.Or(s.Postings[postingID-1])
|
|
|
|
}
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
|
|
|
}
|
2017-12-05 00:06:06 +01:00
|
|
|
return rv, nil
|
2017-09-29 18:42:37 +02:00
|
|
|
}
|
2017-12-05 15:31:02 +01:00
|
|
|
|
|
|
|
// Close releases all resources associated with this segment
|
|
|
|
func (s *Segment) Close() error {
|
|
|
|
return nil
|
|
|
|
}
|