0
0
Fork 0
bleve/index/scorch/segment/bolt/posting.go

324 lines
8.2 KiB
Go

// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bolt
import (
"bytes"
"fmt"
"math"
"github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/boltdb/bolt"
)
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
dictionary *Dictionary
term string
postingsID uint64
postings *roaring.Bitmap
except *roaring.Bitmap
postingKey []byte
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator() segment.PostingsIterator {
rv := &PostingsIterator{
postings: p,
}
if p.postings != nil {
detailsBucket := p.dictionary.segment.tx.Bucket(postingDetailsBucket)
rv.detailBucket = detailsBucket.Bucket(p.postingKey)
rv.all = p.postings.Iterator()
if p.except != nil {
allExcept := p.postings.Clone()
allExcept.AndNot(p.except)
rv.actual = allExcept.Iterator()
} else {
rv.actual = p.postings.Iterator()
}
}
return rv
}
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var rv uint64
if p.postings != nil {
rv = p.postings.GetCardinality()
if p.except != nil {
except := p.except.GetCardinality()
if except > rv {
// avoid underflow
except = rv
}
rv -= except
}
}
return rv
}
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
offset int
locoffset int
actual roaring.IntIterable
detailBucket *bolt.Bucket
currChunk uint32
currChunkFreqNorm []byte
currChunkLoc []byte
freqNormDecoder *govarint.Base128Decoder
locDecoder *govarint.Base128Decoder
}
func (i *PostingsIterator) loadChunk(chunk int) error {
// load correct chunk bytes
chunkID := segment.EncodeUvarintAscending(nil, uint64(chunk))
chunkBucket := i.detailBucket.Bucket(chunkID)
if chunkBucket == nil {
return fmt.Errorf("chunk %d missing", chunkID)
}
i.currChunkFreqNorm = chunkBucket.Get(freqNormKey)
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm))
i.currChunkLoc = chunkBucket.Get(locKey)
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc))
i.currChunk = uint32(chunk)
return nil
}
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
freq, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
}
normBits, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, err
}
// readLocation processes all the integers on the stream representing a single
// location. if you care about it, pass in a non-nil location struct, and we
// will fill it. if you don't care about it, pass in nil and we safely consume
// the contents.
func (i *PostingsIterator) readLocation(l *Location) error {
// read off field
fieldID, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location field: %v", err)
}
// read off pos
pos, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location pos: %v", err)
}
// read off start
start, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location start: %v", err)
}
// read off end
end, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location end: %v", err)
}
// read off num array pos
numArrayPos, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err)
}
// group these together for less branching
if l != nil {
l.field = i.postings.dictionary.segment.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if numArrayPos > 0 {
l.ap = make([]uint64, int(numArrayPos))
}
}
// read off array positions
for k := 0; k < int(numArrayPos); k++ {
ap, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading array position: %v", err)
}
if l != nil {
l.ap[k] = ap
}
}
return nil
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
if i.actual == nil || !i.actual.HasNext() {
return nil, nil
}
n := i.actual.Next()
nChunk := n / i.postings.dictionary.segment.chunkFactor
allN := i.all.Next()
allNChunk := allN / i.postings.dictionary.segment.chunkFactor
// n is the next actual hit (excluding some postings)
// allN is the next hit in the full postings
// if they don't match, adjust offsets to factor in item we're skipping over
// incr the all iterator, and check again
for allN != n {
// in different chunks, reset offsets
if allNChunk != nChunk {
i.locoffset = 0
i.offset = 0
} else {
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk))
if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err)
}
}
// read off freq/offsets even though we don't care about them
freq, _, err := i.readFreqNorm()
if err != nil {
return nil, err
}
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
for j := 0; j < int(freq); j++ {
err := i.readLocation(nil)
if err != nil {
return nil, err
}
}
}
// in same chunk, need to account for offsets
i.offset++
}
allN = i.all.Next()
}
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk))
if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err)
}
}
rv := &Posting{
iterator: i,
docNum: uint64(n),
}
var err error
var normBits uint64
rv.freq, normBits, err = i.readFreqNorm()
if err != nil {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
// read off 'freq' locations
rv.locs = make([]segment.Location, rv.freq)
locs := make([]Location, rv.freq)
for j := 0; j < int(rv.freq); j++ {
err := i.readLocation(&locs[j])
if err != nil {
return nil, err
}
rv.locs[j] = &locs[j]
}
}
return rv, nil
}
// Posting is a single entry in a postings list
type Posting struct {
iterator *PostingsIterator
docNum uint64
freq uint64
norm float32
locs []segment.Location
}
// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
return p.docNum
}
// Frequency returns the frequence of occurance of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.freq
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(p.norm)
}
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
return p.locs
}
// Location represents the location of a single occurance
type Location struct {
field string
pos uint64
start uint64
end uint64
ap []uint64
}
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.field
}
// Start returns the start byte offset of this occurance
func (l *Location) Start() uint64 {
return l.start
}
// End returns the end byte offset of this occurance
func (l *Location) End() uint64 {
return l.end
}
// Pos returns the 1-based phrase position of this occurance
func (l *Location) Pos() uint64 {
return l.pos
}
// ArrayPositions returns the array position vector associated with this occurance
func (l *Location) ArrayPositions() []uint64 {
return l.ap
}