2017-12-09 20:28:33 +01:00
|
|
|
// Copyright (c) 2017 Couchbase, Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package zap
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"encoding/binary"
|
|
|
|
"fmt"
|
|
|
|
"math"
|
|
|
|
|
|
|
|
"github.com/RoaringBitmap/roaring"
|
|
|
|
"github.com/Smerity/govarint"
|
|
|
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
|
|
)
|
|
|
|
|
|
|
|
// PostingsList is an in-memory represenation of a postings list
|
|
|
|
type PostingsList struct {
|
2018-01-18 03:46:57 +01:00
|
|
|
sb *SegmentBase
|
2018-01-27 18:42:56 +01:00
|
|
|
term []byte
|
2017-12-11 21:59:36 +01:00
|
|
|
postingsOffset uint64
|
|
|
|
freqOffset uint64
|
|
|
|
locOffset uint64
|
|
|
|
locBitmap *roaring.Bitmap
|
|
|
|
postings *roaring.Bitmap
|
|
|
|
except *roaring.Bitmap
|
|
|
|
postingKey []byte
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Iterator returns an iterator for this postings list
|
|
|
|
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
|
|
|
rv := &PostingsIterator{
|
|
|
|
postings: p,
|
|
|
|
}
|
|
|
|
if p.postings != nil {
|
|
|
|
// prepare the freq chunk details
|
|
|
|
var n uint64
|
|
|
|
var read int
|
|
|
|
var numFreqChunks uint64
|
2018-01-18 03:46:57 +01:00
|
|
|
numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
|
2017-12-09 20:28:33 +01:00
|
|
|
n += uint64(read)
|
|
|
|
rv.freqChunkLens = make([]uint64, int(numFreqChunks))
|
|
|
|
for i := 0; i < int(numFreqChunks); i++ {
|
2018-01-18 03:46:57 +01:00
|
|
|
rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
|
2017-12-09 20:28:33 +01:00
|
|
|
n += uint64(read)
|
|
|
|
}
|
|
|
|
rv.freqChunkStart = p.freqOffset + n
|
|
|
|
|
|
|
|
// prepare the loc chunk details
|
|
|
|
n = 0
|
|
|
|
var numLocChunks uint64
|
2018-01-18 03:46:57 +01:00
|
|
|
numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
|
2017-12-09 20:28:33 +01:00
|
|
|
n += uint64(read)
|
|
|
|
rv.locChunkLens = make([]uint64, int(numLocChunks))
|
|
|
|
for i := 0; i < int(numLocChunks); i++ {
|
2018-01-18 03:46:57 +01:00
|
|
|
rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
|
2017-12-09 20:28:33 +01:00
|
|
|
n += uint64(read)
|
|
|
|
}
|
|
|
|
rv.locChunkStart = p.locOffset + n
|
2017-12-11 21:59:36 +01:00
|
|
|
rv.locBitmap = p.locBitmap
|
2017-12-11 21:47:41 +01:00
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
rv.all = p.postings.Iterator()
|
|
|
|
if p.except != nil {
|
2017-12-19 22:37:04 +01:00
|
|
|
allExcept := roaring.AndNot(p.postings, p.except)
|
2017-12-09 20:28:33 +01:00
|
|
|
rv.actual = allExcept.Iterator()
|
|
|
|
} else {
|
|
|
|
rv.actual = p.postings.Iterator()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
// Count returns the number of items on this postings list
|
|
|
|
func (p *PostingsList) Count() uint64 {
|
|
|
|
if p.postings != nil {
|
2017-12-20 02:44:25 +01:00
|
|
|
n := p.postings.GetCardinality()
|
2017-12-09 20:28:33 +01:00
|
|
|
if p.except != nil {
|
2017-12-20 02:44:25 +01:00
|
|
|
e := p.except.GetCardinality()
|
|
|
|
if e > n {
|
|
|
|
e = n
|
|
|
|
}
|
|
|
|
return n - e
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
2017-12-20 02:44:25 +01:00
|
|
|
return n
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
2017-12-12 14:42:13 +01:00
|
|
|
return 0
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
2018-01-30 19:27:07 +01:00
|
|
|
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
|
|
|
rv.postingsOffset = postingsOffset
|
|
|
|
|
|
|
|
// read the location of the freq/norm details
|
|
|
|
var n uint64
|
|
|
|
var read int
|
|
|
|
|
|
|
|
rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
|
|
|
|
n += uint64(read)
|
|
|
|
|
|
|
|
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
|
|
|
n += uint64(read)
|
|
|
|
|
|
|
|
var locBitmapOffset uint64
|
|
|
|
locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
|
|
|
n += uint64(read)
|
|
|
|
|
|
|
|
var locBitmapLen uint64
|
|
|
|
locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64])
|
|
|
|
|
|
|
|
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
|
|
|
|
|
|
|
|
rv.locBitmap = roaring.NewBitmap()
|
|
|
|
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
var postingsLen uint64
|
|
|
|
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
|
|
|
n += uint64(read)
|
|
|
|
|
|
|
|
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
|
|
|
|
|
|
|
|
rv.postings = roaring.NewBitmap()
|
|
|
|
_, err = rv.postings.FromBuffer(roaringBytes)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error loading roaring bitmap: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
// PostingsIterator provides a way to iterate through the postings list
|
|
|
|
type PostingsIterator struct {
|
|
|
|
postings *PostingsList
|
|
|
|
all roaring.IntIterable
|
|
|
|
offset int
|
|
|
|
locoffset int
|
|
|
|
actual roaring.IntIterable
|
|
|
|
|
|
|
|
currChunk uint32
|
|
|
|
currChunkFreqNorm []byte
|
|
|
|
currChunkLoc []byte
|
|
|
|
freqNormDecoder *govarint.Base128Decoder
|
|
|
|
locDecoder *govarint.Base128Decoder
|
|
|
|
|
|
|
|
freqChunkLens []uint64
|
|
|
|
freqChunkStart uint64
|
|
|
|
|
|
|
|
locChunkLens []uint64
|
|
|
|
locChunkStart uint64
|
2017-12-11 21:47:41 +01:00
|
|
|
|
|
|
|
locBitmap *roaring.Bitmap
|
2017-12-15 21:32:37 +01:00
|
|
|
|
|
|
|
next Posting
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i *PostingsIterator) loadChunk(chunk int) error {
|
|
|
|
if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
|
|
|
|
return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
|
|
|
|
}
|
|
|
|
// load correct chunk bytes
|
|
|
|
start := i.freqChunkStart
|
|
|
|
for j := 0; j < chunk; j++ {
|
|
|
|
start += i.freqChunkLens[j]
|
|
|
|
}
|
|
|
|
end := start + i.freqChunkLens[chunk]
|
2018-01-18 03:46:57 +01:00
|
|
|
i.currChunkFreqNorm = i.postings.sb.mem[start:end]
|
2017-12-09 20:28:33 +01:00
|
|
|
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm))
|
|
|
|
|
|
|
|
start = i.locChunkStart
|
|
|
|
for j := 0; j < chunk; j++ {
|
|
|
|
start += i.locChunkLens[j]
|
|
|
|
}
|
|
|
|
end = start + i.locChunkLens[chunk]
|
2018-01-18 03:46:57 +01:00
|
|
|
i.currChunkLoc = i.postings.sb.mem[start:end]
|
2017-12-09 20:28:33 +01:00
|
|
|
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc))
|
|
|
|
i.currChunk = uint32(chunk)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
|
|
|
|
freq, err := i.freqNormDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
|
|
|
|
}
|
|
|
|
normBits, err := i.freqNormDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, fmt.Errorf("error reading norm: %v", err)
|
|
|
|
}
|
|
|
|
return freq, normBits, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// readLocation processes all the integers on the stream representing a single
|
|
|
|
// location. if you care about it, pass in a non-nil location struct, and we
|
|
|
|
// will fill it. if you don't care about it, pass in nil and we safely consume
|
|
|
|
// the contents.
|
|
|
|
func (i *PostingsIterator) readLocation(l *Location) error {
|
|
|
|
// read off field
|
|
|
|
fieldID, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading location field: %v", err)
|
|
|
|
}
|
|
|
|
// read off pos
|
|
|
|
pos, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading location pos: %v", err)
|
|
|
|
}
|
|
|
|
// read off start
|
|
|
|
start, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading location start: %v", err)
|
|
|
|
}
|
|
|
|
// read off end
|
|
|
|
end, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading location end: %v", err)
|
|
|
|
}
|
|
|
|
// read off num array pos
|
|
|
|
numArrayPos, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading location num array pos: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// group these together for less branching
|
|
|
|
if l != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
l.field = i.postings.sb.fieldsInv[fieldID]
|
2017-12-09 20:28:33 +01:00
|
|
|
l.pos = pos
|
|
|
|
l.start = start
|
|
|
|
l.end = end
|
|
|
|
if numArrayPos > 0 {
|
|
|
|
l.ap = make([]uint64, int(numArrayPos))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// read off array positions
|
|
|
|
for k := 0; k < int(numArrayPos); k++ {
|
|
|
|
ap, err := i.locDecoder.GetU64()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading array position: %v", err)
|
|
|
|
}
|
|
|
|
if l != nil {
|
|
|
|
l.ap[k] = ap
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Next returns the next posting on the postings list, or nil at the end
|
|
|
|
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|
|
|
if i.actual == nil || !i.actual.HasNext() {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
n := i.actual.Next()
|
2018-01-18 03:46:57 +01:00
|
|
|
nChunk := n / i.postings.sb.chunkFactor
|
2017-12-09 20:28:33 +01:00
|
|
|
allN := i.all.Next()
|
2018-01-18 03:46:57 +01:00
|
|
|
allNChunk := allN / i.postings.sb.chunkFactor
|
2017-12-09 20:28:33 +01:00
|
|
|
|
|
|
|
// n is the next actual hit (excluding some postings)
|
|
|
|
// allN is the next hit in the full postings
|
|
|
|
// if they don't match, adjust offsets to factor in item we're skipping over
|
|
|
|
// incr the all iterator, and check again
|
|
|
|
for allN != n {
|
|
|
|
|
|
|
|
// in different chunks, reset offsets
|
|
|
|
if allNChunk != nChunk {
|
|
|
|
i.locoffset = 0
|
|
|
|
i.offset = 0
|
|
|
|
} else {
|
|
|
|
|
|
|
|
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
|
|
|
err := i.loadChunk(int(nChunk))
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error loading chunk: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// read off freq/offsets even though we don't care about them
|
|
|
|
freq, _, err := i.readFreqNorm()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-12-11 21:47:41 +01:00
|
|
|
if i.locBitmap.Contains(allN) {
|
2017-12-09 20:28:33 +01:00
|
|
|
for j := 0; j < int(freq); j++ {
|
|
|
|
err := i.readLocation(nil)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// in same chunk, need to account for offsets
|
|
|
|
i.offset++
|
|
|
|
}
|
|
|
|
|
|
|
|
allN = i.all.Next()
|
|
|
|
}
|
|
|
|
|
|
|
|
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
|
|
|
err := i.loadChunk(int(nChunk))
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error loading chunk: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-15 21:32:37 +01:00
|
|
|
i.next = Posting{} // clear the struct.
|
|
|
|
rv := &i.next
|
|
|
|
rv.iterator = i
|
|
|
|
rv.docNum = uint64(n)
|
2017-12-09 20:28:33 +01:00
|
|
|
|
|
|
|
var err error
|
|
|
|
var normBits uint64
|
|
|
|
rv.freq, normBits, err = i.readFreqNorm()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
rv.norm = math.Float32frombits(uint32(normBits))
|
2017-12-11 21:47:41 +01:00
|
|
|
if i.locBitmap.Contains(n) {
|
2017-12-09 20:28:33 +01:00
|
|
|
// read off 'freq' locations
|
|
|
|
rv.locs = make([]segment.Location, rv.freq)
|
|
|
|
locs := make([]Location, rv.freq)
|
|
|
|
for j := 0; j < int(rv.freq); j++ {
|
|
|
|
err := i.readLocation(&locs[j])
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
rv.locs[j] = &locs[j]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Posting is a single entry in a postings list
|
|
|
|
type Posting struct {
|
|
|
|
iterator *PostingsIterator
|
|
|
|
docNum uint64
|
|
|
|
|
|
|
|
freq uint64
|
|
|
|
norm float32
|
|
|
|
locs []segment.Location
|
|
|
|
}
|
|
|
|
|
|
|
|
// Number returns the document number of this posting in this segment
|
|
|
|
func (p *Posting) Number() uint64 {
|
|
|
|
return p.docNum
|
|
|
|
}
|
|
|
|
|
|
|
|
// Frequency returns the frequence of occurance of this term in this doc/field
|
|
|
|
func (p *Posting) Frequency() uint64 {
|
|
|
|
return p.freq
|
|
|
|
}
|
|
|
|
|
|
|
|
// Norm returns the normalization factor for this posting
|
|
|
|
func (p *Posting) Norm() float64 {
|
|
|
|
return float64(p.norm)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Locations returns the location information for each occurance
|
|
|
|
func (p *Posting) Locations() []segment.Location {
|
|
|
|
return p.locs
|
|
|
|
}
|
|
|
|
|
|
|
|
// Location represents the location of a single occurance
|
|
|
|
type Location struct {
|
|
|
|
field string
|
|
|
|
pos uint64
|
|
|
|
start uint64
|
|
|
|
end uint64
|
|
|
|
ap []uint64
|
|
|
|
}
|
|
|
|
|
|
|
|
// Field returns the name of the field (useful in composite fields to know
|
|
|
|
// which original field the value came from)
|
|
|
|
func (l *Location) Field() string {
|
|
|
|
return l.field
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start returns the start byte offset of this occurance
|
|
|
|
func (l *Location) Start() uint64 {
|
|
|
|
return l.start
|
|
|
|
}
|
|
|
|
|
|
|
|
// End returns the end byte offset of this occurance
|
|
|
|
func (l *Location) End() uint64 {
|
|
|
|
return l.end
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pos returns the 1-based phrase position of this occurance
|
|
|
|
func (l *Location) Pos() uint64 {
|
|
|
|
return l.pos
|
|
|
|
}
|
|
|
|
|
|
|
|
// ArrayPositions returns the array position vector associated with this occurance
|
|
|
|
func (l *Location) ArrayPositions() []uint64 {
|
|
|
|
return l.ap
|
|
|
|
}
|