scorch zap optimize FST val encoding for terms with 1 hit
NOTE: this is a scorch zap file format change / bump to version 4. In this optimization, the uint64 val stored in the vellum FST (term dictionary) now may either be a uint64 postingsOffset (same as before this change) or a uint64 encoding of the docNum + norm (in the case where a term appears in just a single doc).
This commit is contained in:
parent
f04226d10b
commit
eac9808990
|
@ -18,7 +18,9 @@ import (
|
|||
"encoding/binary"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
@ -57,7 +59,19 @@ var exploreCmd = &cobra.Command{
|
|||
return fmt.Errorf("error looking for term : %v", err)
|
||||
}
|
||||
if exists {
|
||||
fmt.Printf("postings list begins at %d (%x)\n", postingsAddr, postingsAddr)
|
||||
fmt.Printf("fst val is %d (%x)\n", postingsAddr, postingsAddr)
|
||||
|
||||
if postingsAddr&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit {
|
||||
docNum, normBits := zap.FSTValDecode1Hit(postingsAddr)
|
||||
norm := math.Float32frombits(uint32(normBits))
|
||||
fmt.Printf("Posting List is 1-hit encoded, docNum: %d, norm: %f\n",
|
||||
docNum, norm)
|
||||
return nil
|
||||
}
|
||||
|
||||
if postingsAddr&zap.FSTValEncodingMask != zap.FSTValEncodingGeneral {
|
||||
return fmt.Errorf("unknown fst val encoding")
|
||||
}
|
||||
|
||||
var n uint64
|
||||
freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64])
|
||||
|
|
|
@ -28,7 +28,7 @@ import (
|
|||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
const version uint32 = 3
|
||||
const version uint32 = 4
|
||||
|
||||
const fieldNotUninverted = math.MaxUint64
|
||||
|
||||
|
|
|
@ -130,3 +130,7 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
|||
}
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
func (c *chunkedIntCoder) FinalSize() int {
|
||||
return len(c.final)
|
||||
}
|
||||
|
|
|
@ -225,6 +225,21 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
newRoaring := roaring.NewBitmap()
|
||||
newRoaringLocs := roaring.NewBitmap()
|
||||
|
||||
var lastDocNum, lastFreq, lastNorm uint64
|
||||
|
||||
// determines whether to use "1-hit" encoding optimization
|
||||
// when a term appears in only 1 doc, with no loc info,
|
||||
// has freq of 1, and the docNum fits into 31-bits
|
||||
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
|
||||
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
|
||||
docNum := uint64(newRoaring.Minimum())
|
||||
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
|
||||
return true, docNum, lastNorm
|
||||
}
|
||||
}
|
||||
return false, 0, 0
|
||||
}
|
||||
|
||||
finishTerm := func(term []byte) error {
|
||||
if term == nil {
|
||||
return nil
|
||||
|
@ -233,8 +248,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
if newRoaring.GetCardinality() > 0 {
|
||||
// this field/term actually has hits in the new segment, lets write it down
|
||||
termCardinality := newRoaring.GetCardinality()
|
||||
|
||||
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
|
||||
if encodeAs1Hit {
|
||||
err = newVellum.Insert(term, FSTValEncode1Hit(docNum1Hit, normBits1Hit))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else if termCardinality > 0 {
|
||||
// this field/term has hits in the new segment
|
||||
freqOffset := uint64(w.Count())
|
||||
_, err := tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
|
@ -251,7 +274,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
return err
|
||||
}
|
||||
postingOffset := uint64(w.Count())
|
||||
|
||||
// write out the start of the term info
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
|
@ -287,6 +309,10 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
|
||||
lastDocNum = 0
|
||||
lastFreq = 0
|
||||
lastNorm = 0
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -315,7 +341,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
|
||||
postItr = postings.iterator(postItr)
|
||||
|
||||
nextDocNum, nextFreqNormBytes, nextLocBytes, err2 := postItr.nextBytes()
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 :=
|
||||
postItr.nextBytes()
|
||||
for err2 == nil && len(nextFreqNormBytes) > 0 {
|
||||
hitNewDocNum := newDocNumsI[nextDocNum]
|
||||
if hitNewDocNum == docDropped {
|
||||
|
@ -339,7 +366,12 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
docTermMap[hitNewDocNum] =
|
||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
||||
|
||||
nextDocNum, nextFreqNormBytes, nextLocBytes, err2 = postItr.nextBytes()
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 =
|
||||
postItr.nextBytes()
|
||||
}
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
|
|
|
@ -859,3 +859,12 @@ func TestMergeBytesWritten(t *testing.T) {
|
|||
|
||||
testMergeWithSelf(t, seg3, 4)
|
||||
}
|
||||
|
||||
func TestUnder32Bits(t *testing.T) {
|
||||
if !under32Bits(0) || !under32Bits(uint64(0x7fffffff)) {
|
||||
t.Errorf("under32Bits bad")
|
||||
}
|
||||
if under32Bits(uint64(0x80000000)) || under32Bits(uint64(0x80000001)) {
|
||||
t.Errorf("under32Bits wrong")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,55 @@ func init() {
|
|||
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
|
||||
}
|
||||
|
||||
// FST or vellum value (uint64) encoding is determined by the top two
|
||||
// highest-order or most significant bits...
|
||||
//
|
||||
// encoding : MSB
|
||||
// name : 63 62 61...to...bit #0 (LSB)
|
||||
// ----------+---+---+---------------------------------------------------
|
||||
// general : 0 | 0 | 62-bits of postingsOffset.
|
||||
// ~ : 0 | 1 | reserved for future.
|
||||
// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum.
|
||||
// ~ : 1 | 1 | reserved for future.
|
||||
//
|
||||
// Encoding "general" is able to handle all cases, where the
|
||||
// postingsOffset points to more information about the postings for
|
||||
// the term.
|
||||
//
|
||||
// Encoding "1-hit" is used to optimize a commonly seen case when a
|
||||
// term has only a single hit. For example, a term in the _id field
|
||||
// will have only 1 hit. The "1-hit" encoding is used for a term
|
||||
// in a field when...
|
||||
//
|
||||
// - term vector info is disabled for that field;
|
||||
// - and, the term appears in only a single doc for that field;
|
||||
// - and, the term's freq is exactly 1 in that single doc for that field;
|
||||
// - and, the docNum must fit into 31-bits;
|
||||
//
|
||||
// Otherwise, the "general" encoding is used instead.
|
||||
//
|
||||
// In the "1-hit" encoding, the field in that single doc may have
|
||||
// other terms, which is supported in the "1-hit" encoding by the
|
||||
// positive float31 norm.
|
||||
|
||||
const FSTValEncodingMask = uint64(0xc000000000000000)
|
||||
const FSTValEncodingGeneral = uint64(0x0000000000000000)
|
||||
const FSTValEncoding1Hit = uint64(0x8000000000000000)
|
||||
|
||||
func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 {
|
||||
return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum)
|
||||
}
|
||||
|
||||
func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) {
|
||||
return (mask31Bits & v), (mask31Bits & (v >> 31))
|
||||
}
|
||||
|
||||
const mask31Bits = uint64(0x000000007fffffff)
|
||||
|
||||
func under32Bits(x uint64) bool {
|
||||
return x <= mask31Bits
|
||||
}
|
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct {
|
||||
sb *SegmentBase
|
||||
|
@ -52,6 +101,10 @@ type PostingsList struct {
|
|||
locBitmap *roaring.Bitmap
|
||||
postings *roaring.Bitmap
|
||||
except *roaring.Bitmap
|
||||
|
||||
// when postingsOffset == freqOffset == 0, then the postings list
|
||||
// represents a "1-hit" encoding, and has the following norm
|
||||
normBits1Hit uint64
|
||||
}
|
||||
|
||||
func (p *PostingsList) Size() int {
|
||||
|
@ -85,6 +138,8 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
|
|||
}
|
||||
locDecoder := rv.locDecoder
|
||||
|
||||
buf := rv.buf
|
||||
|
||||
*rv = PostingsIterator{} // clear the struct
|
||||
|
||||
rv.freqNormReader = freqNormReader
|
||||
|
@ -92,11 +147,17 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
|
|||
|
||||
rv.locReader = locReader
|
||||
rv.locDecoder = locDecoder
|
||||
|
||||
rv.buf = buf
|
||||
}
|
||||
rv.postings = p
|
||||
|
||||
if p.postings != nil {
|
||||
// prepare the freq chunk details
|
||||
if p.postings == nil {
|
||||
return rv
|
||||
}
|
||||
|
||||
if p.freqOffset > 0 && p.locOffset > 0 {
|
||||
// "general" encoding, so prepare the freq chunk details
|
||||
var n uint64
|
||||
var read int
|
||||
var numFreqChunks uint64
|
||||
|
@ -120,15 +181,19 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
|
|||
n += uint64(read)
|
||||
}
|
||||
rv.locChunkStart = p.locOffset + n
|
||||
rv.locBitmap = p.locBitmap
|
||||
} else {
|
||||
// "1-hit" encoding
|
||||
rv.normBits1Hit = p.normBits1Hit
|
||||
}
|
||||
|
||||
rv.all = p.postings.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := roaring.AndNot(p.postings, p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = p.postings.Iterator()
|
||||
}
|
||||
rv.locBitmap = p.locBitmap
|
||||
|
||||
rv.all = p.postings.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := roaring.AndNot(p.postings, p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = p.postings.Iterator()
|
||||
}
|
||||
|
||||
return rv
|
||||
|
@ -153,6 +218,11 @@ func (p *PostingsList) Count() uint64 {
|
|||
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
||||
rv.postingsOffset = postingsOffset
|
||||
|
||||
// handle "1-hit" encoding special case
|
||||
if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit {
|
||||
return rv.init1Hit(postingsOffset)
|
||||
}
|
||||
|
||||
// read the location of the freq/norm details
|
||||
var n uint64
|
||||
var read int
|
||||
|
@ -193,6 +263,24 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
var emptyRoaring = roaring.NewBitmap()
|
||||
|
||||
func (rv *PostingsList) init1Hit(fstVal uint64) error {
|
||||
docNum, normBits := FSTValDecode1Hit(fstVal)
|
||||
|
||||
rv.locBitmap = emptyRoaring
|
||||
|
||||
rv.postings = roaring.NewBitmap()
|
||||
rv.postings.Add(uint32(docNum))
|
||||
|
||||
// TODO: we can likely do better than allocating a roaring bitmap
|
||||
// with just 1 entry, but for now reuse existing machinery
|
||||
|
||||
rv.normBits1Hit = normBits
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct {
|
||||
postings *PostingsList
|
||||
|
@ -219,6 +307,10 @@ type PostingsIterator struct {
|
|||
|
||||
next Posting // reused across Next() calls
|
||||
nextLocs []Location // reused across Next() calls
|
||||
|
||||
normBits1Hit uint64
|
||||
|
||||
buf []byte
|
||||
}
|
||||
|
||||
func (i *PostingsIterator) Size() int {
|
||||
|
@ -244,7 +336,8 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
|
|||
if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
|
||||
return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
|
||||
}
|
||||
// load correct chunk bytes
|
||||
|
||||
// load freq chunk bytes
|
||||
start := i.freqChunkStart
|
||||
for j := 0; j < chunk; j++ {
|
||||
start += i.freqChunkLens[j]
|
||||
|
@ -258,6 +351,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
|
|||
i.freqNormReader.Reset(i.currChunkFreqNorm)
|
||||
}
|
||||
|
||||
// load loc chunk bytes
|
||||
start = i.locChunkStart
|
||||
for j := 0; j < chunk; j++ {
|
||||
start += i.locChunkLens[j]
|
||||
|
@ -270,11 +364,16 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
|
|||
} else {
|
||||
i.locReader.Reset(i.currChunkLoc)
|
||||
}
|
||||
|
||||
i.currChunk = uint32(chunk)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
|
||||
if i.normBits1Hit != 0 {
|
||||
return 1, i.normBits1Hit, nil
|
||||
}
|
||||
|
||||
freq, err := i.freqNormDecoder.GetU64()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
|
||||
|
@ -360,6 +459,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|||
return nil, err
|
||||
}
|
||||
rv.norm = math.Float32frombits(uint32(normBits))
|
||||
|
||||
if i.locBitmap.Contains(uint32(docNum)) {
|
||||
// read off 'freq' locations, into reused slices
|
||||
if cap(i.nextLocs) >= int(rv.freq) {
|
||||
|
@ -386,33 +486,40 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|||
|
||||
// nextBytes returns the docNum and the encoded freq & loc bytes for
|
||||
// the next posting
|
||||
func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) {
|
||||
func (i *PostingsIterator) nextBytes() (
|
||||
docNumOut uint64, freq uint64, normBits uint64,
|
||||
bytesFreqNorm []byte, bytesLoc []byte, err error) {
|
||||
docNum, exists, err := i.nextDocNum()
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
if err != nil || !exists {
|
||||
return 0, 0, 0, nil, nil, err
|
||||
}
|
||||
if !exists {
|
||||
return 0, nil, nil, nil
|
||||
|
||||
if i.normBits1Hit != 0 {
|
||||
if i.buf == nil {
|
||||
i.buf = make([]byte, binary.MaxVarintLen64*2)
|
||||
}
|
||||
n := binary.PutUvarint(i.buf, uint64(1))
|
||||
n += binary.PutUvarint(i.buf, i.normBits1Hit)
|
||||
return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
|
||||
}
|
||||
|
||||
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||
|
||||
freq, _, err := i.readFreqNorm()
|
||||
freq, normBits, err = i.readFreqNorm()
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
return 0, 0, 0, nil, nil, err
|
||||
}
|
||||
|
||||
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||
bytesFreqNorm := i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
|
||||
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
|
||||
|
||||
var bytesLoc []byte
|
||||
if i.locBitmap.Contains(uint32(docNum)) {
|
||||
startLoc := len(i.currChunkLoc) - i.locReader.Len()
|
||||
|
||||
for j := uint64(0); j < freq; j++ {
|
||||
err := i.readLocation(nil)
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
return 0, 0, 0, nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -420,7 +527,7 @@ func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) {
|
|||
bytesLoc = i.currChunkLoc[startLoc:endLoc]
|
||||
}
|
||||
|
||||
return docNum, bytesFreqNorm, bytesLoc, nil
|
||||
return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil
|
||||
}
|
||||
|
||||
// nextDocNum returns the next docNum on the postings list, and also
|
||||
|
@ -431,8 +538,13 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
|
|||
}
|
||||
|
||||
n := i.actual.Next()
|
||||
nChunk := n / i.postings.sb.chunkFactor
|
||||
allN := i.all.Next()
|
||||
|
||||
if i.normBits1Hit != 0 {
|
||||
return uint64(n), true, nil
|
||||
}
|
||||
|
||||
nChunk := n / i.postings.sb.chunkFactor
|
||||
allNChunk := allN / i.postings.sb.chunkFactor
|
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
|
|
Loading…
Reference in New Issue