scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding
NOTE: this is a zap file format change. The separate "postings locations" roaring Bitmap that encoded whether a posting has locations info is now replaced by the least significant bit in the freq varint encoded in the freq-norm chunkedIntCoder. encode/decodeFreqHasLocs() are added as helper functions.
This commit is contained in:
parent
a7c4237d00
commit
621b58dd83
|
@ -81,10 +81,6 @@ var exploreCmd = &cobra.Command{
|
|||
locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var locBitmapAddr uint64
|
||||
locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var postingListLen uint64
|
||||
postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
@ -131,8 +127,6 @@ var exploreCmd = &cobra.Command{
|
|||
running2 += offset
|
||||
}
|
||||
|
||||
fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr)
|
||||
|
||||
} else {
|
||||
fmt.Printf("dictionary does not contain term '%s'\n", args[2])
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ import (
|
|||
"github.com/Smerity/govarint"
|
||||
)
|
||||
|
||||
const version uint32 = 6
|
||||
const version uint32 = 7
|
||||
|
||||
const fieldNotUninverted = math.MaxUint64
|
||||
|
||||
|
|
|
@ -72,15 +72,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
|
|||
if postings != nil {
|
||||
postings.Clear()
|
||||
}
|
||||
locBitmap := rv.locBitmap
|
||||
if locBitmap != nil {
|
||||
locBitmap.Clear()
|
||||
}
|
||||
|
||||
*rv = PostingsList{} // clear the struct
|
||||
|
||||
rv.postings = postings
|
||||
rv.locBitmap = locBitmap
|
||||
}
|
||||
rv.sb = d.sb
|
||||
rv.except = except
|
||||
|
|
|
@ -259,9 +259,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
postingsOffset, err := writePostings(
|
||||
newRoaring, newRoaringLocs, tfEncoder, locEncoder,
|
||||
use1HitEncoding, w, bufMaxVarintLen64)
|
||||
postingsOffset, err := writePostings(newRoaring,
|
||||
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -423,12 +422,14 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po
|
|||
nextFreq := next.Frequency()
|
||||
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
|
||||
|
||||
err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm)
|
||||
locs := next.Locations()
|
||||
|
||||
err = tfEncoder.Add(hitNewDocNum,
|
||||
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
locs := next.Locations()
|
||||
if len(locs) > 0 {
|
||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||
|
||||
|
@ -503,8 +504,7 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
|
|||
return lastDocNum, lastFreq, lastNorm, err
|
||||
}
|
||||
|
||||
func writePostings(postings, postingLocs *roaring.Bitmap,
|
||||
tfEncoder, locEncoder *chunkedIntCoder,
|
||||
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
|
||||
use1HitEncoding func(uint64) (bool, uint64, uint64),
|
||||
w *CountHashWriter, bufMaxVarintLen64 []byte) (
|
||||
offset uint64, err error) {
|
||||
|
@ -532,12 +532,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap,
|
|||
return 0, err
|
||||
}
|
||||
|
||||
postingLocsOffset := uint64(w.Count())
|
||||
_, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
postingsOffset := uint64(w.Count())
|
||||
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
|
||||
|
@ -552,12 +546,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap,
|
|||
return 0, err
|
||||
}
|
||||
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
|
|
|
@ -103,9 +103,6 @@ type interim struct {
|
|||
// postings id -> bitmap of docNums
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// postings id -> bitmap of docNums that have locations
|
||||
PostingsLocs []*roaring.Bitmap
|
||||
|
||||
// postings id -> freq/norm's, one for each docNum in postings
|
||||
FreqNorms [][]interimFreqNorm
|
||||
freqNormsBacking []interimFreqNorm
|
||||
|
@ -151,10 +148,6 @@ func (s *interim) reset() (err error) {
|
|||
idn.Clear()
|
||||
}
|
||||
s.Postings = s.Postings[:0]
|
||||
for _, idn := range s.PostingsLocs {
|
||||
idn.Clear()
|
||||
}
|
||||
s.PostingsLocs = s.PostingsLocs[:0]
|
||||
s.FreqNorms = s.FreqNorms[:0]
|
||||
for i := range s.freqNormsBacking {
|
||||
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||
|
@ -198,6 +191,7 @@ type interimStoredField struct {
|
|||
type interimFreqNorm struct {
|
||||
freq uint64
|
||||
norm float32
|
||||
hasLocs bool
|
||||
}
|
||||
|
||||
type interimLoc struct {
|
||||
|
@ -356,19 +350,6 @@ func (s *interim) prepareDicts() {
|
|||
s.Postings = postings
|
||||
}
|
||||
|
||||
if cap(s.PostingsLocs) >= numPostingsLists {
|
||||
s.PostingsLocs = s.PostingsLocs[:numPostingsLists]
|
||||
} else {
|
||||
postingsLocs := make([]*roaring.Bitmap, numPostingsLists)
|
||||
copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)])
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
if postingsLocs[i] == nil {
|
||||
postingsLocs[i] = roaring.New()
|
||||
}
|
||||
}
|
||||
s.PostingsLocs = postingsLocs
|
||||
}
|
||||
|
||||
if cap(s.FreqNorms) >= numPostingsLists {
|
||||
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||
} else {
|
||||
|
@ -466,12 +447,10 @@ func (s *interim) processDocument(docNum uint64,
|
|||
interimFreqNorm{
|
||||
freq: uint64(tf.Frequency()),
|
||||
norm: norm,
|
||||
hasLocs: len(tf.Locations) > 0,
|
||||
})
|
||||
|
||||
if len(tf.Locations) > 0 {
|
||||
locBS := s.PostingsLocs[pid]
|
||||
locBS.Add(uint32(docNum))
|
||||
|
||||
locs := s.Locs[pid]
|
||||
|
||||
for _, loc := range tf.Locations {
|
||||
|
@ -625,7 +604,6 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
|
|||
pid := dict[term] - 1
|
||||
|
||||
postingsBS := s.Postings[pid]
|
||||
postingsLocsBS := s.PostingsLocs[pid]
|
||||
|
||||
freqNorms := s.FreqNorms[pid]
|
||||
freqNormOffset := 0
|
||||
|
@ -639,7 +617,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
|
|||
|
||||
freqNorm := freqNorms[freqNormOffset]
|
||||
|
||||
err = tfEncoder.Add(docNum, freqNorm.freq,
|
||||
err = tfEncoder.Add(docNum,
|
||||
encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs),
|
||||
uint64(math.Float32bits(freqNorm.norm)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
|
@ -675,9 +654,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
|
|||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
postingsOffset, err := writePostings(
|
||||
postingsBS, postingsLocsBS, tfEncoder, locEncoder,
|
||||
nil, s.w, buf)
|
||||
postingsOffset, err :=
|
||||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
|
|
@ -100,7 +100,6 @@ type PostingsList struct {
|
|||
postingsOffset uint64
|
||||
freqOffset uint64
|
||||
locOffset uint64
|
||||
locBitmap *roaring.Bitmap
|
||||
postings *roaring.Bitmap
|
||||
except *roaring.Bitmap
|
||||
|
||||
|
@ -222,8 +221,6 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
|
|||
}
|
||||
rv.locChunkStart = p.locOffset + n
|
||||
|
||||
rv.locBitmap = p.locBitmap
|
||||
|
||||
rv.all = p.postings.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := roaring.AndNot(p.postings, p.except)
|
||||
|
@ -271,23 +268,6 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
|||
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var locBitmapOffset uint64
|
||||
locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var locBitmapLen uint64
|
||||
locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64])
|
||||
|
||||
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
|
||||
|
||||
if rv.locBitmap == nil {
|
||||
rv.locBitmap = roaring.NewBitmap()
|
||||
}
|
||||
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
|
||||
}
|
||||
|
||||
var postingsLen uint64
|
||||
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
@ -297,7 +277,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
|||
if rv.postings == nil {
|
||||
rv.postings = roaring.NewBitmap()
|
||||
}
|
||||
_, err = rv.postings.FromBuffer(roaringBytes)
|
||||
_, err := rv.postings.FromBuffer(roaringBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error loading roaring bitmap: %v", err)
|
||||
}
|
||||
|
@ -334,8 +314,6 @@ type PostingsIterator struct {
|
|||
locChunkOffsets []uint64
|
||||
locChunkStart uint64
|
||||
|
||||
locBitmap *roaring.Bitmap
|
||||
|
||||
next Posting // reused across Next() calls
|
||||
nextLocs []Location // reused across Next() calls
|
||||
|
||||
|
@ -353,10 +331,6 @@ func (i *PostingsIterator) Size() int {
|
|||
len(i.locChunkOffsets)*size.SizeOfUint64 +
|
||||
i.next.Size()
|
||||
|
||||
if i.locBitmap != nil {
|
||||
sizeInBytes += int(i.locBitmap.GetSizeInBytes())
|
||||
}
|
||||
|
||||
for _, entry := range i.nextLocs {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
@ -397,20 +371,37 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
|
||||
func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
|
||||
if i.normBits1Hit != 0 {
|
||||
return 1, i.normBits1Hit, nil
|
||||
return 1, i.normBits1Hit, false, nil
|
||||
}
|
||||
|
||||
freq, err := i.freqNormDecoder.GetU64()
|
||||
freqHasLocs, err := i.freqNormDecoder.GetU64()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
|
||||
return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
|
||||
}
|
||||
freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
|
||||
|
||||
normBits, err := i.freqNormDecoder.GetU64()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("error reading norm: %v", err)
|
||||
return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
|
||||
}
|
||||
return freq, normBits, err
|
||||
|
||||
return freq, normBits, hasLocs, err
|
||||
}
|
||||
|
||||
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
|
||||
rv := freq << 1
|
||||
if hasLocs {
|
||||
rv = rv | 0x01 // 0'th LSB encodes whether there are locations
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
|
||||
freq := freqHasLocs >> 1
|
||||
hasLocs := freqHasLocs&0x01 != 0
|
||||
return freq, hasLocs
|
||||
}
|
||||
|
||||
// readLocation processes all the integers on the stream representing a single
|
||||
|
@ -484,13 +475,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|||
rv.docNum = docNum
|
||||
|
||||
var normBits uint64
|
||||
rv.freq, normBits, err = i.readFreqNorm()
|
||||
var hasLocs bool
|
||||
|
||||
rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rv.norm = math.Float32frombits(uint32(normBits))
|
||||
|
||||
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
|
||||
if hasLocs {
|
||||
// read off 'freq' locations, into reused slices
|
||||
if cap(i.nextLocs) >= int(rv.freq) {
|
||||
i.nextLocs = i.nextLocs[0:rv.freq]
|
||||
|
@ -514,6 +508,8 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
var freqHasLocs1Hit = encodeFreqHasLocs(1, false)
|
||||
|
||||
// nextBytes returns the docNum and the encoded freq & loc bytes for
|
||||
// the next posting
|
||||
func (i *PostingsIterator) nextBytes() (
|
||||
|
@ -528,14 +524,16 @@ func (i *PostingsIterator) nextBytes() (
|
|||
if i.buf == nil {
|
||||
i.buf = make([]byte, binary.MaxVarintLen64*2)
|
||||
}
|
||||
n := binary.PutUvarint(i.buf, uint64(1))
|
||||
n := binary.PutUvarint(i.buf, freqHasLocs1Hit)
|
||||
n += binary.PutUvarint(i.buf, i.normBits1Hit)
|
||||
return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
|
||||
}
|
||||
|
||||
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||
|
||||
freq, normBits, err = i.readFreqNorm()
|
||||
var hasLocs bool
|
||||
|
||||
freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, nil, err
|
||||
}
|
||||
|
@ -543,7 +541,7 @@ func (i *PostingsIterator) nextBytes() (
|
|||
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
|
||||
|
||||
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
|
||||
if hasLocs {
|
||||
startLoc := len(i.currChunkLoc) - i.locReader.Len()
|
||||
|
||||
for j := uint64(0); j < freq; j++ {
|
||||
|
@ -596,11 +594,12 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
|
|||
}
|
||||
|
||||
// read off freq/offsets even though we don't care about them
|
||||
freq, _, err := i.readFreqNorm()
|
||||
freq, _, hasLocs, err := i.readFreqNormHasLocs()
|
||||
if err != nil {
|
||||
return 0, false, err
|
||||
}
|
||||
if i.locBitmap.Contains(allN) {
|
||||
|
||||
if hasLocs {
|
||||
for j := 0; j < int(freq); j++ {
|
||||
err := i.readLocation(nil)
|
||||
if err != nil {
|
||||
|
|
Loading…
Reference in New Issue