From 7a19e6fd7e807171abbc2f823f3a12b037fefb89 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 12:40:02 -0700 Subject: [PATCH] scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding This is attempt #2 of the optimization that replaces the locsBitmap, without any changes from the original commit attempt. A commit that follows this one contains the actual fix. See also... - commit 621b58dd834123 (the 1st attempt) - commit 49a4ee60ba1d95 (the revert) ------------- The original commit message body from 621b58 was... NOTE: this is a zap file format change. The separate "postings locations" roaring Bitmap that encoded whether a posting has locations info is now replaced by the least significant bit in the freq varint encoded in the freq-norm chunkedIntCoder. encode/decodeFreqHasLocs() are added as helper functions. --- cmd/bleve/cmd/zap/explore.go | 6 --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/dict.go | 5 -- index/scorch/segment/zap/merge.go | 26 +++------- index/scorch/segment/zap/new.go | 42 ++++----------- index/scorch/segment/zap/posting.go | 79 ++++++++++++++--------------- 6 files changed, 57 insertions(+), 103 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 225b7373..0c2471ed 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -81,10 +81,6 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapAddr uint64 - locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - var postingListLen uint64 postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) @@ -131,8 +127,6 @@ var exploreCmd = &cobra.Command{ running2 += offset } - fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) - } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 20b892ca..9e9d787b 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 6 +const version uint32 = 7 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3b8132f2..38b4faca 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -72,15 +72,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if postings != nil { postings.Clear() } - locBitmap := rv.locBitmap - if locBitmap != nil { - locBitmap.Clear() - } *rv = PostingsList{} // clear the struct rv.postings = postings - rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1da5e526..51dd7420 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -259,9 +259,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - newRoaring, newRoaringLocs, tfEncoder, locEncoder, - use1HitEncoding, w, bufMaxVarintLen64) + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) if err != nil { return err } @@ -423,12 +422,14 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po nextFreq := next.Frequency() nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) if err != nil { return 0, 0, 0, nil, err } - locs := next.Locations() if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) @@ -503,8 +504,7 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, return lastDocNum, lastFreq, lastNorm, err } -func writePostings(postings, postingLocs *roaring.Bitmap, - tfEncoder, locEncoder *chunkedIntCoder, +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), w *CountHashWriter, bufMaxVarintLen64 []byte) ( offset uint64, err error) { @@ -532,12 +532,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - postingLocsOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) - if err != nil { - return 0, err - } - postingsOffset := uint64(w.Count()) n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) @@ -552,12 +546,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return 0, err - } - _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 7d098349..5837436f 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -103,9 +103,6 @@ type interim struct { // postings id -> bitmap of docNums Postings []*roaring.Bitmap - // postings id -> bitmap of docNums that have locations - PostingsLocs []*roaring.Bitmap - // postings id -> freq/norm's, one for each docNum in postings FreqNorms [][]interimFreqNorm freqNormsBacking []interimFreqNorm @@ -151,10 +148,6 @@ func (s *interim) reset() (err error) { idn.Clear() } s.Postings = s.Postings[:0] - for _, idn := range s.PostingsLocs { - idn.Clear() - } - s.PostingsLocs = s.PostingsLocs[:0] s.FreqNorms = s.FreqNorms[:0] for i := range s.freqNormsBacking { s.freqNormsBacking[i] = interimFreqNorm{} @@ -196,8 +189,9 @@ type interimStoredField struct { } type interimFreqNorm struct { - freq uint64 - norm float32 + freq uint64 + norm float32 + hasLocs bool } type interimLoc struct { @@ -356,19 +350,6 @@ func (s *interim) prepareDicts() { s.Postings = postings } - if cap(s.PostingsLocs) >= numPostingsLists { - s.PostingsLocs = s.PostingsLocs[:numPostingsLists] - } else { - postingsLocs := make([]*roaring.Bitmap, numPostingsLists) - copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) - for i := 0; i < numPostingsLists; i++ { - if postingsLocs[i] == nil { - postingsLocs[i] = roaring.New() - } - } - s.PostingsLocs = postingsLocs - } - if cap(s.FreqNorms) >= numPostingsLists { s.FreqNorms = s.FreqNorms[:numPostingsLists] } else { @@ -464,14 +445,12 @@ func (s *interim) processDocument(docNum uint64, s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, + freq: uint64(tf.Frequency()), + norm: norm, + hasLocs: len(tf.Locations) > 0, }) if len(tf.Locations) > 0 { - locBS := s.PostingsLocs[pid] - locBS.Add(uint32(docNum)) - locs := s.Locs[pid] for _, loc := range tf.Locations { @@ -625,7 +604,6 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err pid := dict[term] - 1 postingsBS := s.Postings[pid] - postingsLocsBS := s.PostingsLocs[pid] freqNorms := s.FreqNorms[pid] freqNormOffset := 0 @@ -639,7 +617,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] - err = tfEncoder.Add(docNum, freqNorm.freq, + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err @@ -675,9 +654,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - postingsBS, postingsLocsBS, tfEncoder, locEncoder, - nil, s.w, buf) + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f5ccad1a..004b8031 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -100,7 +100,6 @@ type PostingsList struct { postingsOffset uint64 freqOffset uint64 locOffset uint64 - locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap @@ -222,8 +221,6 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.locChunkStart = p.locOffset + n - rv.locBitmap = p.locBitmap - rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) @@ -271,23 +268,6 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - - locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - - if rv.locBitmap == nil { - rv.locBitmap = roaring.NewBitmap() - } - _, err := rv.locBitmap.FromBuffer(locRoaringBytes) - if err != nil { - return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) - } - var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -297,7 +277,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { if rv.postings == nil { rv.postings = roaring.NewBitmap() } - _, err = rv.postings.FromBuffer(roaringBytes) + _, err := rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -334,8 +314,6 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 - locBitmap *roaring.Bitmap - next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls @@ -353,10 +331,6 @@ func (i *PostingsIterator) Size() int { len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() - if i.locBitmap != nil { - sizeInBytes += int(i.locBitmap.GetSizeInBytes()) - } - for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } @@ -397,20 +371,37 @@ func (i *PostingsIterator) loadChunk(chunk int) error { return nil } -func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, nil + return 1, i.normBits1Hit, false, nil } - freq, err := i.freqNormDecoder.GetU64() + freqHasLocs, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + normBits, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading norm: %v", err) + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } - return freq, normBits, err + + return freq, normBits, hasLocs, err +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs } // readLocation processes all the integers on the stream representing a single @@ -484,13 +475,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.docNum = docNum var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return nil, err } + rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -514,6 +508,8 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( @@ -528,14 +524,16 @@ func (i *PostingsIterator) nextBytes() ( if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } - n := binary.PutUvarint(i.buf, uint64(1)) + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) n += binary.PutUvarint(i.buf, i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return 0, 0, 0, nil, nil, err } @@ -543,7 +541,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -596,11 +594,12 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() + freq, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { return 0, false, err } - if i.locBitmap.Contains(allN) { + + if hasLocs { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil {