From 49a4ee60ba1d9569403a9c980aa87c5131048bf3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 10:01:30 -0700 Subject: [PATCH] Revert "scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding" Testing with the cbft application led to cbft process exits... AsyncError exit()... error reading location field: EOF -- main.initBleveOptions.func1() at init_bleve.go:85 This reverts commit 621b58dd8341232c01653db36372f0e0361bc90f. --- cmd/bleve/cmd/zap/explore.go | 6 +++ index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/dict.go | 5 ++ index/scorch/segment/zap/merge.go | 26 +++++++--- index/scorch/segment/zap/new.go | 42 +++++++++++---- index/scorch/segment/zap/posting.go | 79 +++++++++++++++-------------- 6 files changed, 103 insertions(+), 57 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 0c2471ed..225b7373 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -81,6 +81,10 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) + var locBitmapAddr uint64 + locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + n += uint64(read) + var postingListLen uint64 postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) @@ -127,6 +131,8 @@ var exploreCmd = &cobra.Command{ running2 += offset } + fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) + } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 9e9d787b..20b892ca 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 7 +const version uint32 = 6 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 38b4faca..3b8132f2 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -72,10 +72,15 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if postings != nil { postings.Clear() } + locBitmap := rv.locBitmap + if locBitmap != nil { + locBitmap.Clear() + } *rv = PostingsList{} // clear the struct rv.postings = postings + rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 51dd7420..1da5e526 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -259,8 +259,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings(newRoaring, - tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + postingsOffset, err := writePostings( + newRoaring, newRoaringLocs, tfEncoder, locEncoder, + use1HitEncoding, w, bufMaxVarintLen64) if err != nil { return err } @@ -422,14 +423,12 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po nextFreq := next.Frequency() nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - locs := next.Locations() - - err = tfEncoder.Add(hitNewDocNum, - encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) if err != nil { return 0, 0, 0, nil, err } + locs := next.Locations() if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) @@ -504,7 +503,8 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, return lastDocNum, lastFreq, lastNorm, err } -func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, +func writePostings(postings, postingLocs *roaring.Bitmap, + tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), w *CountHashWriter, bufMaxVarintLen64 []byte) ( offset uint64, err error) { @@ -532,6 +532,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return 0, err } + postingLocsOffset := uint64(w.Count()) + _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + postingsOffset := uint64(w.Count()) n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) @@ -546,6 +552,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return 0, err } + n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 5837436f..7d098349 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -103,6 +103,9 @@ type interim struct { // postings id -> bitmap of docNums Postings []*roaring.Bitmap + // postings id -> bitmap of docNums that have locations + PostingsLocs []*roaring.Bitmap + // postings id -> freq/norm's, one for each docNum in postings FreqNorms [][]interimFreqNorm freqNormsBacking []interimFreqNorm @@ -148,6 +151,10 @@ func (s *interim) reset() (err error) { idn.Clear() } s.Postings = s.Postings[:0] + for _, idn := range s.PostingsLocs { + idn.Clear() + } + s.PostingsLocs = s.PostingsLocs[:0] s.FreqNorms = s.FreqNorms[:0] for i := range s.freqNormsBacking { s.freqNormsBacking[i] = interimFreqNorm{} @@ -189,9 +196,8 @@ type interimStoredField struct { } type interimFreqNorm struct { - freq uint64 - norm float32 - hasLocs bool + freq uint64 + norm float32 } type interimLoc struct { @@ -350,6 +356,19 @@ func (s *interim) prepareDicts() { s.Postings = postings } + if cap(s.PostingsLocs) >= numPostingsLists { + s.PostingsLocs = s.PostingsLocs[:numPostingsLists] + } else { + postingsLocs := make([]*roaring.Bitmap, numPostingsLists) + copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) + for i := 0; i < numPostingsLists; i++ { + if postingsLocs[i] == nil { + postingsLocs[i] = roaring.New() + } + } + s.PostingsLocs = postingsLocs + } + if cap(s.FreqNorms) >= numPostingsLists { s.FreqNorms = s.FreqNorms[:numPostingsLists] } else { @@ -445,12 +464,14 @@ func (s *interim) processDocument(docNum uint64, s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, - hasLocs: len(tf.Locations) > 0, + freq: uint64(tf.Frequency()), + norm: norm, }) if len(tf.Locations) > 0 { + locBS := s.PostingsLocs[pid] + locBS.Add(uint32(docNum)) + locs := s.Locs[pid] for _, loc := range tf.Locations { @@ -604,6 +625,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err pid := dict[term] - 1 postingsBS := s.Postings[pid] + postingsLocsBS := s.PostingsLocs[pid] freqNorms := s.FreqNorms[pid] freqNormOffset := 0 @@ -617,8 +639,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] - err = tfEncoder.Add(docNum, - encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), + err = tfEncoder.Add(docNum, freqNorm.freq, uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err @@ -654,8 +675,9 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder.Close() locEncoder.Close() - postingsOffset, err := - writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) + postingsOffset, err := writePostings( + postingsBS, postingsLocsBS, tfEncoder, locEncoder, + nil, s.w, buf) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 004b8031..f5ccad1a 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -100,6 +100,7 @@ type PostingsList struct { postingsOffset uint64 freqOffset uint64 locOffset uint64 + locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap @@ -221,6 +222,8 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.locChunkStart = p.locOffset + n + rv.locBitmap = p.locBitmap + rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) @@ -268,6 +271,23 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) + var locBitmapOffset uint64 + locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var locBitmapLen uint64 + locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) + + locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] + + if rv.locBitmap == nil { + rv.locBitmap = roaring.NewBitmap() + } + _, err := rv.locBitmap.FromBuffer(locRoaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) + } + var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -277,7 +297,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { if rv.postings == nil { rv.postings = roaring.NewBitmap() } - _, err := rv.postings.FromBuffer(roaringBytes) + _, err = rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -314,6 +334,8 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 + locBitmap *roaring.Bitmap + next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls @@ -331,6 +353,10 @@ func (i *PostingsIterator) Size() int { len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() + if i.locBitmap != nil { + sizeInBytes += int(i.locBitmap.GetSizeInBytes()) + } + for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } @@ -371,37 +397,20 @@ func (i *PostingsIterator) loadChunk(chunk int) error { return nil } -func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { +func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, false, nil + return 1, i.normBits1Hit, nil } - freqHasLocs, err := i.freqNormDecoder.GetU64() + freq, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, fmt.Errorf("error reading frequency: %v", err) } - freq, hasLocs := decodeFreqHasLocs(freqHasLocs) - normBits, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + return 0, 0, fmt.Errorf("error reading norm: %v", err) } - - return freq, normBits, hasLocs, err -} - -func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { - rv := freq << 1 - if hasLocs { - rv = rv | 0x01 // 0'th LSB encodes whether there are locations - } - return rv -} - -func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { - freq := freqHasLocs >> 1 - hasLocs := freqHasLocs&0x01 != 0 - return freq, hasLocs + return freq, normBits, err } // readLocation processes all the integers on the stream representing a single @@ -475,16 +484,13 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.docNum = docNum var normBits uint64 - var hasLocs bool - - rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + rv.freq, normBits, err = i.readFreqNorm() if err != nil { return nil, err } - rv.norm = math.Float32frombits(uint32(normBits)) - if hasLocs { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -508,8 +514,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } -var freqHasLocs1Hit = encodeFreqHasLocs(1, false) - // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( @@ -524,16 +528,14 @@ func (i *PostingsIterator) nextBytes() ( if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } - n := binary.PutUvarint(i.buf, freqHasLocs1Hit) + n := binary.PutUvarint(i.buf, uint64(1)) n += binary.PutUvarint(i.buf, i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - var hasLocs bool - - freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + freq, normBits, err = i.readFreqNorm() if err != nil { return 0, 0, 0, nil, nil, err } @@ -541,7 +543,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if hasLocs { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -594,12 +596,11 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } // read off freq/offsets even though we don't care about them - freq, _, hasLocs, err := i.readFreqNormHasLocs() + freq, _, err := i.readFreqNorm() if err != nil { return 0, false, err } - - if hasLocs { + if i.locBitmap.Contains(allN) { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil {