0
0
Fork 0

Merge pull request #856 from steveyen/revert-locsBitmap-replacement

Revert "scorch zap replace locsBitmap w/ 1 bit from freq-norm varint …
This commit is contained in:
Steve Yen 2018-03-23 11:20:45 -07:00 committed by GitHub
commit 1f7faf7e01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 103 additions and 57 deletions

View File

@ -81,6 +81,10 @@ var exploreCmd = &cobra.Command{
locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
n += uint64(read)
var locBitmapAddr uint64
locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
n += uint64(read)
var postingListLen uint64
postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
n += uint64(read)
@ -127,6 +131,8 @@ var exploreCmd = &cobra.Command{
running2 += offset
}
fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr)
} else {
fmt.Printf("dictionary does not contain term '%s'\n", args[2])
}

View File

@ -22,7 +22,7 @@ import (
"github.com/Smerity/govarint"
)
const version uint32 = 7
const version uint32 = 6
const fieldNotUninverted = math.MaxUint64

View File

@ -72,10 +72,15 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
if postings != nil {
postings.Clear()
}
locBitmap := rv.locBitmap
if locBitmap != nil {
locBitmap.Clear()
}
*rv = PostingsList{} // clear the struct
rv.postings = postings
rv.locBitmap = locBitmap
}
rv.sb = d.sb
rv.except = except

View File

@ -259,8 +259,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err := writePostings(newRoaring,
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
postingsOffset, err := writePostings(
newRoaring, newRoaringLocs, tfEncoder, locEncoder,
use1HitEncoding, w, bufMaxVarintLen64)
if err != nil {
return err
}
@ -422,14 +423,12 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po
nextFreq := next.Frequency()
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
locs := next.Locations()
err = tfEncoder.Add(hitNewDocNum,
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm)
if err != nil {
return 0, 0, 0, nil, err
}
locs := next.Locations()
if len(locs) > 0 {
newRoaringLocs.Add(uint32(hitNewDocNum))
@ -504,7 +503,8 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
return lastDocNum, lastFreq, lastNorm, err
}
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
func writePostings(postings, postingLocs *roaring.Bitmap,
tfEncoder, locEncoder *chunkedIntCoder,
use1HitEncoding func(uint64) (bool, uint64, uint64),
w *CountHashWriter, bufMaxVarintLen64 []byte) (
offset uint64, err error) {
@ -532,6 +532,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo
return 0, err
}
postingLocsOffset := uint64(w.Count())
_, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64)
if err != nil {
return 0, err
}
postingsOffset := uint64(w.Count())
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
@ -546,6 +552,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo
return 0, err
}
n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset)
_, err = w.Write(bufMaxVarintLen64[:n])
if err != nil {
return 0, err
}
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
if err != nil {
return 0, err

View File

@ -103,6 +103,9 @@ type interim struct {
// postings id -> bitmap of docNums
Postings []*roaring.Bitmap
// postings id -> bitmap of docNums that have locations
PostingsLocs []*roaring.Bitmap
// postings id -> freq/norm's, one for each docNum in postings
FreqNorms [][]interimFreqNorm
freqNormsBacking []interimFreqNorm
@ -148,6 +151,10 @@ func (s *interim) reset() (err error) {
idn.Clear()
}
s.Postings = s.Postings[:0]
for _, idn := range s.PostingsLocs {
idn.Clear()
}
s.PostingsLocs = s.PostingsLocs[:0]
s.FreqNorms = s.FreqNorms[:0]
for i := range s.freqNormsBacking {
s.freqNormsBacking[i] = interimFreqNorm{}
@ -189,9 +196,8 @@ type interimStoredField struct {
}
type interimFreqNorm struct {
freq uint64
norm float32
hasLocs bool
freq uint64
norm float32
}
type interimLoc struct {
@ -350,6 +356,19 @@ func (s *interim) prepareDicts() {
s.Postings = postings
}
if cap(s.PostingsLocs) >= numPostingsLists {
s.PostingsLocs = s.PostingsLocs[:numPostingsLists]
} else {
postingsLocs := make([]*roaring.Bitmap, numPostingsLists)
copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)])
for i := 0; i < numPostingsLists; i++ {
if postingsLocs[i] == nil {
postingsLocs[i] = roaring.New()
}
}
s.PostingsLocs = postingsLocs
}
if cap(s.FreqNorms) >= numPostingsLists {
s.FreqNorms = s.FreqNorms[:numPostingsLists]
} else {
@ -445,12 +464,14 @@ func (s *interim) processDocument(docNum uint64,
s.FreqNorms[pid] = append(s.FreqNorms[pid],
interimFreqNorm{
freq: uint64(tf.Frequency()),
norm: norm,
hasLocs: len(tf.Locations) > 0,
freq: uint64(tf.Frequency()),
norm: norm,
})
if len(tf.Locations) > 0 {
locBS := s.PostingsLocs[pid]
locBS.Add(uint32(docNum))
locs := s.Locs[pid]
for _, loc := range tf.Locations {
@ -604,6 +625,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
pid := dict[term] - 1
postingsBS := s.Postings[pid]
postingsLocsBS := s.PostingsLocs[pid]
freqNorms := s.FreqNorms[pid]
freqNormOffset := 0
@ -617,8 +639,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
freqNorm := freqNorms[freqNormOffset]
err = tfEncoder.Add(docNum,
encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs),
err = tfEncoder.Add(docNum, freqNorm.freq,
uint64(math.Float32bits(freqNorm.norm)))
if err != nil {
return 0, nil, err
@ -654,8 +675,9 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err
tfEncoder.Close()
locEncoder.Close()
postingsOffset, err :=
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
postingsOffset, err := writePostings(
postingsBS, postingsLocsBS, tfEncoder, locEncoder,
nil, s.w, buf)
if err != nil {
return 0, nil, err
}

View File

@ -100,6 +100,7 @@ type PostingsList struct {
postingsOffset uint64
freqOffset uint64
locOffset uint64
locBitmap *roaring.Bitmap
postings *roaring.Bitmap
except *roaring.Bitmap
@ -221,6 +222,8 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
}
rv.locChunkStart = p.locOffset + n
rv.locBitmap = p.locBitmap
rv.all = p.postings.Iterator()
if p.except != nil {
allExcept := roaring.AndNot(p.postings, p.except)
@ -268,6 +271,23 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var locBitmapOffset uint64
locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var locBitmapLen uint64
locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64])
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
if rv.locBitmap == nil {
rv.locBitmap = roaring.NewBitmap()
}
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
if err != nil {
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
}
var postingsLen uint64
postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
@ -277,7 +297,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
if rv.postings == nil {
rv.postings = roaring.NewBitmap()
}
_, err := rv.postings.FromBuffer(roaringBytes)
_, err = rv.postings.FromBuffer(roaringBytes)
if err != nil {
return fmt.Errorf("error loading roaring bitmap: %v", err)
}
@ -314,6 +334,8 @@ type PostingsIterator struct {
locChunkOffsets []uint64
locChunkStart uint64
locBitmap *roaring.Bitmap
next Posting // reused across Next() calls
nextLocs []Location // reused across Next() calls
@ -331,6 +353,10 @@ func (i *PostingsIterator) Size() int {
len(i.locChunkOffsets)*size.SizeOfUint64 +
i.next.Size()
if i.locBitmap != nil {
sizeInBytes += int(i.locBitmap.GetSizeInBytes())
}
for _, entry := range i.nextLocs {
sizeInBytes += entry.Size()
}
@ -371,37 +397,20 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
return nil
}
func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) {
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
if i.normBits1Hit != 0 {
return 1, i.normBits1Hit, false, nil
return 1, i.normBits1Hit, nil
}
freqHasLocs, err := i.freqNormDecoder.GetU64()
freq, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading frequency: %v", err)
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
}
freq, hasLocs := decodeFreqHasLocs(freqHasLocs)
normBits, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, false, fmt.Errorf("error reading norm: %v", err)
return 0, 0, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, hasLocs, err
}
func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 {
rv := freq << 1
if hasLocs {
rv = rv | 0x01 // 0'th LSB encodes whether there are locations
}
return rv
}
func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) {
freq := freqHasLocs >> 1
hasLocs := freqHasLocs&0x01 != 0
return freq, hasLocs
return freq, normBits, err
}
// readLocation processes all the integers on the stream representing a single
@ -475,16 +484,13 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
rv.docNum = docNum
var normBits uint64
var hasLocs bool
rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
rv.freq, normBits, err = i.readFreqNorm()
if err != nil {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if hasLocs {
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
// read off 'freq' locations, into reused slices
if cap(i.nextLocs) >= int(rv.freq) {
i.nextLocs = i.nextLocs[0:rv.freq]
@ -508,8 +514,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
return rv, nil
}
var freqHasLocs1Hit = encodeFreqHasLocs(1, false)
// nextBytes returns the docNum and the encoded freq & loc bytes for
// the next posting
func (i *PostingsIterator) nextBytes() (
@ -524,16 +528,14 @@ func (i *PostingsIterator) nextBytes() (
if i.buf == nil {
i.buf = make([]byte, binary.MaxVarintLen64*2)
}
n := binary.PutUvarint(i.buf, freqHasLocs1Hit)
n := binary.PutUvarint(i.buf, uint64(1))
n += binary.PutUvarint(i.buf, i.normBits1Hit)
return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil
}
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
var hasLocs bool
freq, normBits, hasLocs, err = i.readFreqNormHasLocs()
freq, normBits, err = i.readFreqNorm()
if err != nil {
return 0, 0, 0, nil, nil, err
}
@ -541,7 +543,7 @@ func (i *PostingsIterator) nextBytes() (
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
if hasLocs {
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
startLoc := len(i.currChunkLoc) - i.locReader.Len()
for j := uint64(0); j < freq; j++ {
@ -594,12 +596,11 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
}
// read off freq/offsets even though we don't care about them
freq, _, hasLocs, err := i.readFreqNormHasLocs()
freq, _, err := i.readFreqNorm()
if err != nil {
return 0, false, err
}
if hasLocs {
if i.locBitmap.Contains(allN) {
for j := 0; j < int(freq); j++ {
err := i.readLocation(nil)
if err != nil {