0
0
Fork 0

scorch zap optimimze to avoid bitmaps for 1-hit posting lists

This commit avoids creating roaring.Bitmap's (which would have just a
single entry) when a postings list/iterator represents a single
"1-hit" encoding.
This commit is contained in:
Steve Yen 2018-03-09 23:19:47 -08:00
parent b38a61d4cf
commit 2a20a36e15
2 changed files with 80 additions and 61 deletions

View File

@ -92,6 +92,8 @@ func under32Bits(x uint64) bool {
return x <= mask31Bits
}
const docNum1HitFinished = math.MaxUint64
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
sb *SegmentBase
@ -102,8 +104,9 @@ type PostingsList struct {
postings *roaring.Bitmap
except *roaring.Bitmap
// when postingsOffset == freqOffset == 0, then the postings list
// represents a "1-hit" encoding, and has the following norm
// when normBits1Hit != 0, then this postings list came from a
// 1-hit encoding, and only the docNum1Hit & normBits1Hit apply
docNum1Hit uint64
normBits1Hit uint64
}
@ -117,6 +120,17 @@ func (p *PostingsList) Size() int {
return sizeInBytes
}
func (p *PostingsList) OrInto(receiver *roaring.Bitmap) {
if p.normBits1Hit != 0 {
receiver.Add(uint32(p.docNum1Hit))
return
}
if p.postings != nil {
receiver.Or(p.postings)
}
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator() segment.PostingsIterator {
return p.iterator(nil)
@ -152,39 +166,47 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
}
rv.postings = p
if p.normBits1Hit != 0 {
// "1-hit" encoding
rv.docNum1Hit = p.docNum1Hit
rv.normBits1Hit = p.normBits1Hit
if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) {
rv.docNum1Hit = docNum1HitFinished
}
return rv
}
// "general" encoding, check if empty
if p.postings == nil {
return rv
}
if p.freqOffset > 0 && p.locOffset > 0 {
// "general" encoding, so prepare the freq chunk details
var n uint64
var read int
var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
// prepare the freq chunk details
var n uint64
var read int
var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.freqChunkLens = make([]uint64, int(numFreqChunks))
for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.freqChunkLens = make([]uint64, int(numFreqChunks))
for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.freqChunkStart = p.freqOffset + n
// prepare the loc chunk details
n = 0
var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.locChunkLens = make([]uint64, int(numLocChunks))
for i := 0; i < int(numLocChunks); i++ {
rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.locChunkStart = p.locOffset + n
} else {
// "1-hit" encoding
rv.normBits1Hit = p.normBits1Hit
}
rv.freqChunkStart = p.freqOffset + n
// prepare the loc chunk details
n = 0
var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.locChunkLens = make([]uint64, int(numLocChunks))
for i := 0; i < int(numLocChunks); i++ {
rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.locChunkStart = p.locOffset + n
rv.locBitmap = p.locBitmap
@ -201,18 +223,20 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
if p.postings != nil {
n := p.postings.GetCardinality()
if p.except != nil {
e := p.except.GetCardinality()
if e > n {
e = n
}
return n - e
}
return n
var n uint64
if p.normBits1Hit != 0 {
n = 1
} else if p.postings != nil {
n = p.postings.GetCardinality()
}
return 0
var e uint64
if p.except != nil {
e = p.except.GetCardinality()
}
if n <= e {
return 0
}
return n - e
}
func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
@ -263,19 +287,10 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
return nil
}
var emptyRoaring = roaring.NewBitmap()
func (rv *PostingsList) init1Hit(fstVal uint64) error {
docNum, normBits := FSTValDecode1Hit(fstVal)
rv.locBitmap = emptyRoaring
rv.postings = roaring.NewBitmap()
rv.postings.Add(uint32(docNum))
// TODO: we can likely do better than allocating a roaring bitmap
// with just 1 entry, but for now reuse existing machinery
rv.docNum1Hit = docNum
rv.normBits1Hit = normBits
return nil
@ -308,6 +323,7 @@ type PostingsIterator struct {
next Posting // reused across Next() calls
nextLocs []Location // reused across Next() calls
docNum1Hit uint64
normBits1Hit uint64
buf []byte
@ -460,7 +476,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.locBitmap.Contains(uint32(docNum)) {
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
// read off 'freq' locations, into reused slices
if cap(i.nextLocs) >= int(rv.freq) {
i.nextLocs = i.nextLocs[0:rv.freq]
@ -513,7 +529,7 @@ func (i *PostingsIterator) nextBytes() (
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
if i.locBitmap.Contains(uint32(docNum)) {
if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) {
startLoc := len(i.currChunkLoc) - i.locReader.Len()
for j := uint64(0); j < freq; j++ {
@ -533,6 +549,15 @@ func (i *PostingsIterator) nextBytes() (
// nextDocNum returns the next docNum on the postings list, and also
// sets up the currChunk / loc related fields of the iterator.
func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
if i.normBits1Hit != 0 {
if i.docNum1Hit == docNum1HitFinished {
return 0, false, nil
}
docNum := i.docNum1Hit
i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum
return docNum, true, nil
}
if i.actual == nil || !i.actual.HasNext() {
return 0, false, nil
}
@ -540,10 +565,6 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
n := i.actual.Next()
allN := i.all.Next()
if i.normBits1Hit != 0 {
return uint64(n), true, nil
}
nChunk := n / i.postings.sb.chunkFactor
allNChunk := allN / i.postings.sb.chunkFactor

View File

@ -341,15 +341,13 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
return nil, err
}
var postings *PostingsList
var postingsList *PostingsList
for _, id := range ids {
postings, err = idDict.postingsList([]byte(id), nil, postings)
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
if err != nil {
return nil, err
}
if postings.postings != nil {
rv.Or(postings.postings)
}
postingsList.OrInto(rv)
}
}