Merge pull request #801 from steveyen/scorch-postings-itr-byte-copy
scorch merge optimizations via tf/loc byte copy & reader/decoder reuse
This commit is contained in:
commit
2b005f1e23
|
@ -82,6 +82,19 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
|
||||||
|
chunk := docNum / c.chunkSize
|
||||||
|
if chunk != c.currChunk {
|
||||||
|
// starting a new chunk
|
||||||
|
c.Close()
|
||||||
|
c.chunkBuf.Reset()
|
||||||
|
c.currChunk = chunk
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := c.chunkBuf.Write(buf)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// Close indicates you are done calling Add() this allows the final chunk
|
// Close indicates you are done calling Add() this allows the final chunk
|
||||||
// to be encoded.
|
// to be encoded.
|
||||||
func (c *chunkedIntCoder) Close() {
|
func (c *chunkedIntCoder) Close() {
|
||||||
|
|
|
@ -162,7 +162,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
|
|
||||||
var bufReuse bytes.Buffer
|
var bufReuse bytes.Buffer
|
||||||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
||||||
var bufLoc []uint64
|
|
||||||
|
|
||||||
var postings *PostingsList
|
var postings *PostingsList
|
||||||
var postItr *PostingsIterator
|
var postItr *PostingsIterator
|
||||||
|
@ -316,45 +315,32 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
newDocNumsI := newDocNums[itrI]
|
newDocNumsI := newDocNums[itrI]
|
||||||
|
|
||||||
postItr = postings.iterator(postItr)
|
postItr = postings.iterator(postItr)
|
||||||
next, err2 := postItr.Next()
|
|
||||||
for next != nil && err2 == nil {
|
nextDocNum, nextFreqNormBytes, nextLocBytes, err2 := postItr.nextBytes()
|
||||||
hitNewDocNum := newDocNumsI[next.Number()]
|
for err2 == nil && len(nextFreqNormBytes) > 0 {
|
||||||
|
hitNewDocNum := newDocNumsI[nextDocNum]
|
||||||
if hitNewDocNum == docDropped {
|
if hitNewDocNum == docDropped {
|
||||||
return nil, 0, fmt.Errorf("see hit with dropped doc num")
|
return nil, 0, fmt.Errorf("see hit with dropped doc num")
|
||||||
}
|
}
|
||||||
|
|
||||||
newRoaring.Add(uint32(hitNewDocNum))
|
newRoaring.Add(uint32(hitNewDocNum))
|
||||||
// encode norm bits
|
err2 = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
|
||||||
norm := next.Norm()
|
if err2 != nil {
|
||||||
normBits := math.Float32bits(float32(norm))
|
return nil, 0, err2
|
||||||
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
}
|
||||||
locs := next.Locations()
|
|
||||||
if len(locs) > 0 {
|
if len(nextLocBytes) > 0 {
|
||||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||||
for _, loc := range locs {
|
err2 = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
|
||||||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
|
if err2 != nil {
|
||||||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
return nil, 0, err2
|
||||||
}
|
|
||||||
args := bufLoc[0:5]
|
|
||||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
|
||||||
args[1] = loc.Pos()
|
|
||||||
args[2] = loc.Start()
|
|
||||||
args[3] = loc.End()
|
|
||||||
args[4] = uint64(len(loc.ArrayPositions()))
|
|
||||||
args = append(args, loc.ArrayPositions()...)
|
|
||||||
err = locEncoder.Add(hitNewDocNum, args...)
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
docTermMap[hitNewDocNum] =
|
docTermMap[hitNewDocNum] =
|
||||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
||||||
|
|
||||||
next, err2 = postItr.Next()
|
nextDocNum, nextFreqNormBytes, nextLocBytes, err2 = postItr.nextBytes()
|
||||||
}
|
}
|
||||||
if err2 != nil {
|
if err2 != nil {
|
||||||
return nil, 0, err2
|
return nil, 0, err2
|
||||||
|
|
|
@ -45,7 +45,25 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
|
||||||
if rv == nil {
|
if rv == nil {
|
||||||
rv = &PostingsIterator{}
|
rv = &PostingsIterator{}
|
||||||
} else {
|
} else {
|
||||||
|
freqNormReader := rv.freqNormReader
|
||||||
|
if freqNormReader != nil {
|
||||||
|
freqNormReader.Reset([]byte(nil))
|
||||||
|
}
|
||||||
|
freqNormDecoder := rv.freqNormDecoder
|
||||||
|
|
||||||
|
locReader := rv.locReader
|
||||||
|
if locReader != nil {
|
||||||
|
locReader.Reset([]byte(nil))
|
||||||
|
}
|
||||||
|
locDecoder := rv.locDecoder
|
||||||
|
|
||||||
*rv = PostingsIterator{} // clear the struct
|
*rv = PostingsIterator{} // clear the struct
|
||||||
|
|
||||||
|
rv.freqNormReader = freqNormReader
|
||||||
|
rv.freqNormDecoder = freqNormDecoder
|
||||||
|
|
||||||
|
rv.locReader = locReader
|
||||||
|
rv.locDecoder = locDecoder
|
||||||
}
|
}
|
||||||
rv.postings = p
|
rv.postings = p
|
||||||
|
|
||||||
|
@ -279,75 +297,23 @@ func (i *PostingsIterator) readLocation(l *Location) error {
|
||||||
|
|
||||||
// Next returns the next posting on the postings list, or nil at the end
|
// Next returns the next posting on the postings list, or nil at the end
|
||||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||||
if i.actual == nil || !i.actual.HasNext() {
|
docNum, exists, err := i.nextDocNum()
|
||||||
return nil, nil
|
if err != nil || !exists {
|
||||||
}
|
return nil, err
|
||||||
n := i.actual.Next()
|
|
||||||
nChunk := n / i.postings.sb.chunkFactor
|
|
||||||
allN := i.all.Next()
|
|
||||||
allNChunk := allN / i.postings.sb.chunkFactor
|
|
||||||
|
|
||||||
// n is the next actual hit (excluding some postings)
|
|
||||||
// allN is the next hit in the full postings
|
|
||||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
|
||||||
// incr the all iterator, and check again
|
|
||||||
for allN != n {
|
|
||||||
|
|
||||||
// in different chunks, reset offsets
|
|
||||||
if allNChunk != nChunk {
|
|
||||||
i.locoffset = 0
|
|
||||||
i.offset = 0
|
|
||||||
} else {
|
|
||||||
|
|
||||||
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
|
||||||
err := i.loadChunk(int(nChunk))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error loading chunk: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// read off freq/offsets even though we don't care about them
|
|
||||||
freq, _, err := i.readFreqNorm()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
if i.locBitmap.Contains(allN) {
|
|
||||||
for j := 0; j < int(freq); j++ {
|
|
||||||
err := i.readLocation(nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// in same chunk, need to account for offsets
|
|
||||||
i.offset++
|
|
||||||
}
|
|
||||||
|
|
||||||
allN = i.all.Next()
|
|
||||||
}
|
|
||||||
|
|
||||||
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
|
||||||
err := i.loadChunk(int(nChunk))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error loading chunk: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
reuseLocs := i.next.locs // hold for reuse before struct clearing
|
reuseLocs := i.next.locs // hold for reuse before struct clearing
|
||||||
i.next = Posting{} // clear the struct
|
i.next = Posting{} // clear the struct
|
||||||
rv := &i.next
|
rv := &i.next
|
||||||
rv.iterator = i
|
rv.docNum = docNum
|
||||||
rv.docNum = uint64(n)
|
|
||||||
|
|
||||||
var err error
|
|
||||||
var normBits uint64
|
var normBits uint64
|
||||||
rv.freq, normBits, err = i.readFreqNorm()
|
rv.freq, normBits, err = i.readFreqNorm()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
rv.norm = math.Float32frombits(uint32(normBits))
|
rv.norm = math.Float32frombits(uint32(normBits))
|
||||||
if i.locBitmap.Contains(n) {
|
if i.locBitmap.Contains(uint32(docNum)) {
|
||||||
// read off 'freq' locations, into reused slices
|
// read off 'freq' locations, into reused slices
|
||||||
if cap(i.nextLocs) >= int(rv.freq) {
|
if cap(i.nextLocs) >= int(rv.freq) {
|
||||||
i.nextLocs = i.nextLocs[0:rv.freq]
|
i.nextLocs = i.nextLocs[0:rv.freq]
|
||||||
|
@ -371,14 +337,111 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// nextBytes returns the docNum and the encoded freq & loc bytes for
|
||||||
|
// the next posting
|
||||||
|
func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) {
|
||||||
|
docNum, exists, err := i.nextDocNum()
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, nil, err
|
||||||
|
}
|
||||||
|
if !exists {
|
||||||
|
return 0, nil, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||||
|
|
||||||
|
freq, _, err := i.readFreqNorm()
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len()
|
||||||
|
bytesFreqNorm := i.currChunkFreqNorm[startFreqNorm:endFreqNorm]
|
||||||
|
|
||||||
|
var bytesLoc []byte
|
||||||
|
if i.locBitmap.Contains(uint32(docNum)) {
|
||||||
|
startLoc := len(i.currChunkLoc) - i.locReader.Len()
|
||||||
|
|
||||||
|
for j := uint64(0); j < freq; j++ {
|
||||||
|
err := i.readLocation(nil)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
endLoc := len(i.currChunkLoc) - i.locReader.Len()
|
||||||
|
bytesLoc = i.currChunkLoc[startLoc:endLoc]
|
||||||
|
}
|
||||||
|
|
||||||
|
return docNum, bytesFreqNorm, bytesLoc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// nextDocNum returns the next docNum on the postings list, and also
|
||||||
|
// sets up the currChunk / loc related fields of the iterator.
|
||||||
|
func (i *PostingsIterator) nextDocNum() (uint64, bool, error) {
|
||||||
|
if i.actual == nil || !i.actual.HasNext() {
|
||||||
|
return 0, false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
n := i.actual.Next()
|
||||||
|
nChunk := n / i.postings.sb.chunkFactor
|
||||||
|
allN := i.all.Next()
|
||||||
|
allNChunk := allN / i.postings.sb.chunkFactor
|
||||||
|
|
||||||
|
// n is the next actual hit (excluding some postings)
|
||||||
|
// allN is the next hit in the full postings
|
||||||
|
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||||
|
// incr the all iterator, and check again
|
||||||
|
for allN != n {
|
||||||
|
// in different chunks, reset offsets
|
||||||
|
if allNChunk != nChunk {
|
||||||
|
i.locoffset = 0
|
||||||
|
i.offset = 0
|
||||||
|
} else {
|
||||||
|
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
||||||
|
err := i.loadChunk(int(nChunk))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, fmt.Errorf("error loading chunk: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// read off freq/offsets even though we don't care about them
|
||||||
|
freq, _, err := i.readFreqNorm()
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, err
|
||||||
|
}
|
||||||
|
if i.locBitmap.Contains(allN) {
|
||||||
|
for j := 0; j < int(freq); j++ {
|
||||||
|
err := i.readLocation(nil)
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// in same chunk, need to account for offsets
|
||||||
|
i.offset++
|
||||||
|
}
|
||||||
|
|
||||||
|
allN = i.all.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
||||||
|
err := i.loadChunk(int(nChunk))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false, fmt.Errorf("error loading chunk: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return uint64(n), true, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Posting is a single entry in a postings list
|
// Posting is a single entry in a postings list
|
||||||
type Posting struct {
|
type Posting struct {
|
||||||
iterator *PostingsIterator
|
docNum uint64
|
||||||
docNum uint64
|
freq uint64
|
||||||
|
norm float32
|
||||||
freq uint64
|
locs []segment.Location
|
||||||
norm float32
|
|
||||||
locs []segment.Location
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number returns the document number of this posting in this segment
|
// Number returns the document number of this posting in this segment
|
||||||
|
|
Loading…
Reference in New Issue