MB-28781 - scorch zap merge freq/loc copying only when fieldsSame
The optimization recently introduced in commit 530a3d24cf
,
("scorch zap optimize merge by byte copying freq/norm/loc's") was to
byte-copy freq/norm/loc data directly during merging. But, it was
incorrect if the fields were different across segments.
This change now performs that byte-copying merging optimization only
when the fields are the same across segments, and if not, leverages
the old approach of deserializing & re-serializing the freq/norm/loc
information, which has the important step of remapping fieldID's.
See also: https://issues.couchbase.com/browse/MB-28781
This commit is contained in:
parent
c881146270
commit
f65ba5c0f4
|
@ -117,7 +117,8 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
}
|
||||
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
|
||||
fieldsInv, fieldsMap, fieldsSame,
|
||||
newDocNums, numDocs, chunkFactor, cr)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
|
@ -158,11 +159,12 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
|
|||
}
|
||||
|
||||
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64,
|
||||
newSegDocCount uint64, chunkFactor uint32,
|
||||
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
|
||||
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
|
||||
w *CountHashWriter) ([]uint64, uint64, error) {
|
||||
|
||||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
||||
var bufLoc []uint64
|
||||
|
||||
var postings *PostingsList
|
||||
var postItr *PostingsIterator
|
||||
|
@ -307,9 +309,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
|
||||
postItr = postings.iterator(postItr)
|
||||
|
||||
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
|
||||
term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
|
||||
tfEncoder, locEncoder, docTermMap)
|
||||
if fieldsSame {
|
||||
// can optimize by copying freq/norm/loc bytes directly
|
||||
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
|
||||
term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
|
||||
tfEncoder, locEncoder, docTermMap)
|
||||
} else {
|
||||
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
|
||||
fieldsMap, term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
|
||||
tfEncoder, locEncoder, docTermMap, bufLoc)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
@ -397,6 +406,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
return rv, fieldDvLocsOffset, nil
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte,
|
||||
bufLoc []uint64) (
|
||||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
|
||||
next, err := postItr.Next()
|
||||
for next != nil && err == nil {
|
||||
hitNewDocNum := newDocNums[next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
|
||||
}
|
||||
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
|
||||
nextFreq := next.Frequency()
|
||||
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
|
||||
|
||||
err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
locs := next.Locations()
|
||||
if len(locs) > 0 {
|
||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||
|
||||
for _, loc := range locs {
|
||||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
|
||||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
||||
}
|
||||
args := bufLoc[0:5]
|
||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||
args[1] = loc.Pos()
|
||||
args[2] = loc.Start()
|
||||
args[3] = loc.End()
|
||||
args[4] = uint64(len(loc.ArrayPositions()))
|
||||
args = append(args, loc.ArrayPositions()...)
|
||||
err = locEncoder.Add(hitNewDocNum, args...)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docTermMap[hitNewDocNum] =
|
||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
||||
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
next, err = postItr.Next()
|
||||
}
|
||||
|
||||
return lastDocNum, lastFreq, lastNorm, bufLoc, err
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) (
|
||||
|
|
Loading…
Reference in New Issue