0
0
Fork 0

MB-28781 - scorch zap merge freq/loc copying only when fieldsSame

The optimization recently introduced in commit 530a3d24cf,
("scorch zap optimize merge by byte copying freq/norm/loc's") was to
byte-copy freq/norm/loc data directly during merging.  But, it was
incorrect if the fields were different across segments.

This change now performs that byte-copying merging optimization only
when the fields are the same across segments, and if not, leverages
the old approach of deserializing & re-serializing the freq/norm/loc
information, which has the important step of remapping fieldID's.

See also: https://issues.couchbase.com/browse/MB-28781
This commit is contained in:
Steve Yen 2018-03-19 10:28:01 -07:00
parent c881146270
commit f65ba5c0f4
1 changed files with 72 additions and 6 deletions

View File

@ -117,7 +117,8 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
return nil, 0, 0, 0, 0, nil, nil, nil, err
}
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
fieldsInv, fieldsMap, fieldsSame,
newDocNums, numDocs, chunkFactor, cr)
if err != nil {
return nil, 0, 0, 0, 0, nil, nil, nil, err
@ -158,11 +159,12 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
}
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64,
newSegDocCount uint64, chunkFactor uint32,
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, uint64, error) {
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64
var postings *PostingsList
var postItr *PostingsIterator
@ -307,9 +309,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
postItr = postings.iterator(postItr)
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
tfEncoder, locEncoder, docTermMap)
if fieldsSame {
// can optimize by copying freq/norm/loc bytes directly
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
tfEncoder, locEncoder, docTermMap)
} else {
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
fieldsMap, term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs,
tfEncoder, locEncoder, docTermMap, bufLoc)
}
if err != nil {
return nil, 0, err
}
@ -397,6 +406,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
return rv, fieldDvLocsOffset, nil
}
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte,
bufLoc []uint64) (
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
next, err := postItr.Next()
for next != nil && err == nil {
hitNewDocNum := newDocNums[next.Number()]
if hitNewDocNum == docDropped {
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
}
newRoaring.Add(uint32(hitNewDocNum))
nextFreq := next.Frequency()
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm)
if err != nil {
return 0, 0, 0, nil, err
}
locs := next.Locations()
if len(locs) > 0 {
newRoaringLocs.Add(uint32(hitNewDocNum))
for _, loc := range locs {
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
}
args := bufLoc[0:5]
args[0] = uint64(fieldsMap[loc.Field()] - 1)
args[1] = loc.Pos()
args[2] = loc.Start()
args[3] = loc.End()
args[4] = uint64(len(loc.ArrayPositions()))
args = append(args, loc.ArrayPositions()...)
err = locEncoder.Add(hitNewDocNum, args...)
if err != nil {
return 0, 0, 0, nil, err
}
}
}
docTermMap[hitNewDocNum] =
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
lastDocNum = hitNewDocNum
lastFreq = nextFreq
lastNorm = nextNorm
next, err = postItr.Next()
}
return lastDocNum, lastFreq, lastNorm, bufLoc, err
}
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap,
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) (