From f65ba5c0f4df639ab1a8f4ddb5502eb9e269f6a8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 19 Mar 2018 10:28:01 -0700 Subject: [PATCH] MB-28781 - scorch zap merge freq/loc copying only when fieldsSame The optimization recently introduced in commit 530a3d24cf0768f4c7a, ("scorch zap optimize merge by byte copying freq/norm/loc's") was to byte-copy freq/norm/loc data directly during merging. But, it was incorrect if the fields were different across segments. This change now performs that byte-copying merging optimization only when the fields are the same across segments, and if not, leverages the old approach of deserializing & re-serializing the freq/norm/loc information, which has the important step of remapping fieldID's. See also: https://issues.couchbase.com/browse/MB-28781 --- index/scorch/segment/zap/merge.go | 78 ++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 4fe10edd..1da5e526 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -117,7 +117,8 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, return nil, 0, 0, 0, 0, nil, nil, nil, err } - dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, + fieldsInv, fieldsMap, fieldsSame, newDocNums, numDocs, chunkFactor, cr) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err @@ -158,11 +159,12 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 } func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, - newSegDocCount uint64, chunkFactor uint32, + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 var postings *PostingsList var postItr *PostingsIterator @@ -307,9 +309,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, postItr = postings.iterator(postItr) - lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( - term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, - tfEncoder, locEncoder, docTermMap) + if fieldsSame { + // can optimize by copying freq/norm/loc bytes directly + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + tfEncoder, locEncoder, docTermMap) + } else { + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + tfEncoder, locEncoder, docTermMap, bufLoc) + } if err != nil { return nil, 0, err } @@ -397,6 +406,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return rv, fieldDvLocsOffset, nil } +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, + bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + + err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) + if err != nil { + return 0, 0, 0, nil, err + } + + locs := next.Locations() + if len(locs) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + + for _, loc := range locs { + if cap(bufLoc) < 5+len(loc.ArrayPositions()) { + bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(loc.ArrayPositions())) + args = append(args, loc.ArrayPositions()...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) (