scorch zap merge uses array for docTermMap with no sorting
Instead of sorting docNum keys from a hashmap, this change instead iterates from docNum 0 to N and uses an array instead of hashmap. The array is also reused across outer loop iterations. This optimizes for when there's a lot of structural similarity between docs, where many/most docs have the same fields. i.e., beers, breweries. If every doc has completely different fields, then this change might produce worse behavior compared to the previous sparse hashmap approach.
This commit is contained in:
parent
5d1a2b0ad7
commit
a444c25ddf
|
@ -21,7 +21,6 @@ import (
|
|||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
|
@ -149,7 +148,11 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
|
|||
fieldDvLocs := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocsOffset := uint64(fieldNotUninverted)
|
||||
|
||||
var docNumbers docIDRange
|
||||
// docTermMap is keyed by docNum, where the array impl provides
|
||||
// better memory usage behavior than a sparse-friendlier hashmap
|
||||
// for when docs have much structural similarity (i.e., every doc
|
||||
// has a given field)
|
||||
var docTermMap [][]byte
|
||||
|
||||
var vellumBuf bytes.Buffer
|
||||
|
||||
|
@ -193,7 +196,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
|
|||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
|
||||
docTermMap := make(map[uint64][]byte, newSegDocCount)
|
||||
if uint64(cap(docTermMap)) < newSegDocCount {
|
||||
docTermMap = make([][]byte, newSegDocCount)
|
||||
} else {
|
||||
docTermMap = docTermMap[0:newSegDocCount]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
for err == nil {
|
||||
term, _ := mergeItr.Current()
|
||||
|
@ -343,21 +353,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
|
|||
|
||||
rv[fieldID] = dictOffset
|
||||
|
||||
// update the doc nums
|
||||
if cap(docNumbers) < len(docTermMap) {
|
||||
docNumbers = make(docIDRange, 0, len(docTermMap))
|
||||
}
|
||||
docNumbers = docNumbers[:0]
|
||||
for k := range docTermMap {
|
||||
docNumbers = append(docNumbers, k)
|
||||
}
|
||||
sort.Sort(docNumbers)
|
||||
|
||||
// update the field doc values
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
for _, docNum := range docNumbers {
|
||||
err = fdvEncoder.Add(docNum, docTermMap[docNum])
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
err = fdvEncoder.Close()
|
||||
|
|
Loading…
Reference in New Issue