0
0
Fork 0

scorch zap merge uses array for docTermMap with no sorting

Instead of sorting docNum keys from a hashmap, this change instead
iterates from docNum 0 to N and uses an array instead of hashmap.
The array is also reused across outer loop iterations.

This optimizes for when there's a lot of structural similarity between
docs, where many/most docs have the same fields.  i.e., beers,
breweries.  If every doc has completely different fields, then this
change might produce worse behavior compared to the previous sparse
hashmap approach.
This commit is contained in:
Steve Yen 2018-01-27 13:23:33 -08:00
parent 5d1a2b0ad7
commit a444c25ddf
1 changed files with 20 additions and 17 deletions

View File

@ -21,7 +21,6 @@ import (
"fmt"
"math"
"os"
"sort"
"github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
@ -149,7 +148,11 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
fieldDvLocs := make([]uint64, len(fieldsInv))
fieldDvLocsOffset := uint64(fieldNotUninverted)
var docNumbers docIDRange
// docTermMap is keyed by docNum, where the array impl provides
// better memory usage behavior than a sparse-friendlier hashmap
// for when docs have much structural similarity (i.e., every doc
// has a given field)
var docTermMap [][]byte
var vellumBuf bytes.Buffer
@ -193,7 +196,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
docTermMap := make(map[uint64][]byte, newSegDocCount)
if uint64(cap(docTermMap)) < newSegDocCount {
docTermMap = make([][]byte, newSegDocCount)
} else {
docTermMap = docTermMap[0:newSegDocCount]
for docNum := range docTermMap { // reset the docTermMap
docTermMap[docNum] = docTermMap[docNum][:0]
}
}
for err == nil {
term, _ := mergeItr.Current()
@ -343,21 +353,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
rv[fieldID] = dictOffset
// update the doc nums
if cap(docNumbers) < len(docTermMap) {
docNumbers = make(docIDRange, 0, len(docTermMap))
}
docNumbers = docNumbers[:0]
for k := range docTermMap {
docNumbers = append(docNumbers, k)
}
sort.Sort(docNumbers)
// update the field doc values
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
for _, docNum := range docNumbers {
err = fdvEncoder.Add(docNum, docTermMap[docNum])
if err != nil {
return nil, 0, err
for docNum, docTerms := range docTermMap {
if len(docTerms) > 0 {
err = fdvEncoder.Add(uint64(docNum), docTerms)
if err != nil {
return nil, 0, err
}
}
}
err = fdvEncoder.Close()