0
0
Fork 0

Merge pull request #822 from steveyen/scorch-reuse-roaring-and-sync-Pool

reuse roaring Bitmaps and also use sync.Pool for interim data
This commit is contained in:
Steve Yen 2018-03-12 13:20:51 -07:00 committed by GitHub
commit a526fe70f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 158 additions and 38 deletions

View File

@ -68,7 +68,19 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
if rv == nil { if rv == nil {
rv = &PostingsList{} rv = &PostingsList{}
} else { } else {
postings := rv.postings
if postings != nil {
postings.Clear()
}
locBitmap := rv.locBitmap
if locBitmap != nil {
locBitmap.Clear()
}
*rv = PostingsList{} // clear the struct *rv = PostingsList{} // clear the struct
rv.postings = postings
rv.locBitmap = locBitmap
} }
rv.sb = d.sb rv.sb = d.sb
rv.except = except rv.except = except

View File

@ -183,6 +183,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
return nil, 0, err return nil, 0, err
} }
newRoaring := roaring.NewBitmap()
newRoaringLocs := roaring.NewBitmap()
// for each field // for each field
for fieldID, fieldName := range fieldsInv { for fieldID, fieldName := range fieldsInv {
@ -222,8 +225,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
var prevTerm []byte var prevTerm []byte
newRoaring := roaring.NewBitmap() newRoaring.Clear()
newRoaringLocs := roaring.NewBitmap() newRoaringLocs.Clear()
var lastDocNum, lastFreq, lastNorm uint64 var lastDocNum, lastFreq, lastNorm uint64
@ -262,8 +265,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
} }
} }
newRoaring = roaring.NewBitmap() newRoaring.Clear()
newRoaringLocs = roaring.NewBitmap() newRoaringLocs.Clear()
tfEncoder.Reset() tfEncoder.Reset()
locEncoder.Reset() locEncoder.Reset()

View File

@ -19,6 +19,7 @@ import (
"encoding/binary" "encoding/binary"
"math" "math"
"sort" "sort"
"sync"
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint" "github.com/Smerity/govarint"
@ -35,12 +36,11 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
chunkFactor uint32) (*SegmentBase, error) { chunkFactor uint32) (*SegmentBase, error) {
var br bytes.Buffer var br bytes.Buffer
s := interim{ s := interimPool.Get().(*interim)
results: results,
chunkFactor: chunkFactor, s.results = results
w: NewCountHashWriter(&br), s.chunkFactor = chunkFactor
FieldsMap: map[string]uint16{}, s.w = NewCountHashWriter(&br)
}
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
err := s.convert() err := s.convert()
@ -52,9 +52,13 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
s.FieldsMap, s.FieldsInv, uint64(len(results)), s.FieldsMap, s.FieldsInv, uint64(len(results)),
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
interimPool.Put(s.cleanse())
return sb, err return sb, err
} }
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
// interim holds temporary working data used while converting from // interim holds temporary working data used while converting from
// analysis results to a zap-encoded segment // analysis results to a zap-encoded segment
type interim struct { type interim struct {
@ -91,16 +95,66 @@ type interim struct {
PostingsLocs []*roaring.Bitmap PostingsLocs []*roaring.Bitmap
// postings id -> freq/norm's, one for each docNum in postings // postings id -> freq/norm's, one for each docNum in postings
FreqNorms [][]interimFreqNorm FreqNorms [][]interimFreqNorm
freqNormsBacking []interimFreqNorm
// postings id -> locs, one for each freq // postings id -> locs, one for each freq
Locs [][]interimLoc Locs [][]interimLoc
locsBacking []interimLoc
numTermsPerPostingsList []int // key is postings list id
numLocsPerPostingsList []int // key is postings list id
buf0 bytes.Buffer buf0 bytes.Buffer
tmp0 []byte tmp0 []byte
tmp1 []byte tmp1 []byte
} }
func (s *interim) cleanse() *interim {
s.results = nil
s.chunkFactor = 0
s.w = nil
s.FieldsMap = nil
s.FieldsInv = s.FieldsInv[:0]
for i := range s.Dicts {
s.Dicts[i] = nil
}
s.Dicts = s.Dicts[:0]
for i := range s.DictKeys {
s.DictKeys[i] = s.DictKeys[i][:0]
}
s.DictKeys = s.DictKeys[:0]
for i := range s.IncludeDocValues {
s.IncludeDocValues[i] = false
}
s.IncludeDocValues = s.IncludeDocValues[:0]
for _, idn := range s.Postings {
idn.Clear()
}
s.Postings = s.Postings[:0]
for _, idn := range s.PostingsLocs {
idn.Clear()
}
s.PostingsLocs = s.PostingsLocs[:0]
s.FreqNorms = s.FreqNorms[:0]
for i := range s.freqNormsBacking {
s.freqNormsBacking[i] = interimFreqNorm{}
}
s.freqNormsBacking = s.freqNormsBacking[:0]
s.Locs = s.Locs[:0]
for i := range s.locsBacking {
s.locsBacking[i] = interimLoc{}
}
s.locsBacking = s.locsBacking[:0]
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
s.buf0.Reset()
s.tmp0 = s.tmp0[:0]
s.tmp1 = s.tmp1[:0]
return s
}
func (s *interim) grabBuf(size int) []byte { func (s *interim) grabBuf(size int) []byte {
buf := s.tmp0 buf := s.tmp0
if cap(buf) < size { if cap(buf) < size {
@ -130,6 +184,8 @@ type interimLoc struct {
} }
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
s.FieldsMap = map[string]uint16{}
s.getOrDefineField("_id") // _id field is fieldID 0 s.getOrDefineField("_id") // _id field is fieldID 0
for _, result := range s.results { for _, result := range s.results {
@ -143,12 +199,15 @@ func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
sort.Strings(s.FieldsInv[1:]) // keep _id as first field sort.Strings(s.FieldsInv[1:]) // keep _id as first field
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
for fieldID, fieldName := range s.FieldsInv { for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1) s.FieldsMap[fieldName] = uint16(fieldID + 1)
} }
s.IncludeDocValues = make([]bool, len(s.FieldsInv)) if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
} else {
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
}
s.prepareDicts() s.prepareDicts()
@ -189,9 +248,18 @@ func (s *interim) getOrDefineField(fieldName string) int {
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[fieldName] = fieldIDPlus1 s.FieldsMap[fieldName] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, fieldName) s.FieldsInv = append(s.FieldsInv, fieldName)
s.Dicts = append(s.Dicts, make(map[string]uint64)) s.Dicts = append(s.Dicts, make(map[string]uint64))
s.DictKeys = append(s.DictKeys, make([]string, 0))
n := len(s.DictKeys)
if n < cap(s.DictKeys) {
s.DictKeys = s.DictKeys[:n+1]
s.DictKeys[n] = s.DictKeys[n][:0]
} else {
s.DictKeys = append(s.DictKeys, []string(nil))
}
} }
return int(fieldIDPlus1 - 1) return int(fieldIDPlus1 - 1)
} }
@ -199,9 +267,6 @@ func (s *interim) getOrDefineField(fieldName string) int {
func (s *interim) prepareDicts() { func (s *interim) prepareDicts() {
var pidNext int var pidNext int
numTermsPerPostingsList := make([]int, 0, 64) // key is postings list id
numLocsPerPostingsList := make([]int, 0, 64) // key is postings list id
var totTFs int var totTFs int
var totLocs int var totLocs int
@ -218,14 +283,14 @@ func (s *interim) prepareDicts() {
dict[term] = pidPlus1 dict[term] = pidPlus1
dictKeys = append(dictKeys, term) dictKeys = append(dictKeys, term)
numTermsPerPostingsList = append(numTermsPerPostingsList, 0) s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
numLocsPerPostingsList = append(numLocsPerPostingsList, 0) s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
} }
pid := pidPlus1 - 1 pid := pidPlus1 - 1
numTermsPerPostingsList[pid] += 1 s.numTermsPerPostingsList[pid] += 1
numLocsPerPostingsList[pid] += len(tf.Locations) s.numLocsPerPostingsList[pid] += len(tf.Locations)
totLocs += len(tf.Locations) totLocs += len(tf.Locations)
} }
@ -253,28 +318,64 @@ func (s *interim) prepareDicts() {
numPostingsLists := pidNext numPostingsLists := pidNext
s.Postings = make([]*roaring.Bitmap, numPostingsLists) if cap(s.Postings) >= numPostingsLists {
for i := 0; i < numPostingsLists; i++ { s.Postings = s.Postings[:numPostingsLists]
s.Postings[i] = roaring.New() } else {
postings := make([]*roaring.Bitmap, numPostingsLists)
copy(postings, s.Postings[:cap(s.Postings)])
for i := 0; i < numPostingsLists; i++ {
if postings[i] == nil {
postings[i] = roaring.New()
}
}
s.Postings = postings
} }
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) if cap(s.PostingsLocs) >= numPostingsLists {
for i := 0; i < numPostingsLists; i++ { s.PostingsLocs = s.PostingsLocs[:numPostingsLists]
s.PostingsLocs[i] = roaring.New() } else {
postingsLocs := make([]*roaring.Bitmap, numPostingsLists)
copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)])
for i := 0; i < numPostingsLists; i++ {
if postingsLocs[i] == nil {
postingsLocs[i] = roaring.New()
}
}
s.PostingsLocs = postingsLocs
} }
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) if cap(s.FreqNorms) >= numPostingsLists {
s.FreqNorms = s.FreqNorms[:numPostingsLists]
} else {
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
}
freqNormsBacking := make([]interimFreqNorm, totTFs) if cap(s.freqNormsBacking) >= totTFs {
for pid, numTerms := range numTermsPerPostingsList { s.freqNormsBacking = s.freqNormsBacking[:totTFs]
} else {
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
}
freqNormsBacking := s.freqNormsBacking
for pid, numTerms := range s.numTermsPerPostingsList {
s.FreqNorms[pid] = freqNormsBacking[0:0] s.FreqNorms[pid] = freqNormsBacking[0:0]
freqNormsBacking = freqNormsBacking[numTerms:] freqNormsBacking = freqNormsBacking[numTerms:]
} }
s.Locs = make([][]interimLoc, numPostingsLists) if cap(s.Locs) >= numPostingsLists {
s.Locs = s.Locs[:numPostingsLists]
} else {
s.Locs = make([][]interimLoc, numPostingsLists)
}
locsBacking := make([]interimLoc, totLocs) if cap(s.locsBacking) >= totLocs {
for pid, numLocs := range numLocsPerPostingsList { s.locsBacking = s.locsBacking[:totLocs]
} else {
s.locsBacking = make([]interimLoc, totLocs)
}
locsBacking := s.locsBacking
for pid, numLocs := range s.numLocsPerPostingsList {
s.Locs[pid] = locsBacking[0:0] s.Locs[pid] = locsBacking[0:0]
locsBacking = locsBacking[numLocs:] locsBacking = locsBacking[numLocs:]
} }
@ -334,7 +435,7 @@ func (s *interim) processDocument(docNum uint64,
for term, tf := range tfs { for term, tf := range tfs {
pid := dict[term] - 1 pid := dict[term] - 1
bs := s.Postings[pid] bs := s.Postings[pid]
bs.AddInt(int(docNum)) bs.Add(uint32(docNum))
s.FreqNorms[pid] = append(s.FreqNorms[pid], s.FreqNorms[pid] = append(s.FreqNorms[pid],
interimFreqNorm{ interimFreqNorm{
@ -344,7 +445,7 @@ func (s *interim) processDocument(docNum uint64,
if len(tf.Locations) > 0 { if len(tf.Locations) > 0 {
locBS := s.PostingsLocs[pid] locBS := s.PostingsLocs[pid]
locBS.AddInt(int(docNum)) locBS.Add(uint32(docNum))
locs := s.Locs[pid] locs := s.Locs[pid]

View File

@ -266,7 +266,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
rv.locBitmap = roaring.NewBitmap() if rv.locBitmap == nil {
rv.locBitmap = roaring.NewBitmap()
}
_, err := rv.locBitmap.FromBuffer(locRoaringBytes) _, err := rv.locBitmap.FromBuffer(locRoaringBytes)
if err != nil { if err != nil {
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
@ -278,7 +280,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
rv.postings = roaring.NewBitmap() if rv.postings == nil {
rv.postings = roaring.NewBitmap()
}
_, err = rv.postings.FromBuffer(roaringBytes) _, err = rv.postings.FromBuffer(roaringBytes)
if err != nil { if err != nil {
return fmt.Errorf("error loading roaring bitmap: %v", err) return fmt.Errorf("error loading roaring bitmap: %v", err)