From 531800c479915f58ab823f00fe1f5fa112695702 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 16:30:52 -0700 Subject: [PATCH 1/6] scorch zap use roaring Add() instead of AddInt() This change invokes Add() directly as AddInt() is a convenience wrapper around Add(). --- index/scorch/segment/zap/new.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 3a8b2012..f29711c0 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -334,7 +334,7 @@ func (s *interim) processDocument(docNum uint64, for term, tf := range tfs { pid := dict[term] - 1 bs := s.Postings[pid] - bs.AddInt(int(docNum)) + bs.Add(uint32(docNum)) s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ @@ -344,7 +344,7 @@ func (s *interim) processDocument(docNum uint64, if len(tf.Locations) > 0 { locBS := s.PostingsLocs[pid] - locBS.AddInt(int(docNum)) + locBS.Add(uint32(docNum)) locs := s.Locs[pid] From c4ceffe58430d4f4103d5cca7a0777dceebd8af6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 12:09:21 -0700 Subject: [PATCH 2/6] scorch zap sync Pool for interim data --- index/scorch/segment/zap/new.go | 93 +++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index f29711c0..c7a6b0ce 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "math" "sort" + "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -35,12 +36,11 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, chunkFactor uint32) (*SegmentBase, error) { var br bytes.Buffer - s := interim{ - results: results, - chunkFactor: chunkFactor, - w: NewCountHashWriter(&br), - FieldsMap: map[string]uint16{}, - } + s := interimPool.Get().(*interim) + + s.results = results + s.chunkFactor = chunkFactor + s.w = NewCountHashWriter(&br) storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, err := s.convert() @@ -52,9 +52,13 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, s.FieldsMap, s.FieldsInv, uint64(len(results)), storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + interimPool.Put(s.cleanse()) + return sb, err } +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + // interim holds temporary working data used while converting from // analysis results to a zap-encoded segment type interim struct { @@ -101,6 +105,41 @@ type interim struct { tmp1 []byte } +func (s *interim) cleanse() *interim { + s.results = nil + s.chunkFactor = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = s.FieldsInv[:0] + for i := range s.Dicts { + s.Dicts[i] = nil + } + s.Dicts = s.Dicts[:0] + for i := range s.DictKeys { + s.DictKeys[i] = s.DictKeys[i][:0] + } + s.DictKeys = s.DictKeys[:0] + for i := range s.IncludeDocValues { + s.IncludeDocValues[i] = false + } + s.IncludeDocValues = s.IncludeDocValues[:0] + for _, idn := range s.Postings { + idn.Clear() + } + s.Postings = s.Postings[:0] + for _, idn := range s.PostingsLocs { + idn.Clear() + } + s.PostingsLocs = s.PostingsLocs[:0] + s.FreqNorms = nil + s.Locs = nil + s.buf0.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + + return s +} + func (s *interim) grabBuf(size int) []byte { buf := s.tmp0 if cap(buf) < size { @@ -130,6 +169,8 @@ type interimLoc struct { } func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.FieldsMap = map[string]uint16{} + s.getOrDefineField("_id") // _id field is fieldID 0 for _, result := range s.results { @@ -143,12 +184,15 @@ func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { sort.Strings(s.FieldsInv[1:]) // keep _id as first field - s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) for fieldID, fieldName := range s.FieldsInv { s.FieldsMap[fieldName] = uint16(fieldID + 1) } - s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + if cap(s.IncludeDocValues) >= len(s.FieldsInv) { + s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] + } else { + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + } s.prepareDicts() @@ -189,9 +233,18 @@ func (s *interim) getOrDefineField(fieldName string) int { fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) s.FieldsMap[fieldName] = fieldIDPlus1 s.FieldsInv = append(s.FieldsInv, fieldName) + s.Dicts = append(s.Dicts, make(map[string]uint64)) - s.DictKeys = append(s.DictKeys, make([]string, 0)) + + n := len(s.DictKeys) + if n < cap(s.DictKeys) { + s.DictKeys = s.DictKeys[:n+1] + s.DictKeys[n] = s.DictKeys[n][:0] + } else { + s.DictKeys = append(s.DictKeys, []string(nil)) + } } + return int(fieldIDPlus1 - 1) } @@ -253,16 +306,25 @@ func (s *interim) prepareDicts() { numPostingsLists := pidNext - s.Postings = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() + if cap(s.Postings) >= numPostingsLists { + s.Postings = s.Postings[:numPostingsLists] + } else { + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.Postings[i] = roaring.New() + } } - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() + if cap(s.PostingsLocs) >= numPostingsLists { + s.PostingsLocs = s.PostingsLocs[:numPostingsLists] + } else { + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.PostingsLocs[i] = roaring.New() + } } + // TODO: reuse this. s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) freqNormsBacking := make([]interimFreqNorm, totTFs) @@ -271,6 +333,7 @@ func (s *interim) prepareDicts() { freqNormsBacking = freqNormsBacking[numTerms:] } + // TODO: reuse this. s.Locs = make([][]interimLoc, numPostingsLists) locsBacking := make([]interimLoc, totLocs) From cad88096cacab844536d322312c0a59f150926c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 16:30:34 -0700 Subject: [PATCH 3/6] scorch zap reuse roaring Bitmap during merge --- index/scorch/segment/zap/merge.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index a934dfc3..07de0943 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -183,6 +183,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err } + newRoaring := roaring.NewBitmap() + newRoaringLocs := roaring.NewBitmap() + // for each field for fieldID, fieldName := range fieldsInv { @@ -222,8 +225,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var prevTerm []byte - newRoaring := roaring.NewBitmap() - newRoaringLocs := roaring.NewBitmap() + newRoaring.Clear() + newRoaringLocs.Clear() var lastDocNum, lastFreq, lastNorm uint64 @@ -262,8 +265,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } } - newRoaring = roaring.NewBitmap() - newRoaringLocs = roaring.NewBitmap() + newRoaring.Clear() + newRoaringLocs.Clear() tfEncoder.Reset() locEncoder.Reset() From b1f39695217295421681d509744e55c0027b5f5d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 20:13:31 -0700 Subject: [PATCH 4/6] scorch zap reuse roaring Bitmap in postings lists --- index/scorch/segment/zap/dict.go | 12 ++++++++++++ index/scorch/segment/zap/posting.go | 8 ++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index e5d71268..3b8132f2 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -68,7 +68,19 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if rv == nil { rv = &PostingsList{} } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + locBitmap := rv.locBitmap + if locBitmap != nil { + locBitmap.Clear() + } + *rv = PostingsList{} // clear the struct + + rv.postings = postings + rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f2df32bf..bc533ad1 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -266,7 +266,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - rv.locBitmap = roaring.NewBitmap() + if rv.locBitmap == nil { + rv.locBitmap = roaring.NewBitmap() + } _, err := rv.locBitmap.FromBuffer(locRoaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) @@ -278,7 +280,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] - rv.postings = roaring.NewBitmap() + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } _, err = rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) From 07901910e24dabd67f9e26c88a03a8675e0fd87d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 21:07:14 -0700 Subject: [PATCH 5/6] scorch zap reuse roaring Bitmap in prepareDicts() slice growth In this change, if the postings/postingsLocs slices need to be grown, then copy over and reuse any of the preallocated roaring Bitmap's from the old slice. --- index/scorch/segment/zap/new.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index c7a6b0ce..5b625b99 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -309,19 +309,27 @@ func (s *interim) prepareDicts() { if cap(s.Postings) >= numPostingsLists { s.Postings = s.Postings[:numPostingsLists] } else { - s.Postings = make([]*roaring.Bitmap, numPostingsLists) + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, s.Postings[:cap(s.Postings)]) for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() + if postings[i] == nil { + postings[i] = roaring.New() + } } + s.Postings = postings } if cap(s.PostingsLocs) >= numPostingsLists { s.PostingsLocs = s.PostingsLocs[:numPostingsLists] } else { - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + postingsLocs := make([]*roaring.Bitmap, numPostingsLists) + copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() + if postingsLocs[i] == nil { + postingsLocs[i] = roaring.New() + } } + s.PostingsLocs = postingsLocs } // TODO: reuse this. From dbfc5e913027c7dc5a85f9a9f91476babf8322da Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Mar 2018 10:04:11 -0700 Subject: [PATCH 6/6] scorch zap reuse interim freq/norm/loc slices --- index/scorch/segment/zap/new.go | 68 ++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 5b625b99..51971ba5 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -95,10 +95,15 @@ type interim struct { PostingsLocs []*roaring.Bitmap // postings id -> freq/norm's, one for each docNum in postings - FreqNorms [][]interimFreqNorm + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm // postings id -> locs, one for each freq - Locs [][]interimLoc + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id buf0 bytes.Buffer tmp0 []byte @@ -131,8 +136,18 @@ func (s *interim) cleanse() *interim { idn.Clear() } s.PostingsLocs = s.PostingsLocs[:0] - s.FreqNorms = nil - s.Locs = nil + s.FreqNorms = s.FreqNorms[:0] + for i := range s.freqNormsBacking { + s.freqNormsBacking[i] = interimFreqNorm{} + } + s.freqNormsBacking = s.freqNormsBacking[:0] + s.Locs = s.Locs[:0] + for i := range s.locsBacking { + s.locsBacking[i] = interimLoc{} + } + s.locsBacking = s.locsBacking[:0] + s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] + s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] s.buf0.Reset() s.tmp0 = s.tmp0[:0] s.tmp1 = s.tmp1[:0] @@ -252,9 +267,6 @@ func (s *interim) getOrDefineField(fieldName string) int { func (s *interim) prepareDicts() { var pidNext int - numTermsPerPostingsList := make([]int, 0, 64) // key is postings list id - numLocsPerPostingsList := make([]int, 0, 64) // key is postings list id - var totTFs int var totLocs int @@ -271,14 +283,14 @@ func (s *interim) prepareDicts() { dict[term] = pidPlus1 dictKeys = append(dictKeys, term) - numTermsPerPostingsList = append(numTermsPerPostingsList, 0) - numLocsPerPostingsList = append(numLocsPerPostingsList, 0) + s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) + s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) } pid := pidPlus1 - 1 - numTermsPerPostingsList[pid] += 1 - numLocsPerPostingsList[pid] += len(tf.Locations) + s.numTermsPerPostingsList[pid] += 1 + s.numLocsPerPostingsList[pid] += len(tf.Locations) totLocs += len(tf.Locations) } @@ -332,20 +344,38 @@ func (s *interim) prepareDicts() { s.PostingsLocs = postingsLocs } - // TODO: reuse this. - s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + if cap(s.FreqNorms) >= numPostingsLists { + s.FreqNorms = s.FreqNorms[:numPostingsLists] + } else { + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } - freqNormsBacking := make([]interimFreqNorm, totTFs) - for pid, numTerms := range numTermsPerPostingsList { + if cap(s.freqNormsBacking) >= totTFs { + s.freqNormsBacking = s.freqNormsBacking[:totTFs] + } else { + s.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := s.freqNormsBacking + for pid, numTerms := range s.numTermsPerPostingsList { s.FreqNorms[pid] = freqNormsBacking[0:0] freqNormsBacking = freqNormsBacking[numTerms:] } - // TODO: reuse this. - s.Locs = make([][]interimLoc, numPostingsLists) + if cap(s.Locs) >= numPostingsLists { + s.Locs = s.Locs[:numPostingsLists] + } else { + s.Locs = make([][]interimLoc, numPostingsLists) + } - locsBacking := make([]interimLoc, totLocs) - for pid, numLocs := range numLocsPerPostingsList { + if cap(s.locsBacking) >= totLocs { + s.locsBacking = s.locsBacking[:totLocs] + } else { + s.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := s.locsBacking + for pid, numLocs := range s.numLocsPerPostingsList { s.Locs[pid] = locsBacking[0:0] locsBacking = locsBacking[numLocs:] }