From 88c740095b7a5224c98e7538a0ee3c7bca574ee2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 10:59:53 -0800 Subject: [PATCH 01/18] scorch optimizations for mem.PostingsIterator.Next() & docTermMap Due to the usage rules of iterators, mem.PostingsIterator.Next() can reuse its returned Postings instance. Also, there's a micro optimization in persistDocValues() for one fewer access to the docTermMap in the inner-loop. --- index/scorch/segment/mem/posting.go | 6 +++--- index/scorch/segment/zap/build.go | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index d91a0056..25cbeb45 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -73,6 +73,7 @@ type PostingsIterator struct { offset int locoffset int actual roaring.IntIterable + reuse Posting } // Next returns the next posting on the postings list, or nil at the end @@ -92,17 +93,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { i.offset++ allN = i.all.Next() } - rv := &Posting{ + i.reuse = Posting{ iterator: i, docNum: uint64(n), offset: i.offset, locoffset: i.locoffset, hasLoc: i.locations.Contains(n), } - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) i.offset++ - return rv, nil + return &i.reuse, nil } // Posting is a single entry in a postings list diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 77f18b05..b075496c 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -552,8 +552,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() - docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) - docTermMap[docNum] = append(docTermMap[docNum], termSeparator) + docTermMap[docNum] = append(append(docTermMap[docNum], []byte(next.Term)...), termSeparator) nextPosting, err2 = postingsItr.Next() } if err2 != nil { @@ -562,10 +561,10 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, next, err = dictItr.Next() } - if err != nil { return nil, err } + // sort wrt to docIDs var docNumbers docIDRange for k := range docTermMap { From b7cfef81c9a86d590b2a9ccdaed184c3e1bbcdef Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:43:22 -0800 Subject: [PATCH 02/18] scorch optimize mem processDocument() dict access This change moves the dict lookup to outside of the loop. --- index/scorch/segment/mem/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 57d60dc8..2a2683dc 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -252,8 +252,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { + dict := s.Dicts[fieldID] for term, tokenFreq := range tokenFrequencies { - pid := s.Dicts[fieldID][term] - 1 + pid := dict[term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From 6ae799052a8eaf945a769f526c209bd1385d3fbf Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:52:27 -0800 Subject: [PATCH 03/18] scorch mem optimize processDocument() stored field --- index/scorch/segment/mem/build.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 2a2683dc..3a892f9a 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -222,12 +222,6 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } } - storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { - s.Stored[docNum][field] = append(s.Stored[docNum][field], val) - s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) - s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) - } - // walk each composite field for _, field := range result.Document.CompositeFields { fieldID := uint16(s.getOrDefineField(field.Name())) @@ -235,6 +229,10 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { processField(fieldID, field.Name(), l, tf) } + docStored := s.Stored[docNum] + docStoredTypes := s.StoredTypes[docNum] + docStoredPos := s.StoredPos[docNum] + // walk each field for i, field := range result.Document.Fields { fieldID := uint16(s.getOrDefineField(field.Name())) @@ -242,7 +240,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { tf := result.Analyzed[i] processField(fieldID, field.Name(), l, tf) if field.Options().IsStored() { - storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) + docStored[fieldID] = append(docStored[fieldID], field.Value()) + docStoredTypes[fieldID] = append(docStoredTypes[fieldID], encodeFieldType(field)) + docStoredPos[fieldID] = append(docStoredPos[fieldID], field.ArrayPositions()) } if field.Options().IncludeDocValues() { From 884da6f93a3c0b9a4567501ad4ddf96eea739227 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:58:28 -0800 Subject: [PATCH 04/18] scorch optimize mem processDocument() norm calculation This change moves the norm calculation outside of the inner loop. --- index/scorch/segment/mem/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 3a892f9a..643ae36e 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -253,12 +253,13 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { dict := s.Dicts[fieldID] + norm := float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))) for term, tokenFreq := range tokenFrequencies { pid := dict[term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) - s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + s.Norms[pid] = append(s.Norms[pid], norm) locationBS := s.PostingsLocs[pid] if len(tokenFreq.Locations) > 0 { locationBS.AddInt(int(docNum)) From d44c5ad5682450baff48ead3345d69f9ec5399cc Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 19:32:39 -0800 Subject: [PATCH 05/18] scorch stats MaxBatchIntroTime bug fix and more timing stats Added timing stats for in-mem zap merging and file-based zap merging. --- index/scorch/merge.go | 19 +++++++++++++++++++ index/scorch/scorch.go | 2 +- index/scorch/stats.go | 8 ++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 005d4f41..ee3ec46c 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -179,9 +179,19 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename + + fileMergeZapStartTime := time.Now() + atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) + + fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { + atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) + } + if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -258,11 +268,20 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, cr := zap.NewCountHashWriter(&br) + memMergeZapStartTime := time.Now() + atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, err := zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) + + memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime { + atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) + } + if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 87372a32..a40f374a 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -365,7 +365,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, introTime := uint64(time.Since(introStartTime)) atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { - atomic.AddUint64(&s.stats.MaxBatchIntroTime, introTime) + atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime) } return err diff --git a/index/scorch/stats.go b/index/scorch/stats.go index cd416a7c..3c978af7 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -88,8 +88,10 @@ type Stats struct { TotFileMergeSegmentsEmpty uint64 TotFileMergeSegments uint64 - TotFileMergeZapBeg uint64 - TotFileMergeZapEnd uint64 + TotFileMergeZapBeg uint64 + TotFileMergeZapEnd uint64 + TotFileMergeZapTime uint64 + MaxFileMergeZapTime uint64 TotFileMergeIntroductions uint64 TotFileMergeIntroductionsDone uint64 @@ -99,6 +101,8 @@ type Stats struct { TotMemMergeDone uint64 TotMemMergeZapBeg uint64 TotMemMergeZapEnd uint64 + TotMemMergeZapTime uint64 + MaxMemMergeZapTime uint64 TotMemMergeSegments uint64 } From 85761c6a57987ece0b193042ddf953f3f360bad1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 19:39:21 -0800 Subject: [PATCH 06/18] go fmt --- index/scorch/segment/mem/build.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 643ae36e..57971aae 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -253,7 +253,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { dict := s.Dicts[fieldID] - norm := float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))) + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) for term, tokenFreq := range tokenFrequencies { pid := dict[term] - 1 bs := s.Postings[pid] From 8c0881eab2caef301835eb9b0660e0b727d06687 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:03:02 -0800 Subject: [PATCH 07/18] scorch zap build reuses mem postingsList/Iterator structs --- index/scorch/segment/mem/dict.go | 20 ++++++++++++++------ index/scorch/segment/mem/posting.go | 11 +++++++++-- index/scorch/segment/zap/build.go | 8 ++++++-- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index cf92ef71..b564ed1f 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -33,12 +33,20 @@ type Dictionary struct { // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return &PostingsList{ - dictionary: d, - term: term, - postingsID: d.segment.Dicts[d.fieldID][term], - except: except, - }, nil + return d.InitPostingsList(term, except, nil) +} + +func (d *Dictionary) InitPostingsList(term string, except *roaring.Bitmap, + prealloc *PostingsList) (*PostingsList, error) { + rv := prealloc + if rv == nil { + rv = &PostingsList{} + } + rv.dictionary = d + rv.term = term + rv.postingsID = d.segment.Dicts[d.fieldID][term] + rv.except = except + return rv, nil } // Iterator returns an iterator for this dictionary diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 25cbeb45..2554333a 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -46,9 +46,16 @@ func (p *PostingsList) Count() uint64 { // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { - rv := &PostingsIterator{ - postings: p, + return p.InitIterator(nil) +} +func (p *PostingsList) InitIterator(prealloc *PostingsIterator) *PostingsIterator { + rv := prealloc + if rv == nil { + rv = &PostingsIterator{postings: p} + } else { + *rv = PostingsIterator{postings: p} } + if p.postingsID > 0 { allbits := p.dictionary.segment.Postings[p.postingsID-1] rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index b075496c..7fbc995f 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -532,6 +532,9 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) + var postings *mem.PostingsList + var postingsItr *mem.PostingsIterator + for fieldID := range memSegment.DocValueFields { field := memSegment.FieldsInv[fieldID] docTermMap := make(map[uint64][]byte, 0) @@ -543,12 +546,13 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, dictItr := dict.Iterator() next, err := dictItr.Next() for err == nil && next != nil { - postings, err1 := dict.PostingsList(next.Term, nil) + var err1 error + postings, err1 = dict.(*mem.Dictionary).InitPostingsList(next.Term, nil, postings) if err1 != nil { return nil, err } - postingsItr := postings.Iterator() + postingsItr = postings.InitIterator(postingsItr) nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() From 856778ad7bb4d4c8433a504de1302360ac4a5dbc Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:06:45 -0800 Subject: [PATCH 08/18] scorch zap build prealloc docNumbers capacity --- index/scorch/segment/zap/build.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 7fbc995f..4edd277d 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -570,7 +570,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, } // sort wrt to docIDs - var docNumbers docIDRange + docNumbers := make(docIDRange, 0, len(docTermMap)) for k := range docTermMap { docNumbers = append(docNumbers, k) } From a338386a038594f37d5b7901c4d3a73e53424787 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:56:33 -0800 Subject: [PATCH 09/18] scorch build optimize freq/loc slice capacity --- index/scorch/segment/zap/build.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 4edd277d..eec9998b 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -308,7 +308,7 @@ func persistStoredFieldValues(fieldID int, } func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { - var freqOffsets, locOfffsets []uint64 + freqOffsets := make([]uint64, 0, len(memSegment.Postings)) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { if postingID != 0 { @@ -351,6 +351,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // now do it again for the locations + locOffsets := make([]uint64, 0, len(memSegment.Postings)) locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { if postingID != 0 { @@ -414,14 +415,15 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // record where this postings loc info starts - locOfffsets = append(locOfffsets, uint64(w.Count())) + locOffsets = append(locOffsets, uint64(w.Count())) locEncoder.Close() _, err := locEncoder.Write(w) if err != nil { return nil, nil, err } } - return freqOffsets, locOfffsets, nil + + return freqOffsets, locOffsets, nil } func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { From 8f8fd511b7d2a6f221a5cc51641017b4be9c2a40 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 13:01:22 -0800 Subject: [PATCH 10/18] scorch zap access freqs[offset] outside loop --- index/scorch/segment/zap/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index eec9998b..237cc5f3 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -368,7 +368,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac var locOffset int for postingsListItr.HasNext() { docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(freqs[offset]); i++ { + n := int(freqs[offset]) + for i := 0; i < n; i++ { if len(locfields) > 0 { // put field err := locEncoder.Add(docNum, uint64(locfields[locOffset])) From 502e64c2567c363d3cf0f2b1b542b9c1db74e973 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Mar 2018 16:33:13 -0800 Subject: [PATCH 11/18] scorch zap Posting doesn't use iterator field --- index/scorch/segment/zap/posting.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index ada39b43..27d90f2b 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -337,7 +337,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { reuseLocs := i.next.locs // hold for reuse before struct clearing i.next = Posting{} // clear the struct rv := &i.next - rv.iterator = i rv.docNum = uint64(n) var err error @@ -373,12 +372,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { // Posting is a single entry in a postings list type Posting struct { - iterator *PostingsIterator - docNum uint64 - - freq uint64 - norm float32 - locs []segment.Location + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location } // Number returns the document number of this posting in this segment From 655268bec821e10be03d23734d60db6804f7cb82 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 07:55:26 -0800 Subject: [PATCH 12/18] scorch zap postings iterator nextDocNum() helper method Refactored out a nextDocNum() helper method from Next() that future optimizations can use. --- index/scorch/segment/zap/posting.go | 122 +++++++++++++++------------- 1 file changed, 67 insertions(+), 55 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 27d90f2b..adc399ea 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -279,74 +279,26 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { + docNum, exists, err := i.nextDocNum() + if err != nil { + return nil, err + } + if !exists { return nil, nil } - n := i.actual.Next() - nChunk := n / i.postings.sb.chunkFactor - allN := i.all.Next() - allNChunk := allN / i.postings.sb.chunkFactor - - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again - for allN != n { - - // in different chunks, reset offsets - if allNChunk != nChunk { - i.locoffset = 0 - i.offset = 0 - } else { - - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) - } - } - - // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() - if err != nil { - return nil, err - } - if i.locBitmap.Contains(allN) { - for j := 0; j < int(freq); j++ { - err := i.readLocation(nil) - if err != nil { - return nil, err - } - } - } - - // in same chunk, need to account for offsets - i.offset++ - } - - allN = i.all.Next() - } - - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) - } - } reuseLocs := i.next.locs // hold for reuse before struct clearing i.next = Posting{} // clear the struct rv := &i.next - rv.docNum = uint64(n) + rv.docNum = docNum - var err error var normBits uint64 rv.freq, normBits, err = i.readFreqNorm() if err != nil { return nil, err } rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap.Contains(n) { + if i.locBitmap.Contains(uint32(docNum)) { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -370,6 +322,66 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { + if i.actual == nil || !i.actual.HasNext() { + return 0, false, nil + } + + n := i.actual.Next() + nChunk := n / i.postings.sb.chunkFactor + allN := i.all.Next() + allNChunk := allN / i.postings.sb.chunkFactor + + // n is the next actual hit (excluding some postings) + // allN is the next hit in the full postings + // if they don't match, adjust offsets to factor in item we're skipping over + // incr the all iterator, and check again + for allN != n { + // in different chunks, reset offsets + if allNChunk != nChunk { + i.locoffset = 0 + i.offset = 0 + } else { + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + freq, _, err := i.readFreqNorm() + if err != nil { + return 0, false, err + } + if i.locBitmap.Contains(allN) { + for j := 0; j < int(freq); j++ { + err := i.readLocation(nil) + if err != nil { + return 0, false, err + } + } + } + + // in same chunk, need to account for offsets + i.offset++ + } + + allN = i.all.Next() + } + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + // Posting is a single entry in a postings list type Posting struct { docNum uint64 From 530a3d24cf0768f4c7a82e9a61dd9a0eff3ec8a2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 07:58:42 -0800 Subject: [PATCH 13/18] scorch zap optimize merge by byte copying freq/norm/loc's This change adds a zap PostingsIterator.nextBytes() method, which is similar to Next(), but instead of returning a Posting instance, nextBytes() returns the encoded freq/norm and location byte slices. The zap merge code then provides those byte slices directly to the intCoder's via a new method, intCoder.AddBytes(), thereby avoiding having to encode many uvarint's. --- index/scorch/segment/zap/intcoder.go | 13 ++++++++ index/scorch/segment/zap/merge.go | 42 +++++++++----------------- index/scorch/segment/zap/posting.go | 44 +++++++++++++++++++++++++--- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 247e36fb..8d1f9453 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -82,6 +82,19 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { return nil } +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + // Close indicates you are done calling Add() this allows the final chunk // to be encoded. func (c *chunkedIntCoder) Close() { diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 33ce16c5..5066dfb9 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -162,7 +162,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var bufReuse bytes.Buffer var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) - var bufLoc []uint64 var postings *PostingsList var postItr *PostingsIterator @@ -316,45 +315,32 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, newDocNumsI := newDocNums[itrI] postItr = postings.iterator(postItr) - next, err2 := postItr.Next() - for next != nil && err2 == nil { - hitNewDocNum := newDocNumsI[next.Number()] + + nextDocNum, nextFreqNormBytes, nextLocBytes, err2 := postItr.nextBytes() + for err2 == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNumsI[nextDocNum] if hitNewDocNum == docDropped { return nil, 0, fmt.Errorf("see hit with dropped doc num") } + newRoaring.Add(uint32(hitNewDocNum)) - // encode norm bits - norm := next.Norm() - normBits := math.Float32bits(float32(norm)) - err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) - if err != nil { - return nil, 0, err + err2 = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err2 != nil { + return nil, 0, err2 } - locs := next.Locations() - if len(locs) > 0 { + + if len(nextLocBytes) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) - for _, loc := range locs { - if cap(bufLoc) < 5+len(loc.ArrayPositions()) { - bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) - } - args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()] - 1) - args[1] = loc.Pos() - args[2] = loc.Start() - args[3] = loc.End() - args[4] = uint64(len(loc.ArrayPositions())) - args = append(args, loc.ArrayPositions()...) - err = locEncoder.Add(hitNewDocNum, args...) - if err != nil { - return nil, 0, err - } + err2 = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err2 != nil { + return nil, 0, err2 } } docTermMap[hitNewDocNum] = append(append(docTermMap[hitNewDocNum], term...), termSeparator) - next, err2 = postItr.Next() + nextDocNum, nextFreqNormBytes, nextLocBytes, err2 = postItr.nextBytes() } if err2 != nil { return nil, 0, err2 diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index adc399ea..2dab4166 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -280,12 +280,9 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { docNum, exists, err := i.nextDocNum() - if err != nil { + if err != nil || !exists { return nil, err } - if !exists { - return nil, nil - } reuseLocs := i.next.locs // hold for reuse before struct clearing i.next = Posting{} // clear the struct @@ -322,6 +319,45 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +// nextBytes returns the docNum and the encoded freq & loc bytes for +// the next posting +func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) { + docNum, exists, err := i.nextDocNum() + if err != nil { + return 0, nil, nil, err + } + if !exists { + return 0, nil, nil, nil + } + + startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + + freq, _, err := i.readFreqNorm() + if err != nil { + return 0, nil, nil, err + } + + endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + bytesFreqNorm := i.currChunkFreqNorm[startFreqNorm:endFreqNorm] + + var bytesLoc []byte + if i.locBitmap.Contains(uint32(docNum)) { + startLoc := len(i.currChunkLoc) - i.locReader.Len() + + for j := uint64(0); j < freq; j++ { + err := i.readLocation(nil) + if err != nil { + return 0, nil, nil, err + } + } + + endLoc := len(i.currChunkLoc) - i.locReader.Len() + bytesLoc = i.currChunkLoc[startLoc:endLoc] + } + + return docNum, bytesFreqNorm, bytesLoc, nil +} + // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { From 5b86da85f358199d80436005d104607a7fed867d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 08:06:12 -0800 Subject: [PATCH 14/18] scorch zap optimize postings itr with tf/loc reader/decoder reuse --- index/scorch/segment/zap/posting.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 2dab4166..589c7cb8 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -45,7 +45,25 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.Reset([]byte(nil)) + } + freqNormDecoder := rv.freqNormDecoder + + locReader := rv.locReader + if locReader != nil { + locReader.Reset([]byte(nil)) + } + locDecoder := rv.locDecoder + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.freqNormDecoder = freqNormDecoder + + rv.locReader = locReader + rv.locDecoder = locDecoder } rv.postings = p From 7e36109b3c83f2e454805e709bd850e859574244 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 1 Mar 2018 17:12:16 -0800 Subject: [PATCH 15/18] MB-28162: Provide API to estimate memory needed to run a search query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This API (unexported) will estimate the amount of memory needed to execute a search query over an index before the collector begins data collection. Sample estimates for certain queries: {Size: 10, BenchmarkUpsidedownSearchOverhead} ESTIMATE BENCHMEM TermQuery 4616 4796 MatchQuery 5210 5405 DisjunctionQuery (Match queries) 7700 8447 DisjunctionQuery (Term queries) 6514 6591 ConjunctionQuery (Match queries) 7524 8175 Nested disjunction query (disjunction of disjunctions) 10306 10708 … --- document/document.go | 21 +++++- document/field_composite.go | 9 +++ index/index.go | 35 ++++++++++ index/scorch/scorch.go | 10 +-- index/scorch/segment/empty.go | 12 ++++ index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/dict.go | 20 ++++++ index/scorch/segment/mem/posting.go | 62 ++++++++++++++++++ index/scorch/segment/mem/segment.go | 83 ++++++++++++------------ index/scorch/segment/mem/segment_test.go | 2 +- index/scorch/segment/segment.go | 15 +++-- index/scorch/segment/zap/contentcoder.go | 8 +++ index/scorch/segment/zap/docvalues.go | 31 +++++---- index/scorch/segment/zap/posting.go | 79 ++++++++++++++++++++++ index/scorch/segment/zap/segment.go | 78 +++++++++++----------- index/scorch/snapshot_index.go | 7 ++ index/scorch/snapshot_index_doc.go | 13 ++++ index/scorch/snapshot_index_tfr.go | 30 +++++++++ index/scorch/snapshot_segment.go | 4 +- index/upsidedown/index_reader.go | 14 ++++ index/upsidedown/reader.go | 39 ++++++++++- index/upsidedown/row.go | 29 +++++++++ index_impl.go | 53 ++++++++++++++- index_test.go | 55 ++++++++++++++++ search.go | 30 +++++++++ search/collector/search_test.go | 18 +++++ search/collector/topn.go | 25 +++++++ search/explanation.go | 21 ++++++ search/facet/facet_builder_datetime.go | 29 +++++++++ search/facet/facet_builder_numeric.go | 29 +++++++++ search/facet/facet_builder_terms.go | 21 ++++++ search/facets_builder.go | 47 ++++++++++++++ search/pool.go | 11 ++++ search/scorer/scorer_conjunction.go | 14 ++++ search/scorer/scorer_constant.go | 19 ++++++ search/scorer/scorer_disjunction.go | 13 ++++ search/scorer/scorer_term.go | 24 +++++++ search/search.go | 82 +++++++++++++++++++++++ search/searcher/search_boolean.go | 36 ++++++++++ search/searcher/search_conjunction.go | 26 ++++++++ search/searcher/search_disjunction.go | 35 ++++++++++ search/searcher/search_docid.go | 16 +++++ search/searcher/search_filter.go | 15 +++++ search/searcher/search_match_all.go | 17 +++++ search/searcher/search_match_none.go | 15 +++++ search/searcher/search_phrase.go | 31 +++++++++ search/searcher/search_term.go | 18 +++++ size/sizes.go | 57 ++++++++++++++++ 48 files changed, 1242 insertions(+), 118 deletions(-) create mode 100644 size/sizes.go diff --git a/document/document.go b/document/document.go index c37585c6..921098b0 100644 --- a/document/document.go +++ b/document/document.go @@ -14,7 +14,19 @@ package document -import "fmt" +import ( + "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDocument int + +func init() { + var d Document + reflectStaticSizeDocument = int(reflect.TypeOf(d).Size()) +} type Document struct { ID string `json:"id"` @@ -30,6 +42,13 @@ func NewDocument(id string) *Document { } } +func (d *Document) Size() int { + return reflectStaticSizeDocument + size.SizeOfPtr + + len(d.ID) + + len(d.Fields)*size.SizeOfPtr + + len(d.CompositeFields)*(size.SizeOfPtr+reflectStaticSizeCompositeField) +} + func (d *Document) AddField(f Field) *Document { switch f := f.(type) { case *CompositeField: diff --git a/document/field_composite.go b/document/field_composite.go index b41b1b8e..e53cd456 100644 --- a/document/field_composite.go +++ b/document/field_composite.go @@ -15,9 +15,18 @@ package document import ( + "reflect" + "github.com/blevesearch/bleve/analysis" ) +var reflectStaticSizeCompositeField int + +func init() { + var cf CompositeField + reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size()) +} + const DefaultCompositeIndexingOptions = IndexField type CompositeField struct { diff --git a/index/index.go b/index/index.go index 9870b417..c25d7fa4 100644 --- a/index/index.go +++ b/index/index.go @@ -18,11 +18,23 @@ import ( "bytes" "encoding/json" "fmt" + "reflect" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermFieldDoc int +var reflectStaticSizeTermFieldVector int + +func init() { + var tfd TermFieldDoc + reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) + var tfv TermFieldVector + reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) +} + var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") type Index interface { @@ -82,6 +94,8 @@ type IndexReader interface { DumpFields() chan interface{} Close() error + + Size() int } // FieldTerms contains the terms used by a document, keyed by field @@ -115,6 +129,11 @@ type TermFieldVector struct { End uint64 } +func (tfv *TermFieldVector) Size() int { + return reflectStaticSizeTermFieldVector + size.SizeOfPtr + + len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64 +} + // IndexInternalID is an opaque document identifier interal to the index impl type IndexInternalID []byte @@ -134,6 +153,17 @@ type TermFieldDoc struct { Vectors []*TermFieldVector } +func (tfd *TermFieldDoc) Size() int { + sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr + + len(tfd.Term) + len(tfd.ID) + + for _, entry := range tfd.Vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + // Reset allows an already allocated TermFieldDoc to be reused func (tfd *TermFieldDoc) Reset() *TermFieldDoc { // remember the []byte used for the ID @@ -161,6 +191,8 @@ type TermFieldReader interface { // Count returns the number of documents contains the term in this field. Count() uint64 Close() error + + Size() int } type DictEntry struct { @@ -185,6 +217,9 @@ type DocIDReader interface { // will start there instead. If ID is greater than or equal to the end of // the range, Next() call will return io.EOF. Advance(ID IndexInternalID) (IndexInternalID, error) + + Size() int + Close() error } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index a40f374a..2a9eb634 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -472,20 +472,20 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { } func (s *Scorch) MemoryUsed() uint64 { - var memUsed uint64 + var memUsed int s.rootLock.RLock() if s.root != nil { for _, segmentSnapshot := range s.root.segment { memUsed += 8 /* size of id -> uint64 */ + - segmentSnapshot.segment.SizeInBytes() + segmentSnapshot.segment.Size() if segmentSnapshot.deleted != nil { - memUsed += segmentSnapshot.deleted.GetSizeInBytes() + memUsed += int(segmentSnapshot.deleted.GetSizeInBytes()) } - memUsed += segmentSnapshot.cachedDocs.sizeInBytes() + memUsed += segmentSnapshot.cachedDocs.size() } } s.rootLock.RUnlock() - return memUsed + return uint64(memUsed) } func (s *Scorch) markIneligibleForRemoval(filename string) { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 83454644..6c19f60f 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -46,6 +46,10 @@ func (e *EmptySegment) Close() error { return nil } +func (e *EmptySegment) Size() uint64 { + return 0 +} + func (e *EmptySegment) AddRef() { } @@ -84,6 +88,10 @@ func (e *EmptyPostingsList) Iterator() PostingsIterator { return &EmptyPostingsIterator{} } +func (e *EmptyPostingsList) Size() int { + return 0 +} + func (e *EmptyPostingsList) Count() uint64 { return 0 } @@ -93,3 +101,7 @@ type EmptyPostingsIterator struct{} func (e *EmptyPostingsIterator) Next() (Posting, error) { return nil, nil } + +func (e *EmptyPostingsIterator) Size() int { + return 0 +} diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 57971aae..264c94d1 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -45,7 +45,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { } // compute memory usage of segment - s.updateSizeInBytes() + s.updateSize() // professional debugging // diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index b564ed1f..9f5a873a 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -15,14 +15,23 @@ package mem import ( + "reflect" "sort" "strings" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDictionary int + +func init() { + var d Dictionary + reflectStaticSizeDictionary = int(reflect.TypeOf(d).Size()) +} + // Dictionary is the in-memory representation of the term dictionary type Dictionary struct { segment *Segment @@ -30,6 +39,17 @@ type Dictionary struct { fieldID uint16 } +func (d *Dictionary) Size() int { + sizeInBytes := reflectStaticSizeDictionary + size.SizeOfPtr + + len(d.field) + + if d.segment != nil { + sizeInBytes += int(d.segment.Size()) + } + + return sizeInBytes +} + // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 2554333a..4203acbe 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -15,10 +15,29 @@ package mem import ( + "reflect" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { dictionary *Dictionary @@ -27,6 +46,20 @@ type PostingsList struct { except *roaring.Bitmap } +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.dictionary != nil { + sizeInBytes += p.dictionary.Size() + } + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { var rv uint64 @@ -83,6 +116,16 @@ type PostingsIterator struct { reuse Posting } +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + if i.locations != nil { + sizeInBytes += int(i.locations.GetSizeInBytes()) + } + + return sizeInBytes +} + // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { if i.actual == nil || !i.actual.HasNext() { @@ -121,6 +164,16 @@ type Posting struct { hasLoc bool } +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + size.SizeOfPtr + + if p.iterator != nil { + sizeInBytes += p.iterator.Size() + } + + return sizeInBytes +} + // Number returns the document number of this posting in this segment func (p *Posting) Number() uint64 { return p.docNum @@ -158,6 +211,15 @@ type Location struct { offset int } +func (l *Location) Size() int { + sizeInBytes := reflectStaticSizeLocation + if l.p != nil { + sizeInBytes += l.p.Size() + } + + return sizeInBytes +} + // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 04bdb368..e9c4a273 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -16,11 +16,20 @@ package mem import ( "fmt" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeSegment int + +func init() { + var s Segment + reflectStaticSizeSegment = int(reflect.TypeOf(s).Size()) +} + // _id field is always guaranteed to have fieldID of 0 const idFieldID uint16 = 0 @@ -96,7 +105,7 @@ type Segment struct { // Footprint of the segment, updated when analyzed document mutations // are added into the segment - sizeInBytes uint64 + sizeInBytes int } // New builds a new empty Segment @@ -107,99 +116,87 @@ func New() *Segment { } } -func (s *Segment) updateSizeInBytes() { - var sizeInBytes uint64 +func (s *Segment) updateSize() { + sizeInBytes := reflectStaticSizeSegment // FieldsMap, FieldsInv for k, _ := range s.FieldsMap { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 2 /* size of uint16 */) + sizeInBytes += (len(k)+size.SizeOfString)*2 + + size.SizeOfUint16 } - // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) // Dicts, DictKeys for _, entry := range s.Dicts { for k, _ := range entry { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 8 /* size of uint64 */) + sizeInBytes += (len(k)+size.SizeOfString)*2 + + size.SizeOfUint64 } // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) + sizeInBytes += (size.SizeOfMap + size.SizeOfSlice) } - sizeInBytes += (segment.SizeOfSlice * 2) // Postings, PostingsLocs for i := 0; i < len(s.Postings); i++ { - sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + - (s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) + sizeInBytes += (int(s.Postings[i].GetSizeInBytes()) + size.SizeOfPtr) + + (int(s.PostingsLocs[i].GetSizeInBytes()) + size.SizeOfPtr) } - sizeInBytes += (segment.SizeOfSlice * 2) // Freqs, Norms for i := 0; i < len(s.Freqs); i++ { - sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + - len(s.Norms[i])*4 /* size of float32 */) + - (segment.SizeOfSlice * 2) + sizeInBytes += (len(s.Freqs[i])*size.SizeOfUint64 + + len(s.Norms[i])*size.SizeOfFloat32) + + (size.SizeOfSlice * 2) } - sizeInBytes += (segment.SizeOfSlice * 2) // Location data for i := 0; i < len(s.Locfields); i++ { - sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + - len(s.Locstarts[i])*8 /* size of uint64 */ + - len(s.Locends[i])*8 /* size of uint64 */ + - len(s.Locpos[i])*8 /* size of uint64 */) + sizeInBytes += len(s.Locfields[i])*size.SizeOfUint16 + + len(s.Locstarts[i])*size.SizeOfUint64 + + len(s.Locends[i])*size.SizeOfUint64 + + len(s.Locpos[i])*size.SizeOfUint64 for j := 0; j < len(s.Locarraypos[i]); j++ { - sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + - segment.SizeOfSlice + sizeInBytes += len(s.Locarraypos[i][j])*size.SizeOfUint64 + + size.SizeOfSlice } - sizeInBytes += (segment.SizeOfSlice * 5) + sizeInBytes += (size.SizeOfSlice * 5) } - sizeInBytes += (segment.SizeOfSlice * 5) // Stored data for i := 0; i < len(s.Stored); i++ { for _, v := range s.Stored[i] { - sizeInBytes += uint64(2 /* size of uint16 */) + sizeInBytes += size.SizeOfUint16 for _, arr := range v { - sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice + sizeInBytes += len(arr) + size.SizeOfSlice } - sizeInBytes += segment.SizeOfSlice + sizeInBytes += size.SizeOfSlice } for _, v := range s.StoredTypes[i] { - sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice + sizeInBytes += size.SizeOfUint16 + len(v) + size.SizeOfSlice } for _, v := range s.StoredPos[i] { - sizeInBytes += uint64(2 /* size of uint16 */) + sizeInBytes += size.SizeOfUint16 for _, arr := range v { - sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + - segment.SizeOfSlice + sizeInBytes += len(arr)*size.SizeOfUint64 + + size.SizeOfSlice } - sizeInBytes += segment.SizeOfSlice + sizeInBytes += size.SizeOfSlice } // overhead from map(s) within Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfMap * 3) + sizeInBytes += (size.SizeOfMap * 3) } - // overhead from data structures: Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfSlice * 3) // DocValueFields - sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + - segment.SizeOfMap - - // SizeInBytes - sizeInBytes += uint64(8) + sizeInBytes += len(s.DocValueFields) * (size.SizeOfUint16 + size.SizeOfBool) s.sizeInBytes = sizeInBytes } -func (s *Segment) SizeInBytes() uint64 { +func (s *Segment) Size() int { return s.sizeInBytes } diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 5e3818c2..6c5625d8 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -169,7 +169,7 @@ func TestSingle(t *testing.T) { t.Fatalf("segment nil, not expected") } - if segment.SizeInBytes() <= 0 { + if segment.Size() <= 0 { t.Fatalf("segment size not updated") } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index d5435ab9..8eee5f75 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -19,12 +19,6 @@ import ( "github.com/blevesearch/bleve/index" ) -// Overhead from go data structures when deployed on a 64-bit system. -const SizeOfMap uint64 = 8 -const SizeOfPointer uint64 = 8 -const SizeOfSlice uint64 = 24 -const SizeOfString uint64 = 16 - // DocumentFieldValueVisitor defines a callback to be visited for each // stored field value. The return value determines if the visitor // should keep going. Returning true continues visiting, false stops. @@ -42,7 +36,7 @@ type Segment interface { Close() error - SizeInBytes() uint64 + Size() int AddRef() DecRef() error @@ -63,6 +57,8 @@ type DictionaryIterator interface { type PostingsList interface { Iterator() PostingsIterator + Size() int + Count() uint64 // NOTE deferred for future work @@ -77,6 +73,8 @@ type PostingsIterator interface { // implementations may return a shared instance to reduce memory // allocations. Next() (Posting, error) + + Size() int } type Posting interface { @@ -86,6 +84,8 @@ type Posting interface { Norm() float64 Locations() []Location + + Size() int } type Location interface { @@ -94,6 +94,7 @@ type Location interface { End() uint64 Pos() uint64 ArrayPositions() []uint64 + Size() int } // DocumentFieldTermVisitable is implemented by various scorch segment diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 83457146..933f10a1 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -18,10 +18,18 @@ import ( "bytes" "encoding/binary" "io" + "reflect" "github.com/golang/snappy" ) +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 0514bd30..13635c57 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -19,13 +19,21 @@ import ( "encoding/binary" "fmt" "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/golang/snappy" ) +var reflectStaticSizedocValueIterator int + +func init() { + var dvi docValueIterator + reflectStaticSizedocValueIterator = int(reflect.TypeOf(dvi).Size()) +} + type docValueIterator struct { field string curChunkNum uint64 @@ -36,21 +44,12 @@ type docValueIterator struct { curChunkData []byte // compressed data cache } -func (di *docValueIterator) sizeInBytes() uint64 { - // curChunkNum, numChunks, dvDataLoc --> uint64 - sizeInBytes := 24 - - // field - sizeInBytes += (len(di.field) + int(segment.SizeOfString)) - - // chunkLens, curChunkHeader - sizeInBytes += len(di.chunkLens)*8 + - len(di.curChunkHeader)*24 + - int(segment.SizeOfSlice*2) /* overhead from slices */ - - // curChunkData is mmap'ed, not included - - return uint64(sizeInBytes) +func (di *docValueIterator) size() int { + return reflectStaticSizedocValueIterator + size.SizeOfPtr + + len(di.field) + + len(di.chunkLens)*size.SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) } func (di *docValueIterator) fieldName() string { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 589c7cb8..e9c68cba 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -19,12 +19,30 @@ import ( "encoding/binary" "fmt" "math" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase @@ -36,6 +54,28 @@ type PostingsList struct { except *roaring.Bitmap } +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.sb != nil { + sizeInBytes += (p.sb.Size() - len(p.sb.mem)) // do not include the mmap'ed part + } + + if p.locBitmap != nil { + sizeInBytes += int(p.locBitmap.GetSizeInBytes()) + } + + if p.postings != nil { + sizeInBytes += int(p.postings.GetSizeInBytes()) + } + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { return p.iterator(nil) @@ -193,6 +233,25 @@ type PostingsIterator struct { nextLocs []Location // reused across Next() calls } +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + len(i.currChunkFreqNorm) + + len(i.currChunkLoc) + + len(i.freqChunkLens)*size.SizeOfUint64 + + len(i.locChunkLens)*size.SizeOfUint64 + + i.next.Size() + + if i.locBitmap != nil { + sizeInBytes += int(i.locBitmap.GetSizeInBytes()) + } + + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (i *PostingsIterator) loadChunk(chunk int) error { if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) @@ -444,6 +503,20 @@ type Posting struct { locs []segment.Location } +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + if p.iterator != nil { + sizeInBytes += p.iterator.Size() + } + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + // Number returns the document number of this posting in this segment func (p *Posting) Number() uint64 { return p.docNum @@ -473,6 +546,12 @@ type Location struct { ap []uint64 } +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*size.SizeOfUint64 +} + // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 40c0af27..972b7578 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -20,16 +20,25 @@ import ( "fmt" "io" "os" + "reflect" "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/couchbase/vellum" mmap "github.com/edsrzf/mmap-go" "github.com/golang/snappy" ) +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) +} + // Open returns a zap impl of a segment func Open(path string) (segment.Segment, error) { f, err := os.Open(path) @@ -92,6 +101,32 @@ type SegmentBase struct { fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field } +func (sb *SegmentBase) Size() int { + sizeInBytes := reflectStaticSizeSegmentBase + + len(sb.mem) + + // fieldsMap + for k, _ := range sb.fieldsMap { + sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + size.SizeOfString + } + sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 + + // fieldDvIterMap + for _, v := range sb.fieldDvIterMap { + sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + return sizeInBytes +} + func (sb *SegmentBase) AddRef() {} func (sb *SegmentBase) DecRef() (err error) { return nil } func (sb *SegmentBase) Close() (err error) { return nil } @@ -111,56 +146,19 @@ type Segment struct { refs int64 } -func (s *Segment) SizeInBytes() uint64 { +func (s *Segment) Size() int { // 8 /* size of file pointer */ // 4 /* size of version -> uint32 */ // 4 /* size of crc -> uint32 */ sizeOfUints := 16 - sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints // mutex, refs -> int64 sizeInBytes += 16 // do not include the mmap'ed part - return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) -} - -func (s *SegmentBase) SizeInBytes() uint64 { - // 4 /* size of memCRC -> uint32 */ - // 4 /* size of chunkFactor -> uint32 */ - // 8 /* size of numDocs -> uint64 */ - // 8 /* size of storedIndexOffset -> uint64 */ - // 8 /* size of fieldsIndexOffset -> uint64 */ - // 8 /* size of docValueOffset -> uint64 */ - sizeInBytes := 40 - - sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) - - // fieldsMap - for k, _ := range s.fieldsMap { - sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ - } - sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ - - // fieldsInv, dictLocs - for _, entry := range s.fieldsInv { - sizeInBytes += (len(entry) + int(segment.SizeOfString)) - } - sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ - sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ - - // fieldDvIterMap - sizeInBytes += len(s.fieldDvIterMap) * - int(segment.SizeOfPointer+2 /* size of uint16 */) - for _, entry := range s.fieldDvIterMap { - if entry != nil { - sizeInBytes += int(entry.sizeInBytes()) - } - } - sizeInBytes += int(segment.SizeOfMap) - - return uint64(sizeInBytes) + return sizeInBytes + s.SegmentBase.Size() - len(s.mem) } func (s *Segment) AddRef() { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 5289b143..9394f391 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,6 +27,7 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) type asynchSegmentResult struct { @@ -89,6 +90,12 @@ func (i *IndexSnapshot) Close() error { return i.DecRef() } +func (i *IndexSnapshot) Size() int { + // Just return the size of the pointer for estimating the overhead + // during Search, a reference of the IndexSnapshot serves as the reader. + return size.SizeOfPtr +} + func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index d1205ff8..27da2086 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -16,17 +16,30 @@ package scorch import ( "bytes" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotDocIDReader int + +func init() { + var isdr IndexSnapshotDocIDReader + reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size()) +} + type IndexSnapshotDocIDReader struct { snapshot *IndexSnapshot iterators []roaring.IntIterable segmentOffset int } +func (i *IndexSnapshotDocIDReader) Size() int { + return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr +} + func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { for i.segmentOffset < len(i.iterators) { if !i.iterators[i.segmentOffset].HasNext() { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index d1f23b27..e1a0e9a5 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -16,12 +16,21 @@ package scorch import ( "bytes" + "reflect" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotTermFieldReader int + +func init() { + var istfr IndexSnapshotTermFieldReader + reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size()) +} + type IndexSnapshotTermFieldReader struct { term []byte field string @@ -36,6 +45,27 @@ type IndexSnapshotTermFieldReader struct { currID index.IndexInternalID } +func (i *IndexSnapshotTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr + + len(i.term) + + len(i.field) + + len(i.currID) + + for _, entry := range i.postings { + sizeInBytes += entry.Size() + } + + for _, entry := range i.iterators { + sizeInBytes += entry.Size() + } + + if i.currPosting != nil { + sizeInBytes += i.currPosting.Size() + } + + return sizeInBytes +} + func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { rv := preAlloced if rv == nil { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 5e64cb1f..cdfe317f 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -213,7 +213,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e return nil } -func (c *cachedDocs) sizeInBytes() uint64 { +func (c *cachedDocs) size() int { sizeInBytes := 0 c.m.Lock() for k, v := range c.cache { // cachedFieldDocs @@ -225,5 +225,5 @@ func (c *cachedDocs) sizeInBytes() uint64 { } } c.m.Unlock() - return uint64(sizeInBytes) + return sizeInBytes } diff --git a/index/upsidedown/index_reader.go b/index/upsidedown/index_reader.go index 77d523c3..4e575521 100644 --- a/index/upsidedown/index_reader.go +++ b/index/upsidedown/index_reader.go @@ -15,17 +15,31 @@ package upsidedown import ( + "reflect" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexReader int + +func init() { + var ir IndexReader + reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size()) +} + type IndexReader struct { index *UpsideDownCouch kvreader store.KVReader docCount uint64 } +func (i *IndexReader) Size() int { + return reflectStaticSizeIndexReader + size.SizeOfPtr +} + func (i *IndexReader) TermFieldReader(term []byte, fieldName string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false) if fieldExists { diff --git a/index/upsidedown/reader.go b/index/upsidedown/reader.go index 1f40c02d..646d4d8a 100644 --- a/index/upsidedown/reader.go +++ b/index/upsidedown/reader.go @@ -16,13 +16,27 @@ package upsidedown import ( "bytes" + "reflect" "sort" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeUpsideDownCouchTermFieldReader int +var reflectStaticSizeUpsideDownCouchDocIDReader int + +func init() { + var tfr UpsideDownCouchTermFieldReader + reflectStaticSizeUpsideDownCouchTermFieldReader = + int(reflect.TypeOf(tfr).Size()) + var cdr UpsideDownCouchDocIDReader + reflectStaticSizeUpsideDownCouchDocIDReader = + int(reflect.TypeOf(cdr).Size()) +} + type UpsideDownCouchTermFieldReader struct { count uint64 indexReader *IndexReader @@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct { includeTermVectors bool } +func (r *UpsideDownCouchTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr + + len(r.term) + + r.tfrPrealloc.Size() + + len(r.keyBuf) + + if r.tfrNext != nil { + sizeInBytes += r.tfrNext.Size() + } + + return sizeInBytes +} + func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { bufNeeded := termFrequencyRowKeySize(term, nil) if bufNeeded < dictionaryRowKeySize(term) { @@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct { onlyMode bool } -func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { +func (r *UpsideDownCouchDocIDReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + + r.indexReader.Size() + for _, entry := range r.only { + sizeInBytes += size.SizeOfString + len(entry) + } + + return sizeInBytes +} + +func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { startBytes := []byte{0x0} endBytes := []byte{0xff} diff --git a/index/upsidedown/row.go b/index/upsidedown/row.go index 7e503ae0..ba50314c 100644 --- a/index/upsidedown/row.go +++ b/index/upsidedown/row.go @@ -20,10 +20,22 @@ import ( "fmt" "io" "math" + "reflect" + "github.com/blevesearch/bleve/size" "github.com/golang/protobuf/proto" ) +var reflectStaticSizeTermFrequencyRow int +var reflectStaticSizeTermVector int + +func init() { + var tfr TermFrequencyRow + reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size()) + var tv TermVector + reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size()) +} + const ByteSeparator byte = 0xff type UpsideDownCouchRowStream chan UpsideDownCouchRow @@ -358,6 +370,11 @@ type TermVector struct { end uint64 } +func (tv *TermVector) Size() int { + return reflectStaticSizeTermVector + size.SizeOfPtr + + len(tv.arrayPositions)*size.SizeOfUint64 +} + func (tv *TermVector) String() string { return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) } @@ -371,6 +388,18 @@ type TermFrequencyRow struct { field uint16 } +func (tfr *TermFrequencyRow) Size() int { + sizeInBytes := reflectStaticSizeTermFrequencyRow + + len(tfr.term) + + len(tfr.doc) + + for _, entry := range tfr.vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (tfr *TermFrequencyRow) Term() []byte { return tfr.term } diff --git a/index_impl.go b/index_impl.go index caea1b8e..df6e748d 100644 --- a/index_impl.go +++ b/index_impl.go @@ -362,8 +362,59 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { return i.SearchInContext(context.Background(), req) } +// memNeededForSearch is a helper function that returns an estimate of RAM +// needed to execute a search request. +func memNeededForSearch(req *SearchRequest, + searcher search.Searcher, + topnCollector *collector.TopNCollector) uint64 { + + backingSize := req.Size + req.From + 1 + if req.Size+req.From > collector.PreAllocSizeSkipCap { + backingSize = collector.PreAllocSizeSkipCap + 1 + } + numDocMatches := backingSize + searcher.DocumentMatchPoolSize() + + estimate := 0 + + // overhead, size in bytes from collector + estimate += topnCollector.Size() + + var dm search.DocumentMatch + sizeOfDocumentMatch := dm.Size() + + // pre-allocing DocumentMatchPool + var sc search.SearchContext + estimate += sc.Size() + numDocMatches*sizeOfDocumentMatch + + // searcher overhead + estimate += searcher.Size() + + // overhead from results, lowestMatchOutsideResults + estimate += (numDocMatches + 1) * sizeOfDocumentMatch + + // additional overhead from SearchResult + var sr SearchResult + estimate += sr.Size() + + // overhead from facet results + if req.Facets != nil { + var fr search.FacetResult + estimate += len(req.Facets) * fr.Size() + } + + // highlighting, store + var d document.Document + if len(req.Fields) > 0 || req.Highlight != nil { + for i := 0; i < (req.Size + req.From); i++ { // size + from => number of hits + estimate += (req.Size + req.From) * d.Size() + } + } + + return uint64(estimate) +} + // SearchInContext executes a search request operation within the provided -// Context. Returns a SearchResult object or an error. +// Context. Returns a SearchResult object or an error. func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { i.mutex.RLock() defer i.mutex.RUnlock() diff --git a/index_test.go b/index_test.go index a69357bf..f1e53647 100644 --- a/index_test.go +++ b/index_test.go @@ -36,6 +36,9 @@ import ( "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" + + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/index/upsidedown" ) func TestCrud(t *testing.T) { @@ -1815,3 +1818,55 @@ func TestIndexAdvancedCountMatchSearch(t *testing.T) { t.Fatal(err) } } + +func benchmarkSearchOverhead(indexType string, b *testing.B) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + b.Fatal(err) + } + }() + + index, err := NewUsing("testidx", NewIndexMapping(), + indexType, Config.DefaultKVStore, nil) + if err != nil { + b.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + b.Fatal(err) + } + }() + + elements := []string{"air", "water", "fire", "earth"} + for j := 0; j < 10000; j++ { + err = index.Index(fmt.Sprintf("%d", j), + map[string]interface{}{"name": elements[j%len(elements)]}) + if err != nil { + b.Fatal(err) + } + } + + query1 := NewTermQuery("water") + query2 := NewTermQuery("fire") + query := NewDisjunctionQuery(query1, query2) + req := NewSearchRequest(query) + + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err = index.Search(req) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkUpsidedownSearchOverhead(b *testing.B) { + benchmarkSearchOverhead(upsidedown.Name, b) +} + +func BenchmarkScorchSearchOverhead(b *testing.B) { + benchmarkSearchOverhead(scorch.Name, b) +} diff --git a/search.go b/search.go index 46d849c1..e324262e 100644 --- a/search.go +++ b/search.go @@ -17,6 +17,7 @@ package bleve import ( "encoding/json" "fmt" + "reflect" "time" "github.com/blevesearch/bleve/analysis" @@ -24,8 +25,19 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeSearchResult int +var reflectStaticSizeSearchStatus int + +func init() { + var sr SearchResult + reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) + var ss SearchStatus + reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) +} + var cache = registry.NewCache() const defaultDateTimeParser = optional.Name @@ -432,6 +444,24 @@ type SearchResult struct { Facets search.FacetResults `json:"facets"` } +func (sr *SearchResult) Size() int { + sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr + + reflectStaticSizeSearchStatus + + for _, entry := range sr.Hits { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for k, v := range sr.Facets { + sizeInBytes += size.SizeOfString + len(k) + + v.Size() + } + + return sizeInBytes +} + func (sr *SearchResult) String() string { rv := "" if sr.Total > 0 { diff --git a/search/collector/search_test.go b/search/collector/search_test.go index 8457fb98..3ba71c1d 100644 --- a/search/collector/search_test.go +++ b/search/collector/search_test.go @@ -15,6 +15,8 @@ package collector import ( + "reflect" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -25,6 +27,18 @@ type stubSearcher struct { matches []*search.DocumentMatch } +func (ss *stubSearcher) Size() int { + sizeInBytes := int(reflect.TypeOf(*ss).Size()) + + for _, entry := range ss.matches { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (ss *stubSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { if ss.index < len(ss.matches) { rv := ctx.DocumentMatchPool.Get() @@ -76,6 +90,10 @@ func (ss *stubSearcher) DocumentMatchPoolSize() int { type stubReader struct{} +func (sr *stubReader) Size() int { + return 0 +} + func (sr *stubReader) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { return nil, nil } diff --git a/search/collector/topn.go b/search/collector/topn.go index 388370e7..d684868c 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -16,12 +16,21 @@ package collector import ( "context" + "reflect" "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTopNCollector int + +func init() { + var coll TopNCollector + reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) +} + type collectorStore interface { // Add the document, and if the new store size exceeds the provided size // the last element is removed and returned. If the size has not been @@ -98,6 +107,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector return hc } +func (hc *TopNCollector) Size() int { + sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr + + if hc.facetsBuilder != nil { + sizeInBytes += hc.facetsBuilder.Size() + } + + for _, entry := range hc.neededFields { + sizeInBytes += len(entry) + size.SizeOfString + } + + sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc) + + return sizeInBytes +} + // Collect goes to the index to find the matching documents func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { startTime := time.Now() diff --git a/search/explanation.go b/search/explanation.go index 766367d7..3b81737b 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -17,8 +17,18 @@ package search import ( "encoding/json" "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeExplanation int + +func init() { + var e Explanation + reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) +} + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -32,3 +42,14 @@ func (expl *Explanation) String() string { } return string(js) } + +func (expl *Explanation) Size() int { + sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr + + len(expl.Message) + + for _, entry := range expl.Children { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} diff --git a/search/facet/facet_builder_datetime.go b/search/facet/facet_builder_datetime.go index 8657a553..c45442e4 100644 --- a/search/facet/facet_builder_datetime.go +++ b/search/facet/facet_builder_datetime.go @@ -15,13 +15,25 @@ package facet import ( + "reflect" "sort" "time" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDateTimeFacetBuilder int +var reflectStaticSizedateTimeRange int + +func init() { + var dtfb DateTimeFacetBuilder + reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size()) + var dtr dateTimeRange + reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size()) +} + type dateTimeRange struct { start time.Time end time.Time @@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder { } } +func (fb *DateTimeFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizedateTimeRange + } + + return sizeInBytes +} + func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { r := dateTimeRange{ start: start, diff --git a/search/facet/facet_builder_numeric.go b/search/facet/facet_builder_numeric.go index 2ab5f278..c1692b54 100644 --- a/search/facet/facet_builder_numeric.go +++ b/search/facet/facet_builder_numeric.go @@ -15,12 +15,24 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeNumericFacetBuilder int +var reflectStaticSizenumericRange int + +func init() { + var nfb NumericFacetBuilder + reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size()) + var nr numericRange + reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size()) +} + type numericRange struct { min *float64 max *float64 @@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder { } } +func (fb *NumericFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizenumericRange + } + + return sizeInBytes +} + func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { r := numericRange{ min: min, diff --git a/search/facet/facet_builder_terms.go b/search/facet/facet_builder_terms.go index a41e475a..5b5901e0 100644 --- a/search/facet/facet_builder_terms.go +++ b/search/facet/facet_builder_terms.go @@ -15,11 +15,20 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermsFacetBuilder int + +func init() { + var tfb TermsFacetBuilder + reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size()) +} + type TermsFacetBuilder struct { size int field string @@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder { } } +func (fb *TermsFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + return sizeInBytes +} + func (fb *TermsFacetBuilder) Field() string { return fb.field } diff --git a/search/facets_builder.go b/search/facets_builder.go index 05e27041..34e45af8 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -15,11 +15,32 @@ package search import ( + "reflect" "sort" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFacetsBuilder int +var reflectStaticSizeFacetResult int +var reflectStaticSizeTermFacet int +var reflectStaticSizeNumericRangeFacet int +var reflectStaticSizeDateRangeFacet int + +func init() { + var fb FacetsBuilder + reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size()) + var fr FacetResult + reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size()) + var tf TermFacet + reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size()) + var nrf NumericRangeFacet + reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size()) + var drf DateRangeFacet + reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size()) +} + type FacetBuilder interface { StartDoc() UpdateVisitor(field string, term []byte) @@ -27,6 +48,8 @@ type FacetBuilder interface { Result() *FacetResult Field() string + + Size() int } type FacetsBuilder struct { @@ -42,6 +65,22 @@ func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { } } +func (fb *FacetsBuilder) Size() int { + sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr + + fb.indexReader.Size() + + for k, v := range fb.facets { + sizeInBytes += size.SizeOfString + len(k) + + v.Size() + } + + for _, entry := range fb.fields { + sizeInBytes += size.SizeOfString + len(entry) + } + + return sizeInBytes +} + func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { fb.facets[name] = facetBuilder fb.fields = append(fb.fields, facetBuilder.Field()) @@ -213,6 +252,14 @@ type FacetResult struct { DateRanges DateRangeFacets `json:"date_ranges,omitempty"` } +func (fr *FacetResult) Size() int { + return reflectStaticSizeFacetResult + size.SizeOfPtr + + len(fr.Field) + + len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) + + len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) + + len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr) +} + func (fr *FacetResult) Merge(other *FacetResult) { fr.Total += other.Total fr.Missing += other.Missing diff --git a/search/pool.go b/search/pool.go index b9b52a61..ba8be8fc 100644 --- a/search/pool.go +++ b/search/pool.go @@ -14,6 +14,17 @@ package search +import ( + "reflect" +) + +var reflectStaticSizeDocumentMatchPool int + +func init() { + var dmp DocumentMatchPool + reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) +} + // DocumentMatchPoolTooSmall is a callback function that can be executed // when the DocumentMatchPool does not have sufficient capacity // By default we just perform just-in-time allocation, but you could log diff --git a/search/scorer/scorer_conjunction.go b/search/scorer/scorer_conjunction.go index aad6f9c1..b866293e 100644 --- a/search/scorer/scorer_conjunction.go +++ b/search/scorer/scorer_conjunction.go @@ -15,13 +15,27 @@ package scorer import ( + "reflect" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionQueryScorer int + +func init() { + var cqs ConjunctionQueryScorer + reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size()) +} + type ConjunctionQueryScorer struct { options search.SearcherOptions } +func (s *ConjunctionQueryScorer) Size() int { + return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr +} + func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { return &ConjunctionQueryScorer{ options: options, diff --git a/search/scorer/scorer_constant.go b/search/scorer/scorer_constant.go index a65a826f..dc10fdaa 100644 --- a/search/scorer/scorer_constant.go +++ b/search/scorer/scorer_constant.go @@ -16,11 +16,20 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConstantScorer int + +func init() { + var cs ConstantScorer + reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size()) +} + type ConstantScorer struct { constant float64 boost float64 @@ -30,6 +39,16 @@ type ConstantScorer struct { queryWeightExplanation *search.Explanation } +func (s *ConstantScorer) Size() int { + sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { rv := ConstantScorer{ options: options, diff --git a/search/scorer/scorer_disjunction.go b/search/scorer/scorer_disjunction.go index 184a15d2..36a601c7 100644 --- a/search/scorer/scorer_disjunction.go +++ b/search/scorer/scorer_disjunction.go @@ -16,14 +16,27 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDisjunctionQueryScorer int + +func init() { + var dqs DisjunctionQueryScorer + reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size()) +} + type DisjunctionQueryScorer struct { options search.SearcherOptions } +func (s *DisjunctionQueryScorer) Size() int { + return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr +} + func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { return &DisjunctionQueryScorer{ options: options, diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index b5f46322..077e38e0 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -17,11 +17,20 @@ package scorer import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermQueryScorer int + +func init() { + var tqs TermQueryScorer + reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size()) +} + type TermQueryScorer struct { queryTerm []byte queryField string @@ -36,6 +45,21 @@ type TermQueryScorer struct { queryWeightExplanation *search.Explanation } +func (s *TermQueryScorer) Size() int { + sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr + + len(s.queryTerm) + len(s.queryField) + + if s.idfExplanation != nil { + sizeInBytes += s.idfExplanation.Size() + } + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { rv := TermQueryScorer{ queryTerm: queryTerm, diff --git a/search/search.go b/search/search.go index f9a92783..ca030df4 100644 --- a/search/search.go +++ b/search/search.go @@ -16,11 +16,26 @@ package search import ( "fmt" + "reflect" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocumentMatch int +var reflectStaticSizeSearchContext int +var reflectStaticSizeLocation int + +func init() { + var dm DocumentMatch + reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) + var sc SearchContext + reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + type ArrayPositions []uint64 func (ap ArrayPositions) Equals(other ArrayPositions) bool { @@ -47,6 +62,11 @@ type Location struct { ArrayPositions ArrayPositions `json:"array_positions"` } +func (l *Location) Size() int { + return reflectStaticSizeLocation + size.SizeOfPtr + + len(l.ArrayPositions)*size.SizeOfUint64 +} + type Locations []*Location type TermLocationMap map[string]Locations @@ -117,6 +137,52 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { return dm } +func (dm *DocumentMatch) Size() int { + sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr + + len(dm.Index) + + len(dm.ID) + + len(dm.IndexInternalID) + + if dm.Expl != nil { + sizeInBytes += dm.Expl.Size() + } + + for k, v := range dm.Locations { + sizeInBytes += size.SizeOfString + len(k) + for k1, v1 := range v { + sizeInBytes += size.SizeOfString + len(k1) + + size.SizeOfSlice + for _, entry := range v1 { + sizeInBytes += entry.Size() + } + } + } + + for k, v := range dm.Fragments { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfSlice + + for _, entry := range v { + sizeInBytes += size.SizeOfString + len(entry) + } + } + + for _, entry := range dm.Sort { + sizeInBytes += size.SizeOfString + len(entry) + } + + for k, _ := range dm.Fields { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + } + + if dm.Document != nil { + sizeInBytes += dm.Document.Size() + } + + return sizeInBytes +} + func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) } @@ -135,6 +201,7 @@ type Searcher interface { SetQueryNorm(float64) Count() uint64 Min() int + Size() int DocumentMatchPoolSize() int } @@ -148,3 +215,18 @@ type SearcherOptions struct { type SearchContext struct { DocumentMatchPool *DocumentMatchPool } + +func (sc *SearchContext) Size() int { + sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr + + reflectStaticSizeDocumentMatchPool + size.SizeOfPtr + + if sc.DocumentMatchPool != nil { + for _, entry := range sc.DocumentMatchPool.avail { + if entry != nil { + sizeInBytes += entry.Size() + } + } + } + + return sizeInBytes +} diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index a905c29e..b87337e1 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -16,12 +16,21 @@ package searcher import ( "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeBooleanSearcher int + +func init() { + var bs BooleanSearcher + reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size()) +} + type BooleanSearcher struct { indexReader index.IndexReader mustSearcher search.Searcher @@ -52,6 +61,33 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc return &rv, nil } +func (s *BooleanSearcher) Size() int { + sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr + + s.indexReader.Size() + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.shouldSearcher != nil { + sizeInBytes += s.shouldSearcher.Size() + } + + if s.mustNotSearcher != nil { + sizeInBytes += s.mustNotSearcher.Size() + } + + sizeInBytes += s.scorer.Size() + + for _, entry := range s.matches { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *BooleanSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index 73fba19c..da65f398 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -16,13 +16,22 @@ package searcher import ( "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionSearcher int + +func init() { + var cs ConjunctionSearcher + reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size()) +} + type ConjunctionSearcher struct { indexReader index.IndexReader searchers OrderedSearcherList @@ -54,6 +63,23 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S return &rv, nil } +func (s *ConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *ConjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index b6910ddb..119bac97 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -17,13 +17,22 @@ package searcher import ( "fmt" "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDisjunctionSearcher int + +func init() { + var ds DisjunctionSearcher + reflectStaticSizeDisjunctionSearcher = int(reflect.TypeOf(ds).Size()) +} + // DisjunctionMaxClauseCount is a compile time setting that applications can // adjust to non-zero value to cause the DisjunctionSearcher to return an // error instead of exeucting searches when the size exceeds this value. @@ -90,6 +99,32 @@ func newDisjunctionSearcher(indexReader index.IndexReader, return &rv, nil } +func (s *DisjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt + + return sizeInBytes +} + func (s *DisjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_docid.go b/search/searcher/search_docid.go index 06351b4a..3b258a58 100644 --- a/search/searcher/search_docid.go +++ b/search/searcher/search_docid.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocIDSearcher int + +func init() { + var ds DocIDSearcher + reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size()) +} + // DocIDSearcher returns documents matching a predefined set of identifiers. type DocIDSearcher struct { reader index.DocIDReader @@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64 }, nil } +func (s *DocIDSearcher) Size() int { + return reflectStaticSizeDocIDSearcher + size.SizeOfPtr + + s.reader.Size() + + s.scorer.Size() +} + func (s *DocIDSearcher) Count() uint64 { return uint64(s.count) } diff --git a/search/searcher/search_filter.go b/search/searcher/search_filter.go index 219f2ee7..7c95fb41 100644 --- a/search/searcher/search_filter.go +++ b/search/searcher/search_filter.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFilteringSearcher int + +func init() { + var fs FilteringSearcher + reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size()) +} + // FilterFunc defines a function which can filter documents // returning true means keep the document // returning false means do not keep the document @@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch } } +func (f *FilteringSearcher) Size() int { + return reflectStaticSizeFilteringSearcher + size.SizeOfPtr + + f.child.Size() +} + func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { next, err := f.child.Next(ctx) for next != nil && err == nil { diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 822db2ea..3f34e591 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchAllSearcher int + +func init() { + var mas MatchAllSearcher + reflectStaticSizeMatchAllSearcher = int(reflect.TypeOf(mas).Size()) +} + type MatchAllSearcher struct { indexReader index.IndexReader reader index.DocIDReader @@ -46,6 +56,13 @@ func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, options s }, nil } +func (s *MatchAllSearcher) Size() int { + return reflectStaticSizeMatchAllSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.reader.Size() + + s.scorer.Size() +} + func (s *MatchAllSearcher) Count() uint64 { return s.count } diff --git a/search/searcher/search_match_none.go b/search/searcher/search_match_none.go index 94759671..6b50b322 100644 --- a/search/searcher/search_match_none.go +++ b/search/searcher/search_match_none.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchNoneSearcher int + +func init() { + var mns MatchNoneSearcher + reflectStaticSizeMatchNoneSearcher = int(reflect.TypeOf(mns).Size()) +} + type MatchNoneSearcher struct { indexReader index.IndexReader } @@ -29,6 +39,11 @@ func NewMatchNoneSearcher(indexReader index.IndexReader) (*MatchNoneSearcher, er }, nil } +func (s *MatchNoneSearcher) Size() int { + return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr + + s.indexReader.Size() +} + func (s *MatchNoneSearcher) Count() uint64 { return uint64(0) } diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 6237cecf..23a359bd 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -17,11 +17,20 @@ package searcher import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePhraseSearcher int + +func init() { + var ps PhraseSearcher + reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size()) +} + type PhraseSearcher struct { indexReader index.IndexReader mustSearcher *ConjunctionSearcher @@ -32,6 +41,28 @@ type PhraseSearcher struct { initialized bool } +func (s *PhraseSearcher) Size() int { + sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr + + s.indexReader.Size() + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.currMust != nil { + sizeInBytes += s.currMust.Size() + } + + for _, entry := range s.terms { + sizeInBytes += size.SizeOfSlice + for _, entry1 := range entry { + sizeInBytes += size.SizeOfString + len(entry1) + } + } + + return sizeInBytes +} + func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) { // turn flat terms []string into [][]string mterms := make([][]string, len(terms)) diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 6fae6ae5..576d6643 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermSearcher int + +func init() { + var ts TermSearcher + reflectStaticSizeTermSearcher = int(reflect.TypeOf(ts).Size()) +} + type TermSearcher struct { indexReader index.IndexReader reader index.TermFieldReader @@ -63,6 +73,14 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri }, nil } +func (s *TermSearcher) Size() int { + return reflectStaticSizeTermSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.reader.Size() + + s.tfd.Size() + + s.scorer.Size() +} + func (s *TermSearcher) Count() uint64 { return s.reader.Count() } diff --git a/size/sizes.go b/size/sizes.go new file mode 100644 index 00000000..4ba544a7 --- /dev/null +++ b/size/sizes.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package size + +import ( + "reflect" +) + +func init() { + var a bool + SizeOfBool = int(reflect.TypeOf(a).Size()) + var b float32 + SizeOfFloat32 = int(reflect.TypeOf(b).Size()) + var c float64 + SizeOfFloat64 = int(reflect.TypeOf(c).Size()) + var d map[int]int + SizeOfMap = int(reflect.TypeOf(d).Size()) + var e *int + SizeOfPtr = int(reflect.TypeOf(e).Size()) + var f []int + SizeOfSlice = int(reflect.TypeOf(f).Size()) + var g string + SizeOfString = int(reflect.TypeOf(g).Size()) + var h uint8 + SizeOfUint8 = int(reflect.TypeOf(h).Size()) + var i uint16 + SizeOfUint16 = int(reflect.TypeOf(i).Size()) + var j uint32 + SizeOfUint32 = int(reflect.TypeOf(j).Size()) + var k uint64 + SizeOfUint64 = int(reflect.TypeOf(k).Size()) +} + +var SizeOfBool int +var SizeOfFloat32 int +var SizeOfFloat64 int +var SizeOfInt int +var SizeOfMap int +var SizeOfPtr int +var SizeOfSlice int +var SizeOfString int +var SizeOfUint8 int +var SizeOfUint16 int +var SizeOfUint32 int +var SizeOfUint64 int From 96071c085cbdf0beb7d7a74f483b1b2c53f4bd97 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 5 Mar 2018 16:49:55 -0800 Subject: [PATCH 16/18] MB-28163: Register a callback with context to estimate RAM for search This callback if registered with context will invoke the api to estimate the memory needed to execute a search query. The callback defined at the client side will be responsible for determining whether to continue with the search or abort based on the threshold settings. --- index_impl.go | 13 +++++++++++++ index_test.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/index_impl.go b/index_impl.go index df6e748d..1036aef2 100644 --- a/index_impl.go +++ b/index_impl.go @@ -50,6 +50,10 @@ const storePath = "store" var mappingInternalKey = []byte("_mapping") +const SearchMemCheckCallbackKey = "_search_mem_callback_key" + +type SearchMemCheckCallbackFn func(size uint64) error + func indexStorePath(path string) string { return path + string(os.PathSeparator) + storePath } @@ -479,6 +483,15 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr collector.SetFacetsBuilder(facetsBuilder) } + if memCb := ctx.Value(SearchMemCheckCallbackKey); memCb != nil { + if memCbFn, ok := memCb.(SearchMemCheckCallbackFn); ok { + err = memCbFn(memNeededForSearch(req, searcher, collector)) + } + } + if err != nil { + return nil, err + } + err = collector.Collect(ctx, searcher, indexReader) if err != nil { return nil, err diff --git a/index_test.go b/index_test.go index f1e53647..57429dcb 100644 --- a/index_test.go +++ b/index_test.go @@ -1870,3 +1870,50 @@ func BenchmarkUpsidedownSearchOverhead(b *testing.B) { func BenchmarkScorchSearchOverhead(b *testing.B) { benchmarkSearchOverhead(scorch.Name, b) } + +func TestSearchMemCheckCallback(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + index, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + elements := []string{"air", "water", "fire", "earth"} + for j := 0; j < 10000; j++ { + err = index.Index(fmt.Sprintf("%d", j), + map[string]interface{}{"name": elements[j%len(elements)]}) + if err != nil { + t.Fatal(err) + } + } + + query := NewTermQuery("water") + req := NewSearchRequest(query) + + expErr := fmt.Errorf("MEM_LIMIT_EXCEEDED") + f := func(size uint64) error { + if size > 1000 { + return expErr + } + return nil + } + + ctx := context.WithValue(context.Background(), SearchMemCheckCallbackKey, + SearchMemCheckCallbackFn(f)) + _, err = index.SearchInContext(ctx, req) + if err != expErr { + t.Fatalf("Expected: %v, Got: %v", expErr, err) + } +} From 38b6c522b0a8a77f775b8f340909a7517766d67a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 6 Mar 2018 14:00:54 -0800 Subject: [PATCH 17/18] Address build breakage after rebase Removed attribute: iterator of type Posting --- index/scorch/segment/zap/posting.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index e9c68cba..ef21df8d 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -506,10 +506,6 @@ type Posting struct { func (p *Posting) Size() int { sizeInBytes := reflectStaticSizePosting - if p.iterator != nil { - sizeInBytes += p.iterator.Size() - } - for _, entry := range p.locs { sizeInBytes += entry.Size() } From b62ca996f63e64dad76bdd95c7d8b4f31d4d6f0a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 13:30:44 -0800 Subject: [PATCH 18/18] scorch zap optimize chunkedIntCoder.Add() calls to use multiple vals This change leverages the ability for the chunkedIntCoder.Add() method to accept multiple input param values (via the '...' param signature), meaning there are fewer Add() invocations. --- index/scorch/segment/zap/build.go | 53 ++++++------------------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 237cc5f3..361e56e5 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -319,19 +319,10 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac postingsListItr := memSegment.Postings[postingID].Iterator() var offset int for postingsListItr.HasNext() { - docNum := uint64(postingsListItr.Next()) - // put freq - err := tfEncoder.Add(docNum, freqs[offset]) - if err != nil { - return nil, nil, err - } - - // put norm - norm := norms[offset] - normBits := math.Float32bits(norm) - err = tfEncoder.Add(docNum, uint64(normBits)) + // put freq & norm + err := tfEncoder.Add(docNum, freqs[offset], uint64(math.Float32bits(norms[offset]))) if err != nil { return nil, nil, err } @@ -347,7 +338,6 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if err != nil { return nil, nil, err } - } // now do it again for the locations @@ -371,43 +361,17 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac n := int(freqs[offset]) for i := 0; i < n; i++ { if len(locfields) > 0 { - // put field - err := locEncoder.Add(docNum, uint64(locfields[locOffset])) - if err != nil { - return nil, nil, err - } - - // put pos - err = locEncoder.Add(docNum, locpos[locOffset]) - if err != nil { - return nil, nil, err - } - - // put start - err = locEncoder.Add(docNum, locstarts[locOffset]) - if err != nil { - return nil, nil, err - } - - // put end - err = locEncoder.Add(docNum, locends[locOffset]) - if err != nil { - return nil, nil, err - } - - // put the number of array positions to follow - num := len(locarraypos[locOffset]) - err = locEncoder.Add(docNum, uint64(num)) + err := locEncoder.Add(docNum, uint64(locfields[locOffset]), + locpos[locOffset], locstarts[locOffset], locends[locOffset], + uint64(len(locarraypos[locOffset]))) if err != nil { return nil, nil, err } // put each array position - for _, pos := range locarraypos[locOffset] { - err = locEncoder.Add(docNum, pos) - if err != nil { - return nil, nil, err - } + err = locEncoder.Add(docNum, locarraypos[locOffset]...) + if err != nil { + return nil, nil, err } } locOffset++ @@ -417,6 +381,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac // record where this postings loc info starts locOffsets = append(locOffsets, uint64(w.Count())) + locEncoder.Close() _, err := locEncoder.Write(w) if err != nil {