From d777d7c3652942a752147f747089d13151b4bd5a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:06:44 -0800 Subject: [PATCH 1/8] scorch mem segment comments consistency --- index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/segment.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index cd11fb40..1dc0a788 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -24,7 +24,7 @@ import ( "github.com/blevesearch/bleve/index" ) -// NewFromAnalyzedDocs places the analyzed document mutations into this segment +// NewFromAnalyzedDocs places the analyzed document mutations into a new segment func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { s := New() diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 5ef3e1f3..40c071f6 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -46,7 +46,7 @@ type Segment struct { FieldsInv []string // term dictionary - // field id -> term -> posting id + 1 + // field id -> term -> postings list id + 1 Dicts []map[string]uint64 // term dictionary keys @@ -54,7 +54,7 @@ type Segment struct { DictKeys [][]string // Postings list - // Postings list id -> Postings bitmap + // postings list id -> Postings bitmap Postings []*roaring.Bitmap // Postings List has locations From e7bd6026eb239f7e8b452b02958b8d37b68bc0ad Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:52:18 -0800 Subject: [PATCH 2/8] scorch mem segment preallocs docMap/fieldLens with capacity The first time through, startNumFields should be 0, where there ought to be more optimization assuming later docs have similar fields as the first doc. --- index/scorch/segment/mem/build.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 1dc0a788..29c41d5d 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -83,9 +83,12 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { } func (s *Segment) processDocument(result *index.AnalysisResult) { + startNumFields := len(s.FieldsMap) + // used to collate information across fields - docMap := map[uint16]analysis.TokenFrequencies{} - fieldLens := map[uint16]int{} + docMap := make(map[uint16]analysis.TokenFrequencies, startNumFields) + fieldLens := make(map[uint16]int, startNumFields) + docNum := uint64(s.addDocument()) processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { From 917c47079122dc59c79ec5e131811ca9dfdd48c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:54:46 -0800 Subject: [PATCH 3/8] scorch mem segment VisitDocument() accesses StoredTypes/Pos outside of loop --- index/scorch/segment/mem/segment.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 40c071f6..3c400b53 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -188,9 +188,11 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi return nil } docFields := s.Stored[int(num)] + st := s.StoredTypes[int(num)] + sp := s.StoredPos[int(num)] for field, values := range docFields { for i, value := range values { - keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i]) + keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) if !keepGoing { return nil } From a4110d325c2ff0b790d509893328f07be6234cf5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 16:37:06 -0800 Subject: [PATCH 4/8] scorch mem segment preallocates slices that are key'ed by postingId The scorch mem segment build phase uses the append() idiom to populate various slices that are keyed by postings list id's. These slices include... * Postings * PostingsLocs * Freqs * Norms * Locfields * Locstarts * Locends * Locpos * Locarraypos This change introduces an initialization step that preallocates those slices up-front, by assigning postings list id's to terms up-front. This change also has an additional effect of simplifying the processDocument() logic to no longer have to worry about a first-time initialization case, removing some duplicate'ish code. --- index/scorch/segment/mem/build.go | 138 +++++++++++++++--------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 29c41d5d..8f080338 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -31,6 +31,9 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { // ensure that _id field get fieldID 0 s.getOrDefineField("_id") + // fill Dicts/DictKeys and preallocate memory + s.initializeDict(results) + // walk each doc for _, result := range results { s.processDocument(result) @@ -82,12 +85,58 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { return s } -func (s *Segment) processDocument(result *index.AnalysisResult) { - startNumFields := len(s.FieldsMap) +// fill Dicts/DictKeys and preallocate memory for postings +func (s *Segment) initializeDict(results []*index.AnalysisResult) { + var numPostings int + processField := func(fieldID uint16, tf analysis.TokenFrequencies) { + for term, _ := range tf { + _, exists := s.Dicts[fieldID][term] + if !exists { + numPostings++ + s.Dicts[fieldID][term] = uint64(numPostings) + s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + } + } + } + + for _, result := range results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + processField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + processField(fieldID, tf) + } + } + + s.Postings = make([]*roaring.Bitmap, numPostings) + for i := 0; i < numPostings; i++ { + s.Postings[i] = roaring.New() + } + s.PostingsLocs = make([]*roaring.Bitmap, numPostings) + for i := 0; i < numPostings; i++ { + s.PostingsLocs[i] = roaring.New() + } + s.Freqs = make([][]uint64, numPostings) + s.Norms = make([][]float32, numPostings) + s.Locfields = make([][]uint16, numPostings) + s.Locstarts = make([][]uint64, numPostings) + s.Locends = make([][]uint64, numPostings) + s.Locpos = make([][]uint64, numPostings) + s.Locarraypos = make([][][]uint64, numPostings) +} + +func (s *Segment) processDocument(result *index.AnalysisResult) { // used to collate information across fields - docMap := make(map[uint16]analysis.TokenFrequencies, startNumFields) - fieldLens := make(map[uint16]int, startNumFields) + docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) + fieldLens := make(map[uint16]int, len(s.FieldsMap)) docNum := uint64(s.addDocument()) @@ -132,80 +181,27 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { fieldTermPostings := s.Dicts[fieldID][term] - - // FIXME this if/else block has duplicate code that has resulted in - // bugs fixed/missed more than once, need to refactor - if fieldTermPostings == 0 { - // need to build new posting - bs := roaring.New() - bs.AddInt(int(docNum)) - - newPostingID := uint64(len(s.Postings) + 1) - // add this new bitset to the postings slice - s.Postings = append(s.Postings, bs) - - locationBS := roaring.New() - s.PostingsLocs = append(s.PostingsLocs, locationBS) - // add this to the details slice - s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())}) - s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) - // add to locations - var locfields []uint16 - var locstarts []uint64 - var locends []uint64 - var locpos []uint64 - var locarraypos [][]uint64 - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } + pid := fieldTermPostings-1 + bs := s.Postings[pid] + bs.AddInt(int(docNum)) + s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) + s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + locationBS := s.PostingsLocs[pid] + if len(tokenFreq.Locations) > 0 { + locationBS.AddInt(int(docNum)) for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field)) } - locfields = append(locfields, locf) - locstarts = append(locstarts, uint64(loc.Start)) - locends = append(locends, uint64(loc.End)) - locpos = append(locpos, uint64(loc.Position)) + s.Locfields[pid] = append(s.Locfields[pid], locf) + s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) + s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) + s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) if len(loc.ArrayPositions) > 0 { - locarraypos = append(locarraypos, loc.ArrayPositions) + s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) } else { - locarraypos = append(locarraypos, nil) - } - } - s.Locfields = append(s.Locfields, locfields) - s.Locstarts = append(s.Locstarts, locstarts) - s.Locends = append(s.Locends, locends) - s.Locpos = append(s.Locpos, locpos) - s.Locarraypos = append(s.Locarraypos, locarraypos) - // record it - s.Dicts[fieldID][term] = newPostingID - // this term was new for this field, add it to dictKeys - s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) - } else { - // posting already started for this field/term - // the actual offset is - 1, because 0 is zero value - bs := s.Postings[fieldTermPostings-1] - bs.AddInt(int(docNum)) - locationBS := s.PostingsLocs[fieldTermPostings-1] - s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) - s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } - for _, loc := range tokenFreq.Locations { - var locf = fieldID - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf) - s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start)) - s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End)) - s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position)) - if len(loc.ArrayPositions) > 0 { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions) - } else { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil) + s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) } } } From a84bd122d2c6f18dc16dbd44a2ef8a5b0f184d34 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 17:04:27 -0800 Subject: [PATCH 5/8] scorch mem segment preallocates sub-slices via # terms This change tracks the number of terms per posting list to preallocate the sub-slices for the Freqs & Norms. --- index/scorch/segment/mem/build.go | 54 +++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 8f080338..eaf36833 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -87,17 +87,26 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { // fill Dicts/DictKeys and preallocate memory for postings func (s *Segment) initializeDict(results []*index.AnalysisResult) { - var numPostings int + var numPostingsLists int + + numTermsPerPostingsList := make([]int, 0, 64) + + var numTokenFrequencies int processField := func(fieldID uint16, tf analysis.TokenFrequencies) { for term, _ := range tf { - _, exists := s.Dicts[fieldID][term] + pidPlus1, exists := s.Dicts[fieldID][term] if !exists { - numPostings++ - s.Dicts[fieldID][term] = uint64(numPostings) + numPostingsLists++ + pidPlus1 = uint64(numPostingsLists) + s.Dicts[fieldID][term] = pidPlus1 s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + numTermsPerPostingsList = append(numTermsPerPostingsList, 0) } + pid := pidPlus1 - 1 + numTermsPerPostingsList[pid]++ } + numTokenFrequencies += len(tf) } for _, result := range results { @@ -116,21 +125,33 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { } } - s.Postings = make([]*roaring.Bitmap, numPostings) - for i := 0; i < numPostings; i++ { + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { s.Postings[i] = roaring.New() } - s.PostingsLocs = make([]*roaring.Bitmap, numPostings) - for i := 0; i < numPostings; i++ { + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { s.PostingsLocs[i] = roaring.New() } - s.Freqs = make([][]uint64, numPostings) - s.Norms = make([][]float32, numPostings) - s.Locfields = make([][]uint16, numPostings) - s.Locstarts = make([][]uint64, numPostings) - s.Locends = make([][]uint64, numPostings) - s.Locpos = make([][]uint64, numPostings) - s.Locarraypos = make([][][]uint64, numPostings) + + s.Freqs = make([][]uint64, numPostingsLists) + s.Norms = make([][]float32, numPostingsLists) + s.Locfields = make([][]uint16, numPostingsLists) + s.Locstarts = make([][]uint64, numPostingsLists) + s.Locends = make([][]uint64, numPostingsLists) + s.Locpos = make([][]uint64, numPostingsLists) + s.Locarraypos = make([][][]uint64, numPostingsLists) + + uint64Backing := make([]uint64, numTokenFrequencies) + float32Backing := make([]float32, numTokenFrequencies) + + for i, numTerms := range numTermsPerPostingsList { + s.Freqs[i] = uint64Backing[0:0] + uint64Backing = uint64Backing[numTerms:] + + s.Norms[i] = float32Backing[0:0] + float32Backing = float32Backing[numTerms:] + } } func (s *Segment) processDocument(result *index.AnalysisResult) { @@ -180,8 +201,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - fieldTermPostings := s.Dicts[fieldID][term] - pid := fieldTermPostings-1 + pid := s.Dicts[fieldID][term]-1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From 0f19b542a3f91fe5f55b8f38b5e6f7af864be644 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 18:40:28 -0800 Subject: [PATCH 6/8] scorch mem segment prealloc's Locfields/starts/ends/pos/arraypos This change preallocates more of the backing arrays for Locfields, Locstarts, Locends, Locpos, Locaaraypos sub-slices of a scorch mem segment. On small bleve-blast tests (50K wiki docs) on a dev macbook, scorch indexing throughput seems to improve from 15MB/sec to 20MB/sec after the recent series of preallocation changes. --- index/scorch/segment/mem/build.go | 53 +++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index eaf36833..14cb1cbc 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -89,12 +89,14 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { func (s *Segment) initializeDict(results []*index.AnalysisResult) { var numPostingsLists int - numTermsPerPostingsList := make([]int, 0, 64) + numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. + numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. var numTokenFrequencies int + var numLocs int - processField := func(fieldID uint16, tf analysis.TokenFrequencies) { - for term, _ := range tf { + processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + for term, tf := range tfs { pidPlus1, exists := s.Dicts[fieldID][term] if !exists { numPostingsLists++ @@ -102,11 +104,14 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.Dicts[fieldID][term] = pidPlus1 s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) numTermsPerPostingsList = append(numTermsPerPostingsList, 0) + numLocsPerPostingsList = append(numLocsPerPostingsList, 0) } pid := pidPlus1 - 1 - numTermsPerPostingsList[pid]++ + numTermsPerPostingsList[pid] += 1 + numLocsPerPostingsList[pid] += len(tf.Locations) + numLocs += len(tf.Locations) } - numTokenFrequencies += len(tf) + numTokenFrequencies += len(tfs) } for _, result := range results { @@ -136,21 +141,43 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.Freqs = make([][]uint64, numPostingsLists) s.Norms = make([][]float32, numPostingsLists) + + uint64Backing := make([]uint64, numTokenFrequencies) + float32Backing := make([]float32, numTokenFrequencies) + + for pid, numTerms := range numTermsPerPostingsList { + s.Freqs[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numTerms:] + + s.Norms[pid] = float32Backing[0:0] + float32Backing = float32Backing[numTerms:] + } + s.Locfields = make([][]uint16, numPostingsLists) s.Locstarts = make([][]uint64, numPostingsLists) s.Locends = make([][]uint64, numPostingsLists) s.Locpos = make([][]uint64, numPostingsLists) s.Locarraypos = make([][][]uint64, numPostingsLists) - uint64Backing := make([]uint64, numTokenFrequencies) - float32Backing := make([]float32, numTokenFrequencies) + uint16Backing := make([]uint16, numLocs) // For Locfields. + uint64Backing = make([]uint64, numLocs*3) // For Locstarts, Locends, Locpos. + auint64Backing := make([][]uint64, numLocs) // For Locarraypos. - for i, numTerms := range numTermsPerPostingsList { - s.Freqs[i] = uint64Backing[0:0] - uint64Backing = uint64Backing[numTerms:] + for pid, numLocs := range numLocsPerPostingsList { + s.Locfields[pid] = uint16Backing[0:0] + uint16Backing = uint16Backing[numLocs:] - s.Norms[i] = float32Backing[0:0] - float32Backing = float32Backing[numTerms:] + s.Locstarts[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locends[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locpos[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locarraypos[pid] = auint64Backing[0:0] + auint64Backing = auint64Backing[numLocs:] } } @@ -201,7 +228,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - pid := s.Dicts[fieldID][term]-1 + pid := s.Dicts[fieldID][term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From d682c85a7b599b3d8053caa4b2937860c1febf14 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 19:17:39 -0800 Subject: [PATCH 7/8] scorch mem segments uses backing array trick even more This change invokes make() only once per distinct type to allocate the large, contiguous backing arrays for the mem segment. --- index/scorch/segment/mem/build.go | 40 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 14cb1cbc..554de890 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -93,7 +93,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. var numTokenFrequencies int - var numLocs int + var totLocs int processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { for term, tf := range tfs { @@ -109,7 +109,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { pid := pidPlus1 - 1 numTermsPerPostingsList[pid] += 1 numLocsPerPostingsList[pid] += len(tf.Locations) - numLocs += len(tf.Locations) + totLocs += len(tf.Locations) } numTokenFrequencies += len(tfs) } @@ -139,12 +139,32 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.PostingsLocs[i] = roaring.New() } - s.Freqs = make([][]uint64, numPostingsLists) + // Preallocate big, contiguous backing arrays. + auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. + uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. + float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. + uint16Backing := make([]uint16, totLocs) // For sub-Locfields. + + // Point top-level slices to the backing arrays. + s.Freqs = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + s.Norms = make([][]float32, numPostingsLists) - uint64Backing := make([]uint64, numTokenFrequencies) - float32Backing := make([]float32, numTokenFrequencies) + s.Locfields = make([][]uint16, numPostingsLists) + s.Locstarts = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locends = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locpos = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locarraypos = make([][][]uint64, numPostingsLists) + + // Point sub-slices to the backing arrays. for pid, numTerms := range numTermsPerPostingsList { s.Freqs[pid] = uint64Backing[0:0] uint64Backing = uint64Backing[numTerms:] @@ -153,16 +173,6 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { float32Backing = float32Backing[numTerms:] } - s.Locfields = make([][]uint16, numPostingsLists) - s.Locstarts = make([][]uint64, numPostingsLists) - s.Locends = make([][]uint64, numPostingsLists) - s.Locpos = make([][]uint64, numPostingsLists) - s.Locarraypos = make([][][]uint64, numPostingsLists) - - uint16Backing := make([]uint16, numLocs) // For Locfields. - uint64Backing = make([]uint64, numLocs*3) // For Locstarts, Locends, Locpos. - auint64Backing := make([][]uint64, numLocs) // For Locarraypos. - for pid, numLocs := range numLocsPerPostingsList { s.Locfields[pid] = uint16Backing[0:0] uint16Backing = uint16Backing[numLocs:] From 71d6d1691b992a25f4511f4ec86c25cf476e3ff8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 22:43:08 -0800 Subject: [PATCH 8/8] scorch zap optimizations of inner loops and easy preallocs --- index/scorch/segment/zap/build.go | 54 ++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index c7f73769..1b16b5e3 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -140,12 +140,18 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + st := memSegment.StoredTypes[docNum] + sp := memSegment.StoredPos[docNum] + // encode fields in order for fieldID := range memSegment.FieldsInv { if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { // has stored values for this field num := len(storedFieldValues) + stf := st[uint16(fieldID)] + spf := sp[uint16(fieldID)] + // process each value for i := 0; i < num; i++ { // encode field @@ -154,7 +160,7 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode type - _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + _, err2 = metaEncoder.PutU64(uint64(stf[i])) if err2 != nil { return 0, err2 } @@ -169,13 +175,13 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) if err2 != nil { return 0, err2 } // encode all array positions - for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + for _, pos := range spf[i] { + _, err2 = metaEncoder.PutU64(pos) if err2 != nil { return 0, err2 } @@ -235,6 +241,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { tfEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + norms := memSegment.Norms[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int for postingsListItr.HasNext() { @@ -242,13 +250,13 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac docNum := uint64(postingsListItr.Next()) // put freq - err := tfEncoder.Add(docNum, memSegment.Freqs[postingID][offset]) + err := tfEncoder.Add(docNum, freqs[offset]) if err != nil { return nil, nil, err } // put norm - norm := memSegment.Norms[postingID][offset] + norm := norms[offset] normBits := math.Float32bits(norm) err = tfEncoder.Add(docNum, uint64(normBits)) if err != nil { @@ -275,40 +283,46 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { locEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + locfields := memSegment.Locfields[postingID] + locpos := memSegment.Locpos[postingID] + locstarts := memSegment.Locstarts[postingID] + locends := memSegment.Locends[postingID] + locarraypos := memSegment.Locarraypos[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int var locOffset int for postingsListItr.HasNext() { docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { - if len(memSegment.Locfields[postingID]) > 0 { + for i := 0; i < int(freqs[offset]); i++ { + if len(locfields) > 0 { // put field - err := locEncoder.Add(docNum, uint64(memSegment.Locfields[postingID][locOffset])) + err := locEncoder.Add(docNum, uint64(locfields[locOffset])) if err != nil { return nil, nil, err } // put pos - err = locEncoder.Add(docNum, memSegment.Locpos[postingID][locOffset]) + err = locEncoder.Add(docNum, locpos[locOffset]) if err != nil { return nil, nil, err } // put start - err = locEncoder.Add(docNum, memSegment.Locstarts[postingID][locOffset]) + err = locEncoder.Add(docNum, locstarts[locOffset]) if err != nil { return nil, nil, err } // put end - err = locEncoder.Add(docNum, memSegment.Locends[postingID][locOffset]) + err = locEncoder.Add(docNum, locends[locOffset]) if err != nil { return nil, nil, err } // put array positions - num := len(memSegment.Locarraypos[postingID][locOffset]) + num := len(locarraypos[locOffset]) // put the number of array positions to follow err = locEncoder.Add(docNum, uint64(num)) @@ -317,8 +331,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // put each array position - for j := 0; j < num; j++ { - err = locEncoder.Add(docNum, memSegment.Locarraypos[postingID][locOffset][j]) + for _, pos := range locarraypos[locOffset] { + err = locEncoder.Add(docNum, pos) if err != nil { return nil, nil, err } @@ -341,6 +355,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.PostingsLocs)) for postingID := range memSegment.PostingsLocs { // record where we start this posting loc rv = append(rv, uint64(w.Count())) @@ -355,6 +370,7 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.Postings)) for postingID := range memSegment.Postings { // record where we start this posting list rv = append(rv, uint64(w.Count())) @@ -376,7 +392,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, } func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { - var rv []uint64 + rv := make([]uint64, 0, len(memSegment.DictKeys)) var buffer bytes.Buffer for fieldID, fieldTerms := range memSegment.DictKeys { @@ -392,10 +408,10 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs dict := memSegment.Dicts[fieldID] // now walk the dictionary in order of fieldTerms (already sorted) - for i := range fieldTerms { - postingID := dict[fieldTerms[i]] - 1 + for _, fieldTerm := range fieldTerms { + postingID := dict[fieldTerm] - 1 postingsAddr := postingsLocs[postingID] - err = builder.Insert([]byte(fieldTerms[i]), postingsAddr) + err = builder.Insert([]byte(fieldTerm), postingsAddr) if err != nil { return nil, err }