diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index cd11fb40..554de890 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -24,13 +24,16 @@ import ( "github.com/blevesearch/bleve/index" ) -// NewFromAnalyzedDocs places the analyzed document mutations into this segment +// NewFromAnalyzedDocs places the analyzed document mutations into a new segment func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { s := New() // ensure that _id field get fieldID 0 s.getOrDefineField("_id") + // fill Dicts/DictKeys and preallocate memory + s.initializeDict(results) + // walk each doc for _, result := range results { s.processDocument(result) @@ -82,10 +85,117 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { return s } +// fill Dicts/DictKeys and preallocate memory for postings +func (s *Segment) initializeDict(results []*index.AnalysisResult) { + var numPostingsLists int + + numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. + numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. + + var numTokenFrequencies int + var totLocs int + + processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + for term, tf := range tfs { + pidPlus1, exists := s.Dicts[fieldID][term] + if !exists { + numPostingsLists++ + pidPlus1 = uint64(numPostingsLists) + s.Dicts[fieldID][term] = pidPlus1 + s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + numTermsPerPostingsList = append(numTermsPerPostingsList, 0) + numLocsPerPostingsList = append(numLocsPerPostingsList, 0) + } + pid := pidPlus1 - 1 + numTermsPerPostingsList[pid] += 1 + numLocsPerPostingsList[pid] += len(tf.Locations) + totLocs += len(tf.Locations) + } + numTokenFrequencies += len(tfs) + } + + for _, result := range results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + processField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + processField(fieldID, tf) + } + } + + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.Postings[i] = roaring.New() + } + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.PostingsLocs[i] = roaring.New() + } + + // Preallocate big, contiguous backing arrays. + auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. + uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. + float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. + uint16Backing := make([]uint16, totLocs) // For sub-Locfields. + + // Point top-level slices to the backing arrays. + s.Freqs = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Norms = make([][]float32, numPostingsLists) + + s.Locfields = make([][]uint16, numPostingsLists) + + s.Locstarts = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locends = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locpos = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locarraypos = make([][][]uint64, numPostingsLists) + + // Point sub-slices to the backing arrays. + for pid, numTerms := range numTermsPerPostingsList { + s.Freqs[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numTerms:] + + s.Norms[pid] = float32Backing[0:0] + float32Backing = float32Backing[numTerms:] + } + + for pid, numLocs := range numLocsPerPostingsList { + s.Locfields[pid] = uint16Backing[0:0] + uint16Backing = uint16Backing[numLocs:] + + s.Locstarts[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locends[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locpos[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locarraypos[pid] = auint64Backing[0:0] + auint64Backing = auint64Backing[numLocs:] + } +} + func (s *Segment) processDocument(result *index.AnalysisResult) { // used to collate information across fields - docMap := map[uint16]analysis.TokenFrequencies{} - fieldLens := map[uint16]int{} + docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) + fieldLens := make(map[uint16]int, len(s.FieldsMap)) + docNum := uint64(s.addDocument()) processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { @@ -128,81 +238,27 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - fieldTermPostings := s.Dicts[fieldID][term] - - // FIXME this if/else block has duplicate code that has resulted in - // bugs fixed/missed more than once, need to refactor - if fieldTermPostings == 0 { - // need to build new posting - bs := roaring.New() - bs.AddInt(int(docNum)) - - newPostingID := uint64(len(s.Postings) + 1) - // add this new bitset to the postings slice - s.Postings = append(s.Postings, bs) - - locationBS := roaring.New() - s.PostingsLocs = append(s.PostingsLocs, locationBS) - // add this to the details slice - s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())}) - s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) - // add to locations - var locfields []uint16 - var locstarts []uint64 - var locends []uint64 - var locpos []uint64 - var locarraypos [][]uint64 - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } + pid := s.Dicts[fieldID][term] - 1 + bs := s.Postings[pid] + bs.AddInt(int(docNum)) + s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) + s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + locationBS := s.PostingsLocs[pid] + if len(tokenFreq.Locations) > 0 { + locationBS.AddInt(int(docNum)) for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field)) } - locfields = append(locfields, locf) - locstarts = append(locstarts, uint64(loc.Start)) - locends = append(locends, uint64(loc.End)) - locpos = append(locpos, uint64(loc.Position)) + s.Locfields[pid] = append(s.Locfields[pid], locf) + s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) + s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) + s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) if len(loc.ArrayPositions) > 0 { - locarraypos = append(locarraypos, loc.ArrayPositions) + s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) } else { - locarraypos = append(locarraypos, nil) - } - } - s.Locfields = append(s.Locfields, locfields) - s.Locstarts = append(s.Locstarts, locstarts) - s.Locends = append(s.Locends, locends) - s.Locpos = append(s.Locpos, locpos) - s.Locarraypos = append(s.Locarraypos, locarraypos) - // record it - s.Dicts[fieldID][term] = newPostingID - // this term was new for this field, add it to dictKeys - s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) - } else { - // posting already started for this field/term - // the actual offset is - 1, because 0 is zero value - bs := s.Postings[fieldTermPostings-1] - bs.AddInt(int(docNum)) - locationBS := s.PostingsLocs[fieldTermPostings-1] - s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) - s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } - for _, loc := range tokenFreq.Locations { - var locf = fieldID - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf) - s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start)) - s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End)) - s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position)) - if len(loc.ArrayPositions) > 0 { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions) - } else { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil) + s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) } } } diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 5ef3e1f3..3c400b53 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -46,7 +46,7 @@ type Segment struct { FieldsInv []string // term dictionary - // field id -> term -> posting id + 1 + // field id -> term -> postings list id + 1 Dicts []map[string]uint64 // term dictionary keys @@ -54,7 +54,7 @@ type Segment struct { DictKeys [][]string // Postings list - // Postings list id -> Postings bitmap + // postings list id -> Postings bitmap Postings []*roaring.Bitmap // Postings List has locations @@ -188,9 +188,11 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi return nil } docFields := s.Stored[int(num)] + st := s.StoredTypes[int(num)] + sp := s.StoredPos[int(num)] for field, values := range docFields { for i, value := range values { - keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i]) + keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) if !keepGoing { return nil } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index c7f73769..1b16b5e3 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -140,12 +140,18 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + st := memSegment.StoredTypes[docNum] + sp := memSegment.StoredPos[docNum] + // encode fields in order for fieldID := range memSegment.FieldsInv { if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { // has stored values for this field num := len(storedFieldValues) + stf := st[uint16(fieldID)] + spf := sp[uint16(fieldID)] + // process each value for i := 0; i < num; i++ { // encode field @@ -154,7 +160,7 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode type - _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + _, err2 = metaEncoder.PutU64(uint64(stf[i])) if err2 != nil { return 0, err2 } @@ -169,13 +175,13 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) if err2 != nil { return 0, err2 } // encode all array positions - for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + for _, pos := range spf[i] { + _, err2 = metaEncoder.PutU64(pos) if err2 != nil { return 0, err2 } @@ -235,6 +241,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { tfEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + norms := memSegment.Norms[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int for postingsListItr.HasNext() { @@ -242,13 +250,13 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac docNum := uint64(postingsListItr.Next()) // put freq - err := tfEncoder.Add(docNum, memSegment.Freqs[postingID][offset]) + err := tfEncoder.Add(docNum, freqs[offset]) if err != nil { return nil, nil, err } // put norm - norm := memSegment.Norms[postingID][offset] + norm := norms[offset] normBits := math.Float32bits(norm) err = tfEncoder.Add(docNum, uint64(normBits)) if err != nil { @@ -275,40 +283,46 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { locEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + locfields := memSegment.Locfields[postingID] + locpos := memSegment.Locpos[postingID] + locstarts := memSegment.Locstarts[postingID] + locends := memSegment.Locends[postingID] + locarraypos := memSegment.Locarraypos[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int var locOffset int for postingsListItr.HasNext() { docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { - if len(memSegment.Locfields[postingID]) > 0 { + for i := 0; i < int(freqs[offset]); i++ { + if len(locfields) > 0 { // put field - err := locEncoder.Add(docNum, uint64(memSegment.Locfields[postingID][locOffset])) + err := locEncoder.Add(docNum, uint64(locfields[locOffset])) if err != nil { return nil, nil, err } // put pos - err = locEncoder.Add(docNum, memSegment.Locpos[postingID][locOffset]) + err = locEncoder.Add(docNum, locpos[locOffset]) if err != nil { return nil, nil, err } // put start - err = locEncoder.Add(docNum, memSegment.Locstarts[postingID][locOffset]) + err = locEncoder.Add(docNum, locstarts[locOffset]) if err != nil { return nil, nil, err } // put end - err = locEncoder.Add(docNum, memSegment.Locends[postingID][locOffset]) + err = locEncoder.Add(docNum, locends[locOffset]) if err != nil { return nil, nil, err } // put array positions - num := len(memSegment.Locarraypos[postingID][locOffset]) + num := len(locarraypos[locOffset]) // put the number of array positions to follow err = locEncoder.Add(docNum, uint64(num)) @@ -317,8 +331,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // put each array position - for j := 0; j < num; j++ { - err = locEncoder.Add(docNum, memSegment.Locarraypos[postingID][locOffset][j]) + for _, pos := range locarraypos[locOffset] { + err = locEncoder.Add(docNum, pos) if err != nil { return nil, nil, err } @@ -341,6 +355,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.PostingsLocs)) for postingID := range memSegment.PostingsLocs { // record where we start this posting loc rv = append(rv, uint64(w.Count())) @@ -355,6 +370,7 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.Postings)) for postingID := range memSegment.Postings { // record where we start this posting list rv = append(rv, uint64(w.Count())) @@ -376,7 +392,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, } func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { - var rv []uint64 + rv := make([]uint64, 0, len(memSegment.DictKeys)) var buffer bytes.Buffer for fieldID, fieldTerms := range memSegment.DictKeys { @@ -392,10 +408,10 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs dict := memSegment.Dicts[fieldID] // now walk the dictionary in order of fieldTerms (already sorted) - for i := range fieldTerms { - postingID := dict[fieldTerms[i]] - 1 + for _, fieldTerm := range fieldTerms { + postingID := dict[fieldTerm] - 1 postingsAddr := postingsLocs[postingID] - err = builder.Insert([]byte(fieldTerms[i]), postingsAddr) + err = builder.Insert([]byte(fieldTerm), postingsAddr) if err != nil { return nil, err }