0
0
Fork 0

scorch sorts fields by name to assign fieldID's

This is a stepping stone to allow easier future comparisons of field
maps and potential merge optimizations.

In bleve-blast tests on a 2015 macbook (50K wikipedia docs, 8
indexers, batch size 100, ssd), this does not seem to have a distinct
effect on indexing throughput.
This commit is contained in:
Steve Yen 2018-02-05 15:42:47 -08:00
parent 1af90936c4
commit ffdeb8055e
3 changed files with 21 additions and 0 deletions

View File

@ -95,6 +95,21 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) {
var numTokenFrequencies int
var totLocs int
// initial scan for all fieldID's to sort them
for _, result := range results {
for _, field := range result.Document.CompositeFields {
s.getOrDefineField(field.Name())
}
for _, field := range result.Document.Fields {
s.getOrDefineField(field.Name())
}
}
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
for fieldID, fieldName := range s.FieldsInv {
s.FieldsMap[fieldName] = uint16(fieldID + 1)
}
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
for term, tf := range tfs {
pidPlus1, exists := s.Dicts[fieldID][term]

View File

@ -21,6 +21,7 @@ import (
"fmt"
"math"
"os"
"sort"
"github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
@ -545,5 +546,8 @@ func mergeFields(segments []*SegmentBase) []string {
rv = append(rv, k)
}
}
sort.Strings(rv[1:]) // leave _id as first
return rv
}

View File

@ -18,6 +18,7 @@ import (
"math"
"os"
"reflect"
"sort"
"testing"
"github.com/blevesearch/bleve/index"
@ -574,6 +575,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
t.Fatalf("segment VisitableDocValueFields err: %v", err)
}
sort.Strings(expectedFields[1:]) // keep _id as first field
if !reflect.DeepEqual(fields, expectedFields) {
t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields)
}