From ffdeb8055efd2e69167caf4aadd19e2cdd6e4e27 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 15:42:47 -0800 Subject: [PATCH] scorch sorts fields by name to assign fieldID's This is a stepping stone to allow easier future comparisons of field maps and potential merge optimizations. In bleve-blast tests on a 2015 macbook (50K wikipedia docs, 8 indexers, batch size 100, ssd), this does not seem to have a distinct effect on indexing throughput. --- index/scorch/segment/mem/build.go | 15 +++++++++++++++ index/scorch/segment/zap/merge.go | 4 ++++ index/scorch/segment/zap/segment_test.go | 2 ++ 3 files changed, 21 insertions(+) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index d3344ce3..57d60dc8 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -95,6 +95,21 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { var numTokenFrequencies int var totLocs int + // initial scan for all fieldID's to sort them + for _, result := range results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { for term, tf := range tfs { pidPlus1, exists := s.Dicts[fieldID][term] diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index db03c998..53b1ffe5 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "os" + "sort" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -545,5 +546,8 @@ func mergeFields(segments []*SegmentBase) []string { rv = append(rv, k) } } + + sort.Strings(rv[1:]) // leave _id as first + return rv } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 704f9e72..9ce354ce 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -18,6 +18,7 @@ import ( "math" "os" "reflect" + "sort" "testing" "github.com/blevesearch/bleve/index" @@ -574,6 +575,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { t.Fatalf("segment VisitableDocValueFields err: %v", err) } + sort.Strings(expectedFields[1:]) // keep _id as first field if !reflect.DeepEqual(fields, expectedFields) { t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields) }