From 85df86ba17c1dceaa69264e22238432a6fd495fd Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 19 Mar 2018 12:33:13 -0700 Subject: [PATCH] Unit tests for segments with docs with non-overlapping fields --- index/scorch/segment/mem/segment_test.go | 177 +++++++++++++++++++++++ index/scorch/segment/zap/build_test.go | 165 +++++++++++++++++++++ index/scorch/segment/zap/segment_test.go | 137 ++++++++++++++++++ 3 files changed, 479 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 6c5625d8..56571927 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -697,3 +697,180 @@ func TestMultiple(t *testing.T) { } } + +func TestMultipleWithNonOverlappingFields(t *testing.T) { + doc1 := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("a")), + document.NewTextField("name", []uint64{}, []byte("ABC")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), + document.NewTextField("manages.count", []uint64{}, []byte("1")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("b")), + document.NewTextField("name", []uint64{}, []byte("XYZ")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc1, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("1"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, []uint64{0}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := NewFromAnalyzedDocs(results) + if segment == nil { + t.Fatalf("segment nil, not expected") + } + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +} diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index 65de7931..e8189f76 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -137,6 +137,12 @@ func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, err return AnalysisResultsToSegmentBase(results, chunkFactor) } +func buildTestSegmentMultiWithDifferentFields(includeDocA, includeDocB bool) (*SegmentBase, error) { + results := buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB) + + return AnalysisResultsToSegmentBase(results, 1024) +} + func buildTestAnalysisResultsMulti() []*index.AnalysisResult { doc := &document.Document{ ID: "a", @@ -298,6 +304,165 @@ func buildTestAnalysisResultsMulti() []*index.AnalysisResult { return results } +func buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB bool) []*index.AnalysisResult { + results := []*index.AnalysisResult{} + + if includeDocA { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("a")), + document.NewTextField("name", []uint64{}, []byte("ABC")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), + document.NewTextField("manages.count", []uint64{}, []byte("1")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + result := &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("1"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + } + + results = append(results, result) + } + + if includeDocB { + doc := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("b")), + document.NewTextField("name", []uint64{}, []byte("XYZ")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + result := &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, []uint64{0}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + }, + } + + results = append(results, result) + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + return results +} + func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( *SegmentBase, []string, error) { doc := &document.Document{ diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 50d5dbd7..339d24ce 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -21,6 +21,7 @@ import ( "sort" "testing" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -600,3 +601,139 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } } + +func TestSegmentDocsWithNonOverlappingFields(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + testSeg, err := buildTestSegmentMultiWithDifferentFields(true, true) + if err != nil { + t.Fatalf("error building segment: %v", err) + } + err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected 2, got %d", segment.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +} + +func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch1.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + testSeg1, _ := buildTestSegmentMultiWithDifferentFields(true, false) + err := PersistSegmentBase(testSeg1, "/tmp/scorch1.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + testSeg2, _ := buildTestSegmentMultiWithDifferentFields(false, true) + err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment1, err := Open("/tmp/scorch1.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment1.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment1.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } + + if nBytes == 0 { + t.Fatalf("expected a non zero total_compaction_written_bytes") + } + + segmentM, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + defer func() { + cerr := segmentM.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + if segmentM.Count() != 2 { + t.Errorf("expected 2, got %d", segmentM.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segmentM.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +}