Merge pull request #689 from steveyen/scorch

MB-27291 - scorch compared to upsidedown/bolt using templated, generated searches
2017-12-21 18:36:02 -08:00 · 2017-12-21 18:36:02 -08:00 · 903e8797c7
parent 4c494216d6 d425a3be86
commit 903e8797c7
4 changed files with 460 additions and 17 deletions
--- a/index/scorch/snapshot_index.go
+++ b/index/scorch/snapshot_index.go
@ -381,7 +381,7 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
 	var res uint64
 	err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res)
 	if err != nil {
-		return res, err
+		return 0, err
 	}
 	return res, nil
 }
--- a/search/searcher/search_conjunction.go
+++ b/search/searcher/search_conjunction.go
@ -57,25 +57,25 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
 func (s *ConjunctionSearcher) computeQueryNorm() {
 	// first calculate sum of squared weights
 	sumOfSquaredWeights := 0.0
-	for _, termSearcher := range s.searchers {
-		sumOfSquaredWeights += termSearcher.Weight()
+	for _, searcher := range s.searchers {
+		sumOfSquaredWeights += searcher.Weight()
 	}
 	// now compute query norm from this
 	s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
 	// finally tell all the downstream searchers the norm
-	for _, termSearcher := range s.searchers {
-		termSearcher.SetQueryNorm(s.queryNorm)
+	for _, searcher := range s.searchers {
+		searcher.SetQueryNorm(s.queryNorm)
 	}
 }

 func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
 	var err error
 	// get all searchers pointing at their first match
-	for i, termSearcher := range s.searchers {
+	for i, searcher := range s.searchers {
 		if s.currs[i] != nil {
 			ctx.DocumentMatchPool.Put(s.currs[i])
 		}
-		s.currs[i], err = termSearcher.Next(ctx)
+		s.currs[i], err = searcher.Next(ctx)
 		if err != nil {
 			return err
 		}
@ -160,11 +160,11 @@ OUTER:

 		// we know all the searchers are pointing at the same thing
 		// so they all need to be bumped
-		for i, termSearcher := range s.searchers {
+		for i, searcher := range s.searchers {
 			if s.currs[i] != rv {
 				ctx.DocumentMatchPool.Put(s.currs[i])
 			}
-			s.currs[i], err = termSearcher.Next(ctx)
+			s.currs[i], err = searcher.Next(ctx)
 			if err != nil {
 				return nil, err
 			}
--- a/search/searcher/search_disjunction.go
+++ b/search/searcher/search_disjunction.go
@ -93,25 +93,25 @@ func newDisjunctionSearcher(indexReader index.IndexReader,
 func (s *DisjunctionSearcher) computeQueryNorm() {
 	// first calculate sum of squared weights
 	sumOfSquaredWeights := 0.0
-	for _, termSearcher := range s.searchers {
-		sumOfSquaredWeights += termSearcher.Weight()
+	for _, searcher := range s.searchers {
+		sumOfSquaredWeights += searcher.Weight()
 	}
 	// now compute query norm from this
 	s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
 	// finally tell all the downstream searchers the norm
-	for _, termSearcher := range s.searchers {
-		termSearcher.SetQueryNorm(s.queryNorm)
+	for _, searcher := range s.searchers {
+		searcher.SetQueryNorm(s.queryNorm)
 	}
 }

 func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
 	var err error
 	// get all searchers pointing at their first match
-	for i, termSearcher := range s.searchers {
+	for i, searcher := range s.searchers {
 		if s.currs[i] != nil {
 			ctx.DocumentMatchPool.Put(s.currs[i])
 		}
-		s.currs[i], err = termSearcher.Next(ctx)
+		s.currs[i], err = searcher.Next(ctx)
 		if err != nil {
 			return err
 		}
@ -221,11 +221,14 @@ func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
 	}
 	// get all searchers pointing at their first match
 	var err error
-	for i, termSearcher := range s.searchers {
+	for i, searcher := range s.searchers {
 		if s.currs[i] != nil {
+			if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
+				continue
+			}
 			ctx.DocumentMatchPool.Put(s.currs[i])
 		}
-		s.currs[i], err = termSearcher.Advance(ctx, ID)
+		s.currs[i], err = searcher.Advance(ctx, ID)
 		if err != nil {
 			return nil, err
 		}
--- a/test/versus_test.go
+++ b/test/versus_test.go
@ -0,0 +1,440 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package test
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"math"
+	"math/rand"
+	"os"
+	"reflect"
+	"strconv"
+	"strings"
+	"testing"
+	"text/template"
+
+	"github.com/blevesearch/bleve"
+	"github.com/blevesearch/bleve/index/scorch"
+	"github.com/blevesearch/bleve/index/store/boltdb"
+	"github.com/blevesearch/bleve/index/upsidedown"
+	"github.com/blevesearch/bleve/mapping"
+	"github.com/blevesearch/bleve/search"
+)
+
+// Tests scorch indexer versus upsidedown/bolt indexer against various
+// templated queries.  Example usage from the bleve top-level directory...
+//
+//     go test -v -run TestScorchVersusUpsideDownBolt ./test
+//     VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test
+//
+func TestScorchVersusUpsideDownBolt(t *testing.T) {
+	(&VersusTest{
+		t:                    t,
+		NumDocs:              1000,
+		MaxWordsPerDoc:       20,
+		NumWords:             10,
+		BatchSize:            10,
+		NumAttemptsPerSearch: 100,
+	}).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil)
+}
+
+func TestScorchVersusUpsideDownBoltSmallMNSAM(t *testing.T) {
+	(&VersusTest{
+		t:                    t,
+		Focus:                "must-not-same-as-must",
+		NumDocs:              5,
+		MaxWordsPerDoc:       2,
+		NumWords:             1,
+		BatchSize:            1,
+		NumAttemptsPerSearch: 1,
+	}).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil)
+}
+
+// -------------------------------------------------------
+
+// Templates used to compare search results in the "versus" tests.
+var searchTemplates = []string{
+	`{
+      "about": "expected to return zero hits",
+      "query": {
+       "query": "title:notARealTitle"
+      }
+     }`,
+	`{
+      "about": "try straight word()'s",
+      "query": {
+       "query": "body:{{word}}"
+      }
+     }`,
+	`{
+      "about": "conjuncts on same term",
+      "query": {
+        "conjuncts": [
+          { "field": "body", "term": "{{word}}", "boost": 1.0 },
+          { "field": "body", "term": "{{word}}", "boost": 1.0 }
+        ]
+      }
+     }`,
+	`{
+      "about": "disjuncts on same term",
+      "query": {
+        "disjuncts": [
+          { "field": "body", "term": "{{word}}", "boost": 1.0 },
+          { "field": "body", "term": "{{word}}", "boost": 1.0 }
+        ]
+      }
+     }`,
+	`{
+      "about": "never-matching-title-conjuncts",
+      "query": {
+        "conjuncts": [
+          {"field": "body", "match": "{{word}}"},
+          {"field": "body", "match": "{{word}}"},
+          {"field": "title", "match": "notAnActualTitle"}
+        ]
+      }
+     }`,
+	`{
+      "about": "never-matching-title-disjuncts",
+      "query": {
+        "disjuncts": [
+          {"field": "body", "match": "{{word}}"},
+          {"field": "body", "match": "{{word}}"},
+          {"field": "title", "match": "notAnActualTitle"}
+        ]
+      }
+     }`,
+	`{
+      "about": "must-not-never-matches",
+      "query": {
+        "must_not": {"disjuncts": [
+          {"field": "title", "match": "notAnActualTitle"}
+        ]},
+        "should": {"disjuncts": [
+          {"field": "body", "match": "{{word}}"}
+        ]}
+      }
+     }`,
+	`{
+      "about": "must-not-only -- FAILS!!!",
+      "query": {
+        "must_not": {"disjuncts": [
+          {"field": "body", "term": "{{word}}"}
+        ]}
+      }
+     }`,
+	`{
+      "about": "must-not-same-as-must -- see: MB-27291",
+      "query": {
+        "must_not": {"disjuncts": [
+          {"field": "body", "match": "{{word}}"}
+        ]},
+        "must": {"conjuncts": [
+          {"field": "body", "match": "{{word}}"}
+        ]}
+      }
+     }`,
+	`{
+      "about": "must-not-same-as-should",
+      "query": {
+        "must_not": {"disjuncts": [
+          {"field": "body", "match": "{{word}}"}
+        ]},
+        "should": {"disjuncts": [
+          {"field": "body", "match": "{{word}}"}
+        ]}
+      }
+     }`,
+	`{
+      "about": "inspired by testrunner RQG issue -- see: MB-27291",
+      "query": {
+        "must_not": {"disjuncts": [
+          {"field": "title", "match": "Trista Allen"},
+          {"field": "body", "match": "{{word}}"}
+        ]},
+        "should": {"disjuncts": [
+          {"field": "title", "match": "Kallie Safiya Amara"},
+          {"field": "body", "match": "{{word}}"}
+        ]}
+      }
+     }`,
+}
+
+// -------------------------------------------------------
+
+type VersusTest struct {
+	t *testing.T
+
+	// Use environment variable VERBOSE=<integer> that's > 0 for more
+	// verbose output.
+	Verbose int
+
+	// Allow user to focus on particular search templates, where
+	// where the search template must contain the Focus string.
+	Focus string
+
+	NumDocs              int // Number of docs to insert.
+	MaxWordsPerDoc       int // Max number words in each doc's Body field.
+	NumWords             int // Total number of words in the dictionary.
+	BatchSize            int // Batch size when inserting docs.
+	NumAttemptsPerSearch int // For each search template, number of searches to try.
+
+	// The Bodies is an array with length NumDocs, where each entry
+	// is the words in a doc's Body field.
+	Bodies [][]string
+
+	CurAttempt  int
+	TotAttempts int
+}
+
+// -------------------------------------------------------
+
+func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) {
+	t := vt.t
+
+	funcMap := template.FuncMap{
+		"word": func() string {
+			return vt.genWord(vt.CurAttempt % vt.NumWords)
+		},
+	}
+
+	// Optionally allow call to focus on a particular search templates,
+	// where the search template must contain the vt.Focus string.
+	if vt.Focus == "" {
+		vt.Focus = os.Getenv("FOCUS")
+	}
+
+	for i, searchTemplate := range searchTemplates {
+		if vt.Focus != "" && !strings.Contains(searchTemplate, vt.Focus) {
+			continue
+		}
+
+		tmpl, err := template.New("search").Funcs(funcMap).Parse(searchTemplate)
+		if err != nil {
+			t.Fatalf("could not parse search template: %s, err: %v", searchTemplate, err)
+		}
+
+		for j := 0; j < vt.NumAttemptsPerSearch; j++ {
+			vt.CurAttempt = j
+
+			var buf bytes.Buffer
+			err = tmpl.Execute(&buf, vt)
+			if err != nil {
+				t.Fatalf("could not execute search template: %s, err: %v", searchTemplate, err)
+			}
+
+			bufBytes := buf.Bytes()
+
+			if vt.Verbose > 0 {
+				fmt.Printf("  %s\n", bufBytes)
+			}
+
+			var search bleve.SearchRequest
+			err = json.Unmarshal(bufBytes, &search)
+			if err != nil {
+				t.Fatalf("could not unmarshal search: %s, err: %v", bufBytes, err)
+			}
+
+			search.Size = vt.NumDocs * 10 // Crank up limit to get all results.
+
+			searchA := search
+			searchB := search
+
+			resA, errA := idxA.Search(&searchA)
+			resB, errB := idxB.Search(&searchB)
+			if errA != errB {
+				t.Errorf("search: (%d) %s,\n err mismatch, errA: %v, errB: %v",
+					i, bufBytes, errA, errB)
+			}
+
+			// Scores might have float64 vs float32 wobbles, so truncate precision.
+			resA.MaxScore = math.Trunc(resA.MaxScore*1000.0) / 1000.0
+			resB.MaxScore = math.Trunc(resB.MaxScore*1000.0) / 1000.0
+
+			// Timings may be different between A & B, so force equality.
+			resA.Took = resB.Took
+
+			// Hits might have different ordering since some indexers
+			// (like upsidedown) have a natural secondary sort on id
+			// while others (like scorch) don't.  So, we compare by
+			// putting the hits from A & B into maps.
+			hitsA := hitsById(resA)
+			hitsB := hitsById(resB)
+			if !reflect.DeepEqual(hitsA, hitsB) {
+				t.Errorf("search: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d",
+					i, bufBytes, len(hitsA), len(hitsB))
+				t.Errorf("\n  hitsA: %#v,\n  hitsB: %#v",
+					hitsA, hitsB)
+				for id, hitA := range hitsA {
+					hitB := hitsB[id]
+					if !reflect.DeepEqual(hitA, hitB) {
+						t.Errorf("\n  hitA: %#v,\n  hitB: %#v", hitA, hitB)
+						idx, _ := strconv.Atoi(id)
+						t.Errorf("\n  body: %s", strings.Join(vt.Bodies[idx], " "))
+					}
+				}
+			}
+
+			resA.Hits = nil
+			resB.Hits = nil
+
+			if !reflect.DeepEqual(resA, resB) {
+				resAj, _ := json.Marshal(resA)
+				resBj, _ := json.Marshal(resB)
+				t.Errorf("search: (%d) %s,\n res mismatch,\n resA: %s,\n resB: %s",
+					i, bufBytes, resAj, resBj)
+			}
+
+			if vt.Verbose > 0 {
+				fmt.Printf("  Total: (%t) %d\n", resA.Total == resB.Total, resA.Total)
+			}
+
+			vt.TotAttempts++
+		}
+	}
+}
+
+// Organizes the hits into a map keyed by id.
+func hitsById(res *bleve.SearchResult) map[string]*search.DocumentMatch {
+	rv := make(map[string]*search.DocumentMatch, len(res.Hits))
+
+	for _, hit := range res.Hits {
+		// Clear out or truncate precision of hit fields that might be
+		// different across different indexer implementations.
+		hit.Index = ""
+		hit.Score = math.Trunc(hit.Score*1000.0) / 1000.0
+		hit.IndexInternalID = nil
+		hit.HitNumber = 0
+
+		rv[hit.ID] = hit
+	}
+
+	return rv
+}
+
+// -------------------------------------------------------
+
+func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string,
+	cb func(versusTest *VersusTest, idxA, idxB bleve.Index)) {
+	if cb == nil {
+		cb = testVersusSearches
+	}
+
+	if vt.Verbose <= 0 {
+		vt.Verbose, _ = strconv.Atoi(os.Getenv("VERBOSE"))
+	}
+
+	dirA := "/tmp/bleve-versus-test-a"
+	dirB := "/tmp/bleve-versus-test-b"
+
+	defer func() {
+		_ = os.RemoveAll(dirA)
+		_ = os.RemoveAll(dirB)
+	}()
+
+	_ = os.RemoveAll(dirA)
+	_ = os.RemoveAll(dirB)
+
+	imA := vt.makeIndexMapping()
+	imB := vt.makeIndexMapping()
+
+	kvConfigA := map[string]interface{}{}
+	kvConfigB := map[string]interface{}{}
+
+	idxA, err := bleve.NewUsing(dirA, imA, indexTypeA, kvStoreA, kvConfigA)
+	if err != nil || idxA == nil {
+		vt.t.Fatalf("new using err: %v", err)
+	}
+	defer func() { _ = idxA.Close() }()
+
+	idxB, err := bleve.NewUsing(dirB, imB, indexTypeB, kvStoreB, kvConfigB)
+	if err != nil || idxB == nil {
+		vt.t.Fatalf("new using err: %v", err)
+	}
+	defer func() { _ = idxB.Close() }()
+
+	rand.Seed(0)
+
+	vt.Bodies = vt.genBodies()
+
+	vt.insertBodies(idxA)
+	vt.insertBodies(idxB)
+
+	cb(vt, idxA, idxB)
+}
+
+// -------------------------------------------------------
+
+func (vt *VersusTest) makeIndexMapping() mapping.IndexMapping {
+	standardFM := bleve.NewTextFieldMapping()
+	standardFM.Store = false
+	standardFM.IncludeInAll = false
+	standardFM.IncludeTermVectors = true
+	standardFM.Analyzer = "standard"
+
+	dm := bleve.NewDocumentMapping()
+	dm.AddFieldMappingsAt("title", standardFM)
+	dm.AddFieldMappingsAt("body", standardFM)
+
+	im := bleve.NewIndexMapping()
+	im.DefaultMapping = dm
+	im.DefaultAnalyzer = "standard"
+
+	return im
+}
+
+func (vt *VersusTest) insertBodies(idx bleve.Index) {
+	batch := idx.NewBatch()
+	for i, bodyWords := range vt.Bodies {
+		title := fmt.Sprintf("%d", i)
+		body := strings.Join(bodyWords, " ")
+		err := batch.Index(title, map[string]interface{}{"title": title, "body": body})
+		if err != nil {
+			vt.t.Fatalf("batch.Index err: %v", err)
+		}
+		if i%vt.BatchSize == 0 {
+			err = idx.Batch(batch)
+			if err != nil {
+				vt.t.Fatalf("batch err: %v", err)
+			}
+			batch.Reset()
+		}
+	}
+	err := idx.Batch(batch)
+	if err != nil {
+		vt.t.Fatalf("last batch err: %v", err)
+	}
+}
+
+func (vt *VersusTest) genBodies() (rv [][]string) {
+	for i := 0; i < vt.NumDocs; i++ {
+		rv = append(rv, vt.genBody())
+	}
+	return rv
+}
+
+func (vt *VersusTest) genBody() (rv []string) {
+	m := rand.Intn(vt.MaxWordsPerDoc)
+	for j := 0; j < m; j++ {
+		rv = append(rv, vt.genWord(rand.Intn(vt.NumWords)))
+	}
+	return rv
+}
+
+func (vt *VersusTest) genWord(i int) string {
+	return fmt.Sprintf("%x", i)
+}