0
0
Fork 0

Merge pull request #689 from steveyen/scorch

MB-27291 - scorch compared to upsidedown/bolt using templated, generated searches
This commit is contained in:
Steve Yen 2017-12-21 18:36:02 -08:00 committed by GitHub
commit 903e8797c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 460 additions and 17 deletions

View File

@ -381,7 +381,7 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
var res uint64
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res)
if err != nil {
return res, err
return 0, err
}
return res, nil
}

View File

@ -57,25 +57,25 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
func (s *ConjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, termSearcher := range s.searchers {
sumOfSquaredWeights += termSearcher.Weight()
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, termSearcher := range s.searchers {
termSearcher.SetQueryNorm(s.queryNorm)
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
}
@ -160,11 +160,11 @@ OUTER:
// we know all the searchers are pointing at the same thing
// so they all need to be bumped
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != rv {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return nil, err
}

View File

@ -93,25 +93,25 @@ func newDisjunctionSearcher(indexReader index.IndexReader,
func (s *DisjunctionSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
for _, termSearcher := range s.searchers {
sumOfSquaredWeights += termSearcher.Weight()
for _, searcher := range s.searchers {
sumOfSquaredWeights += searcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downstream searchers the norm
for _, termSearcher := range s.searchers {
termSearcher.SetQueryNorm(s.queryNorm)
for _, searcher := range s.searchers {
searcher.SetQueryNorm(s.queryNorm)
}
}
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
var err error
// get all searchers pointing at their first match
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != nil {
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Next(ctx)
s.currs[i], err = searcher.Next(ctx)
if err != nil {
return err
}
@ -221,11 +221,14 @@ func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
}
// get all searchers pointing at their first match
var err error
for i, termSearcher := range s.searchers {
for i, searcher := range s.searchers {
if s.currs[i] != nil {
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
continue
}
ctx.DocumentMatchPool.Put(s.currs[i])
}
s.currs[i], err = termSearcher.Advance(ctx, ID)
s.currs[i], err = searcher.Advance(ctx, ID)
if err != nil {
return nil, err
}

440
test/versus_test.go Normal file
View File

@ -0,0 +1,440 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package test
import (
"bytes"
"encoding/json"
"fmt"
"math"
"math/rand"
"os"
"reflect"
"strconv"
"strings"
"testing"
"text/template"
"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/index/scorch"
"github.com/blevesearch/bleve/index/store/boltdb"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search"
)
// Tests scorch indexer versus upsidedown/bolt indexer against various
// templated queries. Example usage from the bleve top-level directory...
//
// go test -v -run TestScorchVersusUpsideDownBolt ./test
// VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test
//
func TestScorchVersusUpsideDownBolt(t *testing.T) {
(&VersusTest{
t: t,
NumDocs: 1000,
MaxWordsPerDoc: 20,
NumWords: 10,
BatchSize: 10,
NumAttemptsPerSearch: 100,
}).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil)
}
func TestScorchVersusUpsideDownBoltSmallMNSAM(t *testing.T) {
(&VersusTest{
t: t,
Focus: "must-not-same-as-must",
NumDocs: 5,
MaxWordsPerDoc: 2,
NumWords: 1,
BatchSize: 1,
NumAttemptsPerSearch: 1,
}).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil)
}
// -------------------------------------------------------
// Templates used to compare search results in the "versus" tests.
var searchTemplates = []string{
`{
"about": "expected to return zero hits",
"query": {
"query": "title:notARealTitle"
}
}`,
`{
"about": "try straight word()'s",
"query": {
"query": "body:{{word}}"
}
}`,
`{
"about": "conjuncts on same term",
"query": {
"conjuncts": [
{ "field": "body", "term": "{{word}}", "boost": 1.0 },
{ "field": "body", "term": "{{word}}", "boost": 1.0 }
]
}
}`,
`{
"about": "disjuncts on same term",
"query": {
"disjuncts": [
{ "field": "body", "term": "{{word}}", "boost": 1.0 },
{ "field": "body", "term": "{{word}}", "boost": 1.0 }
]
}
}`,
`{
"about": "never-matching-title-conjuncts",
"query": {
"conjuncts": [
{"field": "body", "match": "{{word}}"},
{"field": "body", "match": "{{word}}"},
{"field": "title", "match": "notAnActualTitle"}
]
}
}`,
`{
"about": "never-matching-title-disjuncts",
"query": {
"disjuncts": [
{"field": "body", "match": "{{word}}"},
{"field": "body", "match": "{{word}}"},
{"field": "title", "match": "notAnActualTitle"}
]
}
}`,
`{
"about": "must-not-never-matches",
"query": {
"must_not": {"disjuncts": [
{"field": "title", "match": "notAnActualTitle"}
]},
"should": {"disjuncts": [
{"field": "body", "match": "{{word}}"}
]}
}
}`,
`{
"about": "must-not-only -- FAILS!!!",
"query": {
"must_not": {"disjuncts": [
{"field": "body", "term": "{{word}}"}
]}
}
}`,
`{
"about": "must-not-same-as-must -- see: MB-27291",
"query": {
"must_not": {"disjuncts": [
{"field": "body", "match": "{{word}}"}
]},
"must": {"conjuncts": [
{"field": "body", "match": "{{word}}"}
]}
}
}`,
`{
"about": "must-not-same-as-should",
"query": {
"must_not": {"disjuncts": [
{"field": "body", "match": "{{word}}"}
]},
"should": {"disjuncts": [
{"field": "body", "match": "{{word}}"}
]}
}
}`,
`{
"about": "inspired by testrunner RQG issue -- see: MB-27291",
"query": {
"must_not": {"disjuncts": [
{"field": "title", "match": "Trista Allen"},
{"field": "body", "match": "{{word}}"}
]},
"should": {"disjuncts": [
{"field": "title", "match": "Kallie Safiya Amara"},
{"field": "body", "match": "{{word}}"}
]}
}
}`,
}
// -------------------------------------------------------
type VersusTest struct {
t *testing.T
// Use environment variable VERBOSE=<integer> that's > 0 for more
// verbose output.
Verbose int
// Allow user to focus on particular search templates, where
// where the search template must contain the Focus string.
Focus string
NumDocs int // Number of docs to insert.
MaxWordsPerDoc int // Max number words in each doc's Body field.
NumWords int // Total number of words in the dictionary.
BatchSize int // Batch size when inserting docs.
NumAttemptsPerSearch int // For each search template, number of searches to try.
// The Bodies is an array with length NumDocs, where each entry
// is the words in a doc's Body field.
Bodies [][]string
CurAttempt int
TotAttempts int
}
// -------------------------------------------------------
func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) {
t := vt.t
funcMap := template.FuncMap{
"word": func() string {
return vt.genWord(vt.CurAttempt % vt.NumWords)
},
}
// Optionally allow call to focus on a particular search templates,
// where the search template must contain the vt.Focus string.
if vt.Focus == "" {
vt.Focus = os.Getenv("FOCUS")
}
for i, searchTemplate := range searchTemplates {
if vt.Focus != "" && !strings.Contains(searchTemplate, vt.Focus) {
continue
}
tmpl, err := template.New("search").Funcs(funcMap).Parse(searchTemplate)
if err != nil {
t.Fatalf("could not parse search template: %s, err: %v", searchTemplate, err)
}
for j := 0; j < vt.NumAttemptsPerSearch; j++ {
vt.CurAttempt = j
var buf bytes.Buffer
err = tmpl.Execute(&buf, vt)
if err != nil {
t.Fatalf("could not execute search template: %s, err: %v", searchTemplate, err)
}
bufBytes := buf.Bytes()
if vt.Verbose > 0 {
fmt.Printf(" %s\n", bufBytes)
}
var search bleve.SearchRequest
err = json.Unmarshal(bufBytes, &search)
if err != nil {
t.Fatalf("could not unmarshal search: %s, err: %v", bufBytes, err)
}
search.Size = vt.NumDocs * 10 // Crank up limit to get all results.
searchA := search
searchB := search
resA, errA := idxA.Search(&searchA)
resB, errB := idxB.Search(&searchB)
if errA != errB {
t.Errorf("search: (%d) %s,\n err mismatch, errA: %v, errB: %v",
i, bufBytes, errA, errB)
}
// Scores might have float64 vs float32 wobbles, so truncate precision.
resA.MaxScore = math.Trunc(resA.MaxScore*1000.0) / 1000.0
resB.MaxScore = math.Trunc(resB.MaxScore*1000.0) / 1000.0
// Timings may be different between A & B, so force equality.
resA.Took = resB.Took
// Hits might have different ordering since some indexers
// (like upsidedown) have a natural secondary sort on id
// while others (like scorch) don't. So, we compare by
// putting the hits from A & B into maps.
hitsA := hitsById(resA)
hitsB := hitsById(resB)
if !reflect.DeepEqual(hitsA, hitsB) {
t.Errorf("search: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d",
i, bufBytes, len(hitsA), len(hitsB))
t.Errorf("\n hitsA: %#v,\n hitsB: %#v",
hitsA, hitsB)
for id, hitA := range hitsA {
hitB := hitsB[id]
if !reflect.DeepEqual(hitA, hitB) {
t.Errorf("\n hitA: %#v,\n hitB: %#v", hitA, hitB)
idx, _ := strconv.Atoi(id)
t.Errorf("\n body: %s", strings.Join(vt.Bodies[idx], " "))
}
}
}
resA.Hits = nil
resB.Hits = nil
if !reflect.DeepEqual(resA, resB) {
resAj, _ := json.Marshal(resA)
resBj, _ := json.Marshal(resB)
t.Errorf("search: (%d) %s,\n res mismatch,\n resA: %s,\n resB: %s",
i, bufBytes, resAj, resBj)
}
if vt.Verbose > 0 {
fmt.Printf(" Total: (%t) %d\n", resA.Total == resB.Total, resA.Total)
}
vt.TotAttempts++
}
}
}
// Organizes the hits into a map keyed by id.
func hitsById(res *bleve.SearchResult) map[string]*search.DocumentMatch {
rv := make(map[string]*search.DocumentMatch, len(res.Hits))
for _, hit := range res.Hits {
// Clear out or truncate precision of hit fields that might be
// different across different indexer implementations.
hit.Index = ""
hit.Score = math.Trunc(hit.Score*1000.0) / 1000.0
hit.IndexInternalID = nil
hit.HitNumber = 0
rv[hit.ID] = hit
}
return rv
}
// -------------------------------------------------------
func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string,
cb func(versusTest *VersusTest, idxA, idxB bleve.Index)) {
if cb == nil {
cb = testVersusSearches
}
if vt.Verbose <= 0 {
vt.Verbose, _ = strconv.Atoi(os.Getenv("VERBOSE"))
}
dirA := "/tmp/bleve-versus-test-a"
dirB := "/tmp/bleve-versus-test-b"
defer func() {
_ = os.RemoveAll(dirA)
_ = os.RemoveAll(dirB)
}()
_ = os.RemoveAll(dirA)
_ = os.RemoveAll(dirB)
imA := vt.makeIndexMapping()
imB := vt.makeIndexMapping()
kvConfigA := map[string]interface{}{}
kvConfigB := map[string]interface{}{}
idxA, err := bleve.NewUsing(dirA, imA, indexTypeA, kvStoreA, kvConfigA)
if err != nil || idxA == nil {
vt.t.Fatalf("new using err: %v", err)
}
defer func() { _ = idxA.Close() }()
idxB, err := bleve.NewUsing(dirB, imB, indexTypeB, kvStoreB, kvConfigB)
if err != nil || idxB == nil {
vt.t.Fatalf("new using err: %v", err)
}
defer func() { _ = idxB.Close() }()
rand.Seed(0)
vt.Bodies = vt.genBodies()
vt.insertBodies(idxA)
vt.insertBodies(idxB)
cb(vt, idxA, idxB)
}
// -------------------------------------------------------
func (vt *VersusTest) makeIndexMapping() mapping.IndexMapping {
standardFM := bleve.NewTextFieldMapping()
standardFM.Store = false
standardFM.IncludeInAll = false
standardFM.IncludeTermVectors = true
standardFM.Analyzer = "standard"
dm := bleve.NewDocumentMapping()
dm.AddFieldMappingsAt("title", standardFM)
dm.AddFieldMappingsAt("body", standardFM)
im := bleve.NewIndexMapping()
im.DefaultMapping = dm
im.DefaultAnalyzer = "standard"
return im
}
func (vt *VersusTest) insertBodies(idx bleve.Index) {
batch := idx.NewBatch()
for i, bodyWords := range vt.Bodies {
title := fmt.Sprintf("%d", i)
body := strings.Join(bodyWords, " ")
err := batch.Index(title, map[string]interface{}{"title": title, "body": body})
if err != nil {
vt.t.Fatalf("batch.Index err: %v", err)
}
if i%vt.BatchSize == 0 {
err = idx.Batch(batch)
if err != nil {
vt.t.Fatalf("batch err: %v", err)
}
batch.Reset()
}
}
err := idx.Batch(batch)
if err != nil {
vt.t.Fatalf("last batch err: %v", err)
}
}
func (vt *VersusTest) genBodies() (rv [][]string) {
for i := 0; i < vt.NumDocs; i++ {
rv = append(rv, vt.genBody())
}
return rv
}
func (vt *VersusTest) genBody() (rv []string) {
m := rand.Intn(vt.MaxWordsPerDoc)
for j := 0; j < m; j++ {
rv = append(rv, vt.genWord(rand.Intn(vt.NumWords)))
}
return rv
}
func (vt *VersusTest) genWord(i int) string {
return fmt.Sprintf("%x", i)
}