0
0
Fork 0

added phrase search

This commit is contained in:
Marty Schoch 2014-07-03 14:54:50 -04:00
parent ef690e0ead
commit cc77b074fe
5 changed files with 332 additions and 5 deletions

View File

@ -28,32 +28,33 @@ func init() {
}
// sets up some mock data used in many tests in this package
var twoDocIndexDescIndexingOptions = document.DEFAULT_TEXT_INDEXING_OPTIONS | document.INCLUDE_TERM_VECTORS
var twoDocIndexDocs = []*document.Document{
// must have 4/4 beer
document.NewDocument("1").
AddField(document.NewTextField("name", []byte("marty"))).
AddField(document.NewTextField("desc", []byte("beer beer beer beer"))).
AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("beer beer beer beer"), twoDocIndexDescIndexingOptions)).
AddField(document.NewTextField("street", []byte("couchbase way"))),
// must have 1/4 beer
document.NewDocument("2").
AddField(document.NewTextField("name", []byte("steve"))).
AddField(document.NewTextField("desc", []byte("angst beer couch database"))).
AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("angst beer couch database"), twoDocIndexDescIndexingOptions)).
AddField(document.NewTextField("street", []byte("couchbase way"))).
AddField(document.NewTextField("title", []byte("mister"))),
// must have 1/4 beer
document.NewDocument("3").
AddField(document.NewTextField("name", []byte("dustin"))).
AddField(document.NewTextField("desc", []byte("apple beer column dank"))).
AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("apple beer column dank"), twoDocIndexDescIndexingOptions)).
AddField(document.NewTextField("title", []byte("mister"))),
// must have 65/65 beer
document.NewDocument("4").
AddField(document.NewTextField("name", []byte("ravi"))).
AddField(document.NewTextField("desc", []byte("beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer"))),
AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer beer"), twoDocIndexDescIndexingOptions)),
// must have 0/x beer
document.NewDocument("5").
AddField(document.NewTextField("name", []byte("bobert"))).
AddField(document.NewTextField("desc", []byte("water"))).
AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("water"), twoDocIndexDescIndexingOptions)).
AddField(document.NewTextField("title", []byte("mister"))),
}

View File

@ -9,6 +9,9 @@
package search
import (
"encoding/json"
"fmt"
"github.com/couchbaselabs/bleve/index"
)
@ -17,3 +20,41 @@ type Query interface {
Searcher(index index.Index) (Searcher, error)
Validate() error
}
func ParseQuery(input []byte) (Query, error) {
var tmp map[string]interface{}
err := json.Unmarshal(input, &tmp)
if err != nil {
return nil, err
}
_, isTermQuery := tmp["term"]
if isTermQuery {
var rv *TermQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return rv, nil
}
_, hasMust := tmp["must"]
_, hasShould := tmp["should"]
_, hasMustNot := tmp["must_not"]
if hasMust || hasShould || hasMustNot {
var rv *TermBooleanQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return rv, nil
}
_, hasTerms := tmp["terms"]
if hasTerms {
var rv *PhraseQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return rv, nil
}
return nil, fmt.Errorf("Unrecognized query")
}

29
search/query_phrase.go Normal file
View File

@ -0,0 +1,29 @@
package search
import (
"fmt"
"github.com/couchbaselabs/bleve/index"
)
type PhraseQuery struct {
Terms []*TermQuery `json:"terms,omitempty"`
PhrasePositions map[string]float64 `json:"phrase_positions,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
Explain bool `json:"explain,omitempty"`
}
func (q *PhraseQuery) Boost() float64 {
return q.BoostVal
}
func (q *PhraseQuery) Searcher(index index.Index) (Searcher, error) {
return NewPhraseSearcher(index, q)
}
func (q *PhraseQuery) Validate() error {
if q.Terms == nil {
return fmt.Errorf("Phrase query must contain at least one term")
}
return nil
}

178
search/search_phrase.go Normal file
View File

@ -0,0 +1,178 @@
package search
import (
"math"
"github.com/couchbaselabs/bleve/index"
)
type PhraseSearcher struct {
query *PhraseQuery
index index.Index
mustSearcher *TermConjunctionSearcher
queryNorm float64
currMust *DocumentMatch
slop int
}
func NewPhraseSearcher(index index.Index, query *PhraseQuery) (*PhraseSearcher, error) {
// build the downstream searchres
var err error
var mustSearcher *TermConjunctionSearcher
if query.Terms != nil {
qterms := make([]Query, len(query.Terms))
for i, qt := range query.Terms {
qterms[i] = qt
}
tcq := TermConjunctionQuery{
Terms: qterms,
BoostVal: 1.0,
Explain: query.Explain,
}
mustSearcher, err = NewTermConjunctionSearcher(index, &tcq)
if err != nil {
return nil, err
}
}
// build our searcher
rv := PhraseSearcher{
index: index,
query: query,
mustSearcher: mustSearcher,
}
rv.computeQueryNorm()
err = rv.initSearchers()
if err != nil {
return nil, err
}
return &rv, nil
}
func (s *PhraseSearcher) computeQueryNorm() {
// first calculate sum of squared weights
sumOfSquaredWeights := 0.0
if s.mustSearcher != nil {
sumOfSquaredWeights += s.mustSearcher.Weight()
}
// now compute query norm from this
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
// finally tell all the downsteam searchers the norm
if s.mustSearcher != nil {
s.mustSearcher.SetQueryNorm(s.queryNorm)
}
}
func (s *PhraseSearcher) initSearchers() error {
var err error
// get all searchers pointing at their first match
if s.mustSearcher != nil {
s.currMust, err = s.mustSearcher.Next()
if err != nil {
return err
}
}
return nil
}
func (s *PhraseSearcher) advanceNextMust() error {
var err error
if s.mustSearcher != nil {
s.currMust, err = s.mustSearcher.Next()
if err != nil {
return err
}
}
return nil
}
func (s *PhraseSearcher) Weight() float64 {
var rv float64
rv += s.mustSearcher.Weight()
return rv
}
func (s *PhraseSearcher) SetQueryNorm(qnorm float64) {
s.mustSearcher.SetQueryNorm(qnorm)
}
func (s *PhraseSearcher) Next() (*DocumentMatch, error) {
var rv *DocumentMatch
for s.currMust != nil {
rvtlm := make(TermLocationMap, 0)
freq := 0
firstTerm := s.query.Terms[0]
termLocMap, ok := s.currMust.Locations[firstTerm.Field]
if ok {
locations, ok := termLocMap[firstTerm.Term]
if ok {
OUTER:
for _, location := range locations {
crvtlm := make(TermLocationMap, 0)
INNER:
for i := 0; i < len(s.query.Terms); i++ {
nextTerm := s.query.Terms[i]
if nextTerm != nil {
// look through all this terms locations
// to try and find the correct offsets
nextLocations, ok := termLocMap[nextTerm.Term]
if ok {
for _, nextLocation := range nextLocations {
if nextLocation.Pos == location.Pos+float64(i) {
// found a location match for this term
crvtlm.AddLocation(nextTerm.Term, nextLocation)
continue INNER
}
}
// if we got here we didnt find location match for this term
continue OUTER
}
}
}
// if we got here all the terms matched
freq += 1
mergeTermLocationMaps(rvtlm, crvtlm)
}
}
}
if freq > 0 {
// return match
rv = s.currMust
rv.Locations = FieldTermLocationMap{
firstTerm.Field: rvtlm,
}
s.advanceNextMust()
return rv, nil
}
s.advanceNextMust()
}
return nil, nil
}
func (s *PhraseSearcher) Advance(ID string) (*DocumentMatch, error) {
s.mustSearcher.Advance(ID)
return s.Next()
}
func (s *PhraseSearcher) Count() uint64 {
// for now return a worst case
var sum uint64 = 0
sum += s.mustSearcher.Count()
return sum
}
func (s *PhraseSearcher) Close() {
if s.mustSearcher != nil {
s.mustSearcher.Close()
}
}

View File

@ -0,0 +1,78 @@
// Copyright (c) 2013 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package search
import (
"testing"
"github.com/couchbaselabs/bleve/index"
)
func TestPhraseSearch(t *testing.T) {
tests := []struct {
index index.Index
query Query
results []*DocumentMatch
}{
{
index: twoDocIndex,
query: &PhraseQuery{
Terms: []*TermQuery{
&TermQuery{
Term: "angst",
Field: "desc",
BoostVal: 1.0,
Explain: true,
},
&TermQuery{
Term: "beer",
Field: "desc",
BoostVal: 1.0,
Explain: true,
},
},
Explain: true,
},
results: []*DocumentMatch{
&DocumentMatch{
ID: "2",
Score: 1.0807601687084403,
},
},
},
}
for testIndex, test := range tests {
searcher, err := test.query.Searcher(test.index)
defer searcher.Close()
next, err := searcher.Next()
i := 0
for err == nil && next != nil {
if i < len(test.results) {
if next.ID != test.results[i].ID {
t.Errorf("expected result %d to have id %s got %s for test %d", i, test.results[i].ID, next.ID, testIndex)
}
if next.Score != test.results[i].Score {
t.Errorf("expected result %d to have score %v got %v for test %d", i, test.results[i].Score, next.Score, testIndex)
t.Logf("scoring explanation: %s", next.Expl)
}
}
next, err = searcher.Next()
i++
}
if err != nil {
t.Fatalf("error iterating searcher: %v for test %d", err, testIndex)
}
if len(test.results) != i {
t.Errorf("expected %d results got %d for test %d", len(test.results), i, testIndex)
}
}
}