0
0
Fork 0

fix to phrase/phrase match search involving stop words

closes #122
This commit is contained in:
Marty Schoch 2014-11-25 10:07:54 -05:00
parent 316970df13
commit 67beaca6d6
8 changed files with 426 additions and 29 deletions

View File

@ -107,7 +107,7 @@ func ParseQuery(input []byte) (Query, error) {
if rv.Boost() == 0 {
rv.SetBoost(1)
}
for _, tq := range rv.Terms {
for _, tq := range rv.TermQueries {
if tq.Boost() == 0 {
tq.SetBoost(1)
}

View File

@ -12,6 +12,7 @@ package bleve
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
@ -56,7 +57,6 @@ func (q *matchPhraseQuery) SetField(f string) Query {
}
func (q *matchPhraseQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = m.DefaultField
@ -75,18 +75,40 @@ func (q *matchPhraseQuery) Searcher(i index.IndexReader, m *IndexMapping, explai
tokens := analyzer.Analyze([]byte(q.MatchPhrase))
if len(tokens) > 0 {
ts := make([]string, len(tokens))
for i, token := range tokens {
ts[i] = string(token.Term)
}
phraseQuery := NewPhraseQuery(ts, field).SetBoost(q.BoostVal)
phrase := tokenStreamToPhrase(tokens)
phraseQuery := NewPhraseQuery(phrase, field).SetBoost(q.BoostVal)
return phraseQuery.Searcher(i, m, explain)
}
noneQuery := NewMatchNoneQuery()
return noneQuery.Searcher(i, m, explain)
}
func tokenStreamToPhrase(tokens analysis.TokenStream) []string {
firstPosition := int(^uint(0) >> 1)
lastPosition := 0
for _, token := range tokens {
if token.Position < firstPosition {
firstPosition = token.Position
}
if token.Position > lastPosition {
lastPosition = token.Position
}
}
phraseLen := lastPosition - firstPosition + 1
if phraseLen > 0 {
rv := make([]string, phraseLen)
for i := 0; i < phraseLen; i++ {
rv[i] = ""
}
for _, token := range tokens {
pos := token.Position - firstPosition
rv[pos] = string(token.Term)
}
return rv
}
return nil
}
func (q *matchPhraseQuery) Validate() error {
return nil
}

View File

@ -19,8 +19,9 @@ import (
)
type phraseQuery struct {
Terms []Query `json:"terms"`
BoostVal float64 `json:"boost,omitempty"`
TermQueries []Query `json:"terms"`
BoostVal float64 `json:"boost,omitempty"`
terms []string
}
// NewPhraseQuery creates a new Query for finding
@ -29,13 +30,16 @@ type phraseQuery struct {
// order, at the correct index offsets, in the
// specified field.
func NewPhraseQuery(terms []string, field string) *phraseQuery {
termQueries := make([]Query, len(terms))
for i, term := range terms {
termQueries[i] = NewTermQuery(term).SetField(field)
termQueries := make([]Query, 0)
for _, term := range terms {
if term != "" {
termQueries = append(termQueries, NewTermQuery(term).SetField(field))
}
}
return &phraseQuery{
Terms: termQueries,
BoostVal: 1.0,
TermQueries: termQueries,
BoostVal: 1.0,
terms: terms,
}
}
@ -50,21 +54,16 @@ func (q *phraseQuery) SetBoost(b float64) Query {
func (q *phraseQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
terms := make([]string, len(q.Terms))
for i, term := range q.Terms {
terms[i] = term.(*termQuery).Term
}
conjunctionQuery := NewConjunctionQuery(q.Terms)
conjunctionQuery := NewConjunctionQuery(q.TermQueries)
conjunctionSearcher, err := conjunctionQuery.Searcher(i, m, explain)
if err != nil {
return nil, err
}
return searchers.NewPhraseSearcher(i, conjunctionSearcher.(*searchers.ConjunctionSearcher), terms)
return searchers.NewPhraseSearcher(i, conjunctionSearcher.(*searchers.ConjunctionSearcher), q.terms)
}
func (q *phraseQuery) Validate() error {
if len(q.Terms) < 1 {
if len(q.TermQueries) < 1 {
return ErrorPhraseQueryNoTerms
}
return nil
@ -79,17 +78,19 @@ func (q *phraseQuery) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
q.Terms = make([]Query, len(tmp.Terms))
q.TermQueries = make([]Query, len(tmp.Terms))
q.terms = make([]string, 0)
for i, term := range tmp.Terms {
query, err := ParseQuery(term)
if err != nil {
return err
}
q.Terms[i] = query
_, isTermQuery := query.(*termQuery)
q.TermQueries[i] = query
tq, isTermQuery := query.(*termQuery)
if !isTermQuery {
return fmt.Errorf("phrase query can only contain term queries")
}
q.terms = append(q.terms, tq.Term)
}
q.BoostVal = tmp.BoostVal
if q.BoostVal == 0 {

View File

@ -27,7 +27,6 @@ type PhraseSearcher struct {
}
func NewPhraseSearcher(indexReader index.IndexReader, mustSearcher *ConjunctionSearcher, terms []string) (*PhraseSearcher, error) {
// build our searcher
rv := PhraseSearcher{
indexReader: indexReader,
@ -112,7 +111,7 @@ func (s *PhraseSearcher) Next() (*search.DocumentMatch, error) {
for _, location := range locations {
crvtlm := make(search.TermLocationMap, 0)
INNER:
for i := 0; i < len(s.mustSearcher.searchers); i++ {
for i := 0; i < len(s.terms); i++ {
nextTerm := s.terms[i]
if nextTerm != "" {
// look through all this terms locations

View File

@ -11,22 +11,43 @@ package test
import (
"encoding/json"
"flag"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"regexp"
"testing"
"github.com/blevesearch/bleve"
)
var dataset = flag.String("dataset", "", "only test datasets matching this regex")
var keepIndex = flag.Bool("keepIndex", false, "keep the index after testing")
func TestIntegration(t *testing.T) {
flag.Parse()
var err error
var datasetRegexp *regexp.Regexp
if *dataset != "" {
datasetRegexp, err = regexp.Compile(*dataset)
if err != nil {
t.Fatal(err)
}
}
fis, err := ioutil.ReadDir("tests")
if err != nil {
t.Fatal(err)
}
for _, fi := range fis {
if datasetRegexp != nil {
if !datasetRegexp.MatchString(fi.Name()) {
continue
}
}
if fi.IsDir() {
t.Logf("Running test: %s", fi.Name())
runTestDir(t, "tests"+string(filepath.Separator)+fi.Name())
@ -49,7 +70,9 @@ func runTestDir(t *testing.T, dir string) {
}
// open new index
defer os.RemoveAll("test.bleve")
if !*keepIndex {
defer os.RemoveAll("test.bleve")
}
index, err := bleve.New("test.bleve", &mapping)
if err != nil {
t.Errorf("error creating new index: %v", err)

View File

@ -0,0 +1,3 @@
{
"body": "Twenty Thousand Leagues Under The Sea"
}

View File

@ -0,0 +1,23 @@
{
"types": {
"book": {
"properties": {
"body": {
"fields": [
{
"include_term_vectors": true,
"include_in_all": true,
"index": true,
"store": true,
"analyzer": "en",
"type": "text"
}
],
"dynamic": true,
"enabled": true
}
}
}
},
"default_type": "book"
}

View File

@ -0,0 +1,326 @@
[
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty Thousand"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty Thousand Leagues"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty Thousand Leagues Under"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty Thousand Leagues Under the"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Twenty Thousand Leagues Under the Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Thousand"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Thousand Leagues"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Thousand Leagues Under"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Thousand Leagues Under the"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Thousand Leagues Under the Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Leagues"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Leagues Under"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Leagues Under the"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Leagues Under the Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Under the Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "the Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
},
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "body",
"match_phrase": "Sea"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "a"
}
]
}
}
]