0
0
Fork 0

added regexp and wildcard queries

fixes #152
This commit is contained in:
Marty Schoch 2015-03-11 16:57:22 -04:00
parent 183fcd4b14
commit a41f229b14
5 changed files with 403 additions and 0 deletions

View File

@ -189,5 +189,23 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, hasRegexp := tmp["regexp"]
if hasRegexp {
var rv regexpQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasWildcard := tmp["wildcard"]
if hasWildcard {
var rv wildcardQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
return nil, ErrorUnknownQueryType
}

75
query_regexp.go Normal file
View File

@ -0,0 +1,75 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bleve
import (
"regexp"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/searchers"
)
type regexpQuery struct {
Regexp string `json:"regexp"`
FieldVal string `json:"field,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
compiled *regexp.Regexp
}
// NewRegexpQuery creates a new Query which finds
// documents containing terms that match the
// specified regular expression.
func NewRegexpQuery(regexp string) *regexpQuery {
return &regexpQuery{
Regexp: regexp,
BoostVal: 1.0,
}
}
func (q *regexpQuery) Boost() float64 {
return q.BoostVal
}
func (q *regexpQuery) SetBoost(b float64) Query {
q.BoostVal = b
return q
}
func (q *regexpQuery) Field() string {
return q.FieldVal
}
func (q *regexpQuery) SetField(f string) Query {
q.FieldVal = f
return q
}
func (q *regexpQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = m.DefaultField
}
if q.compiled == nil {
var err error
q.compiled, err = regexp.Compile(q.Regexp)
if err != nil {
return nil, err
}
}
return searchers.NewRegexpSearcher(i, q.compiled, field, q.BoostVal, explain)
}
func (q *regexpQuery) Validate() error {
var err error
q.compiled, err = regexp.Compile(q.Regexp)
return err
}

102
query_wildcard.go Normal file
View File

@ -0,0 +1,102 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bleve
import (
"regexp"
"strings"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
"github.com/blevesearch/bleve/search/searchers"
)
var wildcardRegexpReplacer = strings.NewReplacer(
// characters in the wildcard that must
// be escaped in the regexp
"+", `\+`,
"(", `\(`,
")", `\)`,
"^", `\^`,
"$", `\$`,
".", `\.`,
"{", `\{`,
"}", `\}`,
"[", `\[`,
"]", `\]`,
`|`, `\|`,
`\`, `\\`,
// wildcard characters
"*", ".*",
"?", ".")
type wildcardQuery struct {
Wildcard string `json:"wildcard"`
FieldVal string `json:"field,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
compiled *regexp.Regexp
}
// NewWildcardQuery creates a new Query which finds
// documents containing terms that match the
// specified wildcard. In the wildcard pattern '*'
// will match any sequence of 0 or more characters,
// and '?' will match any single character.
func NewWildcardQuery(wildcard string) *wildcardQuery {
return &wildcardQuery{
Wildcard: wildcard,
BoostVal: 1.0,
}
}
func (q *wildcardQuery) Boost() float64 {
return q.BoostVal
}
func (q *wildcardQuery) SetBoost(b float64) Query {
q.BoostVal = b
return q
}
func (q *wildcardQuery) Field() string {
return q.FieldVal
}
func (q *wildcardQuery) SetField(f string) Query {
q.FieldVal = f
return q
}
func (q *wildcardQuery) Searcher(i index.IndexReader, m *IndexMapping, explain bool) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = m.DefaultField
}
if q.compiled == nil {
var err error
q.compiled, err = q.convertToRegexp()
if err != nil {
return nil, err
}
}
return searchers.NewRegexpSearcher(i, q.compiled, field, q.BoostVal, explain)
}
func (q *wildcardQuery) Validate() error {
var err error
q.compiled, err = q.convertToRegexp()
return err
}
func (q *wildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
regexpString := "^" + wildcardRegexpReplacer.Replace(q.Wildcard) + "$"
return regexp.Compile(regexpString)
}

View File

@ -0,0 +1,108 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package searchers
import (
"regexp"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
)
type RegexpSearcher struct {
indexReader index.IndexReader
pattern *regexp.Regexp
field string
explain bool
searcher *DisjunctionSearcher
}
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, explain bool) (*RegexpSearcher, error) {
prefixTerm, complete := pattern.LiteralPrefix()
candidateTerms := make([]string, 0)
if complete {
// there is no pattern
candidateTerms = append(candidateTerms, prefixTerm)
} else {
var fieldDict index.FieldDict
var err error
if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else {
fieldDict, err = indexReader.FieldDict(field)
}
// enumerate the terms and check against regexp
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
if pattern.MatchString(tfd.Term) {
candidateTerms = append(candidateTerms, tfd.Term)
}
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}
}
// enumerate all the terms in the range
qsearchers := make([]search.Searcher, 0, 25)
for _, cterm := range candidateTerms {
qsearcher, err := NewTermSearcher(indexReader, cterm, field, 1.0, explain)
if err != nil {
return nil, err
}
qsearchers = append(qsearchers, qsearcher)
}
// build disjunction searcher of these ranges
searcher, err := NewDisjunctionSearcher(indexReader, qsearchers, 0, explain)
if err != nil {
return nil, err
}
return &RegexpSearcher{
indexReader: indexReader,
pattern: pattern,
field: field,
explain: explain,
searcher: searcher,
}, nil
}
func (s *RegexpSearcher) Count() uint64 {
return s.searcher.Count()
}
func (s *RegexpSearcher) Weight() float64 {
return s.searcher.Weight()
}
func (s *RegexpSearcher) SetQueryNorm(qnorm float64) {
s.searcher.SetQueryNorm(qnorm)
}
func (s *RegexpSearcher) Next() (*search.DocumentMatch, error) {
return s.searcher.Next()
}
func (s *RegexpSearcher) Advance(ID string) (*search.DocumentMatch, error) {
return s.searcher.Next()
}
func (s *RegexpSearcher) Close() error {
return s.searcher.Close()
}
func (s *RegexpSearcher) Min() int {
return 0
}

View File

@ -0,0 +1,100 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package searchers
import (
"regexp"
"testing"
"github.com/blevesearch/bleve/search"
)
func TestRegexpSearch(t *testing.T) {
twoDocIndexReader, err := twoDocIndex.Reader()
if err != nil {
t.Error(err)
}
defer twoDocIndexReader.Close()
pattern, err := regexp.Compile("ma.*")
if err != nil {
t.Fatal(err)
}
regexpSearcher, err := NewRegexpSearcher(twoDocIndexReader, pattern, "name", 1.0, true)
if err != nil {
t.Fatal(err)
}
patternCo, err := regexp.Compile("co.*")
if err != nil {
t.Fatal(err)
}
regexpSearcherCo, err := NewRegexpSearcher(twoDocIndexReader, patternCo, "desc", 1.0, true)
if err != nil {
t.Fatal(err)
}
tests := []struct {
searcher search.Searcher
results []*search.DocumentMatch
}{
{
searcher: regexpSearcher,
results: []*search.DocumentMatch{
&search.DocumentMatch{
ID: "1",
Score: 1.916290731874155,
},
},
},
{
searcher: regexpSearcherCo,
results: []*search.DocumentMatch{
&search.DocumentMatch{
ID: "2",
Score: 0.33875554280828685,
},
&search.DocumentMatch{
ID: "3",
Score: 0.33875554280828685,
},
},
},
}
for testIndex, test := range tests {
defer test.searcher.Close()
next, err := test.searcher.Next()
i := 0
for err == nil && next != nil {
if i < len(test.results) {
if next.ID != test.results[i].ID {
t.Errorf("expected result %d to have id %s got %s for test %d", i, test.results[i].ID, next.ID, testIndex)
}
if next.Score != test.results[i].Score {
t.Errorf("expected result %d to have score %v got %v for test %d", i, test.results[i].Score, next.Score, testIndex)
t.Logf("scoring explanation: %s", next.Expl)
}
}
next, err = test.searcher.Next()
i++
}
if err != nil {
t.Fatalf("error iterating searcher: %v for test %d", err, testIndex)
}
if len(test.results) != i {
t.Errorf("expected %d results got %d for test %d", len(test.results), i, testIndex)
}
}
}