Merge pull request #520 from mschoch/faster_regexp
improve performance of regular expression and wildcard queries
This commit is contained in:
commit
f94a790156
|
@ -33,7 +33,9 @@ type RegexpQuery struct {
|
||||||
|
|
||||||
// NewRegexpQuery creates a new Query which finds
|
// NewRegexpQuery creates a new Query which finds
|
||||||
// documents containing terms that match the
|
// documents containing terms that match the
|
||||||
// specified regular expression.
|
// specified regular expression. The regexp pattern
|
||||||
|
// SHOULD NOT include ^ or $ modifiers, the search
|
||||||
|
// will only match entire terms even without them.
|
||||||
func NewRegexpQuery(regexp string) *RegexpQuery {
|
func NewRegexpQuery(regexp string) *RegexpQuery {
|
||||||
return &RegexpQuery{
|
return &RegexpQuery{
|
||||||
Regexp: regexp,
|
Regexp: regexp,
|
||||||
|
@ -76,14 +78,14 @@ func (q *RegexpQuery) Validate() error {
|
||||||
|
|
||||||
func (q *RegexpQuery) compile() error {
|
func (q *RegexpQuery) compile() error {
|
||||||
if q.compiled == nil {
|
if q.compiled == nil {
|
||||||
// require that pattern be anchored to start and end of term
|
// require that pattern NOT be anchored to start and end of term
|
||||||
actualRegexp := q.Regexp
|
actualRegexp := q.Regexp
|
||||||
if !strings.HasPrefix(actualRegexp, "^") {
|
if strings.HasPrefix(actualRegexp, "^") {
|
||||||
actualRegexp = "^" + actualRegexp
|
actualRegexp = actualRegexp[1:] // remove leading ^
|
||||||
}
|
|
||||||
if !strings.HasSuffix(actualRegexp, "$") {
|
|
||||||
actualRegexp = actualRegexp + "$"
|
|
||||||
}
|
}
|
||||||
|
// do not attempt to remove trailing $, it's presence is not
|
||||||
|
// known to interfere with LiteralPrefix() the way ^ does
|
||||||
|
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
||||||
var err error
|
var err error
|
||||||
q.compiled, err = regexp.Compile(actualRegexp)
|
q.compiled, err = regexp.Compile(actualRegexp)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -101,6 +101,6 @@ func (q *WildcardQuery) Validate() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
|
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
|
||||||
regexpString := "^" + wildcardRegexpReplacer.Replace(q.Wildcard) + "$"
|
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
||||||
return regexp.Compile(regexpString)
|
return regexp.Compile(regexpString)
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,11 @@ import (
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// NewRegexpSearcher creates a searcher which will match documents that
|
||||||
|
// contain terms which match the pattern regexp. The match must be EXACT
|
||||||
|
// matching the entire term. The provided regexp SHOULD NOT start with ^
|
||||||
|
// or end with $ as this can intefere with the implementation. Separately,
|
||||||
|
// matches will be checked to ensure they match the entire term.
|
||||||
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
|
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
|
||||||
|
|
||||||
prefixTerm, complete := pattern.LiteralPrefix()
|
prefixTerm, complete := pattern.LiteralPrefix()
|
||||||
|
@ -79,7 +84,8 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, pattern *regexp.Reg
|
||||||
// enumerate the terms and check against regexp
|
// enumerate the terms and check against regexp
|
||||||
tfd, err := fieldDict.Next()
|
tfd, err := fieldDict.Next()
|
||||||
for err == nil && tfd != nil {
|
for err == nil && tfd != nil {
|
||||||
if pattern.MatchString(tfd.Term) {
|
matchPos := pattern.FindStringIndex(tfd.Term)
|
||||||
|
if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) {
|
||||||
rv = append(rv, tfd.Term)
|
rv = append(rv, tfd.Term)
|
||||||
if tooManyClauses(len(rv)) {
|
if tooManyClauses(len(rv)) {
|
||||||
return rv, tooManyClausesErr()
|
return rv, tooManyClausesErr()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user