0
0

Merge pull request #520 from mschoch/faster_regexp

improve performance of regular expression and wildcard queries
This commit is contained in:
Marty Schoch 2017-01-18 16:31:49 -05:00 committed by GitHub
commit f94a790156
3 changed files with 17 additions and 9 deletions

View File

@ -33,7 +33,9 @@ type RegexpQuery struct {
// NewRegexpQuery creates a new Query which finds
// documents containing terms that match the
// specified regular expression.
// specified regular expression. The regexp pattern
// SHOULD NOT include ^ or $ modifiers, the search
// will only match entire terms even without them.
func NewRegexpQuery(regexp string) *RegexpQuery {
return &RegexpQuery{
Regexp: regexp,
@ -76,14 +78,14 @@ func (q *RegexpQuery) Validate() error {
func (q *RegexpQuery) compile() error {
if q.compiled == nil {
// require that pattern be anchored to start and end of term
// require that pattern NOT be anchored to start and end of term
actualRegexp := q.Regexp
if !strings.HasPrefix(actualRegexp, "^") {
actualRegexp = "^" + actualRegexp
}
if !strings.HasSuffix(actualRegexp, "$") {
actualRegexp = actualRegexp + "$"
if strings.HasPrefix(actualRegexp, "^") {
actualRegexp = actualRegexp[1:] // remove leading ^
}
// do not attempt to remove trailing $, it's presence is not
// known to interfere with LiteralPrefix() the way ^ does
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
var err error
q.compiled, err = regexp.Compile(actualRegexp)
if err != nil {

View File

@ -101,6 +101,6 @@ func (q *WildcardQuery) Validate() error {
}
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
regexpString := "^" + wildcardRegexpReplacer.Replace(q.Wildcard) + "$"
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
return regexp.Compile(regexpString)
}

View File

@ -21,6 +21,11 @@ import (
"github.com/blevesearch/bleve/search"
)
// NewRegexpSearcher creates a searcher which will match documents that
// contain terms which match the pattern regexp. The match must be EXACT
// matching the entire term. The provided regexp SHOULD NOT start with ^
// or end with $ as this can intefere with the implementation. Separately,
// matches will be checked to ensure they match the entire term.
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
prefixTerm, complete := pattern.LiteralPrefix()
@ -79,7 +84,8 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, pattern *regexp.Reg
// enumerate the terms and check against regexp
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
if pattern.MatchString(tfd.Term) {
matchPos := pattern.FindStringIndex(tfd.Term)
if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) {
rv = append(rv, tfd.Term)
if tooManyClauses(len(rv)) {
return rv, tooManyClausesErr()