Merge pull request #520 from mschoch/faster_regexp
improve performance of regular expression and wildcard queries
This commit is contained in:
commit
f94a790156
|
@ -33,7 +33,9 @@ type RegexpQuery struct {
|
|||
|
||||
// NewRegexpQuery creates a new Query which finds
|
||||
// documents containing terms that match the
|
||||
// specified regular expression.
|
||||
// specified regular expression. The regexp pattern
|
||||
// SHOULD NOT include ^ or $ modifiers, the search
|
||||
// will only match entire terms even without them.
|
||||
func NewRegexpQuery(regexp string) *RegexpQuery {
|
||||
return &RegexpQuery{
|
||||
Regexp: regexp,
|
||||
|
@ -76,14 +78,14 @@ func (q *RegexpQuery) Validate() error {
|
|||
|
||||
func (q *RegexpQuery) compile() error {
|
||||
if q.compiled == nil {
|
||||
// require that pattern be anchored to start and end of term
|
||||
// require that pattern NOT be anchored to start and end of term
|
||||
actualRegexp := q.Regexp
|
||||
if !strings.HasPrefix(actualRegexp, "^") {
|
||||
actualRegexp = "^" + actualRegexp
|
||||
}
|
||||
if !strings.HasSuffix(actualRegexp, "$") {
|
||||
actualRegexp = actualRegexp + "$"
|
||||
if strings.HasPrefix(actualRegexp, "^") {
|
||||
actualRegexp = actualRegexp[1:] // remove leading ^
|
||||
}
|
||||
// do not attempt to remove trailing $, it's presence is not
|
||||
// known to interfere with LiteralPrefix() the way ^ does
|
||||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
||||
var err error
|
||||
q.compiled, err = regexp.Compile(actualRegexp)
|
||||
if err != nil {
|
||||
|
|
|
@ -101,6 +101,6 @@ func (q *WildcardQuery) Validate() error {
|
|||
}
|
||||
|
||||
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
|
||||
regexpString := "^" + wildcardRegexpReplacer.Replace(q.Wildcard) + "$"
|
||||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
||||
return regexp.Compile(regexpString)
|
||||
}
|
||||
|
|
|
@ -21,6 +21,11 @@ import (
|
|||
"github.com/blevesearch/bleve/search"
|
||||
)
|
||||
|
||||
// NewRegexpSearcher creates a searcher which will match documents that
|
||||
// contain terms which match the pattern regexp. The match must be EXACT
|
||||
// matching the entire term. The provided regexp SHOULD NOT start with ^
|
||||
// or end with $ as this can intefere with the implementation. Separately,
|
||||
// matches will be checked to ensure they match the entire term.
|
||||
func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
|
||||
|
||||
prefixTerm, complete := pattern.LiteralPrefix()
|
||||
|
@ -79,7 +84,8 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, pattern *regexp.Reg
|
|||
// enumerate the terms and check against regexp
|
||||
tfd, err := fieldDict.Next()
|
||||
for err == nil && tfd != nil {
|
||||
if pattern.MatchString(tfd.Term) {
|
||||
matchPos := pattern.FindStringIndex(tfd.Term)
|
||||
if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) {
|
||||
rv = append(rv, tfd.Term)
|
||||
if tooManyClauses(len(rv)) {
|
||||
return rv, tooManyClausesErr()
|
||||
|
|
Loading…
Reference in New Issue
Block a user