Marty Schoch 5023993895 replaced nex lexer with custom lexer
this improvement was started to improve code coverage
but also improves performance and adds support for escaping


The following quoted string enumerates the characters which
may be escaped.

"+-=&|><!(){}[]^\"~*?:\\/ "

Note that this list includes space.

In order to escape these characters, they are prefixed with the \
(backslash) character.  In all cases, using the escaped version
produces the character itself and is not interpretted by the

Two simple examples:

my\ name

Will be interpretted as a single argument to a match query
with the value "my name".

"contains a\" character"

Will be interpretted as a single argument to a phrase query
with the value `contains a " character`.


before$ go test -v -run=xxx -bench=BenchmarkLexer
BenchmarkLexer-4   	  100000	     13991 ns/op
ok  	github.com/blevesearch/bleve	1.570s

after$ go test -v -run=xxx -bench=BenchmarkLexer
BenchmarkLexer-4   	  500000	      3387 ns/op
ok  	github.com/blevesearch/bleve	1.740s
2016-09-01 13:16:07 -04:00

318 lines
7.1 KiB

// Copyright (c) 2016 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bleve
import (
const reservedChars = "+-=&|><!(){}[]^\"~*?:\\/ "
func unescape(escaped string) string {
// see if this character can be escaped
if strings.ContainsAny(escaped, reservedChars) {
return escaped
// otherwise return it with the \ intact
return "\\" + escaped
type queryStringLex struct {
in *bufio.Reader
buf string
currState lexState
currConsumed bool
inEscape bool
nextToken *yySymType
nextTokenType int
seenDot bool
nextRune rune
nextRuneSize int
atEOF bool
func (l *queryStringLex) reset() {
l.buf = ""
l.inEscape = false
l.seenDot = false
func (l *queryStringLex) Error(msg string) {
func (l *queryStringLex) Lex(lval *yySymType) int {
var err error
for l.nextToken == nil {
if l.currConsumed {
l.nextRune, l.nextRuneSize, err = l.in.ReadRune()
if err != nil && err == io.EOF {
l.nextRune = 0
l.atEOF = true
} else if err != nil {
return 0
l.currState, l.currConsumed = l.currState(l, l.nextRune, l.atEOF)
if l.currState == nil {
return 0
*lval = *l.nextToken
rv := l.nextTokenType
l.nextToken = nil
l.nextTokenType = 0
return rv
func newQueryStringLex(in io.Reader) *queryStringLex {
return &queryStringLex{
in: bufio.NewReader(in),
currState: startState,
currConsumed: true,
type lexState func(l *queryStringLex, next rune, eof bool) (lexState, bool)
func startState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
if eof {
return nil, false
// handle inside escape case up front
if l.inEscape {
l.inEscape = false
l.buf += unescape(string(next))
return inStrState, true
switch next {
case '"':
return inPhraseState, true
case '+', '-', ':', '>', '<', '=':
l.buf += string(next)
return singleCharOpState, true
case '^':
return inBoostState, true
case '~':
return inTildeState, true
switch {
case !l.inEscape && next == '\\':
l.inEscape = true
return startState, true
case unicode.IsDigit(next):
l.buf += string(next)
return inNumOrStrState, true
case !unicode.IsSpace(next):
l.buf += string(next)
return inStrState, true
// doesnt look like anything, just eat it and stay here
return startState, true
func inPhraseState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// unterminated phrase eats the phrase
if eof {
l.Error("unterminated quote")
return nil, false
// only a non-escaped " ends the phrase
if !l.inEscape && next == '"' {
// end phrase
l.nextTokenType = tPHRASE
l.nextToken = &yySymType{
s: l.buf,
logDebugTokens("PHRASE - '%s'", l.nextToken.s)
return startState, true
} else if !l.inEscape && next == '\\' {
l.inEscape = true
} else if l.inEscape {
// if in escape, end it
l.inEscape = false
l.buf += unescape(string(next))
} else {
l.buf += string(next)
return inPhraseState, true
func singleCharOpState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
l.nextToken = &yySymType{}
switch l.buf {
case "+":
l.nextTokenType = tPLUS
case "-":
l.nextTokenType = tMINUS
case ":":
l.nextTokenType = tCOLON
case ">":
l.nextTokenType = tGREATER
case "<":
l.nextTokenType = tLESS
case "=":
l.nextTokenType = tEQUAL
return startState, false
func inBoostState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// only a non-escaped space ends the boost (or eof)
if eof || (!l.inEscape && next == ' ') {
// end boost
l.nextTokenType = tBOOST
if l.buf == "" {
l.buf = "1"
l.nextToken = &yySymType{
s: l.buf,
logDebugTokens("BOOST - '%s'", l.nextToken.s)
return startState, true
} else if !l.inEscape && next == '\\' {
l.inEscape = true
} else if l.inEscape {
// if in escape, end it
l.inEscape = false
l.buf += unescape(string(next))
} else {
l.buf += string(next)
return inBoostState, true
func inTildeState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// only a non-escaped space ends the tilde (or eof)
if eof || (!l.inEscape && next == ' ') {
// end tilde
l.nextTokenType = tTILDE
if l.buf == "" {
l.buf = "1"
l.nextToken = &yySymType{
s: l.buf,
logDebugTokens("TILDE - '%s'", l.nextToken.s)
return startState, true
} else if !l.inEscape && next == '\\' {
l.inEscape = true
} else if l.inEscape {
// if in escape, end it
l.inEscape = false
l.buf += unescape(string(next))
} else {
l.buf += string(next)
return inTildeState, true
func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// only a non-escaped space ends the tilde (or eof)
if eof || (!l.inEscape && next == ' ') {
// end number
l.nextTokenType = tNUMBER
l.nextToken = &yySymType{
s: l.buf,
logDebugTokens("NUMBER - '%s'", l.nextToken.s)
return startState, true
} else if !l.inEscape && next == '\\' {
l.inEscape = true
return inNumOrStrState, true
} else if l.inEscape {
// if in escape, end it
l.inEscape = false
l.buf += unescape(string(next))
// go directly to string, no successfully or unsuccessfully
// escaped string results in a valid number
return inStrState, true
// see where to go
if !l.seenDot && next == '.' {
// stay in this state
l.buf += string(next)
return inNumOrStrState, true
} else if unicode.IsDigit(next) {
l.buf += string(next)
return inNumOrStrState, true
// doesn't look like an number, transition
l.buf += string(next)
return inStrState, true
func inStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
// end on non-escped space, colon, tilde, boost (or eof)
if eof || (!l.inEscape && (next == ' ' || next == ':' || next == '^' || next == '~')) {
// end string
l.nextTokenType = tSTRING
l.nextToken = &yySymType{
s: l.buf,
logDebugTokens("STRING - '%s'", l.nextToken.s)
consumed := true
if !eof && (next == ':' || next == '^' || next == '~') {
consumed = false
return startState, consumed
} else if !l.inEscape && next == '\\' {
l.inEscape = true
} else if l.inEscape {
// if in escape, end it
l.inEscape = false
l.buf += unescape(string(next))
} else {
l.buf += string(next)
return inStrState, true
func logDebugTokens(format string, v ...interface{}) {
if debugLexer {
logger.Printf(format, v...)