replaced nex lexer with custom lexer
this improvement was started to improve code coverage but also improves performance and adds support for escaping escaping: The following quoted string enumerates the characters which may be escaped. "+-=&|><!(){}[]^\"~*?:\\/ " Note that this list includes space. In order to escape these characters, they are prefixed with the \ (backslash) character. In all cases, using the escaped version produces the character itself and is not interpretted by the lexer. Two simple examples: my\ name Will be interpretted as a single argument to a match query with the value "my name". "contains a\" character" Will be interpretted as a single argument to a phrase query with the value `contains a " character`. Performance: before$ go test -v -run=xxx -bench=BenchmarkLexer BenchmarkLexer-4 100000 13991 ns/op PASS ok github.com/blevesearch/bleve 1.570s after$ go test -v -run=xxx -bench=BenchmarkLexer BenchmarkLexer-4 500000 3387 ns/op PASS ok github.com/blevesearch/bleve 1.740s
This commit is contained in:
parent
46f70bfa12
commit
5023993895
@ -1,54 +0,0 @@
|
||||
/\"((\\\")|(\\\\)|(\\\/)|(\\b)|(\\f)|(\\n)|(\\r)|(\\t)|(\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])|[^\"])*\"/ {
|
||||
lval.s = yylex.Text()[1:len(yylex.Text())-1]
|
||||
logDebugTokens("PHRASE - %s", lval.s);
|
||||
return tPHRASE
|
||||
}
|
||||
/\+/ { logDebugTokens("PLUS"); return tPLUS }
|
||||
/-/ { logDebugTokens("MINUS"); return tMINUS }
|
||||
/:/ { logDebugTokens("COLON"); return tCOLON }
|
||||
/\(/ { logDebugTokens("LPAREN"); return tLPAREN }
|
||||
/\)/ { logDebugTokens("RPAREN"); return tRPAREN }
|
||||
/>/ { logDebugTokens("GREATER"); return tGREATER }
|
||||
/</ { logDebugTokens("LESS"); return tLESS }
|
||||
/=/ { logDebugTokens("EQUAL"); return tEQUAL }
|
||||
/\^([0-9]|[1-9][0-9]*)(\.[0-9][0-9]*)?/
|
||||
{
|
||||
lval.s = yylex.Text()[1:]
|
||||
logDebugTokens("BOOST");
|
||||
return tBOOST }
|
||||
/\^/ {
|
||||
lval.s = "1"
|
||||
logDebugTokens("BOOST");
|
||||
return tBOOST
|
||||
}
|
||||
/~([0-9]|[1-9][0-9]*)/
|
||||
{
|
||||
lval.s = yylex.Text()[1:]
|
||||
logDebugTokens("TILDENUMBER - %s", lval.s);
|
||||
return tTILDE
|
||||
}
|
||||
/~/ {
|
||||
lval.s = "1"
|
||||
logDebugTokens("TILDE");
|
||||
return tTILDE
|
||||
}
|
||||
/-?([0-9]|[1-9][0-9]*)(\.[0-9][0-9]*)?/
|
||||
{
|
||||
lval.s = yylex.Text()
|
||||
logDebugTokens("NUMBER - %s", lval.s);
|
||||
return tNUMBER
|
||||
}
|
||||
/[ \t\n]+/ { logDebugTokens("WHITESPACE (count=%d)", len(yylex.Text())) /* eat up whitespace */ }
|
||||
/[^\t\n\f\r :^\+\*\?><=~-][^\t\n\f\r :^~]*/ {
|
||||
lval.s = yylex.Text()
|
||||
logDebugTokens("STRING - %s", lval.s);
|
||||
return tSTRING
|
||||
}
|
||||
//
|
||||
package bleve
|
||||
|
||||
func logDebugTokens(format string, v ...interface{}) {
|
||||
if debugLexer {
|
||||
logger.Printf(format, v...)
|
||||
}
|
||||
}
|
1520
query_string.nn.go
1520
query_string.nn.go
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,7 @@
|
||||
%{
|
||||
package bleve
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@ -18,7 +19,7 @@ n int
|
||||
f float64
|
||||
q Query}
|
||||
|
||||
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tLPAREN tRPAREN tNUMBER tSTRING tGREATER tLESS
|
||||
%token tSTRING tPHRASE tPLUS tMINUS tCOLON tBOOST tNUMBER tSTRING tGREATER tLESS
|
||||
tEQUAL tTILDE
|
||||
|
||||
%type <s> tSTRING
|
||||
@ -93,7 +94,10 @@ tSTRING {
|
||||
|
|
||||
tSTRING tTILDE {
|
||||
str := $1
|
||||
fuzziness, _ := strconv.ParseFloat($2, 64)
|
||||
fuzziness, err := strconv.ParseFloat($2, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid fuzziness value: %v", err))
|
||||
}
|
||||
logDebugGrammar("FUZZY STRING - %s %f", str, fuzziness)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
@ -103,7 +107,10 @@ tSTRING tTILDE {
|
||||
tSTRING tCOLON tSTRING tTILDE {
|
||||
field := $1
|
||||
str := $3
|
||||
fuzziness, _ := strconv.ParseFloat($4, 64)
|
||||
fuzziness, err := strconv.ParseFloat($4, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid fuzziness value: %v", err))
|
||||
}
|
||||
logDebugGrammar("FIELD - %s FUZZY STRING - %s %f", field, str, fuzziness)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
@ -239,7 +246,10 @@ searchSuffix:
|
||||
}
|
||||
|
|
||||
tBOOST {
|
||||
boost, _ := strconv.ParseFloat($1, 64)
|
||||
boost, err := strconv.ParseFloat($1, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid boost value: %v", err))
|
||||
}
|
||||
$$ = boost
|
||||
logDebugGrammar("BOOST %f", boost)
|
||||
};
|
||||
|
@ -4,6 +4,7 @@ import __yyfmt__ "fmt"
|
||||
|
||||
//line query_string.y:2
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@ -14,7 +15,7 @@ func logDebugGrammar(format string, v ...interface{}) {
|
||||
}
|
||||
}
|
||||
|
||||
//line query_string.y:15
|
||||
//line query_string.y:16
|
||||
type yySymType struct {
|
||||
yys int
|
||||
s string
|
||||
@ -29,13 +30,11 @@ const tPLUS = 57348
|
||||
const tMINUS = 57349
|
||||
const tCOLON = 57350
|
||||
const tBOOST = 57351
|
||||
const tLPAREN = 57352
|
||||
const tRPAREN = 57353
|
||||
const tNUMBER = 57354
|
||||
const tGREATER = 57355
|
||||
const tLESS = 57356
|
||||
const tEQUAL = 57357
|
||||
const tTILDE = 57358
|
||||
const tNUMBER = 57352
|
||||
const tGREATER = 57353
|
||||
const tLESS = 57354
|
||||
const tEQUAL = 57355
|
||||
const tTILDE = 57356
|
||||
|
||||
var yyToknames = [...]string{
|
||||
"$end",
|
||||
@ -47,8 +46,6 @@ var yyToknames = [...]string{
|
||||
"tMINUS",
|
||||
"tCOLON",
|
||||
"tBOOST",
|
||||
"tLPAREN",
|
||||
"tRPAREN",
|
||||
"tNUMBER",
|
||||
"tGREATER",
|
||||
"tLESS",
|
||||
@ -77,25 +74,25 @@ const yyPrivate = 57344
|
||||
var yyTokenNames []string
|
||||
var yyStates []string
|
||||
|
||||
const yyLast = 32
|
||||
const yyLast = 31
|
||||
|
||||
var yyAct = [...]int{
|
||||
|
||||
16, 18, 21, 13, 27, 24, 3, 1, 17, 19,
|
||||
20, 25, 22, 15, 26, 23, 9, 11, 31, 29,
|
||||
4, 14, 5, 6, 10, 30, 28, 2, 12, 8,
|
||||
0, 7,
|
||||
16, 18, 21, 13, 27, 24, 17, 19, 20, 25,
|
||||
22, 15, 26, 23, 9, 11, 31, 14, 29, 3,
|
||||
10, 30, 2, 28, 5, 6, 7, 1, 4, 12,
|
||||
8,
|
||||
}
|
||||
var yyPact = [...]int{
|
||||
|
||||
16, -1000, -1000, 16, 12, -1000, -1000, -1000, -6, 5,
|
||||
-1000, -1000, -1000, -1000, -1000, -4, -14, -1000, -1000, 0,
|
||||
-1, -1000, -1000, 14, -1000, -1000, 13, -1000, -1000, -1000,
|
||||
18, -1000, -1000, 18, 10, -1000, -1000, -1000, -6, 3,
|
||||
-1000, -1000, -1000, -1000, -1000, -4, -12, -1000, -1000, 0,
|
||||
-1, -1000, -1000, 13, -1000, -1000, 11, -1000, -1000, -1000,
|
||||
-1000, -1000,
|
||||
}
|
||||
var yyPgo = [...]int{
|
||||
|
||||
0, 29, 28, 20, 7, 27, 6,
|
||||
0, 30, 29, 28, 27, 22, 19,
|
||||
}
|
||||
var yyR1 = [...]int{
|
||||
|
||||
@ -112,9 +109,9 @@ var yyR2 = [...]int{
|
||||
var yyChk = [...]int{
|
||||
|
||||
-1000, -4, -5, -6, -3, 6, 7, -5, -1, 4,
|
||||
12, 5, -2, 9, 16, 8, 4, 12, 5, 13,
|
||||
14, 16, 12, 15, 5, 12, 15, 5, 12, 5,
|
||||
12, 5,
|
||||
10, 5, -2, 9, 14, 8, 4, 10, 5, 11,
|
||||
12, 14, 10, 13, 5, 10, 13, 5, 10, 5,
|
||||
10, 5,
|
||||
}
|
||||
var yyDef = [...]int{
|
||||
|
||||
@ -130,7 +127,7 @@ var yyTok1 = [...]int{
|
||||
var yyTok2 = [...]int{
|
||||
|
||||
2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 16,
|
||||
12, 13, 14,
|
||||
}
|
||||
var yyTok3 = [...]int{
|
||||
0,
|
||||
@ -475,25 +472,25 @@ yydefault:
|
||||
|
||||
case 1:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:36
|
||||
//line query_string.y:37
|
||||
{
|
||||
logDebugGrammar("INPUT")
|
||||
}
|
||||
case 2:
|
||||
yyDollar = yyS[yypt-2 : yypt+1]
|
||||
//line query_string.y:41
|
||||
//line query_string.y:42
|
||||
{
|
||||
logDebugGrammar("SEARCH PARTS")
|
||||
}
|
||||
case 3:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:45
|
||||
//line query_string.y:46
|
||||
{
|
||||
logDebugGrammar("SEARCH PART")
|
||||
}
|
||||
case 4:
|
||||
yyDollar = yyS[yypt-3 : yypt+1]
|
||||
//line query_string.y:50
|
||||
//line query_string.y:51
|
||||
{
|
||||
query := yyDollar[2].q
|
||||
query.SetBoost(yyDollar[3].f)
|
||||
@ -508,27 +505,27 @@ yydefault:
|
||||
}
|
||||
case 5:
|
||||
yyDollar = yyS[yypt-0 : yypt+1]
|
||||
//line query_string.y:65
|
||||
//line query_string.y:66
|
||||
{
|
||||
yyVAL.n = queryShould
|
||||
}
|
||||
case 6:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:69
|
||||
//line query_string.y:70
|
||||
{
|
||||
logDebugGrammar("PLUS")
|
||||
yyVAL.n = queryMust
|
||||
}
|
||||
case 7:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:74
|
||||
//line query_string.y:75
|
||||
{
|
||||
logDebugGrammar("MINUS")
|
||||
yyVAL.n = queryMustNot
|
||||
}
|
||||
case 8:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:80
|
||||
//line query_string.y:81
|
||||
{
|
||||
str := yyDollar[1].s
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
@ -544,10 +541,13 @@ yydefault:
|
||||
}
|
||||
case 9:
|
||||
yyDollar = yyS[yypt-2 : yypt+1]
|
||||
//line query_string.y:94
|
||||
//line query_string.y:95
|
||||
{
|
||||
str := yyDollar[1].s
|
||||
fuzziness, _ := strconv.ParseFloat(yyDollar[2].s, 64)
|
||||
fuzziness, err := strconv.ParseFloat(yyDollar[2].s, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid fuzziness value: %v", err))
|
||||
}
|
||||
logDebugGrammar("FUZZY STRING - %s %f", str, fuzziness)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
@ -555,11 +555,14 @@ yydefault:
|
||||
}
|
||||
case 10:
|
||||
yyDollar = yyS[yypt-4 : yypt+1]
|
||||
//line query_string.y:103
|
||||
//line query_string.y:107
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
str := yyDollar[3].s
|
||||
fuzziness, _ := strconv.ParseFloat(yyDollar[4].s, 64)
|
||||
fuzziness, err := strconv.ParseFloat(yyDollar[4].s, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid fuzziness value: %v", err))
|
||||
}
|
||||
logDebugGrammar("FIELD - %s FUZZY STRING - %s %f", field, str, fuzziness)
|
||||
q := NewMatchQuery(str)
|
||||
q.SetFuzziness(int(fuzziness))
|
||||
@ -568,7 +571,7 @@ yydefault:
|
||||
}
|
||||
case 11:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:114
|
||||
//line query_string.y:121
|
||||
{
|
||||
str := yyDollar[1].s
|
||||
logDebugGrammar("STRING - %s", str)
|
||||
@ -577,7 +580,7 @@ yydefault:
|
||||
}
|
||||
case 12:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:121
|
||||
//line query_string.y:128
|
||||
{
|
||||
phrase := yyDollar[1].s
|
||||
logDebugGrammar("PHRASE - %s", phrase)
|
||||
@ -586,7 +589,7 @@ yydefault:
|
||||
}
|
||||
case 13:
|
||||
yyDollar = yyS[yypt-3 : yypt+1]
|
||||
//line query_string.y:128
|
||||
//line query_string.y:135
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
str := yyDollar[3].s
|
||||
@ -604,7 +607,7 @@ yydefault:
|
||||
}
|
||||
case 14:
|
||||
yyDollar = yyS[yypt-3 : yypt+1]
|
||||
//line query_string.y:144
|
||||
//line query_string.y:151
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
str := yyDollar[3].s
|
||||
@ -614,7 +617,7 @@ yydefault:
|
||||
}
|
||||
case 15:
|
||||
yyDollar = yyS[yypt-3 : yypt+1]
|
||||
//line query_string.y:152
|
||||
//line query_string.y:159
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
phrase := yyDollar[3].s
|
||||
@ -624,7 +627,7 @@ yydefault:
|
||||
}
|
||||
case 16:
|
||||
yyDollar = yyS[yypt-4 : yypt+1]
|
||||
//line query_string.y:160
|
||||
//line query_string.y:167
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
min, _ := strconv.ParseFloat(yyDollar[4].s, 64)
|
||||
@ -635,7 +638,7 @@ yydefault:
|
||||
}
|
||||
case 17:
|
||||
yyDollar = yyS[yypt-5 : yypt+1]
|
||||
//line query_string.y:169
|
||||
//line query_string.y:176
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
min, _ := strconv.ParseFloat(yyDollar[5].s, 64)
|
||||
@ -646,7 +649,7 @@ yydefault:
|
||||
}
|
||||
case 18:
|
||||
yyDollar = yyS[yypt-4 : yypt+1]
|
||||
//line query_string.y:178
|
||||
//line query_string.y:185
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
max, _ := strconv.ParseFloat(yyDollar[4].s, 64)
|
||||
@ -657,7 +660,7 @@ yydefault:
|
||||
}
|
||||
case 19:
|
||||
yyDollar = yyS[yypt-5 : yypt+1]
|
||||
//line query_string.y:187
|
||||
//line query_string.y:194
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
max, _ := strconv.ParseFloat(yyDollar[5].s, 64)
|
||||
@ -668,7 +671,7 @@ yydefault:
|
||||
}
|
||||
case 20:
|
||||
yyDollar = yyS[yypt-4 : yypt+1]
|
||||
//line query_string.y:196
|
||||
//line query_string.y:203
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
minInclusive := false
|
||||
@ -680,7 +683,7 @@ yydefault:
|
||||
}
|
||||
case 21:
|
||||
yyDollar = yyS[yypt-5 : yypt+1]
|
||||
//line query_string.y:206
|
||||
//line query_string.y:213
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
minInclusive := true
|
||||
@ -692,7 +695,7 @@ yydefault:
|
||||
}
|
||||
case 22:
|
||||
yyDollar = yyS[yypt-4 : yypt+1]
|
||||
//line query_string.y:216
|
||||
//line query_string.y:223
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
maxInclusive := false
|
||||
@ -704,7 +707,7 @@ yydefault:
|
||||
}
|
||||
case 23:
|
||||
yyDollar = yyS[yypt-5 : yypt+1]
|
||||
//line query_string.y:226
|
||||
//line query_string.y:233
|
||||
{
|
||||
field := yyDollar[1].s
|
||||
maxInclusive := true
|
||||
@ -716,15 +719,18 @@ yydefault:
|
||||
}
|
||||
case 24:
|
||||
yyDollar = yyS[yypt-0 : yypt+1]
|
||||
//line query_string.y:237
|
||||
//line query_string.y:244
|
||||
{
|
||||
yyVAL.f = 1.0
|
||||
}
|
||||
case 25:
|
||||
yyDollar = yyS[yypt-1 : yypt+1]
|
||||
//line query_string.y:241
|
||||
//line query_string.y:248
|
||||
{
|
||||
boost, _ := strconv.ParseFloat(yyDollar[1].s, 64)
|
||||
boost, err := strconv.ParseFloat(yyDollar[1].s, 64)
|
||||
if err != nil {
|
||||
yylex.(*lexerWrapper).lex.Error(fmt.Sprintf("invalid boost value: %v", err))
|
||||
}
|
||||
yyVAL.f = boost
|
||||
logDebugGrammar("BOOST %f", boost)
|
||||
}
|
||||
|
317
query_string_lex.go
Normal file
317
query_string_lex.go
Normal file
@ -0,0 +1,317 @@
|
||||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
const reservedChars = "+-=&|><!(){}[]^\"~*?:\\/ "
|
||||
|
||||
func unescape(escaped string) string {
|
||||
// see if this character can be escaped
|
||||
if strings.ContainsAny(escaped, reservedChars) {
|
||||
return escaped
|
||||
}
|
||||
// otherwise return it with the \ intact
|
||||
return "\\" + escaped
|
||||
}
|
||||
|
||||
type queryStringLex struct {
|
||||
in *bufio.Reader
|
||||
buf string
|
||||
currState lexState
|
||||
currConsumed bool
|
||||
inEscape bool
|
||||
nextToken *yySymType
|
||||
nextTokenType int
|
||||
seenDot bool
|
||||
nextRune rune
|
||||
nextRuneSize int
|
||||
atEOF bool
|
||||
}
|
||||
|
||||
func (l *queryStringLex) reset() {
|
||||
l.buf = ""
|
||||
l.inEscape = false
|
||||
l.seenDot = false
|
||||
}
|
||||
|
||||
func (l *queryStringLex) Error(msg string) {
|
||||
panic(msg)
|
||||
}
|
||||
|
||||
func (l *queryStringLex) Lex(lval *yySymType) int {
|
||||
var err error
|
||||
|
||||
for l.nextToken == nil {
|
||||
if l.currConsumed {
|
||||
l.nextRune, l.nextRuneSize, err = l.in.ReadRune()
|
||||
if err != nil && err == io.EOF {
|
||||
l.nextRune = 0
|
||||
l.atEOF = true
|
||||
} else if err != nil {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
l.currState, l.currConsumed = l.currState(l, l.nextRune, l.atEOF)
|
||||
if l.currState == nil {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
*lval = *l.nextToken
|
||||
rv := l.nextTokenType
|
||||
l.nextToken = nil
|
||||
l.nextTokenType = 0
|
||||
return rv
|
||||
}
|
||||
|
||||
func newQueryStringLex(in io.Reader) *queryStringLex {
|
||||
return &queryStringLex{
|
||||
in: bufio.NewReader(in),
|
||||
currState: startState,
|
||||
currConsumed: true,
|
||||
}
|
||||
}
|
||||
|
||||
type lexState func(l *queryStringLex, next rune, eof bool) (lexState, bool)
|
||||
|
||||
func startState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
if eof {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// handle inside escape case up front
|
||||
if l.inEscape {
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
return inStrState, true
|
||||
}
|
||||
|
||||
switch next {
|
||||
case '"':
|
||||
return inPhraseState, true
|
||||
case '+', '-', ':', '>', '<', '=':
|
||||
l.buf += string(next)
|
||||
return singleCharOpState, true
|
||||
case '^':
|
||||
return inBoostState, true
|
||||
case '~':
|
||||
return inTildeState, true
|
||||
}
|
||||
|
||||
switch {
|
||||
case !l.inEscape && next == '\\':
|
||||
l.inEscape = true
|
||||
return startState, true
|
||||
case unicode.IsDigit(next):
|
||||
l.buf += string(next)
|
||||
return inNumOrStrState, true
|
||||
case !unicode.IsSpace(next):
|
||||
l.buf += string(next)
|
||||
return inStrState, true
|
||||
}
|
||||
|
||||
// doesnt look like anything, just eat it and stay here
|
||||
l.reset()
|
||||
return startState, true
|
||||
}
|
||||
|
||||
func inPhraseState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
// unterminated phrase eats the phrase
|
||||
if eof {
|
||||
l.Error("unterminated quote")
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// only a non-escaped " ends the phrase
|
||||
if !l.inEscape && next == '"' {
|
||||
// end phrase
|
||||
l.nextTokenType = tPHRASE
|
||||
l.nextToken = &yySymType{
|
||||
s: l.buf,
|
||||
}
|
||||
logDebugTokens("PHRASE - '%s'", l.nextToken.s)
|
||||
l.reset()
|
||||
return startState, true
|
||||
} else if !l.inEscape && next == '\\' {
|
||||
l.inEscape = true
|
||||
} else if l.inEscape {
|
||||
// if in escape, end it
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
} else {
|
||||
l.buf += string(next)
|
||||
}
|
||||
|
||||
return inPhraseState, true
|
||||
}
|
||||
|
||||
func singleCharOpState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
l.nextToken = &yySymType{}
|
||||
|
||||
switch l.buf {
|
||||
case "+":
|
||||
l.nextTokenType = tPLUS
|
||||
logDebugTokens("PLUS")
|
||||
case "-":
|
||||
l.nextTokenType = tMINUS
|
||||
logDebugTokens("MINUS")
|
||||
case ":":
|
||||
l.nextTokenType = tCOLON
|
||||
logDebugTokens("COLON")
|
||||
case ">":
|
||||
l.nextTokenType = tGREATER
|
||||
logDebugTokens("GREATER")
|
||||
case "<":
|
||||
l.nextTokenType = tLESS
|
||||
logDebugTokens("LESS")
|
||||
case "=":
|
||||
l.nextTokenType = tEQUAL
|
||||
logDebugTokens("EQUAL")
|
||||
}
|
||||
|
||||
l.reset()
|
||||
return startState, false
|
||||
}
|
||||
|
||||
func inBoostState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
|
||||
// only a non-escaped space ends the boost (or eof)
|
||||
if eof || (!l.inEscape && next == ' ') {
|
||||
// end boost
|
||||
l.nextTokenType = tBOOST
|
||||
if l.buf == "" {
|
||||
l.buf = "1"
|
||||
}
|
||||
l.nextToken = &yySymType{
|
||||
s: l.buf,
|
||||
}
|
||||
logDebugTokens("BOOST - '%s'", l.nextToken.s)
|
||||
l.reset()
|
||||
return startState, true
|
||||
} else if !l.inEscape && next == '\\' {
|
||||
l.inEscape = true
|
||||
} else if l.inEscape {
|
||||
// if in escape, end it
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
} else {
|
||||
l.buf += string(next)
|
||||
}
|
||||
|
||||
return inBoostState, true
|
||||
}
|
||||
|
||||
func inTildeState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
|
||||
// only a non-escaped space ends the tilde (or eof)
|
||||
if eof || (!l.inEscape && next == ' ') {
|
||||
// end tilde
|
||||
l.nextTokenType = tTILDE
|
||||
if l.buf == "" {
|
||||
l.buf = "1"
|
||||
}
|
||||
l.nextToken = &yySymType{
|
||||
s: l.buf,
|
||||
}
|
||||
logDebugTokens("TILDE - '%s'", l.nextToken.s)
|
||||
l.reset()
|
||||
return startState, true
|
||||
} else if !l.inEscape && next == '\\' {
|
||||
l.inEscape = true
|
||||
} else if l.inEscape {
|
||||
// if in escape, end it
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
} else {
|
||||
l.buf += string(next)
|
||||
}
|
||||
|
||||
return inTildeState, true
|
||||
}
|
||||
|
||||
func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
// only a non-escaped space ends the tilde (or eof)
|
||||
if eof || (!l.inEscape && next == ' ') {
|
||||
// end number
|
||||
l.nextTokenType = tNUMBER
|
||||
l.nextToken = &yySymType{
|
||||
s: l.buf,
|
||||
}
|
||||
logDebugTokens("NUMBER - '%s'", l.nextToken.s)
|
||||
l.reset()
|
||||
return startState, true
|
||||
} else if !l.inEscape && next == '\\' {
|
||||
l.inEscape = true
|
||||
return inNumOrStrState, true
|
||||
} else if l.inEscape {
|
||||
// if in escape, end it
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
// go directly to string, no successfully or unsuccessfully
|
||||
// escaped string results in a valid number
|
||||
return inStrState, true
|
||||
}
|
||||
|
||||
// see where to go
|
||||
if !l.seenDot && next == '.' {
|
||||
// stay in this state
|
||||
l.buf += string(next)
|
||||
return inNumOrStrState, true
|
||||
} else if unicode.IsDigit(next) {
|
||||
l.buf += string(next)
|
||||
return inNumOrStrState, true
|
||||
}
|
||||
|
||||
// doesn't look like an number, transition
|
||||
l.buf += string(next)
|
||||
return inStrState, true
|
||||
}
|
||||
|
||||
func inStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||
// end on non-escped space, colon, tilde, boost (or eof)
|
||||
if eof || (!l.inEscape && (next == ' ' || next == ':' || next == '^' || next == '~')) {
|
||||
// end string
|
||||
l.nextTokenType = tSTRING
|
||||
l.nextToken = &yySymType{
|
||||
s: l.buf,
|
||||
}
|
||||
logDebugTokens("STRING - '%s'", l.nextToken.s)
|
||||
l.reset()
|
||||
|
||||
consumed := true
|
||||
if !eof && (next == ':' || next == '^' || next == '~') {
|
||||
consumed = false
|
||||
}
|
||||
|
||||
return startState, consumed
|
||||
} else if !l.inEscape && next == '\\' {
|
||||
l.inEscape = true
|
||||
} else if l.inEscape {
|
||||
// if in escape, end it
|
||||
l.inEscape = false
|
||||
l.buf += unescape(string(next))
|
||||
} else {
|
||||
l.buf += string(next)
|
||||
}
|
||||
|
||||
return inStrState, true
|
||||
}
|
||||
|
||||
func logDebugTokens(format string, v ...interface{}) {
|
||||
if debugLexer {
|
||||
logger.Printf(format, v...)
|
||||
}
|
||||
}
|
@ -7,11 +7,6 @@
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
//go:generate nex query_string.nex
|
||||
//go:generate sed -i "" -e s/Lexer/lexer/g query_string.nn.go
|
||||
//go:generate sed -i "" -e s/Newlexer/newLexer/g query_string.nn.go
|
||||
//go:generate sed -i "" -e s/debuglexer/debugLexer/g query_string.nn.go
|
||||
//go:generate go fmt query_string.nn.go
|
||||
//go:generate go tool yacc -o query_string.y.go query_string.y
|
||||
//go:generate sed -i "" -e 1d query_string.y.go
|
||||
|
||||
@ -26,21 +21,20 @@ var debugParser bool
|
||||
var debugLexer bool
|
||||
|
||||
func parseQuerySyntax(query string) (rq Query, err error) {
|
||||
lex := newLexerWrapper(newLexer(strings.NewReader(query)))
|
||||
lex := newLexerWrapper(newQueryStringLex(strings.NewReader(query)))
|
||||
doParse(lex)
|
||||
|
||||
if len(lex.errs) > 0 {
|
||||
return nil, fmt.Errorf(strings.Join(lex.errs, "\n"))
|
||||
} else {
|
||||
return lex.query, nil
|
||||
}
|
||||
return lex.query, nil
|
||||
}
|
||||
|
||||
func doParse(lex *lexerWrapper) {
|
||||
defer func() {
|
||||
r := recover()
|
||||
if r != nil {
|
||||
lex.Error("Errors while parsing.")
|
||||
lex.errs = append(lex.errs, fmt.Sprintf("parse error: %v", r))
|
||||
}
|
||||
}()
|
||||
|
||||
@ -54,23 +48,22 @@ const (
|
||||
)
|
||||
|
||||
type lexerWrapper struct {
|
||||
nex yyLexer
|
||||
lex yyLexer
|
||||
errs []string
|
||||
query *booleanQuery
|
||||
}
|
||||
|
||||
func newLexerWrapper(nex yyLexer) *lexerWrapper {
|
||||
func newLexerWrapper(lex yyLexer) *lexerWrapper {
|
||||
return &lexerWrapper{
|
||||
nex: nex,
|
||||
errs: []string{},
|
||||
lex: lex,
|
||||
query: NewBooleanQuery(nil, nil, nil),
|
||||
}
|
||||
}
|
||||
|
||||
func (this *lexerWrapper) Lex(lval *yySymType) int {
|
||||
return this.nex.Lex(lval)
|
||||
func (l *lexerWrapper) Lex(lval *yySymType) int {
|
||||
return l.lex.Lex(lval)
|
||||
}
|
||||
|
||||
func (this *lexerWrapper) Error(s string) {
|
||||
this.errs = append(this.errs, s)
|
||||
func (l *lexerWrapper) Error(s string) {
|
||||
l.errs = append(l.errs, s)
|
||||
}
|
||||
|
@ -406,17 +406,142 @@ func TestQuerySyntaxParserValid(t *testing.T) {
|
||||
},
|
||||
nil),
|
||||
},
|
||||
|
||||
// tests for escaping
|
||||
|
||||
// escape : as field delimeter
|
||||
{
|
||||
input: `name\:marty`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("name:marty"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// first colon delimiter, second escaped
|
||||
{
|
||||
input: `name:marty\:couchbase`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("marty:couchbase").SetField("name"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// escape space, single arguemnt to match query
|
||||
{
|
||||
input: `marty\ couchbase`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("marty couchbase"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// escape leading plus, not a must clause
|
||||
{
|
||||
input: `\+marty`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("+marty"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// escape leading minus, not a must not clause
|
||||
{
|
||||
input: `\-marty`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery("-marty"),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// escape quote inside of phrase
|
||||
{
|
||||
input: `"what does \"quote\" mean"`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchPhraseQuery(`what does "quote" mean`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// escaping an unsupported character retains backslash
|
||||
{
|
||||
input: `can\ i\ escap\e`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery(`can i escap\e`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// leading spaces
|
||||
{
|
||||
input: ` what`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery(`what`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// no boost value defaults to 1
|
||||
{
|
||||
input: `term^`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery(`term`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
// weird lexer cases, something that starts like a number
|
||||
// but contains escape and ends up as string
|
||||
{
|
||||
input: `3.0\:`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery(`3.0:`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
{
|
||||
input: `3.0\a`,
|
||||
mapping: NewIndexMapping(),
|
||||
result: NewBooleanQuery(
|
||||
nil,
|
||||
[]Query{
|
||||
NewMatchQuery(`3.0\a`),
|
||||
},
|
||||
nil),
|
||||
},
|
||||
}
|
||||
|
||||
// turn on lexer debugging
|
||||
// debugLexer = true
|
||||
// logger = log.New(os.Stderr, "bleve", log.LstdFlags)
|
||||
// debugParser = true
|
||||
// logger = log.New(os.Stderr, "bleve ", log.LstdFlags)
|
||||
|
||||
for _, test := range tests {
|
||||
|
||||
q, err := parseQuerySyntax(test.input)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !reflect.DeepEqual(q, test.result) {
|
||||
t.Errorf("Expected %#v, got %#v: for %s", test.result, q, test.input)
|
||||
@ -440,6 +565,11 @@ func TestQuerySyntaxParserInvalid(t *testing.T) {
|
||||
{"field:~text"},
|
||||
{"field:^text"},
|
||||
{"field::text"},
|
||||
{`"this is the time`},
|
||||
{`cat^3\:`},
|
||||
{`cat^3\0`},
|
||||
{`cat~3\:`},
|
||||
{`cat~3\0`},
|
||||
}
|
||||
|
||||
// turn on lexer debugging
|
||||
@ -460,7 +590,7 @@ func BenchmarkLexer(b *testing.B) {
|
||||
var tokenTypes []int
|
||||
var tokens []yySymType
|
||||
r := strings.NewReader(`+field4:"test phrase 1"`)
|
||||
l := newLexer(r)
|
||||
l := newQueryStringLex(r)
|
||||
var lval yySymType
|
||||
rv := l.Lex(&lval)
|
||||
for rv > 0 {
|
||||
|
Loading…
Reference in New Issue
Block a user