0f16eccd6b
name "exception" configure with list of regexp string "exceptions" these exceptions regexps that match sequences you want treated as a single token. these sequences are NOT sent to the underlying tokenizer configure "tokenizer" is the named tokenizer that should be used for processing all text regions not matching exceptions An example configuration with simple patterns to match URLs and email addresses: map[string]interface{}{ "type": "exception", "tokenizer": "unicode", "exceptions": []interface{}{ `[hH][tT][tT][pP][sS]?://(\S)*`, `[fF][iI][lL][eE]://(\S)*`, `[fF][tT][pP]://(\S)*`, `\S+@\S+`, } }
158 lines
3.4 KiB
Go
158 lines
3.4 KiB
Go
package exception
|
|
|
|
import (
|
|
"reflect"
|
|
"testing"
|
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
|
"github.com/blevesearch/bleve/registry"
|
|
)
|
|
|
|
func TestExceptionsTokenizer(t *testing.T) {
|
|
tests := []struct {
|
|
config map[string]interface{}
|
|
input []byte
|
|
patterns []string
|
|
result analysis.TokenStream
|
|
}{
|
|
{
|
|
input: []byte("test http://blevesearch.com/ words"),
|
|
config: map[string]interface{}{
|
|
"type": "exception",
|
|
"tokenizer": "unicode",
|
|
"exceptions": []interface{}{
|
|
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
|
`[fF][iI][lL][eE]://(\S)*`,
|
|
`[fF][tT][pP]://(\S)*`,
|
|
},
|
|
},
|
|
result: analysis.TokenStream{
|
|
&analysis.Token{
|
|
Term: []byte("test"),
|
|
Position: 1,
|
|
Start: 0,
|
|
End: 4,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("http://blevesearch.com/"),
|
|
Position: 2,
|
|
Start: 5,
|
|
End: 28,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("words"),
|
|
Position: 3,
|
|
Start: 29,
|
|
End: 34,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
input: []byte("what ftp://blevesearch.com/ songs"),
|
|
config: map[string]interface{}{
|
|
"type": "exception",
|
|
"tokenizer": "unicode",
|
|
"exceptions": []interface{}{
|
|
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
|
`[fF][iI][lL][eE]://(\S)*`,
|
|
`[fF][tT][pP]://(\S)*`,
|
|
},
|
|
},
|
|
result: analysis.TokenStream{
|
|
&analysis.Token{
|
|
Term: []byte("what"),
|
|
Position: 1,
|
|
Start: 0,
|
|
End: 4,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("ftp://blevesearch.com/"),
|
|
Position: 2,
|
|
Start: 5,
|
|
End: 27,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("songs"),
|
|
Position: 3,
|
|
Start: 28,
|
|
End: 33,
|
|
},
|
|
},
|
|
},
|
|
{
|
|
input: []byte("please email marty@couchbase.com the URL https://blevesearch.com/"),
|
|
config: map[string]interface{}{
|
|
"type": "exception",
|
|
"tokenizer": "unicode",
|
|
"exceptions": []interface{}{
|
|
`[hH][tT][tT][pP][sS]?://(\S)*`,
|
|
`[fF][iI][lL][eE]://(\S)*`,
|
|
`[fF][tT][pP]://(\S)*`,
|
|
`\S+@\S+`,
|
|
},
|
|
},
|
|
result: analysis.TokenStream{
|
|
&analysis.Token{
|
|
Term: []byte("please"),
|
|
Position: 1,
|
|
Start: 0,
|
|
End: 6,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("email"),
|
|
Position: 2,
|
|
Start: 7,
|
|
End: 12,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("marty@couchbase.com"),
|
|
Position: 3,
|
|
Start: 13,
|
|
End: 32,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("the"),
|
|
Position: 4,
|
|
Start: 33,
|
|
End: 36,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("URL"),
|
|
Position: 5,
|
|
Start: 37,
|
|
End: 40,
|
|
},
|
|
&analysis.Token{
|
|
Term: []byte("https://blevesearch.com/"),
|
|
Position: 6,
|
|
Start: 41,
|
|
End: 65,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
// remaining := unicode.NewUnicodeTokenizer()
|
|
for _, test := range tests {
|
|
|
|
// build the requested exception tokenizer
|
|
cache := registry.NewCache()
|
|
tokenizer, err := cache.DefineTokenizer("custom", test.config)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// pattern := strings.Join(test.patterns, "|")
|
|
// r, err := regexp.Compile(pattern)
|
|
// if err != nil {
|
|
// t.Fatal(err)
|
|
// }
|
|
// tokenizer := NewExceptionsTokenizer(r, remaining)
|
|
actual := tokenizer.Tokenize(test.input)
|
|
if !reflect.DeepEqual(actual, test.result) {
|
|
t.Errorf("expected %v, got %v", test.result, actual)
|
|
}
|
|
}
|
|
}
|