0
0
bleve/analysis/tokenizer/exception/exception_test.go
Marty Schoch 6bf9dd59ab BREAKING CHANGE - additional package renaming
i recently learned that package names should also prefer the
singular form, not the plural form
2016-10-01 17:20:59 -04:00

158 lines
3.4 KiB
Go

package exception
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
_ "github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/registry"
)
func TestExceptionsTokenizer(t *testing.T) {
tests := []struct {
config map[string]interface{}
input []byte
patterns []string
result analysis.TokenStream
}{
{
input: []byte("test http://blevesearch.com/ words"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("http://blevesearch.com/"),
Position: 2,
Start: 5,
End: 28,
},
&analysis.Token{
Term: []byte("words"),
Position: 3,
Start: 29,
End: 34,
},
},
},
{
input: []byte("what ftp://blevesearch.com/ songs"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("what"),
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("ftp://blevesearch.com/"),
Position: 2,
Start: 5,
End: 27,
},
&analysis.Token{
Term: []byte("songs"),
Position: 3,
Start: 28,
End: 33,
},
},
},
{
input: []byte("please email marty@couchbase.com the URL https://blevesearch.com/"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
`\S+@\S+`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("please"),
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("email"),
Position: 2,
Start: 7,
End: 12,
},
&analysis.Token{
Term: []byte("marty@couchbase.com"),
Position: 3,
Start: 13,
End: 32,
},
&analysis.Token{
Term: []byte("the"),
Position: 4,
Start: 33,
End: 36,
},
&analysis.Token{
Term: []byte("URL"),
Position: 5,
Start: 37,
End: 40,
},
&analysis.Token{
Term: []byte("https://blevesearch.com/"),
Position: 6,
Start: 41,
End: 65,
},
},
},
}
// remaining := unicode.NewUnicodeTokenizer()
for _, test := range tests {
// build the requested exception tokenizer
cache := registry.NewCache()
tokenizer, err := cache.DefineTokenizer("custom", test.config)
if err != nil {
t.Fatal(err)
}
// pattern := strings.Join(test.patterns, "|")
// r, err := regexp.Compile(pattern)
// if err != nil {
// t.Fatal(err)
// }
// tokenizer := NewExceptionsTokenizer(r, remaining)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.result) {
t.Errorf("expected %v, got %v", test.result, actual)
}
}
}