0
0
Fork 0

fixed regexp tokenizers to not produce empty tokens

This commit is contained in:
Marty Schoch 2016-09-14 16:22:20 -04:00
parent d01ff4ad8a
commit ffee3c3764
2 changed files with 56 additions and 8 deletions

View File

@ -34,17 +34,19 @@ func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
matches := rt.r.FindAllIndex(input, -1)
rv := make(analysis.TokenStream, len(matches))
rv := make(analysis.TokenStream, 0, len(matches))
for i, match := range matches {
matchBytes := input[match[0]:match[1]]
token := analysis.Token{
Term: matchBytes,
Start: match[0],
End: match[1],
Position: i + 1,
Type: detectTokenType(matchBytes),
if match[1]-match[0] > 0 {
token := analysis.Token{
Term: matchBytes,
Start: match[0],
End: match[1],
Position: i + 1,
Type: detectTokenType(matchBytes),
}
rv = append(rv, &token)
}
rv[i] = &token
}
return rv
}

View File

@ -113,3 +113,49 @@ func TestBoundary(t *testing.T) {
}
}
}
func TestBugProducingEmptyTokens(t *testing.T) {
wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
[]byte("Chatha Edwards Sr."),
analysis.TokenStream{
{
Start: 0,
End: 6,
Term: []byte("Chatha"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 7,
End: 14,
Term: []byte("Edwards"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 15,
End: 17,
Term: []byte("Sr"),
Position: 3,
Type: analysis.AlphaNumeric,
},
},
},
}
for _, test := range tests {
tokenizer := NewRegexpTokenizer(wordRegex)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
}
}
}