fixed regexp tokenizers to not produce empty tokens
This commit is contained in:
parent
d01ff4ad8a
commit
ffee3c3764
|
@ -34,17 +34,19 @@ func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
|
|||
|
||||
func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
matches := rt.r.FindAllIndex(input, -1)
|
||||
rv := make(analysis.TokenStream, len(matches))
|
||||
rv := make(analysis.TokenStream, 0, len(matches))
|
||||
for i, match := range matches {
|
||||
matchBytes := input[match[0]:match[1]]
|
||||
token := analysis.Token{
|
||||
Term: matchBytes,
|
||||
Start: match[0],
|
||||
End: match[1],
|
||||
Position: i + 1,
|
||||
Type: detectTokenType(matchBytes),
|
||||
if match[1]-match[0] > 0 {
|
||||
token := analysis.Token{
|
||||
Term: matchBytes,
|
||||
Start: match[0],
|
||||
End: match[1],
|
||||
Position: i + 1,
|
||||
Type: detectTokenType(matchBytes),
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
rv[i] = &token
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
|
|
@ -113,3 +113,49 @@ func TestBoundary(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBugProducingEmptyTokens(t *testing.T) {
|
||||
|
||||
wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
[]byte("Chatha Edwards Sr."),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
Start: 0,
|
||||
End: 6,
|
||||
Term: []byte("Chatha"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 7,
|
||||
End: 14,
|
||||
Term: []byte("Edwards"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
Start: 15,
|
||||
End: 17,
|
||||
Term: []byte("Sr"),
|
||||
Position: 3,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewRegexpTokenizer(wordRegex)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue