0
0
Fork 0

Fix token start/end/position values in camelCase tokenizer

This commit is contained in:
Ethan Koenig 2017-06-22 17:42:39 -04:00
parent 011b168f7b
commit 8994ad2e00
2 changed files with 24 additions and 7 deletions

View File

@ -50,11 +50,12 @@ func NewCamelCaseFilter() *CamelCaseFilter {
func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
nextPosition := 1
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)
runes := bytes.Runes(token.Term)
p := NewParser(runeCount)
p := NewParser(runeCount, nextPosition, token.Start)
for i := 0; i < runeCount; i++ {
if i+1 >= runeCount {
p.Push(runes[i], nil)
@ -63,6 +64,7 @@ func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
}
}
rv = append(rv, p.FlushTokens()...)
nextPosition = p.NextPosition()
}
return rv
}

View File

@ -18,10 +18,17 @@ import (
"github.com/blevesearch/bleve/analysis"
)
func buildTokenFromTerm(buffer []rune) *analysis.Token {
return &analysis.Token{
Term: analysis.BuildTermFromRunes(buffer),
func (p *Parser) buildTokenFromTerm(buffer []rune) *analysis.Token {
term := analysis.BuildTermFromRunes(buffer)
token := &analysis.Token{
Term: term,
Position: p.position,
Start: p.index,
End: p.index + len(term),
}
p.position++
p.index += len(term)
return token
}
// Parser accepts a symbol and passes it to the current state (representing a class).
@ -35,13 +42,17 @@ type Parser struct {
buffer []rune
current State
tokens []*analysis.Token
position int
index int
}
func NewParser(len int) *Parser {
func NewParser(len, position, index int) *Parser {
return &Parser{
bufferLen: len,
buffer: make([]rune, 0, len),
tokens: make([]*analysis.Token, 0, len),
position: position,
index: index,
}
}
@ -57,7 +68,7 @@ func (p *Parser) Push(sym rune, peek *rune) {
} else {
// the old state is no more, thus convert the buffer
p.tokens = append(p.tokens, buildTokenFromTerm(p.buffer))
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
// let the new state begin
p.current = p.NewState(sym)
@ -89,6 +100,10 @@ func (p *Parser) NewState(sym rune) State {
}
func (p *Parser) FlushTokens() []*analysis.Token {
p.tokens = append(p.tokens, buildTokenFromTerm(p.buffer))
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
return p.tokens
}
func (p *Parser) NextPosition() int {
return p.position
}