Merge pull request #386 from a-little-srdjan/BuildTermFromRunes-in-token-filters
removing duplicate code by reusing util.go in analysis
This commit is contained in:
commit
d4097c1f29
|
@ -1,28 +1,15 @@
|
||||||
package camelcase_filter
|
package camelcase_filter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"unicode/utf8"
|
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
)
|
)
|
||||||
|
|
||||||
func buildTokenFromTerm(buffer []rune) *analysis.Token {
|
func buildTokenFromTerm(buffer []rune) *analysis.Token {
|
||||||
return &analysis.Token{
|
return &analysis.Token{
|
||||||
Term: buildTermFromRunes(buffer),
|
Term: analysis.BuildTermFromRunes(buffer),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Lifted from ngram_filter. Expose as public and re-use?
|
|
||||||
func buildTermFromRunes(runes []rune) []byte {
|
|
||||||
rv := make([]byte, 0, len(runes)*4)
|
|
||||||
for _, r := range runes {
|
|
||||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
|
||||||
utf8.EncodeRune(runeBytes, r)
|
|
||||||
rv = append(rv, runeBytes...)
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parser accepts a symbol and passes it to the current state (representing a class).
|
// Parser accepts a symbol and passes it to the current state (representing a class).
|
||||||
// The state can accept it (and accumulate it). Otherwise, the parser creates a new state that
|
// The state can accept it (and accumulate it). Otherwise, the parser creates a new state that
|
||||||
// starts with the pushed symbol.
|
// starts with the pushed symbol.
|
||||||
|
|
|
@ -51,7 +51,7 @@ func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
||||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||||
// build an ngram of this size starting at i
|
// build an ngram of this size starting at i
|
||||||
if i-ngramSize > 0 {
|
if i-ngramSize > 0 {
|
||||||
ngramTerm := buildTermFromRunes(runes[i-ngramSize : i])
|
ngramTerm := analysis.BuildTermFromRunes(runes[i-ngramSize : i])
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Position: token.Position,
|
Position: token.Position,
|
||||||
Start: token.Start,
|
Start: token.Start,
|
||||||
|
@ -68,7 +68,7 @@ func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
||||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||||
// build an ngram of this size starting at i
|
// build an ngram of this size starting at i
|
||||||
if i+ngramSize <= runeCount {
|
if i+ngramSize <= runeCount {
|
||||||
ngramTerm := buildTermFromRunes(runes[i : i+ngramSize])
|
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Position: token.Position,
|
Position: token.Position,
|
||||||
Start: token.Start,
|
Start: token.Start,
|
||||||
|
@ -85,16 +85,6 @@ func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildTermFromRunes(runes []rune) []byte {
|
|
||||||
rv := make([]byte, 0, len(runes)*4)
|
|
||||||
for _, r := range runes {
|
|
||||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
|
||||||
utf8.EncodeRune(runeBytes, r)
|
|
||||||
rv = append(rv, runeBytes...)
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
side := FRONT
|
side := FRONT
|
||||||
back, ok := config["back"].(bool)
|
back, ok := config["back"].(bool)
|
||||||
|
|
|
@ -43,7 +43,7 @@ func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||||
// build an ngram of this size starting at i
|
// build an ngram of this size starting at i
|
||||||
if i+ngramSize <= runeCount {
|
if i+ngramSize <= runeCount {
|
||||||
ngramTerm := buildTermFromRunes(runes[i : i+ngramSize])
|
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
|
||||||
token := analysis.Token{
|
token := analysis.Token{
|
||||||
Position: token.Position,
|
Position: token.Position,
|
||||||
Start: token.Start,
|
Start: token.Start,
|
||||||
|
@ -60,16 +60,6 @@ func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildTermFromRunes(runes []rune) []byte {
|
|
||||||
rv := make([]byte, 0, len(runes)*4)
|
|
||||||
for _, r := range runes {
|
|
||||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
|
||||||
utf8.EncodeRune(runeBytes, r)
|
|
||||||
rv = append(rv, runeBytes...)
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
minVal, ok := config["min"]
|
minVal, ok := config["min"]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
package truncate_token_filter
|
package truncate_token_filter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
@ -34,14 +33,7 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
|
||||||
for _, token := range input {
|
for _, token := range input {
|
||||||
wordLen := utf8.RuneCount(token.Term)
|
wordLen := utf8.RuneCount(token.Term)
|
||||||
if wordLen > s.length {
|
if wordLen > s.length {
|
||||||
runes := bytes.Runes(token.Term)[0:s.length]
|
token.Term = analysis.TruncateRunes(token.Term, wordLen-s.length)
|
||||||
newterm := make([]byte, 0, s.length*4)
|
|
||||||
for _, r := range runes {
|
|
||||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
|
||||||
utf8.EncodeRune(runeBytes, r)
|
|
||||||
newterm = append(newterm, runeBytes...)
|
|
||||||
}
|
|
||||||
token.Term = newterm
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return input
|
return input
|
||||||
|
|
Loading…
Reference in New Issue
Block a user