changed many components to not have defaults
many of these defaults were arbitrary, and not having defaults lets us more easily flag them for configuration added a shingle filter introduce new toke type for shingles
This commit is contained in:
parent
8dd8fb8910
commit
8debf26cb7
|
@ -21,15 +21,20 @@ const Name = "custom"
|
|||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
|
||||
var charFilters []analysis.CharFilter
|
||||
charFilterNames, ok := config["char_filters"].([]string)
|
||||
charFilterNames, ok := config["char_filters"].([]interface{})
|
||||
if ok {
|
||||
charFilters = make([]analysis.CharFilter, len(charFilterNames))
|
||||
for i, charFilterName := range charFilterNames {
|
||||
charFilter, err := cache.CharFilterNamed(charFilterName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
charFilterNameString, ok := charFilterName.(string)
|
||||
if ok {
|
||||
charFilter, err := cache.CharFilterNamed(charFilterNameString)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
charFilters[i] = charFilter
|
||||
} else {
|
||||
return nil, fmt.Errorf("char filter name must be a string")
|
||||
}
|
||||
charFilters[i] = charFilter
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -44,15 +49,20 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
|||
}
|
||||
|
||||
var tokenFilters []analysis.TokenFilter
|
||||
tokenFilterNames, ok := config["token_filters"].([]string)
|
||||
tokenFilterNames, ok := config["token_filters"].([]interface{})
|
||||
if ok {
|
||||
tokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames))
|
||||
for i, tokenFilterName := range tokenFilterNames {
|
||||
tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
tokenFilterNameString, ok := tokenFilterName.(string)
|
||||
if ok {
|
||||
tokenFilter, err := cache.TokenFilterNamed(tokenFilterNameString)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tokenFilters[i] = tokenFilter
|
||||
} else {
|
||||
return nil, fmt.Errorf("token filter name must be a string")
|
||||
}
|
||||
tokenFilters[i] = tokenFilter
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ package edge_ngram_filter
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
|
@ -100,16 +101,16 @@ func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.C
|
|||
if ok && back {
|
||||
side = BACK
|
||||
}
|
||||
min := 1
|
||||
minVal, ok := config["min"].(float64)
|
||||
if ok {
|
||||
min = int(minVal)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
max := 2
|
||||
min := int(minVal)
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if ok {
|
||||
max = int(maxVal)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
max := int(maxVal)
|
||||
|
||||
return NewEdgeNgramFilter(side, min, max), nil
|
||||
}
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
package length_filter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
|
@ -59,6 +60,9 @@ func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cach
|
|||
if ok {
|
||||
max = int(maxVal)
|
||||
}
|
||||
if min == max && max == 0 {
|
||||
return nil, fmt.Errorf("either min or max must be non-zero")
|
||||
}
|
||||
|
||||
return NewLengthFilter(min, max), nil
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ package ngram_filter
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
|
@ -70,16 +71,16 @@ func buildTermFromRunes(runes []rune) []byte {
|
|||
}
|
||||
|
||||
func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
min := 1
|
||||
minVal, ok := config["min"].(float64)
|
||||
if ok {
|
||||
min = int(minVal)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
max := 2
|
||||
min := int(minVal)
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if ok {
|
||||
max = int(maxVal)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
max := int(maxVal)
|
||||
|
||||
return NewNgramFilter(min, max), nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
package shingle
|
||||
|
||||
import (
|
||||
"container/ring"
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const Name = "shingle"
|
||||
|
||||
type ShingleFilter struct {
|
||||
min int
|
||||
max int
|
||||
outputOriginal bool
|
||||
tokenSeparator string
|
||||
fill string
|
||||
ring *ring.Ring
|
||||
itemsInRing int
|
||||
}
|
||||
|
||||
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
|
||||
return &ShingleFilter{
|
||||
min: min,
|
||||
max: max,
|
||||
outputOriginal: outputOriginal,
|
||||
tokenSeparator: sep,
|
||||
fill: fill,
|
||||
ring: ring.New(max),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
currentPosition := 0
|
||||
for _, token := range input {
|
||||
if s.outputOriginal {
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
// if there are gaps, insert filler tokens
|
||||
offset := token.Position - currentPosition
|
||||
for offset > 1 {
|
||||
fillerToken := analysis.Token{
|
||||
Position: 0,
|
||||
Start: -1,
|
||||
End: -1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
Term: []byte(s.fill),
|
||||
}
|
||||
s.ring.Value = &fillerToken
|
||||
if s.itemsInRing < s.max {
|
||||
s.itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState()...)
|
||||
s.ring = s.ring.Next()
|
||||
offset--
|
||||
}
|
||||
currentPosition = token.Position
|
||||
|
||||
s.ring.Value = token
|
||||
if s.itemsInRing < s.max {
|
||||
s.itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState()...)
|
||||
s.ring = s.ring.Next()
|
||||
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) shingleCurrentRingState() analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
for shingleN := s.min; shingleN <= s.max; shingleN++ {
|
||||
// if there are enough items in the ring
|
||||
// to produce a shingle of this size
|
||||
if s.itemsInRing >= shingleN {
|
||||
thisShingleRing := s.ring.Move(-(shingleN - 1))
|
||||
shingledBytes := make([]byte, 0)
|
||||
pos := 0
|
||||
start := -1
|
||||
end := 0
|
||||
for i := 0; i < shingleN; i++ {
|
||||
if i != 0 {
|
||||
shingledBytes = append(shingledBytes, []byte(s.tokenSeparator)...)
|
||||
}
|
||||
curr := thisShingleRing.Value.(*analysis.Token)
|
||||
if pos == 0 && curr.Position != 0 {
|
||||
pos = curr.Position
|
||||
}
|
||||
if start == -1 && curr.Start != -1 {
|
||||
start = curr.Start
|
||||
}
|
||||
if curr.End != -1 {
|
||||
end = curr.End
|
||||
}
|
||||
shingledBytes = append(shingledBytes, curr.Term...)
|
||||
thisShingleRing = thisShingleRing.Next()
|
||||
}
|
||||
token := analysis.Token{
|
||||
Type: analysis.Shingle,
|
||||
Term: shingledBytes,
|
||||
}
|
||||
if pos != 0 {
|
||||
token.Position = pos
|
||||
}
|
||||
if start != -1 {
|
||||
token.Start = start
|
||||
}
|
||||
if end != -1 {
|
||||
token.End = end
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func ShingleFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
minVal, ok := config["min"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
min := int(minVal)
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
max := int(maxVal)
|
||||
|
||||
outputOriginal := false
|
||||
outVal, ok := config["output_original"].(bool)
|
||||
if ok {
|
||||
outputOriginal = outVal
|
||||
}
|
||||
|
||||
sep := " "
|
||||
sepVal, ok := config["separator"].(string)
|
||||
if ok {
|
||||
sep = sepVal
|
||||
}
|
||||
|
||||
fill := "_"
|
||||
fillVal, ok := config["filler"].(string)
|
||||
if ok {
|
||||
fill = fillVal
|
||||
}
|
||||
|
||||
return NewShingleFilter(min, max, outputOriginal, sep, fill), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, ShingleFilterConstructor)
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package shingle
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestNgramFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
min int
|
||||
max int
|
||||
outputOriginal bool
|
||||
separator string
|
||||
filler string
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
min: 2,
|
||||
max: 2,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 3,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 2,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 3,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ugly"),
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
Position: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
Position: 4,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ugly _ quick"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 1,
|
||||
max: 5,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
Position: 2,
|
||||
},
|
||||
// token 3 removed by stop filter
|
||||
&analysis.Token{
|
||||
Term: []byte("see"),
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("shingles"),
|
||||
Position: 5,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 5,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 2,
|
||||
max: 2,
|
||||
outputOriginal: true,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
|
||||
actual := shingleFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -12,6 +12,8 @@
|
|||
package stemmer_filter
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"bitbucket.org/tebeka/snowball"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
@ -63,11 +65,11 @@ func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream
|
|||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
lang := "en"
|
||||
langVal, ok := config["lang"].(string)
|
||||
if ok {
|
||||
lang = langVal
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify stemmer language")
|
||||
}
|
||||
lang := langVal
|
||||
return NewStemmerFilter(lang)
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ package truncate_token_filter
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
|
@ -51,12 +52,11 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
|
|||
}
|
||||
|
||||
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
length := 25
|
||||
|
||||
lenVal, ok := config["length"].(float64)
|
||||
if ok {
|
||||
length = int(lenVal)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify length")
|
||||
}
|
||||
length := int(lenVal)
|
||||
|
||||
return NewTruncateTokenFilter(length), nil
|
||||
}
|
||||
|
|
|
@ -65,11 +65,11 @@ func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.Tok
|
|||
}
|
||||
|
||||
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
form := NFKC
|
||||
formVal, ok := config["form"].(string)
|
||||
if ok {
|
||||
form = formVal
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify form")
|
||||
}
|
||||
form := formVal
|
||||
return NewUnicodeNormalizeFilter(form)
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ import (
|
|||
|
||||
const Name = "whitespace"
|
||||
|
||||
var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
|
||||
var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|[^\p{Z}\p{P}\p{C}]+`)
|
||||
|
||||
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
|
||||
|
|
|
@ -25,15 +25,16 @@ const (
|
|||
Ideographic
|
||||
Numeric
|
||||
DateTime
|
||||
Shingle
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
Start int
|
||||
End int
|
||||
Term []byte
|
||||
Position int
|
||||
Type TokenType
|
||||
KeyWord bool
|
||||
Start int `json:"start"`
|
||||
End int `json:"end"`
|
||||
Term []byte `json:"term"`
|
||||
Position int `json:"position"`
|
||||
Type TokenType `json:"type"`
|
||||
KeyWord bool `json:"keyword"`
|
||||
}
|
||||
|
||||
func (t *Token) String() string {
|
||||
|
|
|
@ -46,6 +46,7 @@ import (
|
|||
_ "github.com/blevesearch/bleve/analysis/token_filters/length_filter"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/ngram_filter"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/shingle"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/truncate_token_filter"
|
||||
_ "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
|
|
|
@ -443,3 +443,11 @@ func (im *IndexMapping) datetimeParserNameForPath(path string) string {
|
|||
|
||||
return im.DefaultDateTimeParser
|
||||
}
|
||||
|
||||
func (im *IndexMapping) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) {
|
||||
analyzer, err := im.cache.AnalyzerNamed(analyzerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return analyzer.Analyze(text), nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue