introduced token type
This commit is contained in:
parent
c8918fe41a
commit
25540c736a
@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
|||||||
}
|
}
|
||||||
token.Start = offset
|
token.Start = offset
|
||||||
token.End = token.Start + len(token.Term)
|
token.End = token.Start + len(token.Term)
|
||||||
|
token.Type = analysis.AlphaNumeric
|
||||||
rv = append(rv, token)
|
rv = append(rv, token)
|
||||||
offset = token.End + 1
|
offset = token.End + 1
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 19,
|
End: 19,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 2,
|
End: 2,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 21,
|
End: 21,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 2,
|
End: 2,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 72,
|
End: 72,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 2,
|
End: 2,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 26,
|
End: 26,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
|
|||||||
Start: 0,
|
Start: 0,
|
||||||
End: 2,
|
End: 2,
|
||||||
Position: 1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -12,14 +12,6 @@ import (
|
|||||||
"github.com/couchbaselabs/bleve/analysis"
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
)
|
)
|
||||||
|
|
||||||
// var DEFAULT_STOP_WORDS []string = []string{
|
|
||||||
// "a", "an", "and", "are", "as", "at", "be", "but", "by",
|
|
||||||
// "for", "if", "in", "into", "is", "it",
|
|
||||||
// "no", "not", "of", "on", "or", "such",
|
|
||||||
// "that", "the", "their", "then", "there", "these",
|
|
||||||
// "they", "this", "to", "was", "will", "with",
|
|
||||||
// }
|
|
||||||
|
|
||||||
type StopWordsFilter struct {
|
type StopWordsFilter struct {
|
||||||
stopWords StopWordsMap
|
stopWords StopWordsMap
|
||||||
}
|
}
|
||||||
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
|||||||
|
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
// func buildStopWordMap(words []string) map[string]bool {
|
|
||||||
// rv := make(map[string]bool, len(words))
|
|
||||||
// for _, word := range words {
|
|
||||||
// rv[word] = true
|
|
||||||
// }
|
|
||||||
// return rv
|
|
||||||
// }
|
|
||||||
|
@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
Start: match[0],
|
Start: match[0],
|
||||||
End: match[1],
|
End: match[1],
|
||||||
Position: i + 1,
|
Position: i + 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
}
|
}
|
||||||
rv[i] = &token
|
rv[i] = &token
|
||||||
}
|
}
|
||||||
|
@ -6,10 +6,11 @@
|
|||||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||||
// either express or implied. See the License for the specific language governing permissions
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
// and limitations under the License.
|
// and limitations under the License.
|
||||||
package simple_word_boundary
|
package regexp_tokenizer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"regexp"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/couchbaselabs/bleve/analysis"
|
"github.com/couchbaselabs/bleve/analysis"
|
||||||
@ -17,6 +18,8 @@ import (
|
|||||||
|
|
||||||
func TestBoundary(t *testing.T) {
|
func TestBoundary(t *testing.T) {
|
||||||
|
|
||||||
|
wordRegex := regexp.MustCompile(`\w+`)
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
input []byte
|
input []byte
|
||||||
output analysis.TokenStream
|
output analysis.TokenStream
|
||||||
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
|
|||||||
[]byte("Hello World."),
|
[]byte("Hello World."),
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
5,
|
End: 5,
|
||||||
[]byte("Hello"),
|
Term: []byte("Hello"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
6,
|
Start: 6,
|
||||||
11,
|
End: 11,
|
||||||
[]byte("World"),
|
Term: []byte("World"),
|
||||||
2,
|
Position: 2,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
tokenizer := NewSimpleWordBoundaryTokenizer()
|
tokenizer := NewRegexpTokenizer(wordRegex)
|
||||||
actual := tokenizer.Tokenize(test.input)
|
actual := tokenizer.Tokenize(test.input)
|
||||||
|
|
||||||
if !reflect.DeepEqual(actual, test.output) {
|
if !reflect.DeepEqual(actual, test.output) {
|
@ -1,29 +0,0 @@
|
|||||||
// Copyright (c) 2014 Couchbase, Inc.
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
||||||
// except in compliance with the License. You may obtain a copy of the License at
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
||||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
||||||
// either express or implied. See the License for the specific language governing permissions
|
|
||||||
// and limitations under the License.
|
|
||||||
package simple_word_boundary
|
|
||||||
|
|
||||||
import (
|
|
||||||
"regexp"
|
|
||||||
|
|
||||||
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
const wordPattern = `\w+`
|
|
||||||
|
|
||||||
var wordRegex = regexp.MustCompile(wordPattern)
|
|
||||||
|
|
||||||
type SimpleWordBoundaryTokenizer struct {
|
|
||||||
*regexp_tokenizer.RegexpTokenizer
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
|
|
||||||
return &SimpleWordBoundaryTokenizer{
|
|
||||||
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
|
|
||||||
}
|
|
||||||
}
|
|
@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||||||
Position: 1,
|
Position: 1,
|
||||||
Start: 0,
|
Start: 0,
|
||||||
End: len(input),
|
End: len(input),
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
|||||||
[]byte("Hello World"),
|
[]byte("Hello World"),
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
11,
|
End: 11,
|
||||||
[]byte("Hello World"),
|
Term: []byte("Hello World"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
|||||||
[]byte("こんにちは世界"),
|
[]byte("こんにちは世界"),
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
21,
|
End: 21,
|
||||||
[]byte("こんにちは世界"),
|
Term: []byte("こんにちは世界"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
|||||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
72,
|
End: 72,
|
||||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
|
|||||||
End: int(indexB),
|
End: int(indexB),
|
||||||
Term: input[indexA:indexB],
|
Term: input[indexA:indexB],
|
||||||
Position: position,
|
Position: position,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
}
|
}
|
||||||
rv = append(rv, &token)
|
rv = append(rv, &token)
|
||||||
}
|
}
|
||||||
|
@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
|
|||||||
"en_US",
|
"en_US",
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
5,
|
End: 5,
|
||||||
[]byte("Hello"),
|
Term: []byte("Hello"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
6,
|
Start: 6,
|
||||||
11,
|
End: 11,
|
||||||
[]byte("World"),
|
Term: []byte("World"),
|
||||||
2,
|
Position: 2,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
|
|||||||
"en_US",
|
"en_US",
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
15,
|
End: 15,
|
||||||
[]byte("こんにちは"),
|
Term: []byte("こんにちは"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
15,
|
Start: 15,
|
||||||
21,
|
End: 21,
|
||||||
[]byte("世界"),
|
Term: []byte("世界"),
|
||||||
2,
|
Position: 2,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
|
|||||||
"th_TH",
|
"th_TH",
|
||||||
analysis.TokenStream{
|
analysis.TokenStream{
|
||||||
{
|
{
|
||||||
0,
|
Start: 0,
|
||||||
9,
|
End: 9,
|
||||||
[]byte("แยก"),
|
Term: []byte("แยก"),
|
||||||
1,
|
Position: 1,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
9,
|
Start: 9,
|
||||||
15,
|
End: 15,
|
||||||
[]byte("คำ"),
|
Term: []byte("คำ"),
|
||||||
2,
|
Position: 2,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
15,
|
Start: 15,
|
||||||
27,
|
End: 27,
|
||||||
[]byte("ภาษา"),
|
Term: []byte("ภาษา"),
|
||||||
3,
|
Position: 3,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
27,
|
Start: 27,
|
||||||
36,
|
End: 36,
|
||||||
[]byte("ไทย"),
|
Term: []byte("ไทย"),
|
||||||
4,
|
Position: 4,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
36,
|
Start: 36,
|
||||||
42,
|
End: 42,
|
||||||
[]byte("ก็"),
|
Term: []byte("ก็"),
|
||||||
5,
|
Position: 5,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
42,
|
Start: 42,
|
||||||
57,
|
End: 57,
|
||||||
[]byte("ทำได้"),
|
Term: []byte("ทำได้"),
|
||||||
6,
|
Position: 6,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
57,
|
Start: 57,
|
||||||
63,
|
End: 63,
|
||||||
[]byte("นะ"),
|
Term: []byte("นะ"),
|
||||||
7,
|
Position: 7,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
63,
|
Start: 63,
|
||||||
72,
|
End: 72,
|
||||||
[]byte("จ้ะ"),
|
Term: []byte("จ้ะ"),
|
||||||
8,
|
Position: 8,
|
||||||
|
Type: analysis.AlphaNumeric,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -16,11 +16,19 @@ type CharFilter interface {
|
|||||||
Filter([]byte) []byte
|
Filter([]byte) []byte
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type TokenType int
|
||||||
|
|
||||||
|
const (
|
||||||
|
AlphaNumeric TokenType = iota
|
||||||
|
Numeric
|
||||||
|
)
|
||||||
|
|
||||||
type Token struct {
|
type Token struct {
|
||||||
Start int
|
Start int
|
||||||
End int
|
End int
|
||||||
Term []byte
|
Term []byte
|
||||||
Position int
|
Position int
|
||||||
|
Type TokenType
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Token) String() string {
|
func (t *Token) String() string {
|
||||||
|
Loading…
Reference in New Issue
Block a user