introduced token type
This commit is contained in:
parent
c8918fe41a
commit
25540c736a
@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
}
|
||||
token.Start = offset
|
||||
token.End = token.Start + len(token.Term)
|
||||
token.Type = analysis.AlphaNumeric
|
||||
rv = append(rv, token)
|
||||
offset = token.End + 1
|
||||
}
|
||||
|
@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 19,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 21,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 72,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 26,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -12,14 +12,6 @@ import (
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
// var DEFAULT_STOP_WORDS []string = []string{
|
||||
// "a", "an", "and", "are", "as", "at", "be", "but", "by",
|
||||
// "for", "if", "in", "into", "is", "it",
|
||||
// "no", "not", "of", "on", "or", "such",
|
||||
// "that", "the", "their", "then", "there", "these",
|
||||
// "they", "this", "to", "was", "will", "with",
|
||||
// }
|
||||
|
||||
type StopWordsFilter struct {
|
||||
stopWords StopWordsMap
|
||||
}
|
||||
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// func buildStopWordMap(words []string) map[string]bool {
|
||||
// rv := make(map[string]bool, len(words))
|
||||
// for _, word := range words {
|
||||
// rv[word] = true
|
||||
// }
|
||||
// return rv
|
||||
// }
|
||||
|
@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
Start: match[0],
|
||||
End: match[1],
|
||||
Position: i + 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
}
|
||||
rv[i] = &token
|
||||
}
|
||||
|
@ -6,10 +6,11 @@
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package simple_word_boundary
|
||||
package regexp_tokenizer
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
@ -17,6 +18,8 @@ import (
|
||||
|
||||
func TestBoundary(t *testing.T) {
|
||||
|
||||
wordRegex := regexp.MustCompile(`\w+`)
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
|
||||
[]byte("Hello World."),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
5,
|
||||
[]byte("Hello"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
6,
|
||||
11,
|
||||
[]byte("World"),
|
||||
2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
tokenizer := NewSimpleWordBoundaryTokenizer()
|
||||
tokenizer := NewRegexpTokenizer(wordRegex)
|
||||
actual := tokenizer.Tokenize(test.input)
|
||||
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
@ -1,29 +0,0 @@
|
||||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package simple_word_boundary
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
|
||||
)
|
||||
|
||||
const wordPattern = `\w+`
|
||||
|
||||
var wordRegex = regexp.MustCompile(wordPattern)
|
||||
|
||||
type SimpleWordBoundaryTokenizer struct {
|
||||
*regexp_tokenizer.RegexpTokenizer
|
||||
}
|
||||
|
||||
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
|
||||
return &SimpleWordBoundaryTokenizer{
|
||||
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
|
||||
}
|
||||
}
|
@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: len(input),
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
||||
[]byte("Hello World"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
11,
|
||||
[]byte("Hello World"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 11,
|
||||
Term: []byte("Hello World"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
||||
[]byte("こんにちは世界"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
21,
|
||||
[]byte("こんにちは世界"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 21,
|
||||
Term: []byte("こんにちは世界"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
72,
|
||||
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 72,
|
||||
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
|
||||
End: int(indexB),
|
||||
Term: input[indexA:indexB],
|
||||
Position: position,
|
||||
Type: analysis.AlphaNumeric,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
|
@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
5,
|
||||
[]byte("Hello"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Term: []byte("Hello"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
6,
|
||||
11,
|
||||
[]byte("World"),
|
||||
2,
|
||||
Start: 6,
|
||||
End: 11,
|
||||
Term: []byte("World"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
|
||||
"en_US",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
15,
|
||||
[]byte("こんにちは"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
Term: []byte("こんにちは"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
15,
|
||||
21,
|
||||
[]byte("世界"),
|
||||
2,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
Term: []byte("世界"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
|
||||
"th_TH",
|
||||
analysis.TokenStream{
|
||||
{
|
||||
0,
|
||||
9,
|
||||
[]byte("แยก"),
|
||||
1,
|
||||
Start: 0,
|
||||
End: 9,
|
||||
Term: []byte("แยก"),
|
||||
Position: 1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
9,
|
||||
15,
|
||||
[]byte("คำ"),
|
||||
2,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
Term: []byte("คำ"),
|
||||
Position: 2,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
15,
|
||||
27,
|
||||
[]byte("ภาษา"),
|
||||
3,
|
||||
Start: 15,
|
||||
End: 27,
|
||||
Term: []byte("ภาษา"),
|
||||
Position: 3,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
27,
|
||||
36,
|
||||
[]byte("ไทย"),
|
||||
4,
|
||||
Start: 27,
|
||||
End: 36,
|
||||
Term: []byte("ไทย"),
|
||||
Position: 4,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
36,
|
||||
42,
|
||||
[]byte("ก็"),
|
||||
5,
|
||||
Start: 36,
|
||||
End: 42,
|
||||
Term: []byte("ก็"),
|
||||
Position: 5,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
42,
|
||||
57,
|
||||
[]byte("ทำได้"),
|
||||
6,
|
||||
Start: 42,
|
||||
End: 57,
|
||||
Term: []byte("ทำได้"),
|
||||
Position: 6,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
57,
|
||||
63,
|
||||
[]byte("นะ"),
|
||||
7,
|
||||
Start: 57,
|
||||
End: 63,
|
||||
Term: []byte("นะ"),
|
||||
Position: 7,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
{
|
||||
63,
|
||||
72,
|
||||
[]byte("จ้ะ"),
|
||||
8,
|
||||
Start: 63,
|
||||
End: 72,
|
||||
Term: []byte("จ้ะ"),
|
||||
Position: 8,
|
||||
Type: analysis.AlphaNumeric,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
@ -16,11 +16,19 @@ type CharFilter interface {
|
||||
Filter([]byte) []byte
|
||||
}
|
||||
|
||||
type TokenType int
|
||||
|
||||
const (
|
||||
AlphaNumeric TokenType = iota
|
||||
Numeric
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
Start int
|
||||
End int
|
||||
Term []byte
|
||||
Position int
|
||||
Type TokenType
|
||||
}
|
||||
|
||||
func (t *Token) String() string {
|
||||
|
Loading…
Reference in New Issue
Block a user