0
0

introduced token type

This commit is contained in:
Marty Schoch 2014-07-31 13:54:12 -04:00
parent c8918fe41a
commit 25540c736a
11 changed files with 110 additions and 115 deletions

View File

@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
} }
token.Start = offset token.Start = offset
token.End = token.Start + len(token.Term) token.End = token.Start + len(token.Term)
token.Type = analysis.AlphaNumeric
rv = append(rv, token) rv = append(rv, token)
offset = token.End + 1 offset = token.End + 1
} }

View File

@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 19, End: 19,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 2, End: 2,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 21, End: 21,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 2, End: 2,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 72, End: 72,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 2, End: 2,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 26, End: 26,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0, Start: 0,
End: 2, End: 2,
Position: 1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },

View File

@ -12,14 +12,6 @@ import (
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
) )
// var DEFAULT_STOP_WORDS []string = []string{
// "a", "an", "and", "are", "as", "at", "be", "but", "by",
// "for", "if", "in", "into", "is", "it",
// "no", "not", "of", "on", "or", "such",
// "that", "the", "their", "then", "there", "these",
// "they", "this", "to", "was", "will", "with",
// }
type StopWordsFilter struct { type StopWordsFilter struct {
stopWords StopWordsMap stopWords StopWordsMap
} }
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
return rv return rv
} }
// func buildStopWordMap(words []string) map[string]bool {
// rv := make(map[string]bool, len(words))
// for _, word := range words {
// rv[word] = true
// }
// return rv
// }

View File

@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
Start: match[0], Start: match[0],
End: match[1], End: match[1],
Position: i + 1, Position: i + 1,
Type: analysis.AlphaNumeric,
} }
rv[i] = &token rv[i] = &token
} }

View File

@ -6,10 +6,11 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package simple_word_boundary package regexp_tokenizer
import ( import (
"reflect" "reflect"
"regexp"
"testing" "testing"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
@ -17,6 +18,8 @@ import (
func TestBoundary(t *testing.T) { func TestBoundary(t *testing.T) {
wordRegex := regexp.MustCompile(`\w+`)
tests := []struct { tests := []struct {
input []byte input []byte
output analysis.TokenStream output analysis.TokenStream
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
[]byte("Hello World."), []byte("Hello World."),
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
5, End: 5,
[]byte("Hello"), Term: []byte("Hello"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
{ {
6, Start: 6,
11, End: 11,
[]byte("World"), Term: []byte("World"),
2, Position: 2,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
} }
for _, test := range tests { for _, test := range tests {
tokenizer := NewSimpleWordBoundaryTokenizer() tokenizer := NewRegexpTokenizer(wordRegex)
actual := tokenizer.Tokenize(test.input) actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {

View File

@ -1,29 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_word_boundary
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
)
const wordPattern = `\w+`
var wordRegex = regexp.MustCompile(wordPattern)
type SimpleWordBoundaryTokenizer struct {
*regexp_tokenizer.RegexpTokenizer
}
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
return &SimpleWordBoundaryTokenizer{
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
}
}

View File

@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
Position: 1, Position: 1,
Start: 0, Start: 0,
End: len(input), End: len(input),
Type: analysis.AlphaNumeric,
}, },
} }
} }

View File

@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("Hello World"), []byte("Hello World"),
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
11, End: 11,
[]byte("Hello World"), Term: []byte("Hello World"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("こんにちは世界"), []byte("こんにちは世界"),
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
21, End: 21,
[]byte("こんにちは世界"), Term: []byte("こんにちは世界"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
72, End: 72,
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },

View File

@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
End: int(indexB), End: int(indexB),
Term: input[indexA:indexB], Term: input[indexA:indexB],
Position: position, Position: position,
Type: analysis.AlphaNumeric,
} }
rv = append(rv, &token) rv = append(rv, &token)
} }

View File

@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
"en_US", "en_US",
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
5, End: 5,
[]byte("Hello"), Term: []byte("Hello"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
{ {
6, Start: 6,
11, End: 11,
[]byte("World"), Term: []byte("World"),
2, Position: 2,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
"en_US", "en_US",
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
15, End: 15,
[]byte("こんにちは"), Term: []byte("こんにちは"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
{ {
15, Start: 15,
21, End: 21,
[]byte("世界"), Term: []byte("世界"),
2, Position: 2,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
"th_TH", "th_TH",
analysis.TokenStream{ analysis.TokenStream{
{ {
0, Start: 0,
9, End: 9,
[]byte("แยก"), Term: []byte("แยก"),
1, Position: 1,
Type: analysis.AlphaNumeric,
}, },
{ {
9, Start: 9,
15, End: 15,
[]byte("คำ"), Term: []byte("คำ"),
2, Position: 2,
Type: analysis.AlphaNumeric,
}, },
{ {
15, Start: 15,
27, End: 27,
[]byte("ภาษา"), Term: []byte("ภาษา"),
3, Position: 3,
Type: analysis.AlphaNumeric,
}, },
{ {
27, Start: 27,
36, End: 36,
[]byte("ไทย"), Term: []byte("ไทย"),
4, Position: 4,
Type: analysis.AlphaNumeric,
}, },
{ {
36, Start: 36,
42, End: 42,
[]byte("ก็"), Term: []byte("ก็"),
5, Position: 5,
Type: analysis.AlphaNumeric,
}, },
{ {
42, Start: 42,
57, End: 57,
[]byte("ทำได้"), Term: []byte("ทำได้"),
6, Position: 6,
Type: analysis.AlphaNumeric,
}, },
{ {
57, Start: 57,
63, End: 63,
[]byte("นะ"), Term: []byte("นะ"),
7, Position: 7,
Type: analysis.AlphaNumeric,
}, },
{ {
63, Start: 63,
72, End: 72,
[]byte("จ้ะ"), Term: []byte("จ้ะ"),
8, Position: 8,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },

View File

@ -16,11 +16,19 @@ type CharFilter interface {
Filter([]byte) []byte Filter([]byte) []byte
} }
type TokenType int
const (
AlphaNumeric TokenType = iota
Numeric
)
type Token struct { type Token struct {
Start int Start int
End int End int
Term []byte Term []byte
Position int Position int
Type TokenType
} }
func (t *Token) String() string { func (t *Token) String() string {