0
0

introduced token type

This commit is contained in:
Marty Schoch 2014-07-31 13:54:12 -04:00
parent c8918fe41a
commit 25540c736a
11 changed files with 110 additions and 115 deletions

View File

@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
}
token.Start = offset
token.End = token.Start + len(token.Term)
token.Type = analysis.AlphaNumeric
rv = append(rv, token)
offset = token.End + 1
}

View File

@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 19,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 21,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 72,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 26,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},

View File

@ -12,14 +12,6 @@ import (
"github.com/couchbaselabs/bleve/analysis"
)
// var DEFAULT_STOP_WORDS []string = []string{
// "a", "an", "and", "are", "as", "at", "be", "but", "by",
// "for", "if", "in", "into", "is", "it",
// "no", "not", "of", "on", "or", "such",
// "that", "the", "their", "then", "there", "these",
// "they", "this", "to", "was", "will", "with",
// }
type StopWordsFilter struct {
stopWords StopWordsMap
}
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
return rv
}
// func buildStopWordMap(words []string) map[string]bool {
// rv := make(map[string]bool, len(words))
// for _, word := range words {
// rv[word] = true
// }
// return rv
// }

View File

@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
Start: match[0],
End: match[1],
Position: i + 1,
Type: analysis.AlphaNumeric,
}
rv[i] = &token
}

View File

@ -6,10 +6,11 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_word_boundary
package regexp_tokenizer
import (
"reflect"
"regexp"
"testing"
"github.com/couchbaselabs/bleve/analysis"
@ -17,6 +18,8 @@ import (
func TestBoundary(t *testing.T) {
wordRegex := regexp.MustCompile(`\w+`)
tests := []struct {
input []byte
output analysis.TokenStream
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
[]byte("Hello World."),
analysis.TokenStream{
{
0,
5,
[]byte("Hello"),
1,
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
6,
11,
[]byte("World"),
2,
Start: 6,
End: 11,
Term: []byte("World"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
}
for _, test := range tests {
tokenizer := NewSimpleWordBoundaryTokenizer()
tokenizer := NewRegexpTokenizer(wordRegex)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {

View File

@ -1,29 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_word_boundary
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
)
const wordPattern = `\w+`
var wordRegex = regexp.MustCompile(wordPattern)
type SimpleWordBoundaryTokenizer struct {
*regexp_tokenizer.RegexpTokenizer
}
func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
return &SimpleWordBoundaryTokenizer{
regexp_tokenizer.NewRegexpTokenizer(wordRegex),
}
}

View File

@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
Position: 1,
Start: 0,
End: len(input),
Type: analysis.AlphaNumeric,
},
}
}

View File

@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("Hello World"),
analysis.TokenStream{
{
0,
11,
[]byte("Hello World"),
1,
Start: 0,
End: 11,
Term: []byte("Hello World"),
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("こんにちは世界"),
analysis.TokenStream{
{
0,
21,
[]byte("こんにちは世界"),
1,
Start: 0,
End: 21,
Term: []byte("こんにちは世界"),
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
analysis.TokenStream{
{
0,
72,
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
1,
Start: 0,
End: 72,
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},

View File

@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
End: int(indexB),
Term: input[indexA:indexB],
Position: position,
Type: analysis.AlphaNumeric,
}
rv = append(rv, &token)
}

View File

@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
"en_US",
analysis.TokenStream{
{
0,
5,
[]byte("Hello"),
1,
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
6,
11,
[]byte("World"),
2,
Start: 6,
End: 11,
Term: []byte("World"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
"en_US",
analysis.TokenStream{
{
0,
15,
[]byte("こんにちは"),
1,
Start: 0,
End: 15,
Term: []byte("こんにちは"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
15,
21,
[]byte("世界"),
2,
Start: 15,
End: 21,
Term: []byte("世界"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
"th_TH",
analysis.TokenStream{
{
0,
9,
[]byte("แยก"),
1,
Start: 0,
End: 9,
Term: []byte("แยก"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
9,
15,
[]byte("คำ"),
2,
Start: 9,
End: 15,
Term: []byte("คำ"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
15,
27,
[]byte("ภาษา"),
3,
Start: 15,
End: 27,
Term: []byte("ภาษา"),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
27,
36,
[]byte("ไทย"),
4,
Start: 27,
End: 36,
Term: []byte("ไทย"),
Position: 4,
Type: analysis.AlphaNumeric,
},
{
36,
42,
[]byte("ก็"),
5,
Start: 36,
End: 42,
Term: []byte("ก็"),
Position: 5,
Type: analysis.AlphaNumeric,
},
{
42,
57,
[]byte("ทำได้"),
6,
Start: 42,
End: 57,
Term: []byte("ทำได้"),
Position: 6,
Type: analysis.AlphaNumeric,
},
{
57,
63,
[]byte("นะ"),
7,
Start: 57,
End: 63,
Term: []byte("นะ"),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
63,
72,
[]byte("จ้ะ"),
8,
Start: 63,
End: 72,
Term: []byte("จ้ะ"),
Position: 8,
Type: analysis.AlphaNumeric,
},
},
},

View File

@ -16,11 +16,19 @@ type CharFilter interface {
Filter([]byte) []byte
}
type TokenType int
const (
AlphaNumeric TokenType = iota
Numeric
)
type Token struct {
Start int
End int
Term []byte
Position int
Type TokenType
}
func (t *Token) String() string {