Merge branch 'sokolovstas-ru_analyzer'
This commit is contained in:
commit
e68b70aa82
|
@ -0,0 +1,57 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ru"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopRuFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerRuFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopRuFilter,
|
||||
stemmerRuFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestRussianAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("километрах"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километр"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("актеров"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актер"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("как"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
// digits safe
|
||||
{
|
||||
input: []byte("text 1000"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("1000"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Вместе с тем о силе электромагнитной энергии имели представление еще"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("вмест"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("сил"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("электромагнитн"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("энерг"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("имел"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("представлен"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Но знание это хранилось в тайне"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("знан"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("эт"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("хран"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("тайн"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/russian"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_ru_snowball"
|
||||
|
||||
type RussianStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewRussianStemmerFilter() *RussianStemmerFilter {
|
||||
return &RussianStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
russian.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func RussianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewRussianStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(SnowballStemmerName, RussianStemmerFilterConstructor)
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestSnowballRussianStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актеров"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актер"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километров"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километр"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
|
@ -0,0 +1,267 @@
|
|||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ru"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var RussianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| this is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| letter 'ё' is translated to 'е'.
|
||||
|
||||
и | and
|
||||
в | in/into
|
||||
во | alternative form
|
||||
не | not
|
||||
что | what/that
|
||||
он | he
|
||||
на | on/onto
|
||||
я | i
|
||||
с | from
|
||||
со | alternative form
|
||||
как | how
|
||||
а | milder form of 'no' (but)
|
||||
то | conjunction and form of 'that'
|
||||
все | all
|
||||
она | she
|
||||
так | so, thus
|
||||
его | him
|
||||
но | but
|
||||
да | yes/and
|
||||
ты | thou
|
||||
к | towards, by
|
||||
у | around, chez
|
||||
же | intensifier particle
|
||||
вы | you
|
||||
за | beyond, behind
|
||||
бы | conditional/subj. particle
|
||||
по | up to, along
|
||||
только | only
|
||||
ее | her
|
||||
мне | to me
|
||||
было | it was
|
||||
вот | here is/are, particle
|
||||
от | away from
|
||||
меня | me
|
||||
еще | still, yet, more
|
||||
нет | no, there isnt/arent
|
||||
о | about
|
||||
из | out of
|
||||
ему | to him
|
||||
теперь | now
|
||||
когда | when
|
||||
даже | even
|
||||
ну | so, well
|
||||
вдруг | suddenly
|
||||
ли | interrogative particle
|
||||
если | if
|
||||
уже | already, but homonym of 'narrower'
|
||||
или | or
|
||||
ни | neither
|
||||
быть | to be
|
||||
был | he was
|
||||
него | prepositional form of его
|
||||
до | up to
|
||||
вас | you accusative
|
||||
нибудь | indef. suffix preceded by hyphen
|
||||
опять | again
|
||||
уж | already, but homonym of 'adder'
|
||||
вам | to you
|
||||
сказал | he said
|
||||
ведь | particle 'after all'
|
||||
там | there
|
||||
потом | then
|
||||
себя | oneself
|
||||
ничего | nothing
|
||||
ей | to her
|
||||
может | usually with 'быть' as 'maybe'
|
||||
они | they
|
||||
тут | here
|
||||
где | where
|
||||
есть | there is/are
|
||||
надо | got to, must
|
||||
ней | prepositional form of ей
|
||||
для | for
|
||||
мы | we
|
||||
тебя | thee
|
||||
их | them, their
|
||||
чем | than
|
||||
была | she was
|
||||
сам | self
|
||||
чтоб | in order to
|
||||
без | without
|
||||
будто | as if
|
||||
человек | man, person, one
|
||||
чего | genitive form of 'what'
|
||||
раз | once
|
||||
тоже | also
|
||||
себе | to oneself
|
||||
под | beneath
|
||||
жизнь | life
|
||||
будет | will be
|
||||
ж | short form of intensifer particle 'же'
|
||||
тогда | then
|
||||
кто | who
|
||||
этот | this
|
||||
говорил | was saying
|
||||
того | genitive form of 'that'
|
||||
потому | for that reason
|
||||
этого | genitive form of 'this'
|
||||
какой | which
|
||||
совсем | altogether
|
||||
ним | prepositional form of 'его', 'они'
|
||||
здесь | here
|
||||
этом | prepositional form of 'этот'
|
||||
один | one
|
||||
почти | almost
|
||||
мой | my
|
||||
тем | instrumental/dative plural of 'тот', 'то'
|
||||
чтобы | full form of 'in order that'
|
||||
нее | her (acc.)
|
||||
кажется | it seems
|
||||
сейчас | now
|
||||
были | they were
|
||||
куда | where to
|
||||
зачем | why
|
||||
сказать | to say
|
||||
всех | all (acc., gen. preposn. plural)
|
||||
никогда | never
|
||||
сегодня | today
|
||||
можно | possible, one can
|
||||
при | by
|
||||
наконец | finally
|
||||
два | two
|
||||
об | alternative form of 'о', about
|
||||
другой | another
|
||||
хоть | even
|
||||
после | after
|
||||
над | above
|
||||
больше | more
|
||||
тот | that one (masc.)
|
||||
через | across, in
|
||||
эти | these
|
||||
нас | us
|
||||
про | about
|
||||
всего | in all, only, of all
|
||||
них | prepositional form of 'они' (they)
|
||||
какая | which, feminine
|
||||
много | lots
|
||||
разве | interrogative particle
|
||||
сказала | she said
|
||||
три | three
|
||||
эту | this, acc. fem. sing.
|
||||
моя | my, feminine
|
||||
впрочем | moreover, besides
|
||||
хорошо | good
|
||||
свою | ones own, acc. fem. sing.
|
||||
этой | oblique form of 'эта', fem. 'this'
|
||||
перед | in front of
|
||||
иногда | sometimes
|
||||
лучше | better
|
||||
чуть | a little
|
||||
том | preposn. form of 'that one'
|
||||
нельзя | one must not
|
||||
такой | such a one
|
||||
им | to them
|
||||
более | more
|
||||
всегда | always
|
||||
конечно | of course
|
||||
всю | acc. fem. sing of 'all'
|
||||
между | between
|
||||
|
||||
|
||||
| b: some paradigms
|
||||
|
|
||||
| personal pronouns
|
||||
|
|
||||
| я меня мне мной [мною]
|
||||
| ты тебя тебе тобой [тобою]
|
||||
| он его ему им [него, нему, ним]
|
||||
| она ее эи ею [нее, нэи, нею]
|
||||
| оно его ему им [него, нему, ним]
|
||||
|
|
||||
| мы нас нам нами
|
||||
| вы вас вам вами
|
||||
| они их им ими [них, ним, ними]
|
||||
|
|
||||
| себя себе собой [собою]
|
||||
|
|
||||
| demonstrative pronouns: этот (this), тот (that)
|
||||
|
|
||||
| этот эта это эти
|
||||
| этого эты это эти
|
||||
| этого этой этого этих
|
||||
| этому этой этому этим
|
||||
| этим этой этим [этою] этими
|
||||
| этом этой этом этих
|
||||
|
|
||||
| тот та то те
|
||||
| того ту то те
|
||||
| того той того тех
|
||||
| тому той тому тем
|
||||
| тем той тем [тою] теми
|
||||
| том той том тех
|
||||
|
|
||||
| determinative pronouns
|
||||
|
|
||||
| (a) весь (all)
|
||||
|
|
||||
| весь вся все все
|
||||
| всего всю все все
|
||||
| всего всей всего всех
|
||||
| всему всей всему всем
|
||||
| всем всей всем [всею] всеми
|
||||
| всем всей всем всех
|
||||
|
|
||||
| (b) сам (himself etc)
|
||||
|
|
||||
| сам сама само сами
|
||||
| самого саму само самих
|
||||
| самого самой самого самих
|
||||
| самому самой самому самим
|
||||
| самим самой самим [самою] самими
|
||||
| самом самой самом самих
|
||||
|
|
||||
| stems of verbs 'to be', 'to have', 'to do' and modal
|
||||
|
|
||||
| быть бы буд быв есть суть
|
||||
| име
|
||||
| дел
|
||||
| мог мож мочь
|
||||
| уме
|
||||
| хоч хот
|
||||
| долж
|
||||
| можн
|
||||
| нужн
|
||||
| нельзя
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(RussianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/kljensen/snowball"
|
||||
)
|
||||
|
||||
const Name = "stemmer_snowball"
|
||||
|
||||
type SnowballStemmer struct {
|
||||
langauge string
|
||||
}
|
||||
|
||||
func NewSnowballStemmer(language string) *SnowballStemmer {
|
||||
return &SnowballStemmer{
|
||||
langauge: language,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if it is not a protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true)
|
||||
token.Term = []byte(stemmed)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
language, ok := config["language"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify language")
|
||||
}
|
||||
return NewSnowballStemmer(language), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestSnowballStemmer(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
// a term which does stem, but does not change length
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marti"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSnowballStemmer(b *testing.B) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(inputTokenStream)
|
||||
}
|
||||
|
||||
}
|
|
@ -17,6 +17,14 @@
|
|||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/blevesearch/snowballstem",
|
||||
"repository": "https://github.com/blevesearch/snowballstem",
|
||||
"vcs": "",
|
||||
"revision": "26b06a2c243d4f8ca5db3486f94409dd5b2a7467",
|
||||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/boltdb/bolt",
|
||||
"repository": "https://github.com/boltdb/bolt",
|
||||
|
|
Loading…
Reference in New Issue