Added Russian analyzer with snowball stemmer
This commit is contained in:
parent
3351c3b046
commit
d8d57e6990
|
@ -0,0 +1,57 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ru"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopRuFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerRuFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopRuFilter,
|
||||
stemmerRuFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestRussianAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("километрах"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километр"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("актеров"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актер"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("как"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,737 @@
|
|||
//! This file was generated automatically by the Snowball to Go compiler
|
||||
//! http://snowballstem.org/
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
snowballRuntime "github.com/snowballstem/snowball/go"
|
||||
)
|
||||
|
||||
var A_0 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0432\u0448\u0438\u0441\u044C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0432", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0432", A: 3, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0432", A: 3, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0432\u0448\u0438", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438", A: 6, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438", A: 6, B: 2, F: nil},
|
||||
}
|
||||
|
||||
var A_1 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0435\u043C\u0443", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u043C\u0443", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0445", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0445", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u044E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044E\u044E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u044E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u044E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F\u044F", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0430\u044F", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043C\u0438", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043C\u0438", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0433\u043E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0433\u043E", A: -1, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_2 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0432\u0448", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0432\u0448", A: 0, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0432\u0448", A: 0, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0449", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044E\u0449", A: 3, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u044E\u0449", A: 4, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D\u043D", A: -1, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_3 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0441\u044C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0441\u044F", A: -1, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_4 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u044B\u0442", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044E\u0442", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u044E\u0442", A: 1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F\u0442", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0442", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u0435\u0442", A: 4, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0442", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D\u044B", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043D\u044B", A: 7, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0442\u044C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u0442\u044C", A: 9, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0442\u044C", A: 9, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0448\u044C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0448\u044C", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044E", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u044E", A: 14, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043B\u0430", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043B\u0430", A: 16, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043B\u0430", A: 16, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D\u0430", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043D\u0430", A: 19, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0442\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0442\u0435", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0439\u0442\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u0439\u0442\u0435", A: 23, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439\u0442\u0435", A: 23, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043B\u0438", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043B\u0438", A: 26, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043B\u0438", A: 26, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0443\u0439", A: 29, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439", A: 29, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043B", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043B", A: 32, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043B", A: 32, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043D", A: 38, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043B\u043E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B\u043B\u043E", A: 40, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u043B\u043E", A: 40, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D\u043E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043D\u043E", A: 43, B: 2, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D\u043D\u043E", A: 43, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_5 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0443", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F\u0445", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u044F\u0445", A: 1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0430\u0445", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044B", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044E", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044C\u044E", A: 6, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u044E", A: 6, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044C\u044F", A: 9, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u044F", A: 9, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0430", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0432", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0432", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044C\u0435", A: 15, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0435", A: 15, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0438", A: 18, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0438", A: 18, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F\u043C\u0438", A: 18, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u044F\u043C\u0438", A: 21, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0430\u043C\u0438", A: 18, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439", A: 24, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0435\u0439", A: 25, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0439", A: 24, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0439", A: 24, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044F\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u044F\u043C", A: 29, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0430\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0438\u0435\u043C", A: 32, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E", A: -1, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_6 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u043E\u0441\u0442", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043E\u0441\u0442\u044C", A: -1, B: 1, F: nil},
|
||||
}
|
||||
|
||||
var A_7 = []*snowballRuntime.Among{
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439\u0448", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u044C", A: -1, B: 3, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u0435\u0439\u0448\u0435", A: -1, B: 1, F: nil},
|
||||
&snowballRuntime.Among{Str: "\u043D", A: -1, B: 2, F: nil},
|
||||
}
|
||||
|
||||
var G_v = []byte{33, 65, 8, 232}
|
||||
|
||||
type Context struct {
|
||||
i_p2 int
|
||||
i_pV int
|
||||
}
|
||||
|
||||
func r_mark_regions(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
// (, line 57
|
||||
context.i_pV = env.Limit
|
||||
context.i_p2 = env.Limit
|
||||
// do, line 61
|
||||
var v_1 = env.Cursor
|
||||
lab0:
|
||||
for {
|
||||
// (, line 61
|
||||
// gopast, line 62
|
||||
golab1:
|
||||
for {
|
||||
lab2:
|
||||
for {
|
||||
if !env.InGrouping(G_v, 1072, 1103) {
|
||||
break lab2
|
||||
}
|
||||
break golab1
|
||||
}
|
||||
if env.Cursor >= env.Limit {
|
||||
break lab0
|
||||
}
|
||||
env.NextChar()
|
||||
}
|
||||
// setmark pV, line 62
|
||||
context.i_pV = env.Cursor
|
||||
// gopast, line 62
|
||||
golab3:
|
||||
for {
|
||||
lab4:
|
||||
for {
|
||||
if !env.OutGrouping(G_v, 1072, 1103) {
|
||||
break lab4
|
||||
}
|
||||
break golab3
|
||||
}
|
||||
if env.Cursor >= env.Limit {
|
||||
break lab0
|
||||
}
|
||||
env.NextChar()
|
||||
}
|
||||
// gopast, line 63
|
||||
golab5:
|
||||
for {
|
||||
lab6:
|
||||
for {
|
||||
if !env.InGrouping(G_v, 1072, 1103) {
|
||||
break lab6
|
||||
}
|
||||
break golab5
|
||||
}
|
||||
if env.Cursor >= env.Limit {
|
||||
break lab0
|
||||
}
|
||||
env.NextChar()
|
||||
}
|
||||
// gopast, line 63
|
||||
golab7:
|
||||
for {
|
||||
lab8:
|
||||
for {
|
||||
if !env.OutGrouping(G_v, 1072, 1103) {
|
||||
break lab8
|
||||
}
|
||||
break golab7
|
||||
}
|
||||
if env.Cursor >= env.Limit {
|
||||
break lab0
|
||||
}
|
||||
env.NextChar()
|
||||
}
|
||||
// setmark p2, line 63
|
||||
context.i_p2 = env.Cursor
|
||||
break lab0
|
||||
}
|
||||
env.Cursor = v_1
|
||||
return true
|
||||
}
|
||||
|
||||
func r_R2(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
if !(context.i_p2 <= env.Cursor) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_perfective_gerund(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 71
|
||||
// [, line 72
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 72
|
||||
among_var = env.FindAmongB(A_0, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 72
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 76
|
||||
// or, line 76
|
||||
lab0:
|
||||
for {
|
||||
var v_1 = env.Limit - env.Cursor
|
||||
lab1:
|
||||
for {
|
||||
// literal, line 76
|
||||
if !env.EqSB("\u0430") {
|
||||
break lab1
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
env.Cursor = env.Limit - v_1
|
||||
// literal, line 76
|
||||
if !env.EqSB("\u044F") {
|
||||
return false
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
// delete, line 76
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
} else if among_var == 2 {
|
||||
// (, line 83
|
||||
// delete, line 83
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_adjective(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 87
|
||||
// [, line 88
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 88
|
||||
among_var = env.FindAmongB(A_1, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 88
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 97
|
||||
// delete, line 97
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_adjectival(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 101
|
||||
// call adjective, line 102
|
||||
if !r_adjective(env, context) {
|
||||
return false
|
||||
}
|
||||
// try, line 109
|
||||
var v_1 = env.Limit - env.Cursor
|
||||
lab0:
|
||||
for {
|
||||
// (, line 109
|
||||
// [, line 110
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 110
|
||||
among_var = env.FindAmongB(A_2, context)
|
||||
if among_var == 0 {
|
||||
env.Cursor = env.Limit - v_1
|
||||
break lab0
|
||||
}
|
||||
// ], line 110
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
env.Cursor = env.Limit - v_1
|
||||
break lab0
|
||||
} else if among_var == 1 {
|
||||
// (, line 115
|
||||
// or, line 115
|
||||
lab1:
|
||||
for {
|
||||
var v_2 = env.Limit - env.Cursor
|
||||
lab2:
|
||||
for {
|
||||
// literal, line 115
|
||||
if !env.EqSB("\u0430") {
|
||||
break lab2
|
||||
}
|
||||
break lab1
|
||||
}
|
||||
env.Cursor = env.Limit - v_2
|
||||
// literal, line 115
|
||||
if !env.EqSB("\u044F") {
|
||||
env.Cursor = env.Limit - v_1
|
||||
break lab0
|
||||
}
|
||||
break lab1
|
||||
}
|
||||
// delete, line 115
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
} else if among_var == 2 {
|
||||
// (, line 122
|
||||
// delete, line 122
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_reflexive(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 128
|
||||
// [, line 129
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 129
|
||||
among_var = env.FindAmongB(A_3, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 129
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 132
|
||||
// delete, line 132
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_verb(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 136
|
||||
// [, line 137
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 137
|
||||
among_var = env.FindAmongB(A_4, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 137
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 143
|
||||
// or, line 143
|
||||
lab0:
|
||||
for {
|
||||
var v_1 = env.Limit - env.Cursor
|
||||
lab1:
|
||||
for {
|
||||
// literal, line 143
|
||||
if !env.EqSB("\u0430") {
|
||||
break lab1
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
env.Cursor = env.Limit - v_1
|
||||
// literal, line 143
|
||||
if !env.EqSB("\u044F") {
|
||||
return false
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
// delete, line 143
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
} else if among_var == 2 {
|
||||
// (, line 151
|
||||
// delete, line 151
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_noun(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 159
|
||||
// [, line 160
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 160
|
||||
among_var = env.FindAmongB(A_5, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 160
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 167
|
||||
// delete, line 167
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_derivational(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 175
|
||||
// [, line 176
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 176
|
||||
among_var = env.FindAmongB(A_6, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 176
|
||||
env.Bra = env.Cursor
|
||||
// call R2, line 176
|
||||
if !r_R2(env, context) {
|
||||
return false
|
||||
}
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 179
|
||||
// delete, line 179
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func r_tidy_up(env *snowballRuntime.Env, ctx interface{}) bool {
|
||||
context := ctx.(*Context)
|
||||
_ = context
|
||||
var among_var int32
|
||||
// (, line 183
|
||||
// [, line 184
|
||||
env.Ket = env.Cursor
|
||||
// substring, line 184
|
||||
among_var = env.FindAmongB(A_7, context)
|
||||
if among_var == 0 {
|
||||
return false
|
||||
}
|
||||
// ], line 184
|
||||
env.Bra = env.Cursor
|
||||
if among_var == 0 {
|
||||
return false
|
||||
} else if among_var == 1 {
|
||||
// (, line 188
|
||||
// delete, line 188
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
// [, line 189
|
||||
env.Ket = env.Cursor
|
||||
// literal, line 189
|
||||
if !env.EqSB("\u043D") {
|
||||
return false
|
||||
}
|
||||
// ], line 189
|
||||
env.Bra = env.Cursor
|
||||
// literal, line 189
|
||||
if !env.EqSB("\u043D") {
|
||||
return false
|
||||
}
|
||||
// delete, line 189
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
} else if among_var == 2 {
|
||||
// (, line 192
|
||||
// literal, line 192
|
||||
if !env.EqSB("\u043D") {
|
||||
return false
|
||||
}
|
||||
// delete, line 192
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
} else if among_var == 3 {
|
||||
// (, line 194
|
||||
// delete, line 194
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func Stem(env *snowballRuntime.Env) bool {
|
||||
var context = &Context{
|
||||
i_p2: 0,
|
||||
i_pV: 0,
|
||||
}
|
||||
_ = context
|
||||
// (, line 199
|
||||
// do, line 201
|
||||
var v_1 = env.Cursor
|
||||
lab0:
|
||||
for {
|
||||
// call mark_regions, line 201
|
||||
if !r_mark_regions(env, context) {
|
||||
break lab0
|
||||
}
|
||||
break lab0
|
||||
}
|
||||
env.Cursor = v_1
|
||||
// backwards, line 202
|
||||
env.LimitBackward = env.Cursor
|
||||
env.Cursor = env.Limit
|
||||
// setlimit, line 202
|
||||
var v_2 = env.Limit - env.Cursor
|
||||
// tomark, line 202
|
||||
if env.Cursor < context.i_pV {
|
||||
return false
|
||||
}
|
||||
env.Cursor = context.i_pV
|
||||
var v_3 = env.LimitBackward
|
||||
env.LimitBackward = env.Cursor
|
||||
env.Cursor = env.Limit - v_2
|
||||
// (, line 202
|
||||
// do, line 203
|
||||
var v_4 = env.Limit - env.Cursor
|
||||
lab1:
|
||||
for {
|
||||
// (, line 203
|
||||
// or, line 204
|
||||
lab2:
|
||||
for {
|
||||
var v_5 = env.Limit - env.Cursor
|
||||
lab3:
|
||||
for {
|
||||
// call perfective_gerund, line 204
|
||||
if !r_perfective_gerund(env, context) {
|
||||
break lab3
|
||||
}
|
||||
break lab2
|
||||
}
|
||||
env.Cursor = env.Limit - v_5
|
||||
// (, line 205
|
||||
// try, line 205
|
||||
var v_6 = env.Limit - env.Cursor
|
||||
lab4:
|
||||
for {
|
||||
// call reflexive, line 205
|
||||
if !r_reflexive(env, context) {
|
||||
env.Cursor = env.Limit - v_6
|
||||
break lab4
|
||||
}
|
||||
break lab4
|
||||
}
|
||||
// or, line 206
|
||||
lab5:
|
||||
for {
|
||||
var v_7 = env.Limit - env.Cursor
|
||||
lab6:
|
||||
for {
|
||||
// call adjectival, line 206
|
||||
if !r_adjectival(env, context) {
|
||||
break lab6
|
||||
}
|
||||
break lab5
|
||||
}
|
||||
env.Cursor = env.Limit - v_7
|
||||
lab7:
|
||||
for {
|
||||
// call verb, line 206
|
||||
if !r_verb(env, context) {
|
||||
break lab7
|
||||
}
|
||||
break lab5
|
||||
}
|
||||
env.Cursor = env.Limit - v_7
|
||||
// call noun, line 206
|
||||
if !r_noun(env, context) {
|
||||
break lab1
|
||||
}
|
||||
break lab5
|
||||
}
|
||||
break lab2
|
||||
}
|
||||
break lab1
|
||||
}
|
||||
env.Cursor = env.Limit - v_4
|
||||
// try, line 209
|
||||
var v_8 = env.Limit - env.Cursor
|
||||
lab8:
|
||||
for {
|
||||
// (, line 209
|
||||
// [, line 209
|
||||
env.Ket = env.Cursor
|
||||
// literal, line 209
|
||||
if !env.EqSB("\u0438") {
|
||||
env.Cursor = env.Limit - v_8
|
||||
break lab8
|
||||
}
|
||||
// ], line 209
|
||||
env.Bra = env.Cursor
|
||||
// delete, line 209
|
||||
if !env.SliceDel() {
|
||||
return false
|
||||
}
|
||||
break lab8
|
||||
}
|
||||
// do, line 212
|
||||
var v_9 = env.Limit - env.Cursor
|
||||
lab9:
|
||||
for {
|
||||
// call derivational, line 212
|
||||
if !r_derivational(env, context) {
|
||||
break lab9
|
||||
}
|
||||
break lab9
|
||||
}
|
||||
env.Cursor = env.Limit - v_9
|
||||
// do, line 213
|
||||
var v_10 = env.Limit - env.Cursor
|
||||
lab10:
|
||||
for {
|
||||
// call tidy_up, line 213
|
||||
if !r_tidy_up(env, context) {
|
||||
break lab10
|
||||
}
|
||||
break lab10
|
||||
}
|
||||
env.Cursor = env.Limit - v_10
|
||||
env.LimitBackward = v_3
|
||||
env.Cursor = env.LimitBackward
|
||||
return true
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/lang/ru/snowball"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
snowballRuntime "github.com/snowballstem/snowball/go"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_ru_snowball"
|
||||
|
||||
type RussianStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewRussianStemmerFilter() *RussianStemmerFilter {
|
||||
return &RussianStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
|
||||
env := snowballRuntime.NewEnv(string(token.Term))
|
||||
snowball.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func RussianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewRussianStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(SnowballStemmerName, RussianStemmerFilterConstructor)
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestSnowballRussianStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актеров"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("актер"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километров"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("километр"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
package ru
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ru"
|
||||
|
||||
var RussianStopWords = []byte(` | From http://snowball.tartarus.org/algorithms/russian/stop.txt
|
||||
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| this is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| letter 'ё' is translated to 'е'.
|
||||
|
||||
и | and
|
||||
в | in/into
|
||||
во | alternative form
|
||||
не | not
|
||||
что | what/that
|
||||
он | he
|
||||
на | on/onto
|
||||
я | i
|
||||
с | from
|
||||
со | alternative form
|
||||
как | how
|
||||
а | milder form of 'no' (but)
|
||||
то | conjunction and form of 'that'
|
||||
все | all
|
||||
она | she
|
||||
так | so, thus
|
||||
его | him
|
||||
но | but
|
||||
да | yes/and
|
||||
ты | thou
|
||||
к | towards, by
|
||||
у | around, chez
|
||||
же | intensifier particle
|
||||
вы | you
|
||||
за | beyond, behind
|
||||
бы | conditional/subj. particle
|
||||
по | up to, along
|
||||
только | only
|
||||
ее | her
|
||||
мне | to me
|
||||
было | it was
|
||||
вот | here is/are, particle
|
||||
от | away from
|
||||
меня | me
|
||||
еще | still, yet, more
|
||||
нет | no, there isnt/arent
|
||||
о | about
|
||||
из | out of
|
||||
ему | to him
|
||||
теперь | now
|
||||
когда | when
|
||||
даже | even
|
||||
ну | so, well
|
||||
вдруг | suddenly
|
||||
ли | interrogative particle
|
||||
если | if
|
||||
уже | already, but homonym of 'narrower'
|
||||
или | or
|
||||
ни | neither
|
||||
быть | to be
|
||||
был | he was
|
||||
него | prepositional form of его
|
||||
до | up to
|
||||
вас | you accusative
|
||||
нибудь | indef. suffix preceded by hyphen
|
||||
опять | again
|
||||
уж | already, but homonym of 'adder'
|
||||
вам | to you
|
||||
сказал | he said
|
||||
ведь | particle 'after all'
|
||||
там | there
|
||||
потом | then
|
||||
себя | oneself
|
||||
ничего | nothing
|
||||
ей | to her
|
||||
может | usually with 'быть' as 'maybe'
|
||||
они | they
|
||||
тут | here
|
||||
где | where
|
||||
есть | there is/are
|
||||
надо | got to, must
|
||||
ней | prepositional form of ей
|
||||
для | for
|
||||
мы | we
|
||||
тебя | thee
|
||||
их | them, their
|
||||
чем | than
|
||||
была | she was
|
||||
сам | self
|
||||
чтоб | in order to
|
||||
без | without
|
||||
будто | as if
|
||||
человек | man, person, one
|
||||
чего | genitive form of 'what'
|
||||
раз | once
|
||||
тоже | also
|
||||
себе | to oneself
|
||||
под | beneath
|
||||
жизнь | life
|
||||
будет | will be
|
||||
ж | short form of intensifer particle 'же'
|
||||
тогда | then
|
||||
кто | who
|
||||
этот | this
|
||||
говорил | was saying
|
||||
того | genitive form of 'that'
|
||||
потому | for that reason
|
||||
этого | genitive form of 'this'
|
||||
какой | which
|
||||
совсем | altogether
|
||||
ним | prepositional form of 'его', 'они'
|
||||
здесь | here
|
||||
этом | prepositional form of 'этот'
|
||||
один | one
|
||||
почти | almost
|
||||
мой | my
|
||||
тем | instrumental/dative plural of 'тот', 'то'
|
||||
чтобы | full form of 'in order that'
|
||||
нее | her (acc.)
|
||||
кажется | it seems
|
||||
сейчас | now
|
||||
были | they were
|
||||
куда | where to
|
||||
зачем | why
|
||||
сказать | to say
|
||||
всех | all (acc., gen. preposn. plural)
|
||||
никогда | never
|
||||
сегодня | today
|
||||
можно | possible, one can
|
||||
при | by
|
||||
наконец | finally
|
||||
два | two
|
||||
об | alternative form of 'о', about
|
||||
другой | another
|
||||
хоть | even
|
||||
после | after
|
||||
над | above
|
||||
больше | more
|
||||
тот | that one (masc.)
|
||||
через | across, in
|
||||
эти | these
|
||||
нас | us
|
||||
про | about
|
||||
всего | in all, only, of all
|
||||
них | prepositional form of 'они' (they)
|
||||
какая | which, feminine
|
||||
много | lots
|
||||
разве | interrogative particle
|
||||
сказала | she said
|
||||
три | three
|
||||
эту | this, acc. fem. sing.
|
||||
моя | my, feminine
|
||||
впрочем | moreover, besides
|
||||
хорошо | good
|
||||
свою | ones own, acc. fem. sing.
|
||||
этой | oblique form of 'эта', fem. 'this'
|
||||
перед | in front of
|
||||
иногда | sometimes
|
||||
лучше | better
|
||||
чуть | a little
|
||||
том | preposn. form of 'that one'
|
||||
нельзя | one must not
|
||||
такой | such a one
|
||||
им | to them
|
||||
более | more
|
||||
всегда | always
|
||||
конечно | of course
|
||||
всю | acc. fem. sing of 'all'
|
||||
между | between
|
||||
|
||||
|
||||
| b: some paradigms
|
||||
|
|
||||
| personal pronouns
|
||||
|
|
||||
| я меня мне мной [мною]
|
||||
| ты тебя тебе тобой [тобою]
|
||||
| он его ему им [него, нему, ним]
|
||||
| она ее эи ею [нее, нэи, нею]
|
||||
| оно его ему им [него, нему, ним]
|
||||
|
|
||||
| мы нас нам нами
|
||||
| вы вас вам вами
|
||||
| они их им ими [них, ним, ними]
|
||||
|
|
||||
| себя себе собой [собою]
|
||||
|
|
||||
| demonstrative pronouns: этот (this), тот (that)
|
||||
|
|
||||
| этот эта это эти
|
||||
| этого эты это эти
|
||||
| этого этой этого этих
|
||||
| этому этой этому этим
|
||||
| этим этой этим [этою] этими
|
||||
| этом этой этом этих
|
||||
|
|
||||
| тот та то те
|
||||
| того ту то те
|
||||
| того той того тех
|
||||
| тому той тому тем
|
||||
| тем той тем [тою] теми
|
||||
| том той том тех
|
||||
|
|
||||
| determinative pronouns
|
||||
|
|
||||
| (a) весь (all)
|
||||
|
|
||||
| весь вся все все
|
||||
| всего всю все все
|
||||
| всего всей всего всех
|
||||
| всему всей всему всем
|
||||
| всем всей всем [всею] всеми
|
||||
| всем всей всем всех
|
||||
|
|
||||
| (b) сам (himself etc)
|
||||
|
|
||||
| сам сама само сами
|
||||
| самого саму само самих
|
||||
| самого самой самого самих
|
||||
| самому самой самому самим
|
||||
| самим самой самим [самою] самими
|
||||
| самом самой самом самих
|
||||
|
|
||||
| stems of verbs 'to be', 'to have', 'to do' and modal
|
||||
|
|
||||
| быть бы буд быв есть суть
|
||||
| име
|
||||
| дел
|
||||
| мог мож мочь
|
||||
| уме
|
||||
| хоч хот
|
||||
| долж
|
||||
| можн
|
||||
| нужн
|
||||
| нельзя
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(RussianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/kljensen/snowball"
|
||||
)
|
||||
|
||||
const Name = "stemmer_snowball"
|
||||
|
||||
type SnowballStemmer struct {
|
||||
langauge string
|
||||
}
|
||||
|
||||
func NewSnowballStemmer(language string) *SnowballStemmer {
|
||||
return &SnowballStemmer{
|
||||
langauge: language,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if it is not a protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true)
|
||||
token.Term = []byte(stemmed)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
language, ok := config["language"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify language")
|
||||
}
|
||||
return NewSnowballStemmer(language), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestSnowballStemmer(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
// a term which does stem, but does not change length
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marti"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSnowballStemmer(b *testing.B) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(inputTokenStream)
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue