0
0
Fork 0

Added Russian analyzer with snowball stemmer

This commit is contained in:
Stanislav Sokolov 2017-02-16 12:44:52 +05:00 committed by Sokolov Stanislav
parent 3351c3b046
commit d8d57e6990
9 changed files with 1444 additions and 0 deletions

View File

@ -0,0 +1,57 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
)
const AnalyzerName = "ru"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopRuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerRuFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopRuFilter,
stemmerRuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestRussianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("километрах"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("километр"),
},
},
},
{
input: []byte("актеров"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("актер"),
},
},
},
// stop word
{
input: []byte("как"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -0,0 +1,737 @@
//! This file was generated automatically by the Snowball to Go compiler
//! http://snowballstem.org/
package snowball
import (
snowballRuntime "github.com/snowballstem/snowball/go"
)
var A_0 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0432\u0448\u0438\u0441\u044C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0432", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0432", A: 3, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0432", A: 3, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0432\u0448\u0438", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438", A: 6, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438", A: 6, B: 2, F: nil},
}
var A_1 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0435\u043C\u0443", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u043C\u0443", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0445", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0445", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u044E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044E\u044E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u044E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u044E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044F\u044F", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0430\u044F", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043C\u0438", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043C\u0438", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0433\u043E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0433\u043E", A: -1, B: 1, F: nil},
}
var A_2 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0432\u0448", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0432\u0448", A: 0, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0432\u0448", A: 0, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0449", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044E\u0449", A: 3, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u044E\u0449", A: 4, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043D\u043D", A: -1, B: 1, F: nil},
}
var A_3 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0441\u044C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0441\u044F", A: -1, B: 1, F: nil},
}
var A_4 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u044B\u0442", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u044E\u0442", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u044E\u0442", A: 1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u044F\u0442", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0442", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u0435\u0442", A: 4, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0442", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043D\u044B", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043D\u044B", A: 7, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0442\u044C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u0442\u044C", A: 9, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0442\u044C", A: 9, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0448\u044C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0448\u044C", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u044E", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0443\u044E", A: 14, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043B\u0430", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043B\u0430", A: 16, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043B\u0430", A: 16, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043D\u0430", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043D\u0430", A: 19, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0442\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0442\u0435", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0439\u0442\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u0439\u0442\u0435", A: 23, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0439\u0442\u0435", A: 23, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043B\u0438", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043B\u0438", A: 26, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043B\u0438", A: 26, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0443\u0439", A: 29, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0439", A: 29, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043B", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043B", A: 32, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043B", A: 32, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043D", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043D", A: 38, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043B\u043E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B\u043B\u043E", A: 40, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u0438\u043B\u043E", A: 40, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043D\u043E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043D\u043E", A: 43, B: 2, F: nil},
&snowballRuntime.Among{Str: "\u043D\u043D\u043E", A: 43, B: 1, F: nil},
}
var A_5 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0443", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044F\u0445", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u044F\u0445", A: 1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0430\u0445", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044B", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044E", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044C\u044E", A: 6, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u044E", A: 6, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044F", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044C\u044F", A: 9, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u044F", A: 9, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0430", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0432", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0432", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044C\u0435", A: 15, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0435", A: 15, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0438", A: 18, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0438", A: 18, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044F\u043C\u0438", A: 18, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u044F\u043C\u0438", A: 21, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0430\u043C\u0438", A: 18, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0439", A: 24, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0435\u0439", A: 25, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0439", A: 24, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0439", A: 24, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044F\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u044F\u043C", A: 29, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0430\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u0438\u0435\u043C", A: 32, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E", A: -1, B: 1, F: nil},
}
var A_6 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u043E\u0441\u0442", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043E\u0441\u0442\u044C", A: -1, B: 1, F: nil},
}
var A_7 = []*snowballRuntime.Among{
&snowballRuntime.Among{Str: "\u0435\u0439\u0448", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u044C", A: -1, B: 3, F: nil},
&snowballRuntime.Among{Str: "\u0435\u0439\u0448\u0435", A: -1, B: 1, F: nil},
&snowballRuntime.Among{Str: "\u043D", A: -1, B: 2, F: nil},
}
var G_v = []byte{33, 65, 8, 232}
type Context struct {
i_p2 int
i_pV int
}
func r_mark_regions(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
// (, line 57
context.i_pV = env.Limit
context.i_p2 = env.Limit
// do, line 61
var v_1 = env.Cursor
lab0:
for {
// (, line 61
// gopast, line 62
golab1:
for {
lab2:
for {
if !env.InGrouping(G_v, 1072, 1103) {
break lab2
}
break golab1
}
if env.Cursor >= env.Limit {
break lab0
}
env.NextChar()
}
// setmark pV, line 62
context.i_pV = env.Cursor
// gopast, line 62
golab3:
for {
lab4:
for {
if !env.OutGrouping(G_v, 1072, 1103) {
break lab4
}
break golab3
}
if env.Cursor >= env.Limit {
break lab0
}
env.NextChar()
}
// gopast, line 63
golab5:
for {
lab6:
for {
if !env.InGrouping(G_v, 1072, 1103) {
break lab6
}
break golab5
}
if env.Cursor >= env.Limit {
break lab0
}
env.NextChar()
}
// gopast, line 63
golab7:
for {
lab8:
for {
if !env.OutGrouping(G_v, 1072, 1103) {
break lab8
}
break golab7
}
if env.Cursor >= env.Limit {
break lab0
}
env.NextChar()
}
// setmark p2, line 63
context.i_p2 = env.Cursor
break lab0
}
env.Cursor = v_1
return true
}
func r_R2(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
if !(context.i_p2 <= env.Cursor) {
return false
}
return true
}
func r_perfective_gerund(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 71
// [, line 72
env.Ket = env.Cursor
// substring, line 72
among_var = env.FindAmongB(A_0, context)
if among_var == 0 {
return false
}
// ], line 72
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 76
// or, line 76
lab0:
for {
var v_1 = env.Limit - env.Cursor
lab1:
for {
// literal, line 76
if !env.EqSB("\u0430") {
break lab1
}
break lab0
}
env.Cursor = env.Limit - v_1
// literal, line 76
if !env.EqSB("\u044F") {
return false
}
break lab0
}
// delete, line 76
if !env.SliceDel() {
return false
}
} else if among_var == 2 {
// (, line 83
// delete, line 83
if !env.SliceDel() {
return false
}
}
return true
}
func r_adjective(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 87
// [, line 88
env.Ket = env.Cursor
// substring, line 88
among_var = env.FindAmongB(A_1, context)
if among_var == 0 {
return false
}
// ], line 88
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 97
// delete, line 97
if !env.SliceDel() {
return false
}
}
return true
}
func r_adjectival(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 101
// call adjective, line 102
if !r_adjective(env, context) {
return false
}
// try, line 109
var v_1 = env.Limit - env.Cursor
lab0:
for {
// (, line 109
// [, line 110
env.Ket = env.Cursor
// substring, line 110
among_var = env.FindAmongB(A_2, context)
if among_var == 0 {
env.Cursor = env.Limit - v_1
break lab0
}
// ], line 110
env.Bra = env.Cursor
if among_var == 0 {
env.Cursor = env.Limit - v_1
break lab0
} else if among_var == 1 {
// (, line 115
// or, line 115
lab1:
for {
var v_2 = env.Limit - env.Cursor
lab2:
for {
// literal, line 115
if !env.EqSB("\u0430") {
break lab2
}
break lab1
}
env.Cursor = env.Limit - v_2
// literal, line 115
if !env.EqSB("\u044F") {
env.Cursor = env.Limit - v_1
break lab0
}
break lab1
}
// delete, line 115
if !env.SliceDel() {
return false
}
} else if among_var == 2 {
// (, line 122
// delete, line 122
if !env.SliceDel() {
return false
}
}
break lab0
}
return true
}
func r_reflexive(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 128
// [, line 129
env.Ket = env.Cursor
// substring, line 129
among_var = env.FindAmongB(A_3, context)
if among_var == 0 {
return false
}
// ], line 129
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 132
// delete, line 132
if !env.SliceDel() {
return false
}
}
return true
}
func r_verb(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 136
// [, line 137
env.Ket = env.Cursor
// substring, line 137
among_var = env.FindAmongB(A_4, context)
if among_var == 0 {
return false
}
// ], line 137
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 143
// or, line 143
lab0:
for {
var v_1 = env.Limit - env.Cursor
lab1:
for {
// literal, line 143
if !env.EqSB("\u0430") {
break lab1
}
break lab0
}
env.Cursor = env.Limit - v_1
// literal, line 143
if !env.EqSB("\u044F") {
return false
}
break lab0
}
// delete, line 143
if !env.SliceDel() {
return false
}
} else if among_var == 2 {
// (, line 151
// delete, line 151
if !env.SliceDel() {
return false
}
}
return true
}
func r_noun(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 159
// [, line 160
env.Ket = env.Cursor
// substring, line 160
among_var = env.FindAmongB(A_5, context)
if among_var == 0 {
return false
}
// ], line 160
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 167
// delete, line 167
if !env.SliceDel() {
return false
}
}
return true
}
func r_derivational(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 175
// [, line 176
env.Ket = env.Cursor
// substring, line 176
among_var = env.FindAmongB(A_6, context)
if among_var == 0 {
return false
}
// ], line 176
env.Bra = env.Cursor
// call R2, line 176
if !r_R2(env, context) {
return false
}
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 179
// delete, line 179
if !env.SliceDel() {
return false
}
}
return true
}
func r_tidy_up(env *snowballRuntime.Env, ctx interface{}) bool {
context := ctx.(*Context)
_ = context
var among_var int32
// (, line 183
// [, line 184
env.Ket = env.Cursor
// substring, line 184
among_var = env.FindAmongB(A_7, context)
if among_var == 0 {
return false
}
// ], line 184
env.Bra = env.Cursor
if among_var == 0 {
return false
} else if among_var == 1 {
// (, line 188
// delete, line 188
if !env.SliceDel() {
return false
}
// [, line 189
env.Ket = env.Cursor
// literal, line 189
if !env.EqSB("\u043D") {
return false
}
// ], line 189
env.Bra = env.Cursor
// literal, line 189
if !env.EqSB("\u043D") {
return false
}
// delete, line 189
if !env.SliceDel() {
return false
}
} else if among_var == 2 {
// (, line 192
// literal, line 192
if !env.EqSB("\u043D") {
return false
}
// delete, line 192
if !env.SliceDel() {
return false
}
} else if among_var == 3 {
// (, line 194
// delete, line 194
if !env.SliceDel() {
return false
}
}
return true
}
func Stem(env *snowballRuntime.Env) bool {
var context = &Context{
i_p2: 0,
i_pV: 0,
}
_ = context
// (, line 199
// do, line 201
var v_1 = env.Cursor
lab0:
for {
// call mark_regions, line 201
if !r_mark_regions(env, context) {
break lab0
}
break lab0
}
env.Cursor = v_1
// backwards, line 202
env.LimitBackward = env.Cursor
env.Cursor = env.Limit
// setlimit, line 202
var v_2 = env.Limit - env.Cursor
// tomark, line 202
if env.Cursor < context.i_pV {
return false
}
env.Cursor = context.i_pV
var v_3 = env.LimitBackward
env.LimitBackward = env.Cursor
env.Cursor = env.Limit - v_2
// (, line 202
// do, line 203
var v_4 = env.Limit - env.Cursor
lab1:
for {
// (, line 203
// or, line 204
lab2:
for {
var v_5 = env.Limit - env.Cursor
lab3:
for {
// call perfective_gerund, line 204
if !r_perfective_gerund(env, context) {
break lab3
}
break lab2
}
env.Cursor = env.Limit - v_5
// (, line 205
// try, line 205
var v_6 = env.Limit - env.Cursor
lab4:
for {
// call reflexive, line 205
if !r_reflexive(env, context) {
env.Cursor = env.Limit - v_6
break lab4
}
break lab4
}
// or, line 206
lab5:
for {
var v_7 = env.Limit - env.Cursor
lab6:
for {
// call adjectival, line 206
if !r_adjectival(env, context) {
break lab6
}
break lab5
}
env.Cursor = env.Limit - v_7
lab7:
for {
// call verb, line 206
if !r_verb(env, context) {
break lab7
}
break lab5
}
env.Cursor = env.Limit - v_7
// call noun, line 206
if !r_noun(env, context) {
break lab1
}
break lab5
}
break lab2
}
break lab1
}
env.Cursor = env.Limit - v_4
// try, line 209
var v_8 = env.Limit - env.Cursor
lab8:
for {
// (, line 209
// [, line 209
env.Ket = env.Cursor
// literal, line 209
if !env.EqSB("\u0438") {
env.Cursor = env.Limit - v_8
break lab8
}
// ], line 209
env.Bra = env.Cursor
// delete, line 209
if !env.SliceDel() {
return false
}
break lab8
}
// do, line 212
var v_9 = env.Limit - env.Cursor
lab9:
for {
// call derivational, line 212
if !r_derivational(env, context) {
break lab9
}
break lab9
}
env.Cursor = env.Limit - v_9
// do, line 213
var v_10 = env.Limit - env.Cursor
lab10:
for {
// call tidy_up, line 213
if !r_tidy_up(env, context) {
break lab10
}
break lab10
}
env.Cursor = env.Limit - v_10
env.LimitBackward = v_3
env.Cursor = env.LimitBackward
return true
}

View File

@ -0,0 +1,50 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/ru/snowball"
"github.com/blevesearch/bleve/registry"
snowballRuntime "github.com/snowballstem/snowball/go"
)
const SnowballStemmerName = "stemmer_ru_snowball"
type RussianStemmerFilter struct {
}
func NewRussianStemmerFilter() *RussianStemmerFilter {
return &RussianStemmerFilter{}
}
func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballRuntime.NewEnv(string(token.Term))
snowball.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func RussianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewRussianStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(SnowballStemmerName, RussianStemmerFilterConstructor)
}

View File

@ -0,0 +1,67 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestSnowballRussianStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("актеров"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("актер"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("километров"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("километр"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token/stop"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,256 @@
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ru"
var RussianStopWords = []byte(` | From http://snowball.tartarus.org/algorithms/russian/stop.txt
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.
| this is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| letter 'ё' is translated to 'е'.
и | and
в | in/into
во | alternative form
не | not
что | what/that
он | he
на | on/onto
я | i
с | from
со | alternative form
как | how
а | milder form of 'no' (but)
то | conjunction and form of 'that'
все | all
она | she
так | so, thus
его | him
но | but
да | yes/and
ты | thou
к | towards, by
у | around, chez
же | intensifier particle
вы | you
за | beyond, behind
бы | conditional/subj. particle
по | up to, along
только | only
ее | her
мне | to me
было | it was
вот | here is/are, particle
от | away from
меня | me
еще | still, yet, more
нет | no, there isnt/arent
о | about
из | out of
ему | to him
теперь | now
когда | when
даже | even
ну | so, well
вдруг | suddenly
ли | interrogative particle
если | if
уже | already, but homonym of 'narrower'
или | or
ни | neither
быть | to be
был | he was
него | prepositional form of его
до | up to
вас | you accusative
нибудь | indef. suffix preceded by hyphen
опять | again
уж | already, but homonym of 'adder'
вам | to you
сказал | he said
ведь | particle 'after all'
там | there
потом | then
себя | oneself
ничего | nothing
ей | to her
может | usually with 'быть' as 'maybe'
они | they
тут | here
где | where
есть | there is/are
надо | got to, must
ней | prepositional form of ей
для | for
мы | we
тебя | thee
их | them, their
чем | than
была | she was
сам | self
чтоб | in order to
без | without
будто | as if
человек | man, person, one
чего | genitive form of 'what'
раз | once
тоже | also
себе | to oneself
под | beneath
жизнь | life
будет | will be
ж | short form of intensifer particle 'же'
тогда | then
кто | who
этот | this
говорил | was saying
того | genitive form of 'that'
потому | for that reason
этого | genitive form of 'this'
какой | which
совсем | altogether
ним | prepositional form of 'его', 'они'
здесь | here
этом | prepositional form of 'этот'
один | one
почти | almost
мой | my
тем | instrumental/dative plural of 'тот', 'то'
чтобы | full form of 'in order that'
нее | her (acc.)
кажется | it seems
сейчас | now
были | they were
куда | where to
зачем | why
сказать | to say
всех | all (acc., gen. preposn. plural)
никогда | never
сегодня | today
можно | possible, one can
при | by
наконец | finally
два | two
об | alternative form of 'о', about
другой | another
хоть | even
после | after
над | above
больше | more
тот | that one (masc.)
через | across, in
эти | these
нас | us
про | about
всего | in all, only, of all
них | prepositional form of 'они' (they)
какая | which, feminine
много | lots
разве | interrogative particle
сказала | she said
три | three
эту | this, acc. fem. sing.
моя | my, feminine
впрочем | moreover, besides
хорошо | good
свою | ones own, acc. fem. sing.
этой | oblique form of 'эта', fem. 'this'
перед | in front of
иногда | sometimes
лучше | better
чуть | a little
том | preposn. form of 'that one'
нельзя | one must not
такой | such a one
им | to them
более | more
всегда | always
конечно | of course
всю | acc. fem. sing of 'all'
между | between
| b: some paradigms
|
| personal pronouns
|
| я меня мне мной [мною]
| ты тебя тебе тобой [тобою]
| он его ему им [него, нему, ним]
| она ее эи ею [нее, нэи, нею]
| оно его ему им [него, нему, ним]
|
| мы нас нам нами
| вы вас вам вами
| они их им ими [них, ним, ними]
|
| себя себе собой [собою]
|
| demonstrative pronouns: этот (this), тот (that)
|
| этот эта это эти
| этого эты это эти
| этого этой этого этих
| этому этой этому этим
| этим этой этим [этою] этими
| этом этой этом этих
|
| тот та то те
| того ту то те
| того той того тех
| тому той тому тем
| тем той тем [тою] теми
| том той том тех
|
| determinative pronouns
|
| (a) весь (all)
|
| весь вся все все
| всего всю все все
| всего всей всего всех
| всему всей всему всем
| всем всей всем [всею] всеми
| всем всей всем всех
|
| (b) сам (himself etc)
|
| сам сама само сами
| самого саму само самих
| самого самой самого самих
| самому самой самому самим
| самим самой самим [самою] самими
| самом самой самом самих
|
| stems of verbs 'to be', 'to have', 'to do' and modal
|
| быть бы буд быв есть суть
| име
| дел
| мог мож мочь
| уме
| хоч хот
| долж
| можн
| нужн
| нельзя
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(RussianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/kljensen/snowball"
)
const Name = "stemmer_snowball"
type SnowballStemmer struct {
langauge string
}
func NewSnowballStemmer(language string) *SnowballStemmer {
return &SnowballStemmer{
langauge: language,
}
}
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true)
token.Term = []byte(stemmed)
}
}
return input
}
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
language, ok := config["language"].(string)
if !ok {
return nil, fmt.Errorf("must specify language")
}
return NewSnowballStemmer(language), nil
}
func init() {
registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
}

View File

@ -0,0 +1,115 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestSnowballStemmer(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewSnowballStemmer("english")
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkSnowballStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewSnowballStemmer("english")
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}