0
0
Fork 0

Merge branch 'sokolovstas-ru_analyzer'

This commit is contained in:
Marty Schoch 2018-01-10 15:16:45 -05:00
commit e68b70aa82
9 changed files with 777 additions and 0 deletions

View File

@ -0,0 +1,57 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
)
const AnalyzerName = "ru"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopRuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerRuFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopRuFilter,
stemmerRuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,122 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestRussianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("километрах"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("километр"),
},
},
},
{
input: []byte("актеров"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("актер"),
},
},
},
// stop word
{
input: []byte("как"),
output: analysis.TokenStream{},
},
// digits safe
{
input: []byte("text 1000"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("text"),
},
&analysis.Token{
Term: []byte("1000"),
},
},
},
{
input: []byte("Вместе с тем о силе электромагнитной энергии имели представление еще"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("вмест"),
},
&analysis.Token{
Term: []byte("сил"),
},
&analysis.Token{
Term: []byte("электромагнитн"),
},
&analysis.Token{
Term: []byte("энерг"),
},
&analysis.Token{
Term: []byte("имел"),
},
&analysis.Token{
Term: []byte("представлен"),
},
},
},
{
input: []byte("Но знание это хранилось в тайне"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("знан"),
},
&analysis.Token{
Term: []byte("эт"),
},
&analysis.Token{
Term: []byte("хран"),
},
&analysis.Token{
Term: []byte("тайн"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/russian"
)
const SnowballStemmerName = "stemmer_ru_snowball"
type RussianStemmerFilter struct {
}
func NewRussianStemmerFilter() *RussianStemmerFilter {
return &RussianStemmerFilter{}
}
func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
russian.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func RussianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewRussianStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(SnowballStemmerName, RussianStemmerFilterConstructor)
}

View File

@ -0,0 +1,67 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestSnowballRussianStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("актеров"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("актер"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("километров"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("километр"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,33 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token/stop"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -0,0 +1,267 @@
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ru"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var RussianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.
| this is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| letter 'ё' is translated to 'е'.
и | and
в | in/into
во | alternative form
не | not
что | what/that
он | he
на | on/onto
я | i
с | from
со | alternative form
как | how
а | milder form of 'no' (but)
то | conjunction and form of 'that'
все | all
она | she
так | so, thus
его | him
но | but
да | yes/and
ты | thou
к | towards, by
у | around, chez
же | intensifier particle
вы | you
за | beyond, behind
бы | conditional/subj. particle
по | up to, along
только | only
ее | her
мне | to me
было | it was
вот | here is/are, particle
от | away from
меня | me
еще | still, yet, more
нет | no, there isnt/arent
о | about
из | out of
ему | to him
теперь | now
когда | when
даже | even
ну | so, well
вдруг | suddenly
ли | interrogative particle
если | if
уже | already, but homonym of 'narrower'
или | or
ни | neither
быть | to be
был | he was
него | prepositional form of его
до | up to
вас | you accusative
нибудь | indef. suffix preceded by hyphen
опять | again
уж | already, but homonym of 'adder'
вам | to you
сказал | he said
ведь | particle 'after all'
там | there
потом | then
себя | oneself
ничего | nothing
ей | to her
может | usually with 'быть' as 'maybe'
они | they
тут | here
где | where
есть | there is/are
надо | got to, must
ней | prepositional form of ей
для | for
мы | we
тебя | thee
их | them, their
чем | than
была | she was
сам | self
чтоб | in order to
без | without
будто | as if
человек | man, person, one
чего | genitive form of 'what'
раз | once
тоже | also
себе | to oneself
под | beneath
жизнь | life
будет | will be
ж | short form of intensifer particle 'же'
тогда | then
кто | who
этот | this
говорил | was saying
того | genitive form of 'that'
потому | for that reason
этого | genitive form of 'this'
какой | which
совсем | altogether
ним | prepositional form of 'его', 'они'
здесь | here
этом | prepositional form of 'этот'
один | one
почти | almost
мой | my
тем | instrumental/dative plural of 'тот', 'то'
чтобы | full form of 'in order that'
нее | her (acc.)
кажется | it seems
сейчас | now
были | they were
куда | where to
зачем | why
сказать | to say
всех | all (acc., gen. preposn. plural)
никогда | never
сегодня | today
можно | possible, one can
при | by
наконец | finally
два | two
об | alternative form of 'о', about
другой | another
хоть | even
после | after
над | above
больше | more
тот | that one (masc.)
через | across, in
эти | these
нас | us
про | about
всего | in all, only, of all
них | prepositional form of 'они' (they)
какая | which, feminine
много | lots
разве | interrogative particle
сказала | she said
три | three
эту | this, acc. fem. sing.
моя | my, feminine
впрочем | moreover, besides
хорошо | good
свою | ones own, acc. fem. sing.
этой | oblique form of 'эта', fem. 'this'
перед | in front of
иногда | sometimes
лучше | better
чуть | a little
том | preposn. form of 'that one'
нельзя | one must not
такой | such a one
им | to them
более | more
всегда | always
конечно | of course
всю | acc. fem. sing of 'all'
между | between
| b: some paradigms
|
| personal pronouns
|
| я меня мне мной [мною]
| ты тебя тебе тобой [тобою]
| он его ему им [него, нему, ним]
| она ее эи ею [нее, нэи, нею]
| оно его ему им [него, нему, ним]
|
| мы нас нам нами
| вы вас вам вами
| они их им ими [них, ним, ними]
|
| себя себе собой [собою]
|
| demonstrative pronouns: этот (this), тот (that)
|
| этот эта это эти
| этого эты это эти
| этого этой этого этих
| этому этой этому этим
| этим этой этим [этою] этими
| этом этой этом этих
|
| тот та то те
| того ту то те
| того той того тех
| тому той тому тем
| тем той тем [тою] теми
| том той том тех
|
| determinative pronouns
|
| (a) весь (all)
|
| весь вся все все
| всего всю все все
| всего всей всего всех
| всему всей всему всем
| всем всей всем [всею] всеми
| всем всей всем всех
|
| (b) сам (himself etc)
|
| сам сама само сами
| самого саму само самих
| самого самой самого самих
| самому самой самому самим
| самим самой самим [самою] самими
| самом самой самом самих
|
| stems of verbs 'to be', 'to have', 'to do' and modal
|
| быть бы буд быв есть суть
| име
| дел
| мог мож мочь
| уме
| хоч хот
| долж
| можн
| нужн
| нельзя
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(RussianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/kljensen/snowball"
)
const Name = "stemmer_snowball"
type SnowballStemmer struct {
langauge string
}
func NewSnowballStemmer(language string) *SnowballStemmer {
return &SnowballStemmer{
langauge: language,
}
}
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true)
token.Term = []byte(stemmed)
}
}
return input
}
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
language, ok := config["language"].(string)
if !ok {
return nil, fmt.Errorf("must specify language")
}
return NewSnowballStemmer(language), nil
}
func init() {
registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
}

View File

@ -0,0 +1,115 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestSnowballStemmer(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewSnowballStemmer("english")
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkSnowballStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewSnowballStemmer("english")
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}

8
vendor/manifest vendored
View File

@ -17,6 +17,14 @@
"branch": "master",
"notests": true
},
{
"importpath": "github.com/blevesearch/snowballstem",
"repository": "https://github.com/blevesearch/snowballstem",
"vcs": "",
"revision": "26b06a2c243d4f8ca5db3486f94409dd5b2a7467",
"branch": "master",
"notests": true
},
{
"importpath": "github.com/boltdb/bolt",
"repository": "https://github.com/boltdb/bolt",