add analyzers for several languages
Having pure Go snowball stemmers allows us to add support for many languages into the core of bleve. Specifically we just added: Russian, Danish, Finnish, Hungarian, Dutch, Norwegian, Romanian, Swedish, Turkish
This commit is contained in:
parent
e68b70aa82
commit
09a61a7a38
|
@ -0,0 +1,56 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "da"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopDaFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerDaFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopDaFilter,
|
||||||
|
stemmerDaFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,71 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDanishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("undersøg"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("undersøg"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 9,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("undersøgelse"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("undersøg"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 13,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("på"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if !reflect.DeepEqual(actual, test.output) {
|
||||||
|
t.Errorf("expected %v, got %v", test.output, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/danish"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_da_snowball"
|
||||||
|
|
||||||
|
type DanishStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDanishStemmerFilter() *DanishStemmerFilter {
|
||||||
|
return &DanishStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
danish.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func DanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewDanishStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, DanishStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,134 @@
|
||||||
|
package da
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_da"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||||
|
| a large text sample.
|
||||||
|
|
||||||
|
|
||||||
|
og | and
|
||||||
|
i | in
|
||||||
|
jeg | I
|
||||||
|
det | that (dem. pronoun)/it (pers. pronoun)
|
||||||
|
at | that (in front of a sentence)/to (with infinitive)
|
||||||
|
en | a/an
|
||||||
|
den | it (pers. pronoun)/that (dem. pronoun)
|
||||||
|
til | to/at/for/until/against/by/of/into, more
|
||||||
|
er | present tense of "to be"
|
||||||
|
som | who, as
|
||||||
|
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||||
|
de | they
|
||||||
|
med | with/by/in, along
|
||||||
|
han | he
|
||||||
|
af | of/by/from/off/for/in/with/on, off
|
||||||
|
for | at/for/to/from/by/of/ago, in front/before, because
|
||||||
|
ikke | not
|
||||||
|
der | who/which, there/those
|
||||||
|
var | past tense of "to be"
|
||||||
|
mig | me/myself
|
||||||
|
sig | oneself/himself/herself/itself/themselves
|
||||||
|
men | but
|
||||||
|
et | a/an/one, one (number), someone/somebody/one
|
||||||
|
har | present tense of "to have"
|
||||||
|
om | round/about/for/in/a, about/around/down, if
|
||||||
|
vi | we
|
||||||
|
min | my
|
||||||
|
havde | past tense of "to have"
|
||||||
|
ham | him
|
||||||
|
hun | she
|
||||||
|
nu | now
|
||||||
|
over | over/above/across/by/beyond/past/on/about, over/past
|
||||||
|
da | then, when/as/since
|
||||||
|
fra | from/off/since, off, since
|
||||||
|
du | you
|
||||||
|
ud | out
|
||||||
|
sin | his/her/its/one's
|
||||||
|
dem | them
|
||||||
|
os | us/ourselves
|
||||||
|
op | up
|
||||||
|
man | you/one
|
||||||
|
hans | his
|
||||||
|
hvor | where
|
||||||
|
eller | or
|
||||||
|
hvad | what
|
||||||
|
skal | must/shall etc.
|
||||||
|
selv | myself/youself/herself/ourselves etc., even
|
||||||
|
her | here
|
||||||
|
alle | all/everyone/everybody etc.
|
||||||
|
vil | will (verb)
|
||||||
|
blev | past tense of "to stay/to remain/to get/to become"
|
||||||
|
kunne | could
|
||||||
|
ind | in
|
||||||
|
når | when
|
||||||
|
være | present tense of "to be"
|
||||||
|
dog | however/yet/after all
|
||||||
|
noget | something
|
||||||
|
ville | would
|
||||||
|
jo | you know/you see (adv), yes
|
||||||
|
deres | their/theirs
|
||||||
|
efter | after/behind/according to/for/by/from, later/afterwards
|
||||||
|
ned | down
|
||||||
|
skulle | should
|
||||||
|
denne | this
|
||||||
|
end | than
|
||||||
|
dette | this
|
||||||
|
mit | my/mine
|
||||||
|
også | also
|
||||||
|
under | under/beneath/below/during, below/underneath
|
||||||
|
have | have
|
||||||
|
dig | you
|
||||||
|
anden | other
|
||||||
|
hende | her
|
||||||
|
mine | my
|
||||||
|
alt | everything
|
||||||
|
meget | much/very, plenty of
|
||||||
|
sit | his, her, its, one's
|
||||||
|
sine | his, her, its, one's
|
||||||
|
vor | our
|
||||||
|
mod | against
|
||||||
|
disse | these
|
||||||
|
hvis | if
|
||||||
|
din | your/yours
|
||||||
|
nogle | some
|
||||||
|
hos | by/at
|
||||||
|
blive | be/become
|
||||||
|
mange | many
|
||||||
|
ad | by/through
|
||||||
|
bliver | present tense of "to be/to become"
|
||||||
|
hendes | her/hers
|
||||||
|
været | be
|
||||||
|
thi | for (conj)
|
||||||
|
jer | you
|
||||||
|
sådan | such, like this/like that
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(DanishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "fi"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopFiFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerFiFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopFiFilter,
|
||||||
|
stemmerFiFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFinishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("edeltäjiinsä"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("edeltäj"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("edeltäjistään"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("edeltäj"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("olla"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/finnish"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_fi_snowball"
|
||||||
|
|
||||||
|
type FinnishStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewFinnishStemmerFilter() *FinnishStemmerFilter {
|
||||||
|
return &FinnishStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
finnish.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func FinnishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewFinnishStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, FinnishStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,121 @@
|
||||||
|
package fi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_fi"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| forms of BE
|
||||||
|
|
||||||
|
olla
|
||||||
|
olen
|
||||||
|
olet
|
||||||
|
on
|
||||||
|
olemme
|
||||||
|
olette
|
||||||
|
ovat
|
||||||
|
ole | negative form
|
||||||
|
|
||||||
|
oli
|
||||||
|
olisi
|
||||||
|
olisit
|
||||||
|
olisin
|
||||||
|
olisimme
|
||||||
|
olisitte
|
||||||
|
olisivat
|
||||||
|
olit
|
||||||
|
olin
|
||||||
|
olimme
|
||||||
|
olitte
|
||||||
|
olivat
|
||||||
|
ollut
|
||||||
|
olleet
|
||||||
|
|
||||||
|
en | negation
|
||||||
|
et
|
||||||
|
ei
|
||||||
|
emme
|
||||||
|
ette
|
||||||
|
eivät
|
||||||
|
|
||||||
|
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||||
|
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||||
|
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||||
|
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||||
|
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||||
|
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||||
|
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||||
|
|
||||||
|
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||||
|
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||||
|
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||||
|
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||||
|
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||||
|
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||||
|
|
||||||
|
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||||
|
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||||
|
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||||
|
mitkä | (pl)
|
||||||
|
|
||||||
|
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||||
|
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||||
|
|
||||||
|
| conjunctions
|
||||||
|
|
||||||
|
että | that
|
||||||
|
ja | and
|
||||||
|
jos | if
|
||||||
|
koska | because
|
||||||
|
kuin | than
|
||||||
|
mutta | but
|
||||||
|
niin | so
|
||||||
|
sekä | and
|
||||||
|
sillä | for
|
||||||
|
tai | or
|
||||||
|
vaan | but
|
||||||
|
vai | or
|
||||||
|
vaikka | although
|
||||||
|
|
||||||
|
|
||||||
|
| prepositions
|
||||||
|
|
||||||
|
kanssa | with
|
||||||
|
mukaan | according to
|
||||||
|
noin | about
|
||||||
|
poikki | across
|
||||||
|
yli | over, across
|
||||||
|
|
||||||
|
| other
|
||||||
|
|
||||||
|
kun | when
|
||||||
|
niin | so
|
||||||
|
nyt | now
|
||||||
|
itse | self
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(FinnishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package hu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "hu"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopHuFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerHuFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopHuFilter,
|
||||||
|
stemmerHuFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package hu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestHungarianAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("babakocsi"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("babakocs"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("babakocsijáért"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("babakocs"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("által"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package hu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/hungarian"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_hu_snowball"
|
||||||
|
|
||||||
|
type HungarianStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewHungarianStemmerFilter() *HungarianStemmerFilter {
|
||||||
|
return &HungarianStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *HungarianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
hungarian.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func HungarianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewHungarianStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, HungarianStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package hu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,235 @@
|
||||||
|
package hu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_hu"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var HungarianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| Hungarian stop word list
|
||||||
|
| prepared by Anna Tordai
|
||||||
|
|
||||||
|
a
|
||||||
|
ahogy
|
||||||
|
ahol
|
||||||
|
aki
|
||||||
|
akik
|
||||||
|
akkor
|
||||||
|
alatt
|
||||||
|
által
|
||||||
|
általában
|
||||||
|
amely
|
||||||
|
amelyek
|
||||||
|
amelyekben
|
||||||
|
amelyeket
|
||||||
|
amelyet
|
||||||
|
amelynek
|
||||||
|
ami
|
||||||
|
amit
|
||||||
|
amolyan
|
||||||
|
amíg
|
||||||
|
amikor
|
||||||
|
át
|
||||||
|
abban
|
||||||
|
ahhoz
|
||||||
|
annak
|
||||||
|
arra
|
||||||
|
arról
|
||||||
|
az
|
||||||
|
azok
|
||||||
|
azon
|
||||||
|
azt
|
||||||
|
azzal
|
||||||
|
azért
|
||||||
|
aztán
|
||||||
|
azután
|
||||||
|
azonban
|
||||||
|
bár
|
||||||
|
be
|
||||||
|
belül
|
||||||
|
benne
|
||||||
|
cikk
|
||||||
|
cikkek
|
||||||
|
cikkeket
|
||||||
|
csak
|
||||||
|
de
|
||||||
|
e
|
||||||
|
eddig
|
||||||
|
egész
|
||||||
|
egy
|
||||||
|
egyes
|
||||||
|
egyetlen
|
||||||
|
egyéb
|
||||||
|
egyik
|
||||||
|
egyre
|
||||||
|
ekkor
|
||||||
|
el
|
||||||
|
elég
|
||||||
|
ellen
|
||||||
|
elő
|
||||||
|
először
|
||||||
|
előtt
|
||||||
|
első
|
||||||
|
én
|
||||||
|
éppen
|
||||||
|
ebben
|
||||||
|
ehhez
|
||||||
|
emilyen
|
||||||
|
ennek
|
||||||
|
erre
|
||||||
|
ez
|
||||||
|
ezt
|
||||||
|
ezek
|
||||||
|
ezen
|
||||||
|
ezzel
|
||||||
|
ezért
|
||||||
|
és
|
||||||
|
fel
|
||||||
|
felé
|
||||||
|
hanem
|
||||||
|
hiszen
|
||||||
|
hogy
|
||||||
|
hogyan
|
||||||
|
igen
|
||||||
|
így
|
||||||
|
illetve
|
||||||
|
ill.
|
||||||
|
ill
|
||||||
|
ilyen
|
||||||
|
ilyenkor
|
||||||
|
ison
|
||||||
|
ismét
|
||||||
|
itt
|
||||||
|
jó
|
||||||
|
jól
|
||||||
|
jobban
|
||||||
|
kell
|
||||||
|
kellett
|
||||||
|
keresztül
|
||||||
|
keressünk
|
||||||
|
ki
|
||||||
|
kívül
|
||||||
|
között
|
||||||
|
közül
|
||||||
|
legalább
|
||||||
|
lehet
|
||||||
|
lehetett
|
||||||
|
legyen
|
||||||
|
lenne
|
||||||
|
lenni
|
||||||
|
lesz
|
||||||
|
lett
|
||||||
|
maga
|
||||||
|
magát
|
||||||
|
majd
|
||||||
|
majd
|
||||||
|
már
|
||||||
|
más
|
||||||
|
másik
|
||||||
|
meg
|
||||||
|
még
|
||||||
|
mellett
|
||||||
|
mert
|
||||||
|
mely
|
||||||
|
melyek
|
||||||
|
mi
|
||||||
|
mit
|
||||||
|
míg
|
||||||
|
miért
|
||||||
|
milyen
|
||||||
|
mikor
|
||||||
|
minden
|
||||||
|
mindent
|
||||||
|
mindenki
|
||||||
|
mindig
|
||||||
|
mint
|
||||||
|
mintha
|
||||||
|
mivel
|
||||||
|
most
|
||||||
|
nagy
|
||||||
|
nagyobb
|
||||||
|
nagyon
|
||||||
|
ne
|
||||||
|
néha
|
||||||
|
nekem
|
||||||
|
neki
|
||||||
|
nem
|
||||||
|
néhány
|
||||||
|
nélkül
|
||||||
|
nincs
|
||||||
|
olyan
|
||||||
|
ott
|
||||||
|
össze
|
||||||
|
ő
|
||||||
|
ők
|
||||||
|
őket
|
||||||
|
pedig
|
||||||
|
persze
|
||||||
|
rá
|
||||||
|
s
|
||||||
|
saját
|
||||||
|
sem
|
||||||
|
semmi
|
||||||
|
sok
|
||||||
|
sokat
|
||||||
|
sokkal
|
||||||
|
számára
|
||||||
|
szemben
|
||||||
|
szerint
|
||||||
|
szinte
|
||||||
|
talán
|
||||||
|
tehát
|
||||||
|
teljes
|
||||||
|
tovább
|
||||||
|
továbbá
|
||||||
|
több
|
||||||
|
úgy
|
||||||
|
ugyanis
|
||||||
|
új
|
||||||
|
újabb
|
||||||
|
újra
|
||||||
|
után
|
||||||
|
utána
|
||||||
|
utolsó
|
||||||
|
vagy
|
||||||
|
vagyis
|
||||||
|
valaki
|
||||||
|
valami
|
||||||
|
valamint
|
||||||
|
való
|
||||||
|
vagyok
|
||||||
|
van
|
||||||
|
vannak
|
||||||
|
volt
|
||||||
|
voltam
|
||||||
|
voltak
|
||||||
|
voltunk
|
||||||
|
vissza
|
||||||
|
vele
|
||||||
|
viszont
|
||||||
|
volna
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(HungarianStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package nl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "nl"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopNlFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerNlFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopNlFilter,
|
||||||
|
stemmerNlFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package nl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestDutchAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("lichamelijk"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("licham"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("lichamelijke"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("licham"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("van"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package nl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/dutch"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_nl_snowball"
|
||||||
|
|
||||||
|
type DutchStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDutchStemmerFilter() *DutchStemmerFilter {
|
||||||
|
return &DutchStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DutchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
dutch.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func DutchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewDutchStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, DutchStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package nl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,143 @@
|
||||||
|
package nl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_nl"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var DutchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||||
|
| a large sample of Dutch text.
|
||||||
|
|
||||||
|
| Dutch stop words frequently exhibit homonym clashes. These are indicated
|
||||||
|
| clearly below.
|
||||||
|
|
||||||
|
de | the
|
||||||
|
en | and
|
||||||
|
van | of, from
|
||||||
|
ik | I, the ego
|
||||||
|
te | (1) chez, at etc, (2) to, (3) too
|
||||||
|
dat | that, which
|
||||||
|
die | that, those, who, which
|
||||||
|
in | in, inside
|
||||||
|
een | a, an, one
|
||||||
|
hij | he
|
||||||
|
het | the, it
|
||||||
|
niet | not, nothing, naught
|
||||||
|
zijn | (1) to be, being, (2) his, one's, its
|
||||||
|
is | is
|
||||||
|
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
|
||||||
|
op | on, upon, at, in, up, used up
|
||||||
|
aan | on, upon, to (as dative)
|
||||||
|
met | with, by
|
||||||
|
als | like, such as, when
|
||||||
|
voor | (1) before, in front of, (2) furrow
|
||||||
|
had | had, past tense all persons sing. of 'hebben' (have)
|
||||||
|
er | there
|
||||||
|
maar | but, only
|
||||||
|
om | round, about, for etc
|
||||||
|
hem | him
|
||||||
|
dan | then
|
||||||
|
zou | should/would, past tense all persons sing. of 'zullen'
|
||||||
|
of | or, whether, if
|
||||||
|
wat | what, something, anything
|
||||||
|
mijn | possessive and noun 'mine'
|
||||||
|
men | people, 'one'
|
||||||
|
dit | this
|
||||||
|
zo | so, thus, in this way
|
||||||
|
door | through by
|
||||||
|
over | over, across
|
||||||
|
ze | she, her, they, them
|
||||||
|
zich | oneself
|
||||||
|
bij | (1) a bee, (2) by, near, at
|
||||||
|
ook | also, too
|
||||||
|
tot | till, until
|
||||||
|
je | you
|
||||||
|
mij | me
|
||||||
|
uit | out of, from
|
||||||
|
der | Old Dutch form of 'van der' still found in surnames
|
||||||
|
daar | (1) there, (2) because
|
||||||
|
haar | (1) her, their, them, (2) hair
|
||||||
|
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
|
||||||
|
heb | present first person sing. of 'to have'
|
||||||
|
hoe | how, why
|
||||||
|
heeft | present third person sing. of 'to have'
|
||||||
|
hebben | 'to have' and various parts thereof
|
||||||
|
deze | this
|
||||||
|
u | you
|
||||||
|
want | (1) for, (2) mitten, (3) rigging
|
||||||
|
nog | yet, still
|
||||||
|
zal | 'shall', first and third person sing. of verb 'zullen' (will)
|
||||||
|
me | me
|
||||||
|
zij | she, they
|
||||||
|
nu | now
|
||||||
|
ge | 'thou', still used in Belgium and south Netherlands
|
||||||
|
geen | none
|
||||||
|
omdat | because
|
||||||
|
iets | something, somewhat
|
||||||
|
worden | to become, grow, get
|
||||||
|
toch | yet, still
|
||||||
|
al | all, every, each
|
||||||
|
waren | (1) 'were' (2) to wander, (3) wares, (3)
|
||||||
|
veel | much, many
|
||||||
|
meer | (1) more, (2) lake
|
||||||
|
doen | to do, to make
|
||||||
|
toen | then, when
|
||||||
|
moet | noun 'spot/mote' and present form of 'to must'
|
||||||
|
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
|
||||||
|
zonder | without
|
||||||
|
kan | noun 'can' and present form of 'to be able'
|
||||||
|
hun | their, them
|
||||||
|
dus | so, consequently
|
||||||
|
alles | all, everything, anything
|
||||||
|
onder | under, beneath
|
||||||
|
ja | yes, of course
|
||||||
|
eens | once, one day
|
||||||
|
hier | here
|
||||||
|
wie | who
|
||||||
|
werd | imperfect third person sing. of 'become'
|
||||||
|
altijd | always
|
||||||
|
doch | yet, but etc
|
||||||
|
wordt | present third person sing. of 'become'
|
||||||
|
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
|
||||||
|
kunnen | to be able
|
||||||
|
ons | us/our
|
||||||
|
zelf | self
|
||||||
|
tegen | against, towards, at
|
||||||
|
na | after, near
|
||||||
|
reeds | already
|
||||||
|
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
|
||||||
|
kon | could; past tense of 'to be able'
|
||||||
|
niets | nothing
|
||||||
|
uw | your
|
||||||
|
iemand | somebody
|
||||||
|
geweest | been; past participle of 'be'
|
||||||
|
andere | other
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(DutchStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package no
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "no"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopNoFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerNoFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopNoFilter,
|
||||||
|
stemmerNoFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package no
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNorwegianAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("havnedistriktene"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("havnedistrikt"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("havnedistrikter"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("havnedistrikt"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("det"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package no
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/norwegian"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_no_snowball"
|
||||||
|
|
||||||
|
type NorwegianStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewNorwegianStemmerFilter() *NorwegianStemmerFilter {
|
||||||
|
return &NorwegianStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NorwegianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
norwegian.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func NorwegianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewNorwegianStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, NorwegianStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package no
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,218 @@
|
||||||
|
package no
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_no"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var NorwegianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| This stop word list is for the dominant bokmål dialect. Words unique
|
||||||
|
| to nynorsk are marked *.
|
||||||
|
|
||||||
|
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
|
||||||
|
|
||||||
|
og | and
|
||||||
|
i | in
|
||||||
|
jeg | I
|
||||||
|
det | it/this/that
|
||||||
|
at | to (w. inf.)
|
||||||
|
en | a/an
|
||||||
|
et | a/an
|
||||||
|
den | it/this/that
|
||||||
|
til | to
|
||||||
|
er | is/am/are
|
||||||
|
som | who/that
|
||||||
|
på | on
|
||||||
|
de | they / you(formal)
|
||||||
|
med | with
|
||||||
|
han | he
|
||||||
|
av | of
|
||||||
|
ikke | not
|
||||||
|
ikkje | not *
|
||||||
|
der | there
|
||||||
|
så | so
|
||||||
|
var | was/were
|
||||||
|
meg | me
|
||||||
|
seg | you
|
||||||
|
men | but
|
||||||
|
ett | one
|
||||||
|
har | have
|
||||||
|
om | about
|
||||||
|
vi | we
|
||||||
|
min | my
|
||||||
|
mitt | my
|
||||||
|
ha | have
|
||||||
|
hadde | had
|
||||||
|
hun | she
|
||||||
|
nå | now
|
||||||
|
over | over
|
||||||
|
da | when/as
|
||||||
|
ved | by/know
|
||||||
|
fra | from
|
||||||
|
du | you
|
||||||
|
ut | out
|
||||||
|
sin | your
|
||||||
|
dem | them
|
||||||
|
oss | us
|
||||||
|
opp | up
|
||||||
|
man | you/one
|
||||||
|
kan | can
|
||||||
|
hans | his
|
||||||
|
hvor | where
|
||||||
|
eller | or
|
||||||
|
hva | what
|
||||||
|
skal | shall/must
|
||||||
|
selv | self (reflective)
|
||||||
|
sjøl | self (reflective)
|
||||||
|
her | here
|
||||||
|
alle | all
|
||||||
|
vil | will
|
||||||
|
bli | become
|
||||||
|
ble | became
|
||||||
|
blei | became *
|
||||||
|
blitt | have become
|
||||||
|
kunne | could
|
||||||
|
inn | in
|
||||||
|
når | when
|
||||||
|
være | be
|
||||||
|
kom | come
|
||||||
|
noen | some
|
||||||
|
noe | some
|
||||||
|
ville | would
|
||||||
|
dere | you
|
||||||
|
som | who/which/that
|
||||||
|
deres | their/theirs
|
||||||
|
kun | only/just
|
||||||
|
ja | yes
|
||||||
|
etter | after
|
||||||
|
ned | down
|
||||||
|
skulle | should
|
||||||
|
denne | this
|
||||||
|
for | for/because
|
||||||
|
deg | you
|
||||||
|
si | hers/his
|
||||||
|
sine | hers/his
|
||||||
|
sitt | hers/his
|
||||||
|
mot | against
|
||||||
|
å | to
|
||||||
|
meget | much
|
||||||
|
hvorfor | why
|
||||||
|
dette | this
|
||||||
|
disse | these/those
|
||||||
|
uten | without
|
||||||
|
hvordan | how
|
||||||
|
ingen | none
|
||||||
|
din | your
|
||||||
|
ditt | your
|
||||||
|
blir | become
|
||||||
|
samme | same
|
||||||
|
hvilken | which
|
||||||
|
hvilke | which (plural)
|
||||||
|
sånn | such a
|
||||||
|
inni | inside/within
|
||||||
|
mellom | between
|
||||||
|
vår | our
|
||||||
|
hver | each
|
||||||
|
hvem | who
|
||||||
|
vors | us/ours
|
||||||
|
hvis | whose
|
||||||
|
både | both
|
||||||
|
bare | only/just
|
||||||
|
enn | than
|
||||||
|
fordi | as/because
|
||||||
|
før | before
|
||||||
|
mange | many
|
||||||
|
også | also
|
||||||
|
slik | just
|
||||||
|
vært | been
|
||||||
|
være | to be
|
||||||
|
båe | both *
|
||||||
|
begge | both
|
||||||
|
siden | since
|
||||||
|
dykk | your *
|
||||||
|
dykkar | yours *
|
||||||
|
dei | they *
|
||||||
|
deira | them *
|
||||||
|
deires | theirs *
|
||||||
|
deim | them *
|
||||||
|
di | your (fem.) *
|
||||||
|
då | as/when *
|
||||||
|
eg | I *
|
||||||
|
ein | a/an *
|
||||||
|
eit | a/an *
|
||||||
|
eitt | a/an *
|
||||||
|
elles | or *
|
||||||
|
honom | he *
|
||||||
|
hjå | at *
|
||||||
|
ho | she *
|
||||||
|
hoe | she *
|
||||||
|
henne | her
|
||||||
|
hennar | her/hers
|
||||||
|
hennes | hers
|
||||||
|
hoss | how *
|
||||||
|
hossen | how *
|
||||||
|
ikkje | not *
|
||||||
|
ingi | noone *
|
||||||
|
inkje | noone *
|
||||||
|
korleis | how *
|
||||||
|
korso | how *
|
||||||
|
kva | what/which *
|
||||||
|
kvar | where *
|
||||||
|
kvarhelst | where *
|
||||||
|
kven | who/whom *
|
||||||
|
kvi | why *
|
||||||
|
kvifor | why *
|
||||||
|
me | we *
|
||||||
|
medan | while *
|
||||||
|
mi | my *
|
||||||
|
mine | my *
|
||||||
|
mykje | much *
|
||||||
|
no | now *
|
||||||
|
nokon | some (masc./neut.) *
|
||||||
|
noka | some (fem.) *
|
||||||
|
nokor | some *
|
||||||
|
noko | some *
|
||||||
|
nokre | some *
|
||||||
|
si | his/hers *
|
||||||
|
sia | since *
|
||||||
|
sidan | since *
|
||||||
|
so | so *
|
||||||
|
somt | some *
|
||||||
|
somme | some *
|
||||||
|
um | about*
|
||||||
|
upp | up *
|
||||||
|
vere | be *
|
||||||
|
vore | was *
|
||||||
|
verte | become *
|
||||||
|
vort | become *
|
||||||
|
varte | became *
|
||||||
|
vart | became *
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(NorwegianStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package ro
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "ro"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopRoFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerRoFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopRoFilter,
|
||||||
|
stemmerRoFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package ro
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRomanianAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("absenţa"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("absenţ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("absenţi"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("absenţ"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("îl"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package ro
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/romanian"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_ro_snowball"
|
||||||
|
|
||||||
|
type RomanianStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRomanianStemmerFilter() *RomanianStemmerFilter {
|
||||||
|
return &RomanianStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *RomanianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
romanian.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func RomanianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewRomanianStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, RomanianStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package ro
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,257 @@
|
||||||
|
package ro
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_ro"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var RomanianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||||
|
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
acea
|
||||||
|
aceasta
|
||||||
|
această
|
||||||
|
aceea
|
||||||
|
acei
|
||||||
|
aceia
|
||||||
|
acel
|
||||||
|
acela
|
||||||
|
acele
|
||||||
|
acelea
|
||||||
|
acest
|
||||||
|
acesta
|
||||||
|
aceste
|
||||||
|
acestea
|
||||||
|
aceşti
|
||||||
|
aceştia
|
||||||
|
acolo
|
||||||
|
acum
|
||||||
|
ai
|
||||||
|
aia
|
||||||
|
aibă
|
||||||
|
aici
|
||||||
|
al
|
||||||
|
ăla
|
||||||
|
ale
|
||||||
|
alea
|
||||||
|
ălea
|
||||||
|
altceva
|
||||||
|
altcineva
|
||||||
|
am
|
||||||
|
ar
|
||||||
|
are
|
||||||
|
aş
|
||||||
|
aşadar
|
||||||
|
asemenea
|
||||||
|
asta
|
||||||
|
ăsta
|
||||||
|
astăzi
|
||||||
|
astea
|
||||||
|
ăstea
|
||||||
|
ăştia
|
||||||
|
asupra
|
||||||
|
aţi
|
||||||
|
au
|
||||||
|
avea
|
||||||
|
avem
|
||||||
|
aveţi
|
||||||
|
azi
|
||||||
|
bine
|
||||||
|
bucur
|
||||||
|
bună
|
||||||
|
ca
|
||||||
|
că
|
||||||
|
căci
|
||||||
|
când
|
||||||
|
care
|
||||||
|
cărei
|
||||||
|
căror
|
||||||
|
cărui
|
||||||
|
cât
|
||||||
|
câte
|
||||||
|
câţi
|
||||||
|
către
|
||||||
|
câtva
|
||||||
|
ce
|
||||||
|
cel
|
||||||
|
ceva
|
||||||
|
chiar
|
||||||
|
cînd
|
||||||
|
cine
|
||||||
|
cineva
|
||||||
|
cît
|
||||||
|
cîte
|
||||||
|
cîţi
|
||||||
|
cîtva
|
||||||
|
contra
|
||||||
|
cu
|
||||||
|
cum
|
||||||
|
cumva
|
||||||
|
curând
|
||||||
|
curînd
|
||||||
|
da
|
||||||
|
dă
|
||||||
|
dacă
|
||||||
|
dar
|
||||||
|
datorită
|
||||||
|
de
|
||||||
|
deci
|
||||||
|
deja
|
||||||
|
deoarece
|
||||||
|
departe
|
||||||
|
deşi
|
||||||
|
din
|
||||||
|
dinaintea
|
||||||
|
dintr
|
||||||
|
dintre
|
||||||
|
drept
|
||||||
|
după
|
||||||
|
ea
|
||||||
|
ei
|
||||||
|
el
|
||||||
|
ele
|
||||||
|
eram
|
||||||
|
este
|
||||||
|
eşti
|
||||||
|
eu
|
||||||
|
face
|
||||||
|
fără
|
||||||
|
fi
|
||||||
|
fie
|
||||||
|
fiecare
|
||||||
|
fii
|
||||||
|
fim
|
||||||
|
fiţi
|
||||||
|
iar
|
||||||
|
ieri
|
||||||
|
îi
|
||||||
|
îl
|
||||||
|
îmi
|
||||||
|
împotriva
|
||||||
|
în
|
||||||
|
înainte
|
||||||
|
înaintea
|
||||||
|
încât
|
||||||
|
încît
|
||||||
|
încotro
|
||||||
|
între
|
||||||
|
întrucât
|
||||||
|
întrucît
|
||||||
|
îţi
|
||||||
|
la
|
||||||
|
lângă
|
||||||
|
le
|
||||||
|
li
|
||||||
|
lîngă
|
||||||
|
lor
|
||||||
|
lui
|
||||||
|
mă
|
||||||
|
mâine
|
||||||
|
mea
|
||||||
|
mei
|
||||||
|
mele
|
||||||
|
mereu
|
||||||
|
meu
|
||||||
|
mi
|
||||||
|
mine
|
||||||
|
mult
|
||||||
|
multă
|
||||||
|
mulţi
|
||||||
|
ne
|
||||||
|
nicăieri
|
||||||
|
nici
|
||||||
|
nimeni
|
||||||
|
nişte
|
||||||
|
noastră
|
||||||
|
noastre
|
||||||
|
noi
|
||||||
|
noştri
|
||||||
|
nostru
|
||||||
|
nu
|
||||||
|
ori
|
||||||
|
oricând
|
||||||
|
oricare
|
||||||
|
oricât
|
||||||
|
orice
|
||||||
|
oricînd
|
||||||
|
oricine
|
||||||
|
oricît
|
||||||
|
oricum
|
||||||
|
oriunde
|
||||||
|
până
|
||||||
|
pe
|
||||||
|
pentru
|
||||||
|
peste
|
||||||
|
pînă
|
||||||
|
poate
|
||||||
|
pot
|
||||||
|
prea
|
||||||
|
prima
|
||||||
|
primul
|
||||||
|
prin
|
||||||
|
printr
|
||||||
|
sa
|
||||||
|
să
|
||||||
|
săi
|
||||||
|
sale
|
||||||
|
sau
|
||||||
|
său
|
||||||
|
se
|
||||||
|
şi
|
||||||
|
sînt
|
||||||
|
sîntem
|
||||||
|
sînteţi
|
||||||
|
spre
|
||||||
|
sub
|
||||||
|
sunt
|
||||||
|
suntem
|
||||||
|
sunteţi
|
||||||
|
ta
|
||||||
|
tăi
|
||||||
|
tale
|
||||||
|
tău
|
||||||
|
te
|
||||||
|
ţi
|
||||||
|
ţie
|
||||||
|
tine
|
||||||
|
toată
|
||||||
|
toate
|
||||||
|
tot
|
||||||
|
toţi
|
||||||
|
totuşi
|
||||||
|
tu
|
||||||
|
un
|
||||||
|
una
|
||||||
|
unde
|
||||||
|
undeva
|
||||||
|
unei
|
||||||
|
unele
|
||||||
|
uneori
|
||||||
|
unor
|
||||||
|
vă
|
||||||
|
vi
|
||||||
|
voastră
|
||||||
|
voastre
|
||||||
|
voi
|
||||||
|
voştri
|
||||||
|
vostru
|
||||||
|
vouă
|
||||||
|
vreo
|
||||||
|
vreun
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(RomanianStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package sv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "sv"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopSvFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerSvFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
toLowerFilter,
|
||||||
|
stopSvFilter,
|
||||||
|
stemmerSvFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,70 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package sv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSwedishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("jaktkarlarne"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("jaktkarl"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("jaktkarlens"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("jaktkarl"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("och"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package sv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/swedish"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_sv_snowball"
|
||||||
|
|
||||||
|
type SwedishStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSwedishStemmerFilter() *SwedishStemmerFilter {
|
||||||
|
return &SwedishStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
swedish.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func SwedishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewSwedishStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, SwedishStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package sv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,157 @@
|
||||||
|
package sv
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_sv"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
|
||||||
|
| This file is distributed under the BSD License.
|
||||||
|
| See http://snowball.tartarus.org/license.php
|
||||||
|
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||||
|
| - Encoding was converted to UTF-8.
|
||||||
|
| - This notice was added.
|
||||||
|
|
|
||||||
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||||
|
|
||||||
|
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||||
|
| word is at the start of a line.
|
||||||
|
|
||||||
|
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||||
|
| a large text sample.
|
||||||
|
|
||||||
|
| Swedish stop words occasionally exhibit homonym clashes. For example
|
||||||
|
| så = so, but also seed. These are indicated clearly below.
|
||||||
|
|
||||||
|
och | and
|
||||||
|
det | it, this/that
|
||||||
|
att | to (with infinitive)
|
||||||
|
i | in, at
|
||||||
|
en | a
|
||||||
|
jag | I
|
||||||
|
hon | she
|
||||||
|
som | who, that
|
||||||
|
han | he
|
||||||
|
på | on
|
||||||
|
den | it, this/that
|
||||||
|
med | with
|
||||||
|
var | where, each
|
||||||
|
sig | him(self) etc
|
||||||
|
för | for
|
||||||
|
så | so (also: seed)
|
||||||
|
till | to
|
||||||
|
är | is
|
||||||
|
men | but
|
||||||
|
ett | a
|
||||||
|
om | if; around, about
|
||||||
|
hade | had
|
||||||
|
de | they, these/those
|
||||||
|
av | of
|
||||||
|
icke | not, no
|
||||||
|
mig | me
|
||||||
|
du | you
|
||||||
|
henne | her
|
||||||
|
då | then, when
|
||||||
|
sin | his
|
||||||
|
nu | now
|
||||||
|
har | have
|
||||||
|
inte | inte någon = no one
|
||||||
|
hans | his
|
||||||
|
honom | him
|
||||||
|
skulle | 'sake'
|
||||||
|
hennes | her
|
||||||
|
där | there
|
||||||
|
min | my
|
||||||
|
man | one (pronoun)
|
||||||
|
ej | nor
|
||||||
|
vid | at, by, on (also: vast)
|
||||||
|
kunde | could
|
||||||
|
något | some etc
|
||||||
|
från | from, off
|
||||||
|
ut | out
|
||||||
|
när | when
|
||||||
|
efter | after, behind
|
||||||
|
upp | up
|
||||||
|
vi | we
|
||||||
|
dem | them
|
||||||
|
vara | be
|
||||||
|
vad | what
|
||||||
|
över | over
|
||||||
|
än | than
|
||||||
|
dig | you
|
||||||
|
kan | can
|
||||||
|
sina | his
|
||||||
|
här | here
|
||||||
|
ha | have
|
||||||
|
mot | towards
|
||||||
|
alla | all
|
||||||
|
under | under (also: wonder)
|
||||||
|
någon | some etc
|
||||||
|
eller | or (else)
|
||||||
|
allt | all
|
||||||
|
mycket | much
|
||||||
|
sedan | since
|
||||||
|
ju | why
|
||||||
|
denna | this/that
|
||||||
|
själv | myself, yourself etc
|
||||||
|
detta | this/that
|
||||||
|
åt | to
|
||||||
|
utan | without
|
||||||
|
varit | was
|
||||||
|
hur | how
|
||||||
|
ingen | no
|
||||||
|
mitt | my
|
||||||
|
ni | you
|
||||||
|
bli | to be, become
|
||||||
|
blev | from bli
|
||||||
|
oss | us
|
||||||
|
din | thy
|
||||||
|
dessa | these/those
|
||||||
|
några | some etc
|
||||||
|
deras | their
|
||||||
|
blir | from bli
|
||||||
|
mina | my
|
||||||
|
samma | (the) same
|
||||||
|
vilken | who, that
|
||||||
|
er | you, your
|
||||||
|
sådan | such a
|
||||||
|
vår | our
|
||||||
|
blivit | from bli
|
||||||
|
dess | its
|
||||||
|
inom | within
|
||||||
|
mellan | between
|
||||||
|
sådant | such a
|
||||||
|
varför | why
|
||||||
|
varje | each
|
||||||
|
vilka | who, that
|
||||||
|
ditt | thy
|
||||||
|
vem | who
|
||||||
|
vilket | who, that
|
||||||
|
sitta | his
|
||||||
|
sådana | such a
|
||||||
|
vart | each
|
||||||
|
dina | thy
|
||||||
|
vars | whose
|
||||||
|
vårt | our
|
||||||
|
våra | our
|
||||||
|
ert | your
|
||||||
|
era | your
|
||||||
|
vilkas | whose
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(SwedishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package tr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/apostrophe"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
||||||
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
const AnalyzerName = "tr"
|
||||||
|
|
||||||
|
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||||
|
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
aposFilter, err := cache.TokenFilterNamed(apostrophe.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stopTrFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
stemmerTrFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rv := analysis.Analyzer{
|
||||||
|
Tokenizer: unicodeTokenizer,
|
||||||
|
TokenFilters: []analysis.TokenFilter{
|
||||||
|
aposFilter,
|
||||||
|
toLowerFilter,
|
||||||
|
stopTrFilter,
|
||||||
|
stemmerTrFilter,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package tr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTurkishAnalyzer(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output analysis.TokenStream
|
||||||
|
}{
|
||||||
|
// stemming
|
||||||
|
{
|
||||||
|
input: []byte("ağacı"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ağaç"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("ağaç"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ağaç"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// stop word
|
||||||
|
{
|
||||||
|
input: []byte("dolayı"),
|
||||||
|
output: analysis.TokenStream{},
|
||||||
|
},
|
||||||
|
// apostrophes
|
||||||
|
{
|
||||||
|
input: []byte("Kıbrıs'ta"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("kıbrıs"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: []byte("Van Gölü'ne"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("van"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("göl"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
for _, test := range tests {
|
||||||
|
actual := analyzer.Analyze(test.input)
|
||||||
|
if len(actual) != len(test.output) {
|
||||||
|
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||||
|
}
|
||||||
|
for i, tok := range actual {
|
||||||
|
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||||
|
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package tr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
|
"github.com/blevesearch/snowballstem"
|
||||||
|
"github.com/blevesearch/snowballstem/turkish"
|
||||||
|
)
|
||||||
|
|
||||||
|
const SnowballStemmerName = "stemmer_tr_snowball"
|
||||||
|
|
||||||
|
type TurkishStemmerFilter struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewTurkishStemmerFilter() *TurkishStemmerFilter {
|
||||||
|
return &TurkishStemmerFilter{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *TurkishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
|
for _, token := range input {
|
||||||
|
env := snowballstem.NewEnv(string(token.Term))
|
||||||
|
turkish.Stem(env)
|
||||||
|
token.Term = []byte(env.Current())
|
||||||
|
}
|
||||||
|
return input
|
||||||
|
}
|
||||||
|
|
||||||
|
func TurkishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
return NewTurkishStemmerFilter(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(SnowballStemmerName, TurkishStemmerFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package tr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token/stop"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||||
|
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return stop.NewStopTokensFilter(tokenMap), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||||
|
}
|
|
@ -0,0 +1,236 @@
|
||||||
|
package tr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
)
|
||||||
|
|
||||||
|
const StopName = "stop_tr"
|
||||||
|
|
||||||
|
// this content was obtained from:
|
||||||
|
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||||
|
// ` was changed to ' to allow for literal string
|
||||||
|
|
||||||
|
var TurkishStopWords = []byte(`# Turkish stopwords from LUCENE-559
|
||||||
|
# merged with the list from "Information Retrieval on Turkish Texts"
|
||||||
|
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
|
||||||
|
acaba
|
||||||
|
altmış
|
||||||
|
altı
|
||||||
|
ama
|
||||||
|
ancak
|
||||||
|
arada
|
||||||
|
aslında
|
||||||
|
ayrıca
|
||||||
|
bana
|
||||||
|
bazı
|
||||||
|
belki
|
||||||
|
ben
|
||||||
|
benden
|
||||||
|
beni
|
||||||
|
benim
|
||||||
|
beri
|
||||||
|
beş
|
||||||
|
bile
|
||||||
|
bin
|
||||||
|
bir
|
||||||
|
birçok
|
||||||
|
biri
|
||||||
|
birkaç
|
||||||
|
birkez
|
||||||
|
birşey
|
||||||
|
birşeyi
|
||||||
|
biz
|
||||||
|
bize
|
||||||
|
bizden
|
||||||
|
bizi
|
||||||
|
bizim
|
||||||
|
böyle
|
||||||
|
böylece
|
||||||
|
bu
|
||||||
|
buna
|
||||||
|
bunda
|
||||||
|
bundan
|
||||||
|
bunlar
|
||||||
|
bunları
|
||||||
|
bunların
|
||||||
|
bunu
|
||||||
|
bunun
|
||||||
|
burada
|
||||||
|
çok
|
||||||
|
çünkü
|
||||||
|
da
|
||||||
|
daha
|
||||||
|
dahi
|
||||||
|
de
|
||||||
|
defa
|
||||||
|
değil
|
||||||
|
diğer
|
||||||
|
diye
|
||||||
|
doksan
|
||||||
|
dokuz
|
||||||
|
dolayı
|
||||||
|
dolayısıyla
|
||||||
|
dört
|
||||||
|
edecek
|
||||||
|
eden
|
||||||
|
ederek
|
||||||
|
edilecek
|
||||||
|
ediliyor
|
||||||
|
edilmesi
|
||||||
|
ediyor
|
||||||
|
eğer
|
||||||
|
elli
|
||||||
|
en
|
||||||
|
etmesi
|
||||||
|
etti
|
||||||
|
ettiği
|
||||||
|
ettiğini
|
||||||
|
gibi
|
||||||
|
göre
|
||||||
|
halen
|
||||||
|
hangi
|
||||||
|
hatta
|
||||||
|
hem
|
||||||
|
henüz
|
||||||
|
hep
|
||||||
|
hepsi
|
||||||
|
her
|
||||||
|
herhangi
|
||||||
|
herkesin
|
||||||
|
hiç
|
||||||
|
hiçbir
|
||||||
|
için
|
||||||
|
iki
|
||||||
|
ile
|
||||||
|
ilgili
|
||||||
|
ise
|
||||||
|
işte
|
||||||
|
itibaren
|
||||||
|
itibariyle
|
||||||
|
kadar
|
||||||
|
karşın
|
||||||
|
katrilyon
|
||||||
|
kendi
|
||||||
|
kendilerine
|
||||||
|
kendini
|
||||||
|
kendisi
|
||||||
|
kendisine
|
||||||
|
kendisini
|
||||||
|
kez
|
||||||
|
ki
|
||||||
|
kim
|
||||||
|
kimden
|
||||||
|
kime
|
||||||
|
kimi
|
||||||
|
kimse
|
||||||
|
kırk
|
||||||
|
milyar
|
||||||
|
milyon
|
||||||
|
mu
|
||||||
|
mü
|
||||||
|
mı
|
||||||
|
nasıl
|
||||||
|
ne
|
||||||
|
neden
|
||||||
|
nedenle
|
||||||
|
nerde
|
||||||
|
nerede
|
||||||
|
nereye
|
||||||
|
niye
|
||||||
|
niçin
|
||||||
|
o
|
||||||
|
olan
|
||||||
|
olarak
|
||||||
|
oldu
|
||||||
|
olduğu
|
||||||
|
olduğunu
|
||||||
|
olduklarını
|
||||||
|
olmadı
|
||||||
|
olmadığı
|
||||||
|
olmak
|
||||||
|
olması
|
||||||
|
olmayan
|
||||||
|
olmaz
|
||||||
|
olsa
|
||||||
|
olsun
|
||||||
|
olup
|
||||||
|
olur
|
||||||
|
olursa
|
||||||
|
oluyor
|
||||||
|
on
|
||||||
|
ona
|
||||||
|
ondan
|
||||||
|
onlar
|
||||||
|
onlardan
|
||||||
|
onları
|
||||||
|
onların
|
||||||
|
onu
|
||||||
|
onun
|
||||||
|
otuz
|
||||||
|
oysa
|
||||||
|
öyle
|
||||||
|
pek
|
||||||
|
rağmen
|
||||||
|
sadece
|
||||||
|
sanki
|
||||||
|
sekiz
|
||||||
|
seksen
|
||||||
|
sen
|
||||||
|
senden
|
||||||
|
seni
|
||||||
|
senin
|
||||||
|
siz
|
||||||
|
sizden
|
||||||
|
sizi
|
||||||
|
sizin
|
||||||
|
şey
|
||||||
|
şeyden
|
||||||
|
şeyi
|
||||||
|
şeyler
|
||||||
|
şöyle
|
||||||
|
şu
|
||||||
|
şuna
|
||||||
|
şunda
|
||||||
|
şundan
|
||||||
|
şunları
|
||||||
|
şunu
|
||||||
|
tarafından
|
||||||
|
trilyon
|
||||||
|
tüm
|
||||||
|
üç
|
||||||
|
üzere
|
||||||
|
var
|
||||||
|
vardı
|
||||||
|
ve
|
||||||
|
veya
|
||||||
|
ya
|
||||||
|
yani
|
||||||
|
yapacak
|
||||||
|
yapılan
|
||||||
|
yapılması
|
||||||
|
yapıyor
|
||||||
|
yapmak
|
||||||
|
yaptı
|
||||||
|
yaptığı
|
||||||
|
yaptığını
|
||||||
|
yaptıkları
|
||||||
|
yedi
|
||||||
|
yerine
|
||||||
|
yetmiş
|
||||||
|
yine
|
||||||
|
yirmi
|
||||||
|
yoksa
|
||||||
|
yüz
|
||||||
|
zaten
|
||||||
|
`)
|
||||||
|
|
||||||
|
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||||
|
rv := analysis.NewTokenMap()
|
||||||
|
err := rv.LoadBytes(TurkishStopWords)
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||||
|
}
|
Loading…
Reference in New Issue