diff --git a/analysis/lang/da/analyzer_da.go b/analysis/lang/da/analyzer_da.go new file mode 100644 index 00000000..dca14177 --- /dev/null +++ b/analysis/lang/da/analyzer_da.go @@ -0,0 +1,56 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/registry" +) + +const AnalyzerName = "da" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopDaFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerDaFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopDaFilter, + stemmerDaFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/da/analyzer_da_test.go b/analysis/lang/da/analyzer_da_test.go new file mode 100644 index 00000000..d6a1c51a --- /dev/null +++ b/analysis/lang/da/analyzer_da_test.go @@ -0,0 +1,71 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestDanishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("undersøg"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("undersøg"), + Position: 1, + Start: 0, + End: 9, + }, + }, + }, + { + input: []byte("undersøgelse"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("undersøg"), + Position: 1, + Start: 0, + End: 13, + }, + }, + }, + // stop word + { + input: []byte("på"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %v, got %v", test.output, actual) + } + } +} diff --git a/analysis/lang/da/stemmer_da.go b/analysis/lang/da/stemmer_da.go new file mode 100644 index 00000000..e40e623a --- /dev/null +++ b/analysis/lang/da/stemmer_da.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/danish" +) + +const SnowballStemmerName = "stemmer_da_snowball" + +type DanishStemmerFilter struct { +} + +func NewDanishStemmerFilter() *DanishStemmerFilter { + return &DanishStemmerFilter{} +} + +func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + danish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func DanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewDanishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, DanishStemmerFilterConstructor) +} diff --git a/analysis/lang/da/stop_filter_da.go b/analysis/lang/da/stop_filter_da.go new file mode 100644 index 00000000..a146d0b4 --- /dev/null +++ b/analysis/lang/da/stop_filter_da.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/da/stop_words_da.go b/analysis/lang/da/stop_words_da.go new file mode 100644 index 00000000..63a407a0 --- /dev/null +++ b/analysis/lang/da/stop_words_da.go @@ -0,0 +1,134 @@ +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_da" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DanishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/fi/analyzer_fi.go b/analysis/lang/fi/analyzer_fi.go new file mode 100644 index 00000000..9482e6b3 --- /dev/null +++ b/analysis/lang/fi/analyzer_fi.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "fi" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopFiFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerFiFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopFiFilter, + stemmerFiFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/fi/analyzer_fi_test.go b/analysis/lang/fi/analyzer_fi_test.go new file mode 100644 index 00000000..035e7fdb --- /dev/null +++ b/analysis/lang/fi/analyzer_fi_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestFinishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("edeltäjiinsä"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("edeltäj"), + }, + }, + }, + { + input: []byte("edeltäjistään"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("edeltäj"), + }, + }, + }, + // stop word + { + input: []byte("olla"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/fi/stemmer_fi.go b/analysis/lang/fi/stemmer_fi.go new file mode 100644 index 00000000..14a6a1cb --- /dev/null +++ b/analysis/lang/fi/stemmer_fi.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/finnish" +) + +const SnowballStemmerName = "stemmer_fi_snowball" + +type FinnishStemmerFilter struct { +} + +func NewFinnishStemmerFilter() *FinnishStemmerFilter { + return &FinnishStemmerFilter{} +} + +func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + finnish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func FinnishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewFinnishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, FinnishStemmerFilterConstructor) +} diff --git a/analysis/lang/fi/stop_filter_fi.go b/analysis/lang/fi/stop_filter_fi.go new file mode 100644 index 00000000..f3576a2b --- /dev/null +++ b/analysis/lang/fi/stop_filter_fi.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/fi/stop_words_fi.go b/analysis/lang/fi/stop_words_fi.go new file mode 100644 index 00000000..7cf0c9c1 --- /dev/null +++ b/analysis/lang/fi/stop_words_fi.go @@ -0,0 +1,121 @@ +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_fi" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(FinnishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/hu/analyzer_hu.go b/analysis/lang/hu/analyzer_hu.go new file mode 100644 index 00000000..6797a91e --- /dev/null +++ b/analysis/lang/hu/analyzer_hu.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "hu" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopHuFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerHuFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopHuFilter, + stemmerHuFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/hu/analyzer_hu_test.go b/analysis/lang/hu/analyzer_hu_test.go new file mode 100644 index 00000000..4a14dff6 --- /dev/null +++ b/analysis/lang/hu/analyzer_hu_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestHungarianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("babakocsi"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("babakocs"), + }, + }, + }, + { + input: []byte("babakocsijáért"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("babakocs"), + }, + }, + }, + // stop word + { + input: []byte("által"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/hu/stemmer_hu.go b/analysis/lang/hu/stemmer_hu.go new file mode 100644 index 00000000..b380818a --- /dev/null +++ b/analysis/lang/hu/stemmer_hu.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/hungarian" +) + +const SnowballStemmerName = "stemmer_hu_snowball" + +type HungarianStemmerFilter struct { +} + +func NewHungarianStemmerFilter() *HungarianStemmerFilter { + return &HungarianStemmerFilter{} +} + +func (s *HungarianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + hungarian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func HungarianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewHungarianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, HungarianStemmerFilterConstructor) +} diff --git a/analysis/lang/hu/stop_filter_hu.go b/analysis/lang/hu/stop_filter_hu.go new file mode 100644 index 00000000..a83fd4cc --- /dev/null +++ b/analysis/lang/hu/stop_filter_hu.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/hu/stop_words_hu.go b/analysis/lang/hu/stop_words_hu.go new file mode 100644 index 00000000..fe45d55e --- /dev/null +++ b/analysis/lang/hu/stop_words_hu.go @@ -0,0 +1,235 @@ +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_hu" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var HungarianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(HungarianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/nl/analyzer_nl.go b/analysis/lang/nl/analyzer_nl.go new file mode 100644 index 00000000..69853a9e --- /dev/null +++ b/analysis/lang/nl/analyzer_nl.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "nl" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopNlFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNlFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNlFilter, + stemmerNlFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/nl/analyzer_nl_test.go b/analysis/lang/nl/analyzer_nl_test.go new file mode 100644 index 00000000..21e851c3 --- /dev/null +++ b/analysis/lang/nl/analyzer_nl_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestDutchAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("lichamelijk"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("licham"), + }, + }, + }, + { + input: []byte("lichamelijke"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("licham"), + }, + }, + }, + // stop word + { + input: []byte("van"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/nl/stemmer_nl.go b/analysis/lang/nl/stemmer_nl.go new file mode 100644 index 00000000..049d9216 --- /dev/null +++ b/analysis/lang/nl/stemmer_nl.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/dutch" +) + +const SnowballStemmerName = "stemmer_nl_snowball" + +type DutchStemmerFilter struct { +} + +func NewDutchStemmerFilter() *DutchStemmerFilter { + return &DutchStemmerFilter{} +} + +func (s *DutchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + dutch.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func DutchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewDutchStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, DutchStemmerFilterConstructor) +} diff --git a/analysis/lang/nl/stop_filter_nl.go b/analysis/lang/nl/stop_filter_nl.go new file mode 100644 index 00000000..218f0f42 --- /dev/null +++ b/analysis/lang/nl/stop_filter_nl.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/nl/stop_words_nl.go b/analysis/lang/nl/stop_words_nl.go new file mode 100644 index 00000000..4adae100 --- /dev/null +++ b/analysis/lang/nl/stop_words_nl.go @@ -0,0 +1,143 @@ +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_nl" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var DutchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DutchStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/no/analyzer_no.go b/analysis/lang/no/analyzer_no.go new file mode 100644 index 00000000..57d749ea --- /dev/null +++ b/analysis/lang/no/analyzer_no.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "no" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopNoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNoFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNoFilter, + stemmerNoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/no/analyzer_no_test.go b/analysis/lang/no/analyzer_no_test.go new file mode 100644 index 00000000..c73f5f73 --- /dev/null +++ b/analysis/lang/no/analyzer_no_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestNorwegianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("havnedistriktene"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("havnedistrikt"), + }, + }, + }, + { + input: []byte("havnedistrikter"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("havnedistrikt"), + }, + }, + }, + // stop word + { + input: []byte("det"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/no/stemmer_no.go b/analysis/lang/no/stemmer_no.go new file mode 100644 index 00000000..e61e0247 --- /dev/null +++ b/analysis/lang/no/stemmer_no.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/norwegian" +) + +const SnowballStemmerName = "stemmer_no_snowball" + +type NorwegianStemmerFilter struct { +} + +func NewNorwegianStemmerFilter() *NorwegianStemmerFilter { + return &NorwegianStemmerFilter{} +} + +func (s *NorwegianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + norwegian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func NorwegianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewNorwegianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, NorwegianStemmerFilterConstructor) +} diff --git a/analysis/lang/no/stop_filter_no.go b/analysis/lang/no/stop_filter_no.go new file mode 100644 index 00000000..093688fa --- /dev/null +++ b/analysis/lang/no/stop_filter_no.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/no/stop_words_no.go b/analysis/lang/no/stop_words_no.go new file mode 100644 index 00000000..bfca3484 --- /dev/null +++ b/analysis/lang/no/stop_words_no.go @@ -0,0 +1,218 @@ +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_no" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var NorwegianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(NorwegianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/ro/analyzer_ro.go b/analysis/lang/ro/analyzer_ro.go new file mode 100644 index 00000000..e2938815 --- /dev/null +++ b/analysis/lang/ro/analyzer_ro.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "ro" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopRoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerRoFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopRoFilter, + stemmerRoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/ro/analyzer_ro_test.go b/analysis/lang/ro/analyzer_ro_test.go new file mode 100644 index 00000000..ee8b88f8 --- /dev/null +++ b/analysis/lang/ro/analyzer_ro_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestRomanianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("absenţa"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("absenţ"), + }, + }, + }, + { + input: []byte("absenţi"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("absenţ"), + }, + }, + }, + // stop word + { + input: []byte("îl"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/ro/stemmer_ro.go b/analysis/lang/ro/stemmer_ro.go new file mode 100644 index 00000000..3966215f --- /dev/null +++ b/analysis/lang/ro/stemmer_ro.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/romanian" +) + +const SnowballStemmerName = "stemmer_ro_snowball" + +type RomanianStemmerFilter struct { +} + +func NewRomanianStemmerFilter() *RomanianStemmerFilter { + return &RomanianStemmerFilter{} +} + +func (s *RomanianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + romanian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func RomanianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewRomanianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, RomanianStemmerFilterConstructor) +} diff --git a/analysis/lang/ro/stop_filter_ro.go b/analysis/lang/ro/stop_filter_ro.go new file mode 100644 index 00000000..a2f7f6dd --- /dev/null +++ b/analysis/lang/ro/stop_filter_ro.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/ro/stop_words_ro.go b/analysis/lang/ro/stop_words_ro.go new file mode 100644 index 00000000..e7d62d41 --- /dev/null +++ b/analysis/lang/ro/stop_words_ro.go @@ -0,0 +1,257 @@ +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_ro" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ +// ` was changed to ' to allow for literal string + +var RomanianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(RomanianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/sv/analyzer_sv.go b/analysis/lang/sv/analyzer_sv.go new file mode 100644 index 00000000..f650158d --- /dev/null +++ b/analysis/lang/sv/analyzer_sv.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "sv" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopSvFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerSvFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopSvFilter, + stemmerSvFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/sv/analyzer_sv_test.go b/analysis/lang/sv/analyzer_sv_test.go new file mode 100644 index 00000000..2d358b63 --- /dev/null +++ b/analysis/lang/sv/analyzer_sv_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSwedishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("jaktkarlarne"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("jaktkarl"), + }, + }, + }, + { + input: []byte("jaktkarlens"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("jaktkarl"), + }, + }, + }, + // stop word + { + input: []byte("och"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/sv/stemmer_sv.go b/analysis/lang/sv/stemmer_sv.go new file mode 100644 index 00000000..247f11bb --- /dev/null +++ b/analysis/lang/sv/stemmer_sv.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/swedish" +) + +const SnowballStemmerName = "stemmer_sv_snowball" + +type SwedishStemmerFilter struct { +} + +func NewSwedishStemmerFilter() *SwedishStemmerFilter { + return &SwedishStemmerFilter{} +} + +func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + swedish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func SwedishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSwedishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, SwedishStemmerFilterConstructor) +} diff --git a/analysis/lang/sv/stop_filter_sv.go b/analysis/lang/sv/stop_filter_sv.go new file mode 100644 index 00000000..46a533d1 --- /dev/null +++ b/analysis/lang/sv/stop_filter_sv.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/sv/stop_words_sv.go b/analysis/lang/sv/stop_words_sv.go new file mode 100644 index 00000000..b4022fd9 --- /dev/null +++ b/analysis/lang/sv/stop_words_sv.go @@ -0,0 +1,157 @@ +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_sv" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(SwedishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/tr/analyzer_tr.go b/analysis/lang/tr/analyzer_tr.go new file mode 100644 index 00000000..d52a1d5c --- /dev/null +++ b/analysis/lang/tr/analyzer_tr.go @@ -0,0 +1,63 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/apostrophe" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "tr" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + aposFilter, err := cache.TokenFilterNamed(apostrophe.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopTrFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerTrFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + aposFilter, + toLowerFilter, + stopTrFilter, + stemmerTrFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/tr/analyzer_tr_test.go b/analysis/lang/tr/analyzer_tr_test.go new file mode 100644 index 00000000..fe898093 --- /dev/null +++ b/analysis/lang/tr/analyzer_tr_test.go @@ -0,0 +1,90 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestTurkishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("ağacı"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ağaç"), + }, + }, + }, + { + input: []byte("ağaç"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ağaç"), + }, + }, + }, + // stop word + { + input: []byte("dolayı"), + output: analysis.TokenStream{}, + }, + // apostrophes + { + input: []byte("Kıbrıs'ta"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kıbrıs"), + }, + }, + }, + { + input: []byte("Van Gölü'ne"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("van"), + }, + &analysis.Token{ + Term: []byte("göl"), + }, + }, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/tr/stemmer_tr.go b/analysis/lang/tr/stemmer_tr.go new file mode 100644 index 00000000..ba3034e1 --- /dev/null +++ b/analysis/lang/tr/stemmer_tr.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/turkish" +) + +const SnowballStemmerName = "stemmer_tr_snowball" + +type TurkishStemmerFilter struct { +} + +func NewTurkishStemmerFilter() *TurkishStemmerFilter { + return &TurkishStemmerFilter{} +} + +func (s *TurkishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + turkish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func TurkishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewTurkishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, TurkishStemmerFilterConstructor) +} diff --git a/analysis/lang/tr/stop_filter_tr.go b/analysis/lang/tr/stop_filter_tr.go new file mode 100644 index 00000000..5b616eb9 --- /dev/null +++ b/analysis/lang/tr/stop_filter_tr.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/tr/stop_words_tr.go b/analysis/lang/tr/stop_words_tr.go new file mode 100644 index 00000000..f96fb07e --- /dev/null +++ b/analysis/lang/tr/stop_words_tr.go @@ -0,0 +1,236 @@ +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_tr" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var TurkishStopWords = []byte(`# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(TurkishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +}