diff --git a/analysis/lang/de/analyzer_de.go b/analysis/lang/de/analyzer_de.go new file mode 100644 index 00000000..dcf4e8e9 --- /dev/null +++ b/analysis/lang/de/analyzer_de.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/registry" +) + +const AnalyzerName = "de" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopDeFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } + lightStemmerDeFilter, err := cache.TokenFilterNamed(LightStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopDeFilter, + normalizeDeFilter, + lightStemmerDeFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/de/analyzer_de_test.go b/analysis/lang/de/analyzer_de_test.go new file mode 100644 index 00000000..18e2683b --- /dev/null +++ b/analysis/lang/de/analyzer_de_test.go @@ -0,0 +1,155 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestGermanAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + input: []byte("Tisch"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("tisch"), + Position: 1, + Start: 0, + End: 5, + }, + }, + }, + { + input: []byte("Tische"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("tisch"), + Position: 1, + Start: 0, + End: 6, + }, + }, + }, + { + input: []byte("Tischen"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("tisch"), + Position: 1, + Start: 0, + End: 7, + }, + }, + }, + // german specials + { + input: []byte("Schaltflächen"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("schaltflach"), + Position: 1, + Start: 0, + End: 14, + }, + }, + }, + { + input: []byte("Schaltflaechen"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("schaltflach"), + Position: 1, + Start: 0, + End: 14, + }, + }, + }, + // tests added by marty to increase coverage + { + input: []byte("Blechern"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("blech"), + Position: 1, + Start: 0, + End: 8, + }, + }, + }, + { + input: []byte("Klecks"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kleck"), + Position: 1, + Start: 0, + End: 6, + }, + }, + }, + { + input: []byte("Mindestens"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("mindest"), + Position: 1, + Start: 0, + End: 10, + }, + }, + }, + { + input: []byte("Kugelfest"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kugelf"), + Position: 1, + Start: 0, + End: 9, + }, + }, + }, + { + input: []byte("Baldigst"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("baldig"), + Position: 1, + Start: 0, + End: 8, + }, + }, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %v, got %v", test.output, actual) + } + } +} diff --git a/analysis/lang/de/german_normalize.go b/analysis/lang/de/german_normalize.go new file mode 100644 index 00000000..370efcbe --- /dev/null +++ b/analysis/lang/de/german_normalize.go @@ -0,0 +1,95 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "bytes" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const NormalizeName = "normalize_de" + +const ( + N = 0 /* ordinary state */ + V = 1 /* stops 'u' from entering umlaut state */ + U = 2 /* umlaut state, allows e-deletion */ +) + +type GermanNormalizeFilter struct { +} + +func NewGermanNormalizeFilter() *GermanNormalizeFilter { + return &GermanNormalizeFilter{} +} + +func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + term := normalize(token.Term) + token.Term = term + } + return input +} + +func normalize(input []byte) []byte { + state := N + runes := bytes.Runes(input) + for i := 0; i < len(runes); i++ { + switch runes[i] { + case 'a', 'o': + state = U + case 'u': + if state == N { + state = U + } else { + state = V + } + case 'e': + if state == U { + runes = analysis.DeleteRune(runes, i) + i-- + } + state = V + case 'i', 'q', 'y': + state = V + case 'ä': + runes[i] = 'a' + state = V + case 'ö': + runes[i] = 'o' + state = V + case 'ü': + runes[i] = 'u' + state = V + case 'ß': + runes[i] = 's' + i++ + runes = analysis.InsertRune(runes, i, 's') + state = N + default: + state = N + } + } + return analysis.BuildTermFromRunes(runes) +} + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewGermanNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/lang/de/german_normalize_test.go b/analysis/lang/de/german_normalize_test.go new file mode 100644 index 00000000..b95a7190 --- /dev/null +++ b/analysis/lang/de/german_normalize_test.go @@ -0,0 +1,103 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestGermanNormalizeFilter(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + // Tests that a/o/u + e is equivalent to the umlaut form + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Schaltflächen"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Schaltflachen"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Schaltflaechen"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Schaltflachen"), + }, + }, + }, + // Tests the specific heuristic that ue is not folded after a vowel or q. + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("dauer"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("dauer"), + }, + }, + }, + // Tests german specific folding of sharp-s + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("weißbier"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("weissbier"), + }, + }, + }, + // empty + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + }, + } + + germanNormalizeFilter := NewGermanNormalizeFilter() + for _, test := range tests { + actual := germanNormalizeFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %#v, got %#v", test.output, actual) + t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/de/light_stemmer_de.go b/analysis/lang/de/light_stemmer_de.go new file mode 100644 index 00000000..c1a093c5 --- /dev/null +++ b/analysis/lang/de/light_stemmer_de.go @@ -0,0 +1,116 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "bytes" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const LightStemmerName = "stemmer_de_light" + +type GermanLightStemmerFilter struct { +} + +func NewGermanLightStemmerFilter() *GermanLightStemmerFilter { + return &GermanLightStemmerFilter{} +} + +func (s *GermanLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + runes := bytes.Runes(token.Term) + runes = stem(runes) + token.Term = analysis.BuildTermFromRunes(runes) + } + return input +} + +func stem(input []rune) []rune { + + for i, r := range input { + switch r { + case 'ä', 'à', 'á', 'â': + input[i] = 'a' + case 'ö', 'ò', 'ó', 'ô': + input[i] = 'o' + case 'ï', 'ì', 'í', 'î': + input[i] = 'i' + case 'ü', 'ù', 'ú', 'û': + input[i] = 'u' + } + } + + input = step1(input) + return step2(input) +} + +func stEnding(ch rune) bool { + switch ch { + case 'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't': + return true + } + return false +} + +func step1(s []rune) []rune { + l := len(s) + if l > 5 && s[l-3] == 'e' && s[l-2] == 'r' && s[l-1] == 'n' { + return s[:l-3] + } + + if l > 4 && s[l-2] == 'e' { + switch s[l-1] { + case 'm', 'n', 'r', 's': + return s[:l-2] + } + } + + if l > 3 && s[l-1] == 'e' { + return s[:l-1] + } + + if l > 3 && s[l-1] == 's' && stEnding(s[l-2]) { + return s[:l-1] + } + + return s +} + +func step2(s []rune) []rune { + l := len(s) + if l > 5 && s[l-3] == 'e' && s[l-2] == 's' && s[l-1] == 't' { + return s[:l-3] + } + + if l > 4 && s[l-2] == 'e' && (s[l-1] == 'r' || s[l-1] == 'n') { + return s[:l-2] + } + + if l > 4 && s[l-2] == 's' && s[l-1] == 't' && stEnding(s[l-3]) { + return s[:l-2] + } + + return s +} + +func GermanLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewGermanLightStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(LightStemmerName, GermanLightStemmerFilterConstructor) +} diff --git a/analysis/lang/de/stop_filter_de.go b/analysis/lang/de/stop_filter_de.go new file mode 100644 index 00000000..bcc423f0 --- /dev/null +++ b/analysis/lang/de/stop_filter_de.go @@ -0,0 +1,33 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/de/stop_words_de.go b/analysis/lang/de/stop_words_de.go new file mode 100644 index 00000000..b71c8f70 --- /dev/null +++ b/analysis/lang/de/stop_words_de.go @@ -0,0 +1,318 @@ +package de + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_de" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(GermanStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +}