Merge pull request #586 from mschoch/add-german

add a pure Go German analyzer
2017-04-29 19:46:08 -04:00 · 2017-04-29 19:46:08 -04:00 · c0d5e75e70
commit c0d5e75e70
parent 11a45d6f9c ce901a8870
7 changed files with 881 additions and 0 deletions
--- a/analysis/lang/de/analyzer_de.go
+++ b/analysis/lang/de/analyzer_de.go
@ -0,0 +1,61 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const AnalyzerName = "de"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopDeFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	lightStemmerDeFilter, err := cache.TokenFilterNamed(LightStemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopDeFilter,
+			normalizeDeFilter,
+			lightStemmerDeFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}
--- a/analysis/lang/de/analyzer_de_test.go
+++ b/analysis/lang/de/analyzer_de_test.go
@ -0,0 +1,155 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func TestGermanAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			input: []byte("Tisch"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("tisch"),
+					Position: 1,
+					Start:    0,
+					End:      5,
+				},
+			},
+		},
+		{
+			input: []byte("Tische"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("tisch"),
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+			},
+		},
+		{
+			input: []byte("Tischen"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("tisch"),
+					Position: 1,
+					Start:    0,
+					End:      7,
+				},
+			},
+		},
+		// german specials
+		{
+			input: []byte("Schaltflächen"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("schaltflach"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{
+			input: []byte("Schaltflaechen"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("schaltflach"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		// tests added by marty to increase coverage
+		{
+			input: []byte("Blechern"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("blech"),
+					Position: 1,
+					Start:    0,
+					End:      8,
+				},
+			},
+		},
+		{
+			input: []byte("Klecks"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("kleck"),
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+			},
+		},
+		{
+			input: []byte("Mindestens"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("mindest"),
+					Position: 1,
+					Start:    0,
+					End:      10,
+				},
+			},
+		},
+		{
+			input: []byte("Kugelfest"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("kugelf"),
+					Position: 1,
+					Start:    0,
+					End:      9,
+				},
+			},
+		},
+		{
+			input: []byte("Baldigst"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("baldig"),
+					Position: 1,
+					Start:    0,
+					End:      8,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/lang/de/german_normalize.go
+++ b/analysis/lang/de/german_normalize.go
@ -0,0 +1,95 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const NormalizeName = "normalize_de"
+
+const (
+	N = 0 /* ordinary state */
+	V = 1 /* stops 'u' from entering umlaut state */
+	U = 2 /* umlaut state, allows e-deletion */
+)
+
+type GermanNormalizeFilter struct {
+}
+
+func NewGermanNormalizeFilter() *GermanNormalizeFilter {
+	return &GermanNormalizeFilter{}
+}
+
+func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := normalize(token.Term)
+		token.Term = term
+	}
+	return input
+}
+
+func normalize(input []byte) []byte {
+	state := N
+	runes := bytes.Runes(input)
+	for i := 0; i < len(runes); i++ {
+		switch runes[i] {
+		case 'a', 'o':
+			state = U
+		case 'u':
+			if state == N {
+				state = U
+			} else {
+				state = V
+			}
+		case 'e':
+			if state == U {
+				runes = analysis.DeleteRune(runes, i)
+				i--
+			}
+			state = V
+		case 'i', 'q', 'y':
+			state = V
+		case 'ä':
+			runes[i] = 'a'
+			state = V
+		case 'ö':
+			runes[i] = 'o'
+			state = V
+		case 'ü':
+			runes[i] = 'u'
+			state = V
+		case 'ß':
+			runes[i] = 's'
+			i++
+			runes = analysis.InsertRune(runes, i, 's')
+			state = N
+		default:
+			state = N
+		}
+	}
+	return analysis.BuildTermFromRunes(runes)
+}
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewGermanNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}
--- a/analysis/lang/de/german_normalize_test.go
+++ b/analysis/lang/de/german_normalize_test.go
@ -0,0 +1,103 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+)
+
+func TestGermanNormalizeFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		// Tests that a/o/u + e is equivalent to the umlaut form
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Schaltflächen"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Schaltflachen"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Schaltflaechen"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Schaltflachen"),
+				},
+			},
+		},
+		// Tests the specific heuristic that ue is not folded after a vowel or q.
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("dauer"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("dauer"),
+				},
+			},
+		},
+		// Tests german specific folding of sharp-s
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("weißbier"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("weissbier"),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	germanNormalizeFilter := NewGermanNormalizeFilter()
+	for _, test := range tests {
+		actual := germanNormalizeFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/de/light_stemmer_de.go
+++ b/analysis/lang/de/light_stemmer_de.go
@ -0,0 +1,116 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const LightStemmerName = "stemmer_de_light"
+
+type GermanLightStemmerFilter struct {
+}
+
+func NewGermanLightStemmerFilter() *GermanLightStemmerFilter {
+	return &GermanLightStemmerFilter{}
+}
+
+func (s *GermanLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		runes := bytes.Runes(token.Term)
+		runes = stem(runes)
+		token.Term = analysis.BuildTermFromRunes(runes)
+	}
+	return input
+}
+
+func stem(input []rune) []rune {
+
+	for i, r := range input {
+		switch r {
+		case 'ä', 'à', 'á', 'â':
+			input[i] = 'a'
+		case 'ö', 'ò', 'ó', 'ô':
+			input[i] = 'o'
+		case 'ï', 'ì', 'í', 'î':
+			input[i] = 'i'
+		case 'ü', 'ù', 'ú', 'û':
+			input[i] = 'u'
+		}
+	}
+
+	input = step1(input)
+	return step2(input)
+}
+
+func stEnding(ch rune) bool {
+	switch ch {
+	case 'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't':
+		return true
+	}
+	return false
+}
+
+func step1(s []rune) []rune {
+	l := len(s)
+	if l > 5 && s[l-3] == 'e' && s[l-2] == 'r' && s[l-1] == 'n' {
+		return s[:l-3]
+	}
+
+	if l > 4 && s[l-2] == 'e' {
+		switch s[l-1] {
+		case 'm', 'n', 'r', 's':
+			return s[:l-2]
+		}
+	}
+
+	if l > 3 && s[l-1] == 'e' {
+		return s[:l-1]
+	}
+
+	if l > 3 && s[l-1] == 's' && stEnding(s[l-2]) {
+		return s[:l-1]
+	}
+
+	return s
+}
+
+func step2(s []rune) []rune {
+	l := len(s)
+	if l > 5 && s[l-3] == 'e' && s[l-2] == 's' && s[l-1] == 't' {
+		return s[:l-3]
+	}
+
+	if l > 4 && s[l-2] == 'e' && (s[l-1] == 'r' || s[l-1] == 'n') {
+		return s[:l-2]
+	}
+
+	if l > 4 && s[l-2] == 's' && s[l-1] == 't' && stEnding(s[l-3]) {
+		return s[:l-2]
+	}
+
+	return s
+}
+
+func GermanLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewGermanLightStemmerFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(LightStemmerName, GermanLightStemmerFilterConstructor)
+}
--- a/analysis/lang/de/stop_filter_de.go
+++ b/analysis/lang/de/stop_filter_de.go
@ -0,0 +1,33 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package de
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/analysis/token/stop"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}
--- a/analysis/lang/de/stop_words_de.go
+++ b/analysis/lang/de/stop_words_de.go
@ -0,0 +1,318 @@
+package de
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const StopName = "stop_de"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
+// ` was changed to ' to allow for literal string
+
+var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ |
+ | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
+
+ | A German stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | The number of forms in this list is reduced significantly by passing it
+ | through the German stemmer.
+
+
+aber           |  but
+
+alle           |  all
+allem
+allen
+aller
+alles
+
+als            |  than, as
+also           |  so
+am             |  an + dem
+an             |  at
+
+ander          |  other
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+
+auch           |  also
+auf            |  on
+aus            |  out of
+bei            |  by
+bin            |  am
+bis            |  until
+bist           |  art
+da             |  there
+damit          |  with it
+dann           |  then
+
+der            |  the
+den
+des
+dem
+die
+das
+
+daß            |  that
+
+derselbe       |  the same
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+
+dazu           |  to that
+
+dein           |  thy
+deine
+deinem
+deinen
+deiner
+deines
+
+denn           |  because
+
+derer          |  of those
+dessen         |  of him
+
+dich           |  thee
+dir            |  to thee
+du             |  thou
+
+dies           |  this
+diese
+diesem
+diesen
+dieser
+dieses
+
+
+doch           |  (several meanings)
+dort           |  (over) there
+
+
+durch          |  through
+
+ein            |  a
+eine
+einem
+einen
+einer
+eines
+
+einig          |  some
+einige
+einigem
+einigen
+einiger
+einiges
+
+einmal         |  once
+
+er             |  he
+ihn            |  him
+ihm            |  to him
+
+es             |  it
+etwas          |  something
+
+euer           |  your
+eure
+eurem
+euren
+eurer
+eures
+
+für            |  for
+gegen          |  towards
+gewesen        |  p.p. of sein
+hab            |  have
+habe           |  have
+haben          |  have
+hat            |  has
+hatte          |  had
+hatten         |  had
+hier           |  here
+hin            |  there
+hinter         |  behind
+
+ich            |  I
+mich           |  me
+mir            |  to me
+
+
+ihr            |  you, to her
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch           |  to you
+
+im             |  in + dem
+in             |  in
+indem          |  while
+ins            |  in + das
+ist            |  is
+
+jede           |  each, every
+jedem
+jeden
+jeder
+jedes
+
+jene           |  that
+jenem
+jenen
+jener
+jenes
+
+jetzt          |  now
+kann           |  can
+
+kein           |  no
+keine
+keinem
+keinen
+keiner
+keines
+
+können         |  can
+könnte         |  could
+machen         |  do
+man            |  one
+
+manche         |  some, many a
+manchem
+manchen
+mancher
+manches
+
+mein           |  my
+meine
+meinem
+meinen
+meiner
+meines
+
+mit            |  with
+muss           |  must
+musste         |  had to
+nach           |  to(wards)
+nicht          |  not
+nichts         |  nothing
+noch           |  still, yet
+nun            |  now
+nur            |  only
+ob             |  whether
+oder           |  or
+ohne           |  without
+sehr           |  very
+
+sein           |  his
+seine
+seinem
+seinen
+seiner
+seines
+
+selbst         |  self
+sich           |  herself
+
+sie            |  they, she
+ihnen          |  to them
+
+sind           |  are
+so             |  so
+
+solche         |  such
+solchem
+solchen
+solcher
+solches
+
+soll           |  shall
+sollte         |  should
+sondern        |  but
+sonst          |  else
+über           |  over
+um             |  about, around
+und            |  and
+
+uns            |  us
+unse
+unsem
+unsen
+unser
+unses
+
+unter          |  under
+viel           |  much
+vom            |  von + dem
+von            |  from
+vor            |  before
+während        |  while
+war            |  was
+waren          |  were
+warst          |  wast
+was            |  what
+weg            |  away, off
+weil           |  because
+weiter         |  further
+
+welche         |  which
+welchem
+welchen
+welcher
+welches
+
+wenn           |  when
+werde          |  will
+werden         |  will
+wie            |  how
+wieder         |  again
+will           |  want
+wir            |  we
+wird           |  will
+wirst          |  willst
+wo             |  where
+wollen         |  want
+wollte         |  wanted
+würde          |  would
+würden         |  would
+zu             |  to
+zum            |  zu + dem
+zur            |  zu + der
+zwar           |  indeed
+zwischen       |  between
+
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(GermanStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}