Merge pull request #586 from mschoch/add-german

add a pure Go German analyzer
2017-04-29 19:46:08 -04:00 · 2017-04-29 19:46:08 -04:00 · c0d5e75e70
commit c0d5e75e70
parent 11a45d6f9c ce901a8870
7 changed files with 881 additions and 0 deletions
--- a/analysis/lang/de/analyzer_de.go
+++ b/analysis/lang/de/analyzer_de.go
@ -0,0 +1,61 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/analysis/token/lowercase"
 	"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
 	"github.com/blevesearch/bleve/registry"
 )
 const AnalyzerName = "de"
 func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
 	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
 	if err != nil {
 		return nil, err
 	}
 	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
 	if err != nil {
 		return nil, err
 	}
 	stopDeFilter, err := cache.TokenFilterNamed(StopName)
 	if err != nil {
 		return nil, err
 	}
 	normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
 	if err != nil {
 		return nil, err
 	}
 	lightStemmerDeFilter, err := cache.TokenFilterNamed(LightStemmerName)
 	if err != nil {
 		return nil, err
 	}
 	rv := analysis.Analyzer{
 		Tokenizer: unicodeTokenizer,
 		TokenFilters: []analysis.TokenFilter{
 			toLowerFilter,
 			stopDeFilter,
 			normalizeDeFilter,
 			lightStemmerDeFilter,
 		},
 	}
 	return &rv, nil
 }
 func init() {
 	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
 }
--- a/analysis/lang/de/analyzer_de_test.go
+++ b/analysis/lang/de/analyzer_de_test.go
@ -0,0 +1,155 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"reflect"
 	"testing"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
 )
 func TestGermanAnalyzer(t *testing.T) {
 	tests := []struct {
 		input  []byte
 		output analysis.TokenStream
 	}{
 		{
 			input: []byte("Tisch"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("tisch"),
 					Position: 1,
 					Start:    0,
 					End:      5,
 				},
 			},
 		},
 		{
 			input: []byte("Tische"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("tisch"),
 					Position: 1,
 					Start:    0,
 					End:      6,
 				},
 			},
 		},
 		{
 			input: []byte("Tischen"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("tisch"),
 					Position: 1,
 					Start:    0,
 					End:      7,
 				},
 			},
 		},
 		// german specials
 		{
 			input: []byte("Schaltflächen"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("schaltflach"),
 					Position: 1,
 					Start:    0,
 					End:      14,
 				},
 			},
 		},
 		{
 			input: []byte("Schaltflaechen"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("schaltflach"),
 					Position: 1,
 					Start:    0,
 					End:      14,
 				},
 			},
 		},
 		// tests added by marty to increase coverage
 		{
 			input: []byte("Blechern"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("blech"),
 					Position: 1,
 					Start:    0,
 					End:      8,
 				},
 			},
 		},
 		{
 			input: []byte("Klecks"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("kleck"),
 					Position: 1,
 					Start:    0,
 					End:      6,
 				},
 			},
 		},
 		{
 			input: []byte("Mindestens"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("mindest"),
 					Position: 1,
 					Start:    0,
 					End:      10,
 				},
 			},
 		},
 		{
 			input: []byte("Kugelfest"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("kugelf"),
 					Position: 1,
 					Start:    0,
 					End:      9,
 				},
 			},
 		},
 		{
 			input: []byte("Baldigst"),
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term:     []byte("baldig"),
 					Position: 1,
 					Start:    0,
 					End:      8,
 				},
 			},
 		},
 	}
 	cache := registry.NewCache()
 	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, test := range tests {
 		actual := analyzer.Analyze(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
 			t.Errorf("expected %v, got %v", test.output, actual)
 		}
 	}
 }
--- a/analysis/lang/de/german_normalize.go
+++ b/analysis/lang/de/german_normalize.go
@ -0,0 +1,95 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"bytes"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
 )
 const NormalizeName = "normalize_de"
 const (
 	N = 0 /* ordinary state */
 	V = 1 /* stops 'u' from entering umlaut state */
 	U = 2 /* umlaut state, allows e-deletion */
 )
 type GermanNormalizeFilter struct {
 }
 func NewGermanNormalizeFilter() *GermanNormalizeFilter {
 	return &GermanNormalizeFilter{}
 }
 func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	for _, token := range input {
 		term := normalize(token.Term)
 		token.Term = term
 	}
 	return input
 }
 func normalize(input []byte) []byte {
 	state := N
 	runes := bytes.Runes(input)
 	for i := 0; i < len(runes); i++ {
 		switch runes[i] {
 		case 'a', 'o':
 			state = U
 		case 'u':
 			if state == N {
 				state = U
 			} else {
 				state = V
 			}
 		case 'e':
 			if state == U {
 				runes = analysis.DeleteRune(runes, i)
 				i--
 			}
 			state = V
 		case 'i', 'q', 'y':
 			state = V
 		case 'ä':
 			runes[i] = 'a'
 			state = V
 		case 'ö':
 			runes[i] = 'o'
 			state = V
 		case 'ü':
 			runes[i] = 'u'
 			state = V
 		case 'ß':
 			runes[i] = 's'
 			i++
 			runes = analysis.InsertRune(runes, i, 's')
 			state = N
 		default:
 			state = N
 		}
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
 func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
 	return NewGermanNormalizeFilter(), nil
 }
 func init() {
 	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
 }
--- a/analysis/lang/de/german_normalize_test.go
+++ b/analysis/lang/de/german_normalize_test.go
@ -0,0 +1,103 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"reflect"
 	"testing"
 	"github.com/blevesearch/bleve/analysis"
 )
 func TestGermanNormalizeFilter(t *testing.T) {
 	tests := []struct {
 		input  analysis.TokenStream
 		output analysis.TokenStream
 	}{
 		// Tests that a/o/u + e is equivalent to the umlaut form
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("Schaltflächen"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("Schaltflachen"),
 				},
 			},
 		},
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("Schaltflaechen"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("Schaltflachen"),
 				},
 			},
 		},
 		// Tests the specific heuristic that ue is not folded after a vowel or q.
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("dauer"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("dauer"),
 				},
 			},
 		},
 		// Tests german specific folding of sharp-s
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("weißbier"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("weissbier"),
 				},
 			},
 		},
 		// empty
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte(""),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte(""),
 				},
 			},
 		},
 	}
 	germanNormalizeFilter := NewGermanNormalizeFilter()
 	for _, test := range tests {
 		actual := germanNormalizeFilter.Filter(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
 			t.Errorf("expected %#v, got %#v", test.output, actual)
 			t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
 		}
 	}
 }
--- a/analysis/lang/de/light_stemmer_de.go
+++ b/analysis/lang/de/light_stemmer_de.go
@ -0,0 +1,116 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"bytes"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
 )
 const LightStemmerName = "stemmer_de_light"
 type GermanLightStemmerFilter struct {
 }
 func NewGermanLightStemmerFilter() *GermanLightStemmerFilter {
 	return &GermanLightStemmerFilter{}
 }
 func (s *GermanLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	for _, token := range input {
 		runes := bytes.Runes(token.Term)
 		runes = stem(runes)
 		token.Term = analysis.BuildTermFromRunes(runes)
 	}
 	return input
 }
 func stem(input []rune) []rune {
 	for i, r := range input {
 		switch r {
 		case 'ä', 'à', 'á', 'â':
 			input[i] = 'a'
 		case 'ö', 'ò', 'ó', 'ô':
 			input[i] = 'o'
 		case 'ï', 'ì', 'í', 'î':
 			input[i] = 'i'
 		case 'ü', 'ù', 'ú', 'û':
 			input[i] = 'u'
 		}
 	}
 	input = step1(input)
 	return step2(input)
 }
 func stEnding(ch rune) bool {
 	switch ch {
 	case 'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't':
 		return true
 	}
 	return false
 }
 func step1(s []rune) []rune {
 	l := len(s)
 	if l > 5 && s[l-3] == 'e' && s[l-2] == 'r' && s[l-1] == 'n' {
 		return s[:l-3]
 	}
 	if l > 4 && s[l-2] == 'e' {
 		switch s[l-1] {
 		case 'm', 'n', 'r', 's':
 			return s[:l-2]
 		}
 	}
 	if l > 3 && s[l-1] == 'e' {
 		return s[:l-1]
 	}
 	if l > 3 && s[l-1] == 's' && stEnding(s[l-2]) {
 		return s[:l-1]
 	}
 	return s
 }
 func step2(s []rune) []rune {
 	l := len(s)
 	if l > 5 && s[l-3] == 'e' && s[l-2] == 's' && s[l-1] == 't' {
 		return s[:l-3]
 	}
 	if l > 4 && s[l-2] == 'e' && (s[l-1] == 'r' || s[l-1] == 'n') {
 		return s[:l-2]
 	}
 	if l > 4 && s[l-2] == 's' && s[l-1] == 't' && stEnding(s[l-3]) {
 		return s[:l-2]
 	}
 	return s
 }
 func GermanLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
 	return NewGermanLightStemmerFilter(), nil
 }
 func init() {
 	registry.RegisterTokenFilter(LightStemmerName, GermanLightStemmerFilterConstructor)
 }
--- a/analysis/lang/de/stop_filter_de.go
+++ b/analysis/lang/de/stop_filter_de.go
@ -0,0 +1,33 @@
 //  Copyright (c) 2017 Couchbase, Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // 		http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package de
 import (
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/analysis/token/stop"
 	"github.com/blevesearch/bleve/registry"
 )
 func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
 	tokenMap, err := cache.TokenMapNamed(StopName)
 	if err != nil {
 		return nil, err
 	}
 	return stop.NewStopTokensFilter(tokenMap), nil
 }
 func init() {
 	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
 }
--- a/analysis/lang/de/stop_words_de.go
+++ b/analysis/lang/de/stop_words_de.go
@ -0,0 +1,318 @@
 package de
 import (
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
 )
 const StopName = "stop_de"
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
 // ` was changed to ' to allow for literal string
 var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
 | This file is distributed under the BSD License.
 | See http://snowball.tartarus.org/license.php
 | Also see http://www.opensource.org/licenses/bsd-license.html
 |  - Encoding was converted to UTF-8.
 |  - This notice was added.
 |
 | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
 | A German stop word list. Comments begin with vertical bar. Each stop
 | word is at the start of a line.
 | The number of forms in this list is reduced significantly by passing it
 | through the German stemmer.
 aber           |  but
 alle           |  all
 allem
 allen
 aller
 alles
 als            |  than, as
 also           |  so
 am             |  an + dem
 an             |  at
 ander          |  other
 andere
 anderem
 anderen
 anderer
 anderes
 anderm
 andern
 anderr
 anders
 auch           |  also
 auf            |  on
 aus            |  out of
 bei            |  by
 bin            |  am
 bis            |  until
 bist           |  art
 da             |  there
 damit          |  with it
 dann           |  then
 der            |  the
 den
 des
 dem
 die
 das
 daß            |  that
 derselbe       |  the same
 derselben
 denselben
 desselben
 demselben
 dieselbe
 dieselben
 dasselbe
 dazu           |  to that
 dein           |  thy
 deine
 deinem
 deinen
 deiner
 deines
 denn           |  because
 derer          |  of those
 dessen         |  of him
 dich           |  thee
 dir            |  to thee
 du             |  thou
 dies           |  this
 diese
 diesem
 diesen
 dieser
 dieses
 doch           |  (several meanings)
 dort           |  (over) there
 durch          |  through
 ein            |  a
 eine
 einem
 einen
 einer
 eines
 einig          |  some
 einige
 einigem
 einigen
 einiger
 einiges
 einmal         |  once
 er             |  he
 ihn            |  him
 ihm            |  to him
 es             |  it
 etwas          |  something
 euer           |  your
 eure
 eurem
 euren
 eurer
 eures
 für            |  for
 gegen          |  towards
 gewesen        |  p.p. of sein
 hab            |  have
 habe           |  have
 haben          |  have
 hat            |  has
 hatte          |  had
 hatten         |  had
 hier           |  here
 hin            |  there
 hinter         |  behind
 ich            |  I
 mich           |  me
 mir            |  to me
 ihr            |  you, to her
 ihre
 ihrem
 ihren
 ihrer
 ihres
 euch           |  to you
 im             |  in + dem
 in             |  in
 indem          |  while
 ins            |  in + das
 ist            |  is
 jede           |  each, every
 jedem
 jeden
 jeder
 jedes
 jene           |  that
 jenem
 jenen
 jener
 jenes
 jetzt          |  now
 kann           |  can
 kein           |  no
 keine
 keinem
 keinen
 keiner
 keines
 können         |  can
 könnte         |  could
 machen         |  do
 man            |  one
 manche         |  some, many a
 manchem
 manchen
 mancher
 manches
 mein           |  my
 meine
 meinem
 meinen
 meiner
 meines
 mit            |  with
 muss           |  must
 musste         |  had to
 nach           |  to(wards)
 nicht          |  not
 nichts         |  nothing
 noch           |  still, yet
 nun            |  now
 nur            |  only
 ob             |  whether
 oder           |  or
 ohne           |  without
 sehr           |  very
 sein           |  his
 seine
 seinem
 seinen
 seiner
 seines
 selbst         |  self
 sich           |  herself
 sie            |  they, she
 ihnen          |  to them
 sind           |  are
 so             |  so
 solche         |  such
 solchem
 solchen
 solcher
 solches
 soll           |  shall
 sollte         |  should
 sondern        |  but
 sonst          |  else
 über           |  over
 um             |  about, around
 und            |  and
 uns            |  us
 unse
 unsem
 unsen
 unser
 unses
 unter          |  under
 viel           |  much
 vom            |  von + dem
 von            |  from
 vor            |  before
 während        |  while
 war            |  was
 waren          |  were
 warst          |  wast
 was            |  what
 weg            |  away, off
 weil           |  because
 weiter         |  further
 welche         |  which
 welchem
 welchen
 welcher
 welches
 wenn           |  when
 werde          |  will
 werden         |  will
 wie            |  how
 wieder         |  again
 will           |  want
 wir            |  we
 wird           |  will
 wirst          |  willst
 wo             |  where
 wollen         |  want
 wollte         |  wanted
 würde          |  would
 würden         |  would
 zu             |  to
 zum            |  zu + dem
 zur            |  zu + der
 zwar           |  indeed
 zwischen       |  between
 `)
 func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
 	rv := analysis.NewTokenMap()
 	err := rv.LoadBytes(GermanStopWords)
 	return rv, err
 }
 func init() {
 	registry.RegisterTokenMap(StopName, TokenMapConstructor)
 }