added german normalizer
updated german analyzer to use this normalizer closes #65
This commit is contained in:
parent
a4707ebb4e
commit
cd0e3fd85b
|
@ -0,0 +1,86 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package german_normalize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
const (
|
||||
N = 0 /* ordinary state */
|
||||
V = 1 /* stops 'u' from entering umlaut state */
|
||||
U = 2 /* umlaut state, allows e-deletion */
|
||||
)
|
||||
|
||||
type GermanNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
|
||||
return &GermanNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
state := N
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case 'a', 'o':
|
||||
state = U
|
||||
case 'u':
|
||||
if state == N {
|
||||
state = U
|
||||
} else {
|
||||
state = V
|
||||
}
|
||||
case 'e':
|
||||
if state == U {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
state = V
|
||||
case 'i', 'q', 'y':
|
||||
state = V
|
||||
case 'ä':
|
||||
runes[i] = 'a'
|
||||
state = V
|
||||
case 'ö':
|
||||
runes[i] = 'o'
|
||||
state = V
|
||||
case 'ü':
|
||||
runes[i] = 'u'
|
||||
state = V
|
||||
case 'ß':
|
||||
runes[i] = 's'
|
||||
i++
|
||||
// newrunes := make([]rune, len(runes)+1)
|
||||
// copy(newrunes, runes)
|
||||
// runes = newrunes
|
||||
// runes[i] = 's'
|
||||
runes = analysis.InsertRune(runes, i, 's')
|
||||
state = N
|
||||
default:
|
||||
state = N
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
package german_normalize
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestGermanNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// Tests that a/o/u + e is equivalent to the umlaut form
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflächen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflaechen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests the specific heuristic that ue is not folded after a vowel or q.
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests german specific folding of sharp-s
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weißbier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weissbier"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
germanNormalizeFilter := NewGermanNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := germanNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,6 +20,18 @@ func DeleteRune(in []rune, pos int) []rune {
|
|||
return in[:len(in)-1]
|
||||
}
|
||||
|
||||
func InsertRune(in []rune, pos int, r rune) []rune {
|
||||
// create a new slice 1 rune larger
|
||||
rv := make([]rune, len(in)+1)
|
||||
// copy the characters before the insert pos
|
||||
copy(rv[0:pos], in[0:pos])
|
||||
// set the inserted rune
|
||||
rv[pos] = r
|
||||
// copy the characters after the insert pos
|
||||
copy(rv[pos+1:], in[pos:])
|
||||
return rv
|
||||
}
|
||||
|
||||
func BuildTermFromRunes(runes []rune) []byte {
|
||||
rv := make([]byte, 0, len(runes)*4)
|
||||
for _, r := range runes {
|
||||
|
|
|
@ -33,3 +33,38 @@ func TestDeleteRune(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestInsertRune(t *testing.T) {
|
||||
tests := []struct {
|
||||
in []rune
|
||||
insPos int
|
||||
insRune rune
|
||||
out []rune
|
||||
}{
|
||||
{
|
||||
in: []rune{'a', 'b', 'c'},
|
||||
insPos: 1,
|
||||
insRune: 'x',
|
||||
out: []rune{'a', 'x', 'b', 'c'},
|
||||
},
|
||||
{
|
||||
in: []rune{'a', 'b', 'c'},
|
||||
insPos: 0,
|
||||
insRune: 'x',
|
||||
out: []rune{'x', 'a', 'b', 'c'},
|
||||
},
|
||||
{
|
||||
in: []rune{'a', 'b', 'c'},
|
||||
insPos: 3,
|
||||
insRune: 'x',
|
||||
out: []rune{'a', 'b', 'c', 'x'},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := InsertRune(test.in, test.insPos, test.insRune)
|
||||
if !reflect.DeepEqual(actual, test.out) {
|
||||
t.Errorf("expected %#v, got %#v", test.out, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ import (
|
|||
"github.com/couchbaselabs/bleve/analysis/token_filters/arabic_normalize"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/cld2"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/german_normalize"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/length_filter"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/couchbaselabs/bleve/analysis/token_filters/persian_normalize"
|
||||
|
@ -296,6 +297,7 @@ func init() {
|
|||
Config.Analysis.TokenFilters["normalize_ckb"] = sorani_normalize.NewSoraniNormalizeFilter()
|
||||
Config.Analysis.TokenFilters["normalize_fa"] = persian_normalize.NewPersianNormalizeFilter()
|
||||
Config.Analysis.TokenFilters["normalize_ar"] = arabic_normalize.NewArabicNormalizeFilter()
|
||||
Config.Analysis.TokenFilters["normalize_de"] = german_normalize.NewGermanNormalizeFilter()
|
||||
|
||||
// register analyzers
|
||||
keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{})
|
||||
|
@ -318,7 +320,7 @@ func init() {
|
|||
Config.Analysis.Analyzers["fi"] = finnishAnalyzer
|
||||
frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_fr", "to_lower", "stop_token_fr", "stemmer_fr"})
|
||||
Config.Analysis.Analyzers["fr"] = frenchAnalyzer
|
||||
germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "stemmer_de"})
|
||||
germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "normalize_de", "stemmer_de"})
|
||||
Config.Analysis.Analyzers["de"] = germanAnalyzer
|
||||
hungarianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_hu", "stemmer_hu"})
|
||||
Config.Analysis.Analyzers["hu"] = hungarianAnalyzer
|
||||
|
|
Loading…
Reference in New Issue