0
0
Fork 0

added german normalizer

updated german analyzer to use this normalizer
closes #65
This commit is contained in:
Marty Schoch 2014-08-11 19:25:37 -04:00
parent a4707ebb4e
commit cd0e3fd85b
5 changed files with 233 additions and 1 deletions

View File

@ -0,0 +1,86 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package german_normalize
import (
"bytes"
"github.com/couchbaselabs/bleve/analysis"
)
const (
N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */
U = 2 /* umlaut state, allows e-deletion */
)
type GermanNormalizeFilter struct {
}
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
return &GermanNormalizeFilter{}
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
}
func normalize(input []byte) []byte {
state := N
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'a', 'o':
state = U
case 'u':
if state == N {
state = U
} else {
state = V
}
case 'e':
if state == U {
runes = analysis.DeleteRune(runes, i)
i--
}
state = V
case 'i', 'q', 'y':
state = V
case 'ä':
runes[i] = 'a'
state = V
case 'ö':
runes[i] = 'o'
state = V
case 'ü':
runes[i] = 'u'
state = V
case 'ß':
runes[i] = 's'
i++
// newrunes := make([]rune, len(runes)+1)
// copy(newrunes, runes)
// runes = newrunes
// runes[i] = 's'
runes = analysis.InsertRune(runes, i, 's')
state = N
default:
state = N
}
}
return analysis.BuildTermFromRunes(runes)
}

View File

@ -0,0 +1,97 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package german_normalize
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
)
func TestGermanNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// Tests that a/o/u + e is equivalent to the umlaut form
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflächen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflaechen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
// Tests the specific heuristic that ue is not folded after a vowel or q.
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
},
// Tests german specific folding of sharp-s
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("weißbier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("weissbier"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
germanNormalizeFilter := NewGermanNormalizeFilter()
for _, test := range tests {
actual := germanNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

View File

@ -20,6 +20,18 @@ func DeleteRune(in []rune, pos int) []rune {
return in[:len(in)-1]
}
func InsertRune(in []rune, pos int, r rune) []rune {
// create a new slice 1 rune larger
rv := make([]rune, len(in)+1)
// copy the characters before the insert pos
copy(rv[0:pos], in[0:pos])
// set the inserted rune
rv[pos] = r
// copy the characters after the insert pos
copy(rv[pos+1:], in[pos:])
return rv
}
func BuildTermFromRunes(runes []rune) []byte {
rv := make([]byte, 0, len(runes)*4)
for _, r := range runes {

View File

@ -33,3 +33,38 @@ func TestDeleteRune(t *testing.T) {
}
}
}
func TestInsertRune(t *testing.T) {
tests := []struct {
in []rune
insPos int
insRune rune
out []rune
}{
{
in: []rune{'a', 'b', 'c'},
insPos: 1,
insRune: 'x',
out: []rune{'a', 'x', 'b', 'c'},
},
{
in: []rune{'a', 'b', 'c'},
insPos: 0,
insRune: 'x',
out: []rune{'x', 'a', 'b', 'c'},
},
{
in: []rune{'a', 'b', 'c'},
insPos: 3,
insRune: 'x',
out: []rune{'a', 'b', 'c', 'x'},
},
}
for _, test := range tests {
actual := InsertRune(test.in, test.insPos, test.insRune)
if !reflect.DeepEqual(actual, test.out) {
t.Errorf("expected %#v, got %#v", test.out, actual)
}
}
}

View File

@ -27,6 +27,7 @@ import (
"github.com/couchbaselabs/bleve/analysis/token_filters/arabic_normalize"
"github.com/couchbaselabs/bleve/analysis/token_filters/cld2"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/german_normalize"
"github.com/couchbaselabs/bleve/analysis/token_filters/length_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/persian_normalize"
@ -296,6 +297,7 @@ func init() {
Config.Analysis.TokenFilters["normalize_ckb"] = sorani_normalize.NewSoraniNormalizeFilter()
Config.Analysis.TokenFilters["normalize_fa"] = persian_normalize.NewPersianNormalizeFilter()
Config.Analysis.TokenFilters["normalize_ar"] = arabic_normalize.NewArabicNormalizeFilter()
Config.Analysis.TokenFilters["normalize_de"] = german_normalize.NewGermanNormalizeFilter()
// register analyzers
keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{})
@ -318,7 +320,7 @@ func init() {
Config.Analysis.Analyzers["fi"] = finnishAnalyzer
frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_fr", "to_lower", "stop_token_fr", "stemmer_fr"})
Config.Analysis.Analyzers["fr"] = frenchAnalyzer
germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "stemmer_de"})
germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "normalize_de", "stemmer_de"})
Config.Analysis.Analyzers["de"] = germanAnalyzer
hungarianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_hu", "stemmer_hu"})
Config.Analysis.Analyzers["hu"] = hungarianAnalyzer