diff --git a/analysis/language/it/analyzer_it.go b/analysis/language/it/analyzer_it.go index c2a94375..9cb4021d 100644 --- a/analysis/language/it/analyzer_it.go +++ b/analysis/language/it/analyzer_it.go @@ -7,9 +7,6 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -// +build libstemmer full -// +build icu full - package it import ( @@ -17,13 +14,13 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" - "github.com/blevesearch/bleve/analysis/tokenizers/icu" + "github.com/blevesearch/bleve/analysis/tokenizers/unicode" ) const AnalyzerName = "it" func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { - icuTokenizer, err := cache.TokenizerNamed(icu.Name) + tokenizer, err := cache.TokenizerNamed(unicode.Name) if err != nil { return nil, err } @@ -39,12 +36,12 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( if err != nil { return nil, err } - stemmerItFilter, err := cache.TokenFilterNamed(StemmerName) + stemmerItFilter, err := cache.TokenFilterNamed(LightStemmerName) if err != nil { return nil, err } rv := analysis.Analyzer{ - Tokenizer: icuTokenizer, + Tokenizer: tokenizer, TokenFilters: []analysis.TokenFilter{ elisionFilter, toLowerFilter, diff --git a/analysis/language/it/analyzer_it_test.go b/analysis/language/it/analyzer_it_test.go index 9118bdeb..03cc8112 100644 --- a/analysis/language/it/analyzer_it_test.go +++ b/analysis/language/it/analyzer_it_test.go @@ -7,9 +7,6 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -// +build libstemmer full -// +build icu full - package it import ( @@ -26,29 +23,44 @@ func TestItalianAnalyzer(t *testing.T) { output analysis.TokenStream }{ // stemming - // fails, stemming discrepencies - // abbandon intead of abbandonat - // { - // input: []byte("abbandonata"), - // output: analysis.TokenStream{ - // &analysis.Token{ - // Term: []byte("abbandonat"), - // }, - // }, - // }, - // { - // input: []byte("abbandonati"), - // output: analysis.TokenStream{ - // &analysis.Token{ - // Term: []byte("abbandonat"), - // }, - // }, - // }, + { + input: []byte("abbandonata"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abbandonat"), + }, + }, + }, + { + input: []byte("abbandonati"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abbandonat"), + }, + }, + }, // stop word { input: []byte("dallo"), output: analysis.TokenStream{}, }, + // contractions + { + input: []byte("dell'Italia"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ital"), + }, + }, + }, + { + input: []byte("l'Italiano"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("italian"), + }, + }, + }, } cache := registry.NewCache() diff --git a/analysis/language/it/elision_it_test.go b/analysis/language/it/elision_it_test.go index b69624a6..5f261ba2 100644 --- a/analysis/language/it/elision_it_test.go +++ b/analysis/language/it/elision_it_test.go @@ -17,7 +17,7 @@ import ( "github.com/blevesearch/bleve/registry" ) -func TestFrenchElision(t *testing.T) { +func TestItalianElision(t *testing.T) { tests := []struct { input analysis.TokenStream output analysis.TokenStream diff --git a/analysis/language/it/light_stemmer_it.go b/analysis/language/it/light_stemmer_it.go new file mode 100644 index 00000000..282dda18 --- /dev/null +++ b/analysis/language/it/light_stemmer_it.go @@ -0,0 +1,96 @@ +// Copyright (c) 2015 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package it + +import ( + "bytes" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const LightStemmerName = "stemmer_it_light" + +type ItalianLightStemmerFilter struct { +} + +func NewItalianLightStemmerFilterFilter() *ItalianLightStemmerFilter { + return &ItalianLightStemmerFilter{} +} + +func (s *ItalianLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + runes := bytes.Runes(token.Term) + runes = stem(runes) + token.Term = analysis.BuildTermFromRunes(runes) + } + return input +} + +func stem(input []rune) []rune { + + inputLen := len(input) + + if inputLen < 6 { + return input + } + + for i := 0; i < inputLen; i++ { + switch input[i] { + case 'à', 'á', 'â', 'ä': + input[i] = 'a' + case 'ò', 'ó', 'ô', 'ö': + input[i] = 'o' + case 'è', 'é', 'ê', 'ë': + input[i] = 'e' + case 'ù', 'ú', 'û', 'ü': + input[i] = 'u' + case 'ì', 'í', 'î', 'ï': + input[i] = 'i' + } + } + + switch input[inputLen-1] { + case 'e': + if input[inputLen-2] == 'i' || input[inputLen-2] == 'h' { + return input[0 : inputLen-2] + } else { + return input[0 : inputLen-1] + } + case 'i': + if input[inputLen-2] == 'h' || input[inputLen-2] == 'i' { + return input[0 : inputLen-2] + } else { + return input[0 : inputLen-1] + } + case 'a': + if input[inputLen-2] == 'i' { + return input[0 : inputLen-2] + } else { + return input[0 : inputLen-1] + } + case 'o': + if input[inputLen-2] == 'i' { + return input[0 : inputLen-2] + } else { + return input[0 : inputLen-1] + } + } + + return input +} + +func ItalianLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewItalianLightStemmerFilterFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(LightStemmerName, ItalianLightStemmerFilterConstructor) +} diff --git a/analysis/language/it/light_stemmer_it_test.go b/analysis/language/it/light_stemmer_it_test.go new file mode 100644 index 00000000..1f81370c --- /dev/null +++ b/analysis/language/it/light_stemmer_it_test.go @@ -0,0 +1,62 @@ +// Copyright (c) 2015 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package it + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestItalianLightStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ragazzo"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ragazz"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ragazzi"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ragazz"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(LightStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +}