diff --git a/analysis/language/pt/analyzer_pt.go b/analysis/language/pt/analyzer_pt.go index 7fb4c35f..bd81427b 100644 --- a/analysis/language/pt/analyzer_pt.go +++ b/analysis/language/pt/analyzer_pt.go @@ -7,9 +7,6 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -// +build libstemmer full -// +build icu full - package pt import ( @@ -17,13 +14,13 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" - "github.com/blevesearch/bleve/analysis/tokenizers/icu" + "github.com/blevesearch/bleve/analysis/tokenizers/unicode" ) const AnalyzerName = "pt" func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { - icuTokenizer, err := cache.TokenizerNamed(icu.Name) + tokenizer, err := cache.TokenizerNamed(unicode.Name) if err != nil { return nil, err } @@ -35,12 +32,12 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( if err != nil { return nil, err } - stemmerPtFilter, err := cache.TokenFilterNamed(StemmerName) + stemmerPtFilter, err := cache.TokenFilterNamed(LightStemmerName) if err != nil { return nil, err } rv := analysis.Analyzer{ - Tokenizer: icuTokenizer, + Tokenizer: tokenizer, TokenFilters: []analysis.TokenFilter{ toLowerFilter, stopPtFilter, diff --git a/analysis/language/pt/analyzer_pt_test.go b/analysis/language/pt/analyzer_pt_test.go index 5c7fff43..d6a81549 100644 --- a/analysis/language/pt/analyzer_pt_test.go +++ b/analysis/language/pt/analyzer_pt_test.go @@ -7,9 +7,6 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -// +build libstemmer full -// +build icu full - package pt import ( @@ -26,24 +23,22 @@ func TestPortugueseAnalyzer(t *testing.T) { output analysis.TokenStream }{ // stemming - // fails due to stemming discrepencies - // got quilométr instead of quilometric - // { - // input: []byte("quilométricas"), - // output: analysis.TokenStream{ - // &analysis.Token{ - // Term: []byte("quilometric"), - // }, - // }, - // }, - // { - // input: []byte("quilométricos"), - // output: analysis.TokenStream{ - // &analysis.Token{ - // Term: []byte("quilometric"), - // }, - // }, - // }, + { + input: []byte("quilométricas"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("quilometric"), + }, + }, + }, + { + input: []byte("quilométricos"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("quilometric"), + }, + }, + }, // stop word { input: []byte("não"), diff --git a/analysis/language/pt/light_stemmer_pt.go b/analysis/language/pt/light_stemmer_pt.go new file mode 100644 index 00000000..93aac7f0 --- /dev/null +++ b/analysis/language/pt/light_stemmer_pt.go @@ -0,0 +1,190 @@ +// Copyright (c) 2015 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package pt + +import ( + "bytes" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const LightStemmerName = "stemmer_pt_light" + +type PortugueseLightStemmerFilter struct { +} + +func NewPortugueseLightStemmerFilter() *PortugueseLightStemmerFilter { + return &PortugueseLightStemmerFilter{} +} + +func (s *PortugueseLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + runes := bytes.Runes(token.Term) + runes = stem(runes) + token.Term = analysis.BuildTermFromRunes(runes) + } + return input +} + +func stem(input []rune) []rune { + + inputLen := len(input) + + if inputLen < 4 { + return input + } + + input = removeSuffix(input) + inputLen = len(input) + + if inputLen > 3 && input[inputLen-1] == 'a' { + input = normFeminine(input) + inputLen = len(input) + } + + if inputLen > 4 { + switch input[inputLen-1] { + case 'e', 'a', 'o': + input = input[0 : inputLen-1] + inputLen = len(input) + } + } + + for i := 0; i < inputLen; i++ { + switch input[i] { + case 'à', 'á', 'â', 'ä', 'ã': + input[i] = 'a' + case 'ò', 'ó', 'ô', 'ö', 'õ': + input[i] = 'o' + case 'è', 'é', 'ê', 'ë': + input[i] = 'e' + case 'ù', 'ú', 'û', 'ü': + input[i] = 'u' + case 'ì', 'í', 'î', 'ï': + input[i] = 'i' + case 'ç': + input[i] = 'c' + } + } + + return input +} + +func removeSuffix(input []rune) []rune { + + inputLen := len(input) + + if inputLen > 4 && analysis.RunesEndsWith(input, "es") { + switch input[inputLen-3] { + case 'r', 's', 'l', 'z': + return input[0 : inputLen-2] + } + } + + if inputLen > 3 && analysis.RunesEndsWith(input, "ns") { + input[inputLen-2] = 'm' + return input[0 : inputLen-1] + } + + if inputLen > 4 && (analysis.RunesEndsWith(input, "eis") || analysis.RunesEndsWith(input, "éis")) { + input[inputLen-3] = 'e' + input[inputLen-2] = 'l' + return input[0 : inputLen-1] + } + + if inputLen > 4 && analysis.RunesEndsWith(input, "ais") { + input[inputLen-2] = 'l' + return input[0 : inputLen-1] + } + + if inputLen > 4 && analysis.RunesEndsWith(input, "óis") { + input[inputLen-3] = 'o' + input[inputLen-2] = 'l' + return input[0 : inputLen-1] + } + + if inputLen > 4 && analysis.RunesEndsWith(input, "is") { + input[inputLen-1] = 'l' + return input + } + + if inputLen > 3 && + (analysis.RunesEndsWith(input, "ões") || + analysis.RunesEndsWith(input, "ães")) { + input = input[0 : inputLen-1] + inputLen = len(input) + input[inputLen-2] = 'ã' + input[inputLen-1] = 'o' + return input + } + + if inputLen > 6 && analysis.RunesEndsWith(input, "mente") { + return input[0 : inputLen-5] + } + + if inputLen > 3 && input[inputLen-1] == 's' { + return input[0 : inputLen-1] + } + return input +} + +func normFeminine(input []rune) []rune { + inputLen := len(input) + + if inputLen > 7 && + (analysis.RunesEndsWith(input, "inha") || + analysis.RunesEndsWith(input, "iaca") || + analysis.RunesEndsWith(input, "eira")) { + input[inputLen-1] = 'o' + return input + } + + if inputLen > 6 { + if analysis.RunesEndsWith(input, "osa") || + analysis.RunesEndsWith(input, "ica") || + analysis.RunesEndsWith(input, "ida") || + analysis.RunesEndsWith(input, "ada") || + analysis.RunesEndsWith(input, "iva") || + analysis.RunesEndsWith(input, "ama") { + input[inputLen-1] = 'o' + return input + } + + if analysis.RunesEndsWith(input, "ona") { + input[inputLen-3] = 'ã' + input[inputLen-2] = 'o' + return input[0 : inputLen-1] + } + + if analysis.RunesEndsWith(input, "ora") { + return input[0 : inputLen-1] + } + + if analysis.RunesEndsWith(input, "esa") { + input[inputLen-3] = 'ê' + return input[0 : inputLen-1] + } + + if analysis.RunesEndsWith(input, "na") { + input[inputLen-1] = 'o' + return input + } + } + return input +} + +func PortugueseLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewPortugueseLightStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(LightStemmerName, PortugueseLightStemmerFilterConstructor) +} diff --git a/analysis/language/pt/light_stemmer_pt_test.go b/analysis/language/pt/light_stemmer_pt_test.go new file mode 100644 index 00000000..11afa0c4 --- /dev/null +++ b/analysis/language/pt/light_stemmer_pt_test.go @@ -0,0 +1,399 @@ +// Copyright (c) 2015 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package pt + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestPortugueseLightStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("doutores"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("doutor"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("doutor"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("doutor"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("homens"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("homem"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("homem"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("homem"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("papéis"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("papel"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("papel"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("papel"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("normais"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("normal"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("normal"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("normal"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("lencóis"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("lencol"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("lencol"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("lencol"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barris"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barril"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barril"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barril"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("botões"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("bota"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("botão"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("bota"), + }, + }, + }, + // longer + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("o"), + }, + &analysis.Token{ + Term: []byte("debate"), + }, + &analysis.Token{ + Term: []byte("político"), + }, + &analysis.Token{ + Term: []byte("pelo"), + }, + &analysis.Token{ + Term: []byte("menos"), + }, + &analysis.Token{ + Term: []byte("o"), + }, + &analysis.Token{ + Term: []byte("que"), + }, + &analysis.Token{ + Term: []byte("vem"), + }, + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("público"), + }, + &analysis.Token{ + Term: []byte("parece"), + }, + &analysis.Token{ + Term: []byte("de"), + }, + &analysis.Token{ + Term: []byte("modo"), + }, + &analysis.Token{ + Term: []byte("nada"), + }, + &analysis.Token{ + Term: []byte("surpreendente"), + }, + &analysis.Token{ + Term: []byte("restrito"), + }, + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("temas"), + }, + &analysis.Token{ + Term: []byte("menores"), + }, + &analysis.Token{ + Term: []byte("mas"), + }, + &analysis.Token{ + Term: []byte("há"), + }, + &analysis.Token{ + Term: []byte("evidentemente"), + }, + &analysis.Token{ + Term: []byte("grandes"), + }, + &analysis.Token{ + Term: []byte("questões"), + }, + &analysis.Token{ + Term: []byte("em"), + }, + &analysis.Token{ + Term: []byte("jogo"), + }, + &analysis.Token{ + Term: []byte("nas"), + }, + &analysis.Token{ + Term: []byte("eleições"), + }, + &analysis.Token{ + Term: []byte("que"), + }, + &analysis.Token{ + Term: []byte("se"), + }, + &analysis.Token{ + Term: []byte("aproximam"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("o"), + }, + &analysis.Token{ + Term: []byte("debat"), + }, + &analysis.Token{ + Term: []byte("politic"), + }, + &analysis.Token{ + Term: []byte("pelo"), + }, + &analysis.Token{ + Term: []byte("meno"), + }, + &analysis.Token{ + Term: []byte("o"), + }, + &analysis.Token{ + Term: []byte("que"), + }, + &analysis.Token{ + Term: []byte("vem"), + }, + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("public"), + }, + &analysis.Token{ + Term: []byte("parec"), + }, + &analysis.Token{ + Term: []byte("de"), + }, + &analysis.Token{ + Term: []byte("modo"), + }, + &analysis.Token{ + Term: []byte("nada"), + }, + &analysis.Token{ + Term: []byte("surpreendent"), + }, + &analysis.Token{ + Term: []byte("restrit"), + }, + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("tema"), + }, + &analysis.Token{ + Term: []byte("menor"), + }, + &analysis.Token{ + Term: []byte("mas"), + }, + &analysis.Token{ + Term: []byte("há"), + }, + &analysis.Token{ + Term: []byte("evident"), + }, + &analysis.Token{ + Term: []byte("grand"), + }, + &analysis.Token{ + Term: []byte("questa"), + }, + &analysis.Token{ + Term: []byte("em"), + }, + &analysis.Token{ + Term: []byte("jogo"), + }, + &analysis.Token{ + Term: []byte("nas"), + }, + &analysis.Token{ + Term: []byte("eleica"), + }, + &analysis.Token{ + Term: []byte("que"), + }, + &analysis.Token{ + Term: []byte("se"), + }, + &analysis.Token{ + Term: []byte("aproximam"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(LightStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +}