diff --git a/analysis/language/ar/stemmer_ar.go b/analysis/language/ar/stemmer_ar.go new file mode 100644 index 00000000..6343746e --- /dev/null +++ b/analysis/language/ar/stemmer_ar.go @@ -0,0 +1,80 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package ar + +import ( + "bytes" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StemmerName = "stemmer_ar" + +// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer +var prefixes = [][]byte{ + []byte("ال"), + []byte("وال"), + []byte("بال"), + []byte("كال"), + []byte("فال"), + []byte("لل"), + []byte("و"), +} +var suffixes = [][]byte{ + []byte("ه"), + []byte("ة"), + []byte("يه"), + []byte("ية"), + []byte("ها"), + []byte("ي"), + []byte("ان"), + []byte("ين"), + []byte("ون"), + []byte("ات"), +} + +type ArabicStemmerFilter struct{} + +func NewArabicStemmerFilter() *ArabicStemmerFilter { + return &ArabicStemmerFilter{} +} + +func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + term := stem(token.Term) + token.Term = term + } + return input +} + +func stem(input []byte) []byte { + for _, p := range prefixes { + if bytes.HasPrefix(input, p) { + input = input[len(p):] + break + } + } + for _, s := range suffixes { + if bytes.HasSuffix(input, s) { + input = input[:len(input)-len(s)] + break + } + } + return input +} + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewArabicStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/ar/stemmer_ar_test.go b/analysis/language/ar/stemmer_ar_test.go new file mode 100644 index 00000000..23684a8d --- /dev/null +++ b/analysis/language/ar/stemmer_ar_test.go @@ -0,0 +1,119 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package ar + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestArabicStemmerFilter(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("السلام"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلامة"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("السلامة"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("الوصل"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("وصل"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("والصل"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("صل"), + }, + }, + }, + // empty + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte(""), + }, + }, + }, + } + + arabicStemmerFilter := NewArabicStemmerFilter() + for _, test := range tests { + actual := arabicStemmerFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %#v, got %#v", test.output, actual) + t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term) + } + } +}