0
0

Merge pull request #157 from saljam/arabic

Further improvements to the Arabic analyzer
This commit is contained in:
Marty Schoch 2015-02-11 15:49:19 -05:00
commit 2f607d73f3
4 changed files with 345 additions and 25 deletions

View File

@ -14,6 +14,7 @@ import (
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode" "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
) )
@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
if err != nil { if err != nil {
return nil, err return nil, err
} }
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName) stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil { if err != nil {
return nil, err return nil, err
@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
Tokenizer: tokenizer, Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{ TokenFilters: []analysis.TokenFilter{
toLowerFilter, toLowerFilter,
normalizeFilter,
stopArFilter, stopArFilter,
normalizeArFilter, normalizeArFilter,
stemmerArFilter, stemmerArFilter,

View File

@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) {
}, },
}, },
}, },
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
} }
cache := registry.NewCache() cache := registry.NewCache()

View File

@ -19,26 +19,26 @@ import (
const StemmerName = "stemmer_ar" const StemmerName = "stemmer_ar"
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer // These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
var prefixes = [][]byte{ var prefixes = [][]rune{
[]byte("ال"), []rune("ال"),
[]byte("وال"), []rune("وال"),
[]byte("بال"), []rune("بال"),
[]byte("كال"), []rune("كال"),
[]byte("فال"), []rune("فال"),
[]byte("لل"), []rune("لل"),
[]byte("و"), []rune("و"),
} }
var suffixes = [][]byte{ var suffixes = [][]rune{
[]byte("ها"), []rune("ها"),
[]byte("ان"), []rune("ان"),
[]byte("ات"), []rune("ات"),
[]byte("ون"), []rune("ون"),
[]byte("ين"), []rune("ين"),
[]byte("يه"), []rune("يه"),
[]byte("ية"), []rune("ية"),
[]byte("ه"), []rune("ه"),
[]byte("ة"), []rune("ة"),
[]byte("ي"), []rune("ي"),
} }
type ArabicStemmerFilter struct{} type ArabicStemmerFilter struct{}
@ -55,21 +55,53 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
return input return input
} }
func canStemPrefix(input, prefix []rune) bool {
// Wa- prefix requires at least 3 characters.
if len(prefix) == 1 && len(input) < 4 {
return false
}
// Other prefixes require only 2.
if len(input)-len(prefix) < 2 {
return false
}
for i := range prefix {
if prefix[i] != input[i] {
return false
}
}
return true
}
func canStemSuffix(input, suffix []rune) bool {
// All suffixes require at least 2 characters after stemming.
if len(input)-len(suffix) < 2 {
return false
}
stemEnd := len(input) - len(suffix)
for i := range suffix {
if suffix[i] != input[stemEnd+i] {
return false
}
}
return true
}
func stem(input []byte) []byte { func stem(input []byte) []byte {
runes := bytes.Runes(input)
// Strip a single prefix. // Strip a single prefix.
for _, p := range prefixes { for _, p := range prefixes {
if bytes.HasPrefix(input, p) { if canStemPrefix(runes, p) {
input = input[len(p):] runes = runes[len(p):]
break break
} }
} }
// Strip off multiple suffixes, in their order in the suffixes array. // Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes { for _, s := range suffixes {
if bytes.HasSuffix(input, s) { if canStemSuffix(runes, s) {
input = input[:len(input)-len(s)] runes = runes[:len(runes)-len(s)]
} }
} }
return input return analysis.BuildTermFromRunes(runes)
} }
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -21,6 +21,279 @@ func TestArabicStemmerFilter(t *testing.T) {
input analysis.TokenStream input analysis.TokenStream
output analysis.TokenStream output analysis.TokenStream
}{ }{
// AlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// WalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// BalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// KalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// FalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// LlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("للاخر"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اخر"),
},
},
},
// WaPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// AhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوجها"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوج"),
},
},
},
// AnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدان"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// AtSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// WnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدين"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهديه"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YpSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدية"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// HSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهده"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// PSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboPrefSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدهات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ShouldntStem
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
},
// NonArabic
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
},
{ {
input: analysis.TokenStream{ input: analysis.TokenStream{
&analysis.Token{ &analysis.Token{
@ -93,7 +366,7 @@ func TestArabicStemmerFilter(t *testing.T) {
}, },
}, },
}, },
// empty // Empty
{ {
input: analysis.TokenStream{ input: analysis.TokenStream{
&analysis.Token{ &analysis.Token{