0
0
Fork 0

added hindi stemmer

closes #40
This commit is contained in:
Marty Schoch 2014-08-11 22:29:47 -04:00
parent c65f7415ff
commit 3481ec9cef
5 changed files with 449 additions and 1 deletions

View File

@ -0,0 +1,136 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_stemmer_filter
import (
"bytes"
"unicode/utf8"
"github.com/couchbaselabs/bleve/analysis"
)
type HindiStemmerFilter struct {
}
func NewHindiStemmerFilter() *HindiStemmerFilter {
return &HindiStemmerFilter{}
}
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
rv = append(rv, token)
}
return rv
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// 5
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("ाएंगी")) ||
bytes.HasSuffix(input, []byte("ाएंगे")) ||
bytes.HasSuffix(input, []byte("ाऊंगी")) ||
bytes.HasSuffix(input, []byte("ाऊंगा")) ||
bytes.HasSuffix(input, []byte("ाइयाँ")) ||
bytes.HasSuffix(input, []byte("ाइयों")) ||
bytes.HasSuffix(input, []byte("ाइयां"))) {
return analysis.TruncateRunes(input, 5)
}
// 4
if inputLen > 5 &&
(bytes.HasSuffix(input, []byte("ाएगी")) ||
bytes.HasSuffix(input, []byte("ाएगा")) ||
bytes.HasSuffix(input, []byte("ाओगी")) ||
bytes.HasSuffix(input, []byte("ाओगे")) ||
bytes.HasSuffix(input, []byte("एंगी")) ||
bytes.HasSuffix(input, []byte("ेंगी")) ||
bytes.HasSuffix(input, []byte("एंगे")) ||
bytes.HasSuffix(input, []byte("ेंगे")) ||
bytes.HasSuffix(input, []byte("ूंगी")) ||
bytes.HasSuffix(input, []byte("ूंगा")) ||
bytes.HasSuffix(input, []byte("ातीं")) ||
bytes.HasSuffix(input, []byte("नाओं")) ||
bytes.HasSuffix(input, []byte("नाएं")) ||
bytes.HasSuffix(input, []byte("ताओं")) ||
bytes.HasSuffix(input, []byte("ताएं")) ||
bytes.HasSuffix(input, []byte("ियाँ")) ||
bytes.HasSuffix(input, []byte("ियों")) ||
bytes.HasSuffix(input, []byte("ियां"))) {
return analysis.TruncateRunes(input, 4)
}
// 3
if inputLen > 4 &&
(bytes.HasSuffix(input, []byte("ाकर")) ||
bytes.HasSuffix(input, []byte("ाइए")) ||
bytes.HasSuffix(input, []byte("ाईं")) ||
bytes.HasSuffix(input, []byte("ाया")) ||
bytes.HasSuffix(input, []byte("ेगी")) ||
bytes.HasSuffix(input, []byte("ेगा")) ||
bytes.HasSuffix(input, []byte("ोगी")) ||
bytes.HasSuffix(input, []byte("ोगे")) ||
bytes.HasSuffix(input, []byte("ाने")) ||
bytes.HasSuffix(input, []byte("ाना")) ||
bytes.HasSuffix(input, []byte("ाते")) ||
bytes.HasSuffix(input, []byte("ाती")) ||
bytes.HasSuffix(input, []byte("ाता")) ||
bytes.HasSuffix(input, []byte("तीं")) ||
bytes.HasSuffix(input, []byte("ाओं")) ||
bytes.HasSuffix(input, []byte("ाएं")) ||
bytes.HasSuffix(input, []byte("ुओं")) ||
bytes.HasSuffix(input, []byte("ुएं")) ||
bytes.HasSuffix(input, []byte("ुआं"))) {
return analysis.TruncateRunes(input, 3)
}
// 2
if inputLen > 3 &&
(bytes.HasSuffix(input, []byte("कर")) ||
bytes.HasSuffix(input, []byte("ाओ")) ||
bytes.HasSuffix(input, []byte("िए")) ||
bytes.HasSuffix(input, []byte("ाई")) ||
bytes.HasSuffix(input, []byte("ाए")) ||
bytes.HasSuffix(input, []byte("ने")) ||
bytes.HasSuffix(input, []byte("नी")) ||
bytes.HasSuffix(input, []byte("ना")) ||
bytes.HasSuffix(input, []byte("ते")) ||
bytes.HasSuffix(input, []byte("ीं")) ||
bytes.HasSuffix(input, []byte("ती")) ||
bytes.HasSuffix(input, []byte("ता")) ||
bytes.HasSuffix(input, []byte("ाँ")) ||
bytes.HasSuffix(input, []byte("ां")) ||
bytes.HasSuffix(input, []byte("ों")) ||
bytes.HasSuffix(input, []byte("ें"))) {
return analysis.TruncateRunes(input, 2)
}
// 1
if inputLen > 2 &&
(bytes.HasSuffix(input, []byte("ो")) ||
bytes.HasSuffix(input, []byte("े")) ||
bytes.HasSuffix(input, []byte("ू")) ||
bytes.HasSuffix(input, []byte("ु")) ||
bytes.HasSuffix(input, []byte("ी")) ||
bytes.HasSuffix(input, []byte("ि")) ||
bytes.HasSuffix(input, []byte("ा"))) {
return analysis.TruncateRunes(input, 1)
}
return input
}

View File

@ -0,0 +1,302 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_stemmer_filter
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
)
func TestHindiStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// masc noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडके"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरु"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरुओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्तों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
// feminine noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकियों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबें"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाएं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
// some verb forms
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाना"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाता"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाती"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
// exceptions
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिनाइयां"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
hindiStemmerFilter := NewHindiStemmerFilter()
for _, test := range tests {
actual := hindiStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -15,7 +15,7 @@ import (
"github.com/couchbaselabs/bleve/analysis"
)
func TestSoraniStemmerFilter(t *testing.T) {
func TestSoraniNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream

View File

@ -9,6 +9,7 @@
package analysis
import (
"bytes"
"unicode/utf8"
)
@ -41,3 +42,10 @@ func BuildTermFromRunes(runes []rune) []byte {
}
return rv
}
func TruncateRunes(input []byte, num int) []byte {
runes := bytes.Runes(input)
runes = runes[:len(runes)-num]
out := BuildTermFromRunes(runes)
return out
}

View File

@ -29,6 +29,7 @@ import (
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/german_normalize"
"github.com/couchbaselabs/bleve/analysis/token_filters/hindi_normalize"
"github.com/couchbaselabs/bleve/analysis/token_filters/hindi_stemmer_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/length_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/token_filters/persian_normalize"
@ -219,6 +220,7 @@ func init() {
Config.Analysis.TokenFilters["stemmer_sv"] = stemmer_filter.MustNewStemmerFilter("swedish")
Config.Analysis.TokenFilters["stemmer_tr"] = stemmer_filter.MustNewStemmerFilter("turkish")
Config.Analysis.TokenFilters["stemmer_ckb"] = sorani_stemmer_filter.NewSoraniStemmerFilter()
Config.Analysis.TokenFilters["stemmer_hi"] = hindi_stemmer_filter.NewHindiStemmerFilter()
// register stop token filters
Config.Analysis.TokenFilters["stop_token_da"] = stop_words_filter.NewStopWordsFilter(