diff --git a/analysis/token_filters/elision_filter/articles_ca.go b/analysis/token_filters/elision_filter/articles_ca.go new file mode 100644 index 00000000..7563b2ff --- /dev/null +++ b/analysis/token_filters/elision_filter/articles_ca.go @@ -0,0 +1,13 @@ +package elision_filter + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var CatalanArticles = []byte(` +d +l +m +n +s +t +`) diff --git a/analysis/token_filters/elision_filter/articles_fr.go b/analysis/token_filters/elision_filter/articles_fr.go new file mode 100644 index 00000000..93a21ad1 --- /dev/null +++ b/analysis/token_filters/elision_filter/articles_fr.go @@ -0,0 +1,20 @@ +package elision_filter + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var FrenchArticles = []byte(` +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu +`) diff --git a/analysis/token_filters/elision_filter/articles_ga.go b/analysis/token_filters/elision_filter/articles_ga.go new file mode 100644 index 00000000..e545c988 --- /dev/null +++ b/analysis/token_filters/elision_filter/articles_ga.go @@ -0,0 +1,10 @@ +package elision_filter + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var IrishArticles = []byte(` +d +m +b +`) diff --git a/analysis/token_filters/elision_filter/articles_it.go b/analysis/token_filters/elision_filter/articles_it.go new file mode 100644 index 00000000..403ab810 --- /dev/null +++ b/analysis/token_filters/elision_filter/articles_it.go @@ -0,0 +1,28 @@ +package elision_filter + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var ItalianArticles = []byte(` +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d +`) diff --git a/analysis/token_filters/elision_filter/elision_filter.go b/analysis/token_filters/elision_filter/elision_filter.go new file mode 100644 index 00000000..9d6dfc2a --- /dev/null +++ b/analysis/token_filters/elision_filter/elision_filter.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package elision_filter + +import ( + "bytes" + + "github.com/couchbaselabs/bleve/analysis" +) + +const RIGHT_SINGLE_QUOTATION_MARK = "’" +const APOSTROPHE = "'" + +const APOSTROPHES = APOSTROPHE + RIGHT_SINGLE_QUOTATION_MARK + +type ElisionFilter struct { + articles analysis.WordMap +} + +func NewElisionFilter(articles analysis.WordMap) *ElisionFilter { + return &ElisionFilter{ + articles: articles, + } +} + +func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + firstApostrophe := bytes.IndexAny(token.Term, APOSTROPHES) + if firstApostrophe >= 0 { + // found an apostrophe + prefix := token.Term[0:firstApostrophe] + // see if the prefix matches one of the articles + _, articleMatch := s.articles[string(prefix)] + if articleMatch { + token.Term = token.Term[firstApostrophe+1:] + } + } + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/elision_filter/elision_filter_test.go b/analysis/token_filters/elision_filter/elision_filter_test.go new file mode 100644 index 00000000..12ef7a7b --- /dev/null +++ b/analysis/token_filters/elision_filter/elision_filter_test.go @@ -0,0 +1,116 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package elision_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestElisionFilter(t *testing.T) { + + frenchArticlesMap := analysis.NewWordMap() + err := frenchArticlesMap.LoadBytes(FrenchArticles) + if err != nil { + t.Fatal(err) + } + + italianArticlesMap := analysis.NewWordMap() + err = italianArticlesMap.LoadBytes(ItalianArticles) + if err != nil { + t.Fatal(err) + } + + catalanArticlesMap := analysis.NewWordMap() + err = catalanArticlesMap.LoadBytes(CatalanArticles) + if err != nil { + t.Fatal(err) + } + + irishArticlesMap := analysis.NewWordMap() + err = irishArticlesMap.LoadBytes(IrishArticles) + if err != nil { + t.Fatal(err) + } + + tests := []struct { + articleMap analysis.WordMap + input analysis.TokenStream + output analysis.TokenStream + }{ + { + articleMap: frenchArticlesMap, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("l'avion"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("avion"), + }, + }, + }, + { + articleMap: italianArticlesMap, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("dell'Italia"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Italia"), + }, + }, + }, + { + articleMap: catalanArticlesMap, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("l'Institut"), + }, + &analysis.Token{ + Term: []byte("d'Estudis"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Institut"), + }, + &analysis.Token{ + Term: []byte("Estudis"), + }, + }, + }, + { + articleMap: irishArticlesMap, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("b'fhearr"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("fhearr"), + }, + }, + }, + } + + for _, test := range tests { + elisionFilter := NewElisionFilter(test.articleMap) + actual := elisionFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/config.go b/config.go index 72075a8f..9783af99 100644 --- a/config.go +++ b/config.go @@ -24,6 +24,7 @@ import ( "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" "github.com/couchbaselabs/bleve/analysis/token_filters/cld2" + "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/length_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" @@ -153,6 +154,12 @@ func init() { Config.Analysis.TokenMaps["ckb_stop"] = Config.MustLoadStopWords(stop_words_filter.SoraniStopWords) Config.Analysis.TokenMaps["th_stop"] = Config.MustLoadStopWords(stop_words_filter.ThaiStopWords) + // register article token maps for elision filters + Config.Analysis.TokenMaps["fr_articles"] = Config.MustLoadStopWords(elision_filter.FrenchArticles) + Config.Analysis.TokenMaps["it_articles"] = Config.MustLoadStopWords(elision_filter.ItalianArticles) + Config.Analysis.TokenMaps["ca_articles"] = Config.MustLoadStopWords(elision_filter.CatalanArticles) + Config.Analysis.TokenMaps["ga_articles"] = Config.MustLoadStopWords(elision_filter.IrishArticles) + // register char filters htmlCharFilterRegexp := regexp.MustCompile(`\s]+))?)+\s*|\s*)/?>`) htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '}) @@ -187,6 +194,7 @@ func init() { Config.Analysis.TokenFilters["stemmer_sv"] = stemmer_filter.MustNewStemmerFilter("swedish") Config.Analysis.TokenFilters["stemmer_tr"] = stemmer_filter.MustNewStemmerFilter("turkish") + // register stop token filters Config.Analysis.TokenFilters["stop_token_da"] = stop_words_filter.NewStopWordsFilter( Config.Analysis.TokenMaps["da_stop"]) Config.Analysis.TokenFilters["stop_token_nl"] = stop_words_filter.NewStopWordsFilter( @@ -244,6 +252,16 @@ func init() { Config.Analysis.TokenFilters["stop_token_th"] = stop_words_filter.NewStopWordsFilter( Config.Analysis.TokenMaps["th_stop"]) + // register elision filters + Config.Analysis.TokenFilters["elision_fr"] = elision_filter.NewElisionFilter( + Config.Analysis.TokenMaps["fr_articles"]) + Config.Analysis.TokenFilters["elision_it"] = elision_filter.NewElisionFilter( + Config.Analysis.TokenMaps["it_articles"]) + Config.Analysis.TokenFilters["elision_ca"] = elision_filter.NewElisionFilter( + Config.Analysis.TokenMaps["ca_articles"]) + Config.Analysis.TokenFilters["elision_ga"] = elision_filter.NewElisionFilter( + Config.Analysis.TokenMaps["ga_articles"]) + // register analyzers keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{}) Config.Analysis.Analyzers["keyword"] = keywordAnalyzer @@ -263,13 +281,13 @@ func init() { Config.Analysis.Analyzers["en"] = englishAnalyzer finnishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_fi", "stemmer_fi"}) Config.Analysis.Analyzers["fi"] = finnishAnalyzer - frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_fr", "stemmer_fr"}) + frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_fr", "to_lower", "stop_token_fr", "stemmer_fr"}) Config.Analysis.Analyzers["fr"] = frenchAnalyzer germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "stemmer_de"}) Config.Analysis.Analyzers["de"] = germanAnalyzer hungarianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_hu", "stemmer_hu"}) Config.Analysis.Analyzers["hu"] = hungarianAnalyzer - italianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_it", "stemmer_it"}) + italianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_it", "to_lower", "stop_token_it", "stemmer_it"}) Config.Analysis.Analyzers["it"] = italianAnalyzer norwegianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_no", "stemmer_no"}) Config.Analysis.Analyzers["no"] = norwegianAnalyzer