0
0
Fork 0

major refactor of analysis files, now wired up to registry

ultimately this is make it more convenient for us to wire up
different elements of the analysis pipeline, without having to
preload everything into memory before we need it

separately the index layer now has a mechanism for storing
internal key/value pairs.  this is expected to be used to
store the mapping, and possibly other pieces of data by the
top layer, but not exposed to the user at the top.
This commit is contained in:
Marty Schoch 2014-08-13 21:14:47 -04:00
parent 3481ec9cef
commit c526a38369
187 changed files with 5078 additions and 963 deletions

View File

@ -0,0 +1,44 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package detect_lang_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "detect_lang"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed("single")
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
detectLangFilter, err := cache.TokenFilterNamed("detect_lang")
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
detectLangFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,32 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package keyword_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "keyword"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "simple"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,46 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/language/en"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "standard"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,30 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "html"
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -10,9 +10,15 @@ package regexp_char_filter
import (
"bytes"
"fmt"
"regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "regexp"
type RegexpCharFilter struct {
r *regexp.Regexp
replacement []byte
@ -28,3 +34,24 @@ func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter
func (s *RegexpCharFilter) Filter(input []byte) []byte {
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
}
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
regexpStr, ok := config["regexp"].(string)
if !ok {
return nil, fmt.Errorf("must specify regexp")
}
r, err := regexp.Compile(regexpStr)
if err != nil {
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
}
replaceBytes := []byte(" ")
replaceStr, ok := config["replace"].(string)
if ok {
replaceBytes = []byte(replaceStr)
}
return NewRegexpCharFilter(r, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
}

View File

@ -0,0 +1,30 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package zero_width_non_joiner
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "zero_width_spaces"
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -0,0 +1,39 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"time"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "dateTimeOptional"
const rfc3339NoTimezone = "2006-01-02T15:04:05"
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
const rfc3339NoTime = "2006-01-02"
var layouts = []string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -9,11 +9,15 @@
package flexible_go
import (
"fmt"
"time"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "flexiblego"
type FlexibleGoDateTimeParser struct {
layouts []string
}
@ -33,3 +37,22 @@ func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error
}
return time.Time{}, analysis.INVALID_DATETIME
}
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
layoutStrs := make([]string, 0)
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
layoutStrs = append(layoutStrs, layoutStr)
}
}
return NewFlexibleGoDateTimeParser(layoutStrs), nil
}
func init() {
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
}

View File

@ -162,6 +162,5 @@ func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
result := tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
//t.Logf("%#v", tf1[0])
}
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package arabic_normalize
package ar
import (
"bytes"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const NormalizeName = "normalize_ar"
const (
ALEF = '\u0627'
ALEF_MADDA = '\u0622'
@ -70,3 +73,11 @@ func normalize(input []byte) []byte {
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package arabic_normalize
package ar
import (
"reflect"

View File

@ -6,32 +6,22 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package stop_words_filter
package ar
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
type StopWordsFilter struct {
stopWords analysis.WordMap
}
func NewStopWordsFilter(stopWords analysis.WordMap) *StopWordsFilter {
return &StopWordsFilter{
stopWords: stopWords,
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
word := string(token.Term)
_, isStopWord := f.stopWords[word]
if !isStopWord {
rv = append(rv, token)
}
}
return rv
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package ar
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ar"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
@ -130,3 +137,13 @@ var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is di
لدى
جميع
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArabicStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bg
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package bg
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_bg"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -198,3 +205,13 @@ var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is
щом
я
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BulgarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,30 @@
package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var CatalanArticles = []byte(`
d
l
m
n
s
t
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_ca"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'Institut"),
},
&analysis.Token{
Term: []byte("d'Estudis"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Institut"),
},
&analysis.Token{
Term: []byte("Estudis"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -225,3 +232,13 @@ vostra
vostre
vostres
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,
stopCkbFilter,
stemmerCkbFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package sorani_normalize
package ckb
import (
"bytes"
"unicode"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const NormalizeName = "normalize_ckb"
const (
YEH = '\u064A'
DOTLESS_YEH = '\u0649'
@ -103,3 +106,11 @@ func normalize(input []byte) []byte {
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package sorani_normalize
package ckb
import (
"reflect"

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package sorani_stemmer_filter
package ckb
import (
"bytes"
"unicode/utf8"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_ckb"
type SoraniStemmerFilter struct {
}
@ -133,3 +136,11 @@ func buildTermFromRunes(runes []rune) []byte {
}
return rv
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -6,14 +6,13 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package sorani_stemmer_filter
package ckb
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/sorani_normalize"
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
)
@ -24,7 +23,7 @@ func TestSoraniStemmerFilter(t *testing.T) {
analyzer := analysis.Analyzer{
Tokenizer: single_token.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{
sorani_normalize.NewSoraniNormalizeFilter(),
NewSoraniNormalizeFilter(),
NewSoraniStemmerFilter(),
},
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ckb"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -141,3 +148,13 @@ var SoraniStopWords = []byte(`# set of kurdish stopwords
# like
وەک
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SoraniStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cs
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package cs
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_cs"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -177,3 +184,13 @@ jež
jakož
načež
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CzechStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,50 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,18 +6,20 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package rune_tokenizer
package da
import (
"unicode"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
type WhitespaceClassifier struct{}
const StemmerName = "stemmer_da"
func NewWhitespaceClassifier() *WhitespaceClassifier {
return &WhitespaceClassifier{}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("da")
}
func (c *WhitespaceClassifier) InToken(r rune) bool {
return !unicode.IsSpace(r)
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_da"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -115,3 +122,13 @@ thi | for (conj)
jer | you
sådan | such, like this/like that
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
stemmerDeFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package german_normalize
package de
import (
"bytes"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const NormalizeName = "normalize_de"
const (
N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */
@ -84,3 +87,11 @@ func normalize(input []byte) []byte {
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package german_normalize
package de
import (
"reflect"

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_de"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("de")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_de"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -299,3 +306,13 @@ zwar | indeed
zwischen | between
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GermanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package el
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package el
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_el"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -83,3 +90,13 @@ var GreekStopWords = []byte(`# Lucene Greek Stopwords list
οσο
οτι
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GreekStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "en"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEnFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
stemmerEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_en"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("en")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,69 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestEnglishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_en"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -324,3 +331,13 @@ very
| high
| long
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(EnglishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "es"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
stemmerEsFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_es"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("es")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_es"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -361,3 +368,13 @@ tenidas
tened
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SpanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package eu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package eu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_eu"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -104,3 +111,13 @@ zuek
zuen
zuten
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BasqueStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,65 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/char_filters/zero_width_non_joiner"
"github.com/couchbaselabs/bleve/analysis/language/ar"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fa"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
if err != nil {
return nil, err
}
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
if err != nil {
return nil, err
}
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
CharFilters: []analysis.CharFilter{
zFilter,
},
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normArFilter,
normFaFilter,
stopFaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package persian_normalize
package fa
import (
"bytes"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const NormalizeName = "normalize_fa"
const (
YEH = '\u064A'
FARSI_YEH = '\u06CC'
@ -62,3 +65,11 @@ func normalize(input []byte) []byte {
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPersianNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package persian_normalize
package fa
import (
"reflect"

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fa"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -318,3 +325,13 @@ var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is d
عنوان
بود
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(PersianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopFiFilter,
stemmerFiFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_fi"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fi")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -102,3 +109,13 @@ nyt | now
itse | self
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FinnishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFrFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
elisionFilter,
toLowerFilter,
stopFrFilter,
stemmerFrFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,37 @@
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var FrenchArticles = []byte(`
l
m
t
qu
n
s
j
d
c
jusqu
quoiqu
lorsqu
puisqu
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_fr"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -6,50 +6,44 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package rune_tokenizer
package fr
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestWhitespaceTokenizer(t *testing.T) {
classifier := NewWhitespaceClassifier()
func TestFrenchElision(t *testing.T) {
tests := []struct {
input []byte
input analysis.TokenStream
output analysis.TokenStream
}{
{
[]byte("Hello World"),
analysis.TokenStream{
{
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'avion"),
},
{
Start: 6,
End: 11,
Term: []byte("World"),
Position: 2,
Type: analysis.AlphaNumeric,
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("avion"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
tokenizer := NewRuneTokenizer(classifier)
actual := tokenizer.Tokenize(test.input)
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_fr"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fr")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -191,3 +198,13 @@ sans | without
soi | oneself
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_ga"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var IrishArticles = []byte(`
d
m
b
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_ga"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("b'fhearr"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("fhearr"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ga"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -115,3 +122,13 @@ um
óna
ónár
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package gl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package gl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_gl"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -166,3 +173,13 @@ voso
vosos
vós
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GalicianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_normalize
package hi
import (
"bytes"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const NormalizeName = "normalize_hi"
type HindiNormalizeFilter struct {
}
@ -123,3 +126,11 @@ func normalize(input []byte) []byte {
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_normalize
package hi
import (
"reflect"

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_stemmer_filter
package hi
import (
"bytes"
"unicode/utf8"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_hi"
type HindiStemmerFilter struct {
}
@ -134,3 +137,11 @@ func stem(input []byte) []byte {
return input
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hindi_stemmer_filter
package hi
import (
"reflect"

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package hi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -240,3 +247,13 @@ var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-l
जेस
नहि
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HindiStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "hu"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopHuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHuFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopHuFilter,
stemmerHuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_hu"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("hu")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hu"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -216,3 +223,13 @@ vele
viszont
volna
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HungarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hy
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package hy
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hy"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -51,3 +58,13 @@ var ArmenianStopWords = []byte(`# example set of Armenian stopwords.
վրա
և
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArmenianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package id
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package id
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_id"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -364,3 +371,13 @@ yaitu
yakni
yang
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IndonesianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "it"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopItFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerItFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
elisionFilter,
toLowerFilter,
stopItFilter,
stemmerItFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,45 @@
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_it"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var ItalianArticles = []byte(`
c
l
all
dall
dell
nell
sull
coll
pell
gl
agl
dagl
degl
negl
sugl
un
m
t
s
v
d
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ItalianArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_it"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dell'Italia"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Italia"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_it"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("it")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_it"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -308,3 +315,13 @@ stessimo
stessero
stando
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ItalianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "nl"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopNlFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerNlFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopNlFilter,
stemmerNlFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_nl"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("nl")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

Some files were not shown because too many files have changed in this diff Show More