0
0
Fork 0

major refactor of analysis files, now wired up to registry

ultimately this is make it more convenient for us to wire up
different elements of the analysis pipeline, without having to
preload everything into memory before we need it

separately the index layer now has a mechanism for storing
internal key/value pairs.  this is expected to be used to
store the mapping, and possibly other pieces of data by the
top layer, but not exposed to the user at the top.
This commit is contained in:
Marty Schoch 2014-08-13 21:14:47 -04:00
parent 3481ec9cef
commit c526a38369
187 changed files with 5078 additions and 963 deletions

View File

@ -0,0 +1,44 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package detect_lang_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "detect_lang"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed("single")
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
detectLangFilter, err := cache.TokenFilterNamed("detect_lang")
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
detectLangFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,32 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package keyword_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "keyword"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package simple_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "simple"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,46 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package standard_analyzer
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/language/en"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "standard"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -0,0 +1,30 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "html"
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -10,9 +10,15 @@ package regexp_char_filter
import ( import (
"bytes" "bytes"
"fmt"
"regexp" "regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const Name = "regexp"
type RegexpCharFilter struct { type RegexpCharFilter struct {
r *regexp.Regexp r *regexp.Regexp
replacement []byte replacement []byte
@ -28,3 +34,24 @@ func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter
func (s *RegexpCharFilter) Filter(input []byte) []byte { func (s *RegexpCharFilter) Filter(input []byte) []byte {
return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) }) return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
} }
func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
regexpStr, ok := config["regexp"].(string)
if !ok {
return nil, fmt.Errorf("must specify regexp")
}
r, err := regexp.Compile(regexpStr)
if err != nil {
return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
}
replaceBytes := []byte(" ")
replaceStr, ok := config["replace"].(string)
if ok {
replaceBytes = []byte(replaceStr)
}
return NewRegexpCharFilter(r, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
}

View File

@ -0,0 +1,30 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package zero_width_non_joiner
import (
"regexp"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "zero_width_spaces"
var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
replaceBytes := []byte(" ")
return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
}
func init() {
registry.RegisterCharFilter(Name, CharFilterConstructor)
}

View File

@ -0,0 +1,39 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
import (
"time"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go"
"github.com/couchbaselabs/bleve/registry"
)
const Name = "dateTimeOptional"
const rfc3339NoTimezone = "2006-01-02T15:04:05"
const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
const rfc3339NoTime = "2006-01-02"
var layouts = []string{
time.RFC3339Nano,
time.RFC3339,
rfc3339NoTimezone,
rfc3339NoTimezoneNoT,
rfc3339NoTime,
}
func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
}
func init() {
registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
}

View File

@ -9,11 +9,15 @@
package flexible_go package flexible_go
import ( import (
"fmt"
"time" "time"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const Name = "flexiblego"
type FlexibleGoDateTimeParser struct { type FlexibleGoDateTimeParser struct {
layouts []string layouts []string
} }
@ -33,3 +37,22 @@ func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error
} }
return time.Time{}, analysis.INVALID_DATETIME return time.Time{}, analysis.INVALID_DATETIME
} }
func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
layouts, ok := config["layouts"].([]interface{})
if !ok {
return nil, fmt.Errorf("must specify layouts")
}
layoutStrs := make([]string, 0)
for _, layout := range layouts {
layoutStr, ok := layout.(string)
if ok {
layoutStrs = append(layoutStrs, layoutStr)
}
}
return NewFlexibleGoDateTimeParser(layoutStrs), nil
}
func init() {
registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
}

View File

@ -162,6 +162,5 @@ func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
result := tf1.MergeAll("tf2", tf2) result := tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(result, expectedResult) { if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result) t.Errorf("expected %#v, got %#v", expectedResult, result)
//t.Logf("%#v", tf1[0])
} }
} }

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package arabic_normalize package ar
import ( import (
"bytes" "bytes"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const NormalizeName = "normalize_ar"
const ( const (
ALEF = '\u0627' ALEF = '\u0627'
ALEF_MADDA = '\u0622' ALEF_MADDA = '\u0622'
@ -70,3 +73,11 @@ func normalize(input []byte) []byte {
} }
return analysis.BuildTermFromRunes(runes) return analysis.BuildTermFromRunes(runes)
} }
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package arabic_normalize package ar
import ( import (
"reflect" "reflect"

View File

@ -6,32 +6,22 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package stop_words_filter package ar
import ( import (
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
) )
type StopWordsFilter struct { func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
stopWords analysis.WordMap tokenMap, err := cache.TokenMapNamed(StopName)
} if err != nil {
return nil, err
func NewStopWordsFilter(stopWords analysis.WordMap) *StopWordsFilter {
return &StopWordsFilter{
stopWords: stopWords,
} }
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
} }
func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func init() {
rv := make(analysis.TokenStream, 0) registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
for _, token := range input {
word := string(token.Term)
_, isStopWord := f.stopWords[word]
if !isStopWord {
rv = append(rv, token)
}
}
return rv
} }

View File

@ -1,4 +1,11 @@
package stop_words_filter package ar
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ar"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
@ -130,3 +137,13 @@ var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is di
لدى لدى
جميع جميع
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArabicStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bg
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package bg
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_bg"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -198,3 +205,13 @@ var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is
щом щом
я я
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BulgarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,30 @@
package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var CatalanArticles = []byte(`
d
l
m
n
s
t
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_ca"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'Institut"),
},
&analysis.Token{
Term: []byte("d'Estudis"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Institut"),
},
&analysis.Token{
Term: []byte("Estudis"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package ca
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ca"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -225,3 +232,13 @@ vostra
vostre vostre
vostres vostres
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,
stopCkbFilter,
stemmerCkbFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package sorani_normalize package ckb
import ( import (
"bytes" "bytes"
"unicode" "unicode"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const NormalizeName = "normalize_ckb"
const ( const (
YEH = '\u064A' YEH = '\u064A'
DOTLESS_YEH = '\u0649' DOTLESS_YEH = '\u0649'
@ -103,3 +106,11 @@ func normalize(input []byte) []byte {
} }
return analysis.BuildTermFromRunes(runes) return analysis.BuildTermFromRunes(runes)
} }
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package sorani_normalize package ckb
import ( import (
"reflect" "reflect"

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package sorani_stemmer_filter package ckb
import ( import (
"bytes" "bytes"
"unicode/utf8" "unicode/utf8"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const StemmerName = "stemmer_ckb"
type SoraniStemmerFilter struct { type SoraniStemmerFilter struct {
} }
@ -133,3 +136,11 @@ func buildTermFromRunes(runes []rune) []byte {
} }
return rv return rv
} }
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -6,14 +6,13 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package sorani_stemmer_filter package ckb
import ( import (
"reflect" "reflect"
"testing" "testing"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/sorani_normalize"
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
) )
@ -24,7 +23,7 @@ func TestSoraniStemmerFilter(t *testing.T) {
analyzer := analysis.Analyzer{ analyzer := analysis.Analyzer{
Tokenizer: single_token.NewSingleTokenTokenizer(), Tokenizer: single_token.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{ TokenFilters: []analysis.TokenFilter{
sorani_normalize.NewSoraniNormalizeFilter(), NewSoraniNormalizeFilter(),
NewSoraniStemmerFilter(), NewSoraniStemmerFilter(),
}, },
} }

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package ckb
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ckb"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -141,3 +148,13 @@ var SoraniStopWords = []byte(`# set of kurdish stopwords
# like # like
وەک وەک
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SoraniStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package cs
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package cs
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_cs"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -177,3 +184,13 @@ jež
jakož jakož
načež načež
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CzechStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,50 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,18 +6,20 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package rune_tokenizer package da
import ( import (
"unicode" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
) )
type WhitespaceClassifier struct{} const StemmerName = "stemmer_da"
func NewWhitespaceClassifier() *WhitespaceClassifier { func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return &WhitespaceClassifier{} return stemmer_filter.NewStemmerFilter("da")
} }
func (c *WhitespaceClassifier) InToken(r rune) bool { func init() {
return !unicode.IsSpace(r) registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
} }

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package da
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_da"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -115,3 +122,13 @@ thi | for (conj)
jer | you jer | you
sådan | such, like this/like that sådan | such, like this/like that
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
"github.com/couchbaselabs/bleve/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
stemmerDeFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package german_normalize package de
import ( import (
"bytes" "bytes"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const NormalizeName = "normalize_de"
const ( const (
N = 0 /* ordinary state */ N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */ V = 1 /* stops 'u' from entering umlaut state */
@ -84,3 +87,11 @@ func normalize(input []byte) []byte {
} }
return analysis.BuildTermFromRunes(runes) return analysis.BuildTermFromRunes(runes)
} }
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package german_normalize package de
import ( import (
"reflect" "reflect"

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_de"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("de")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package de
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_de"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -299,3 +306,13 @@ zwar | indeed
zwischen | between zwischen | between
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GermanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package el
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package el
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_el"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -83,3 +90,13 @@ var GreekStopWords = []byte(`# Lucene Greek Stopwords list
οσο οσο
οτι οτι
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GreekStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "en"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEnFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
stemmerEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_en"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("en")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,69 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestEnglishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package en
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_en"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -324,3 +331,13 @@ very
| high | high
| long | long
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(EnglishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "es"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
stemmerEsFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_es"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("es")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package es
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_es"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -361,3 +368,13 @@ tenidas
tened tened
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SpanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package eu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package eu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_eu"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -104,3 +111,13 @@ zuek
zuen zuen
zuten zuten
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BasqueStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,65 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/char_filters/zero_width_non_joiner"
"github.com/couchbaselabs/bleve/analysis/language/ar"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fa"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
if err != nil {
return nil, err
}
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
if err != nil {
return nil, err
}
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
CharFilters: []analysis.CharFilter{
zFilter,
},
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normArFilter,
normFaFilter,
stopFaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package persian_normalize package fa
import ( import (
"bytes" "bytes"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const NormalizeName = "normalize_fa"
const ( const (
YEH = '\u064A' YEH = '\u064A'
FARSI_YEH = '\u06CC' FARSI_YEH = '\u06CC'
@ -62,3 +65,11 @@ func normalize(input []byte) []byte {
} }
return analysis.BuildTermFromRunes(runes) return analysis.BuildTermFromRunes(runes)
} }
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPersianNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package persian_normalize package fa
import ( import (
"reflect" "reflect"

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package fa
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fa"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -318,3 +325,13 @@ var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is d
عنوان عنوان
بود بود
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(PersianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopFiFilter,
stemmerFiFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_fi"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fi")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package fi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fi"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -102,3 +109,13 @@ nyt | now
itse | self itse | self
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FinnishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "fr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFrFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
elisionFilter,
toLowerFilter,
stopFrFilter,
stemmerFrFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,37 @@
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var FrenchArticles = []byte(`
l
m
t
qu
n
s
j
d
c
jusqu
quoiqu
lorsqu
puisqu
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_fr"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -6,50 +6,44 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package rune_tokenizer package fr
import ( import (
"reflect" "reflect"
"testing" "testing"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
func TestWhitespaceTokenizer(t *testing.T) { func TestFrenchElision(t *testing.T) {
classifier := NewWhitespaceClassifier()
tests := []struct { tests := []struct {
input []byte input analysis.TokenStream
output analysis.TokenStream output analysis.TokenStream
}{ }{
{ {
[]byte("Hello World"), input: analysis.TokenStream{
analysis.TokenStream{ &analysis.Token{
{ Term: []byte("l'avion"),
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
}, },
{ },
Start: 6, output: analysis.TokenStream{
End: 11, &analysis.Token{
Term: []byte("World"), Term: []byte("avion"),
Position: 2,
Type: analysis.AlphaNumeric,
}, },
}, },
}, },
} }
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests { for _, test := range tests {
tokenizer := NewRuneTokenizer(classifier) actual := elisionFilter.Filter(test.input)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) { if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
} }
} }
} }

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_fr"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fr")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package fr
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_fr"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -191,3 +198,13 @@ sans | without
soi | oneself soi | oneself
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_ga"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var IrishArticles = []byte(`
d
m
b
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_ga"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("b'fhearr"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("fhearr"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package ga
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_ga"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -115,3 +122,13 @@ um
óna óna
ónár ónár
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package gl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package gl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_gl"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -166,3 +173,13 @@ voso
vosos vosos
vós vós
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GalicianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -6,14 +6,17 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package hindi_normalize package hi
import ( import (
"bytes" "bytes"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const NormalizeName = "normalize_hi"
type HindiNormalizeFilter struct { type HindiNormalizeFilter struct {
} }
@ -123,3 +126,11 @@ func normalize(input []byte) []byte {
} }
return analysis.BuildTermFromRunes(runes) return analysis.BuildTermFromRunes(runes)
} }
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package hindi_normalize package hi
import ( import (
"reflect" "reflect"

View File

@ -6,15 +6,18 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package hindi_stemmer_filter package hi
import ( import (
"bytes" "bytes"
"unicode/utf8" "unicode/utf8"
"github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
) )
const StemmerName = "stemmer_hi"
type HindiStemmerFilter struct { type HindiStemmerFilter struct {
} }
@ -134,3 +137,11 @@ func stem(input []byte) []byte {
return input return input
} }
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiStemmerFilter(), nil
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -6,7 +6,7 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
package hindi_stemmer_filter package hi
import ( import (
"reflect" "reflect"

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package hi
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hi"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -240,3 +247,13 @@ var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-l
जेस जेस
नहि नहि
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HindiStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "hu"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopHuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHuFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopHuFilter,
stemmerHuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_hu"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("hu")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package hu
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hu"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -216,3 +223,13 @@ vele
viszont viszont
volna volna
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HungarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hy
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package hy
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_hy"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -51,3 +58,13 @@ var ArmenianStopWords = []byte(`# example set of Armenian stopwords.
վրա վրա
և և
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArmenianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package id
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package id
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_id"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@ -364,3 +371,13 @@ yaitu
yakni yakni
yang yang
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IndonesianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "it"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopItFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerItFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
elisionFilter,
toLowerFilter,
stopItFilter,
stemmerItFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,45 @@
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const ArticlesName = "articles_it"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var ItalianArticles = []byte(`
c
l
all
dall
dell
nell
sull
coll
pell
gl
agl
dagl
degl
negl
sugl
un
m
t
s
v
d
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ItalianArticles)
return rv, err
}
func init() {
registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
}

View File

@ -0,0 +1,31 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"fmt"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
"github.com/couchbaselabs/bleve/registry"
)
const ElisionName = "elision_it"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision_filter.NewElisionFilter(articlesTokenMap), nil
}
func init() {
registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
}

View File

@ -0,0 +1,49 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dell'Italia"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Italia"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_it"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("it")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,4 +1,11 @@
package stop_words_filter package it
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
)
const StopName = "stop_it"
// this content was obtained from: // this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@ -308,3 +315,13 @@ stessimo
stessero stessero
stando stando
`) `)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ItalianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -0,0 +1,51 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/registry"
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
)
const AnalyzerName = "nl"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopNlFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerNlFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopNlFilter,
stemmerNlFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -0,0 +1,25 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
"github.com/couchbaselabs/bleve/registry"
)
const StemmerName = "stemmer_nl"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("nl")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -0,0 +1,27 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/couchbaselabs/bleve/analysis"
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/couchbaselabs/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

Some files were not shown because too many files have changed in this diff Show More