0
0
Fork 0

major refactor of bleve configuration

see #221 for full details
This commit is contained in:
Marty Schoch 2015-09-16 17:10:59 -04:00
parent 17c64d37c7
commit f81b2be334
145 changed files with 314 additions and 9649 deletions

View File

@ -1,49 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cld2 full
package detect_lang_analyzer
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/cld2"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/single_token"
"github.com/blevesearch/bleve/registry"
)
const Name = "detect_lang"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: keywordTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
detectLangFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -14,6 +14,8 @@ import (
"github.com/blevesearch/bleve/registry"
)
const Name = "ignore"
type IgnoreByteArrayConverter struct{}
func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
@ -29,5 +31,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
}
func init() {
registry.RegisterByteArrayConverter("ignore", Constructor)
registry.RegisterByteArrayConverter(Name, Constructor)
}

View File

@ -16,6 +16,8 @@ import (
"github.com/blevesearch/bleve/registry"
)
const Name = "json"
type JSONByteArrayConverter struct{}
func NewJSONByteArrayConverter() *JSONByteArrayConverter {
@ -36,5 +38,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
}
func init() {
registry.RegisterByteArrayConverter("json", Constructor)
registry.RegisterByteArrayConverter(Name, Constructor)
}

View File

@ -14,6 +14,8 @@ import (
"github.com/blevesearch/bleve/registry"
)
const Name = "string"
type StringByteArrayConverter struct{}
func NewStringByteArrayConverter() *StringByteArrayConverter {
@ -29,5 +31,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
}
func init() {
registry.RegisterByteArrayConverter("string", Constructor)
registry.RegisterByteArrayConverter(Name, Constructor)
}

View File

@ -7,7 +7,7 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package html_char_filter
package datetime_optional
import (
"time"

View File

@ -7,21 +7,19 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
@ -42,7 +40,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package ckb
import (

View File

@ -1,54 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,69 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package da
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestDanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("undersøg"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 9,
},
},
},
{
input: []byte("undersøgelse"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 13,
},
},
},
// stop word
{
input: []byte("på"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_da"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("da")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,134 +0,0 @@
package da
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_da"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
| on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
stemmerDeFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,97 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestGermanAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("Tisch"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("Tische"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("Tischen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 7,
},
},
},
// german specials
{
input: []byte("Schaltflächen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{
input: []byte("Schaltflaechen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,94 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const NormalizeName = "normalize_de"
const (
N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */
U = 2 /* umlaut state, allows e-deletion */
)
type GermanNormalizeFilter struct {
}
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
return &GermanNormalizeFilter{}
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
state := N
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'a', 'o':
state = U
case 'u':
if state == N {
state = U
} else {
state = V
}
case 'e':
if state == U {
runes = analysis.DeleteRune(runes, i)
i--
}
state = V
case 'i', 'q', 'y':
state = V
case 'ä':
runes[i] = 'a'
state = V
case 'ö':
runes[i] = 'o'
state = V
case 'ü':
runes[i] = 'u'
state = V
case 'ß':
runes[i] = 's'
i++
// newrunes := make([]rune, len(runes)+1)
// copy(newrunes, runes)
// runes = newrunes
// runes[i] = 's'
runes = analysis.InsertRune(runes, i, 's')
state = N
default:
state = N
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}

View File

@ -1,98 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestGermanNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// Tests that a/o/u + e is equivalent to the umlaut form
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflächen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflaechen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
// Tests the specific heuristic that ue is not folded after a vowel or q.
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
},
// Tests german specific folding of sharp-s
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("weißbier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("weissbier"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
germanNormalizeFilter := NewGermanNormalizeFilter()
for _, test := range tests {
actual := germanNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package de
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_de"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("de")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package de
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,318 +0,0 @@
package de
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_de"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The number of forms in this list is reduced significantly by passing it
| through the German stemmer.
aber | but
alle | all
allem
allen
aller
alles
als | than, as
also | so
am | an + dem
an | at
ander | other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch | also
auf | on
aus | out of
bei | by
bin | am
bis | until
bist | art
da | there
damit | with it
dann | then
der | the
den
des
dem
die
das
daß | that
derselbe | the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu | to that
dein | thy
deine
deinem
deinen
deiner
deines
denn | because
derer | of those
dessen | of him
dich | thee
dir | to thee
du | thou
dies | this
diese
diesem
diesen
dieser
dieses
doch | (several meanings)
dort | (over) there
durch | through
ein | a
eine
einem
einen
einer
eines
einig | some
einige
einigem
einigen
einiger
einiges
einmal | once
er | he
ihn | him
ihm | to him
es | it
etwas | something
euer | your
eure
eurem
euren
eurer
eures
für | for
gegen | towards
gewesen | p.p. of sein
hab | have
habe | have
haben | have
hat | has
hatte | had
hatten | had
hier | here
hin | there
hinter | behind
ich | I
mich | me
mir | to me
ihr | you, to her
ihre
ihrem
ihren
ihrer
ihres
euch | to you
im | in + dem
in | in
indem | while
ins | in + das
ist | is
jede | each, every
jedem
jeden
jeder
jedes
jene | that
jenem
jenen
jener
jenes
jetzt | now
kann | can
kein | no
keine
keinem
keinen
keiner
keines
können | can
könnte | could
machen | do
man | one
manche | some, many a
manchem
manchen
mancher
manches
mein | my
meine
meinem
meinen
meiner
meines
mit | with
muss | must
musste | had to
nach | to(wards)
nicht | not
nichts | nothing
noch | still, yet
nun | now
nur | only
ob | whether
oder | or
ohne | without
sehr | very
sein | his
seine
seinem
seinen
seiner
seines
selbst | self
sich | herself
sie | they, she
ihnen | to them
sind | are
so | so
solche | such
solchem
solchen
solcher
solches
soll | shall
sollte | should
sondern | but
sonst | else
über | over
um | about, around
und | and
uns | us
unse
unsem
unsen
unser
unses
unter | under
viel | much
vom | von + dem
von | from
vor | before
während | while
war | was
waren | were
warst | wast
was | what
weg | away, off
weil | because
weiter | further
welche | which
welchem
welchen
welcher
welches
wenn | when
werde | will
werden | will
wie | how
wieder | again
will | want
wir | we
wird | will
wirst | willst
wo | where
wollen | want
wollte | wanted
würde | would
würden | would
zu | to
zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GermanStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package en
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_en"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("en")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,72 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package en
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestEnglishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package es
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "es"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
stemmerEsFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,64 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package es
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestSpanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("chicana"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
{
input: []byte("chicano"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package es
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_es"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("es")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package es
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,380 +0,0 @@
package es
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_es"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | from, of
la | the, her
que | who, that
el | the
en | in
y | and
a | to
los | the, them
del | de + el
se | himself, from him etc
las | the, them
por | for, by, etc
un | a
para | for
con | with
no | no
una | a
su | his, her
al | a + el
| es from SER
lo | him
como | how
más | more
pero | pero
sus | su plural
le | to him, her
ya | already
o | or
| fue from SER
este | this
| ha from HABER
| himself etc
porque | because
esta | this
| son from SER
entre | between
| está from ESTAR
cuando | when
muy | very
sin | without
sobre | on
| ser from SER
| tiene from TENER
también | also
me | me
hasta | until
hay | there is/are
donde | where
| han from HABER
quien | whom, that
| están from ESTAR
| estado from ESTAR
desde | from
todo | all
nos | us
durante | during
| estados from ESTAR
todos | all
uno | a
les | to them
ni | nor
contra | against
otros | other
| fueron from SER
ese | that
eso | that
| había from HABER
ante | before
ellos | they
e | and (variant of y)
esto | this
| me
antes | before
algunos | some
qué | what?
unos | a
yo | I
otro | other
otras | other
otra | other
él | he
tanto | so much, many
esa | that
estos | these
mucho | much, many
quienes | who
nada | nothing
muchos | many
cual | who
| sea from SER
poco | few
ella | she
estar | to be
| haber from HABER
estas | these
| estaba from ESTAR
| estamos from ESTAR
algunas | some
algo | something
nosotros | we
| other forms
mi | me
mis | mi plural
| thou
te | thee
ti | thee
tu | thy
tus | tu plural
ellas | they
nosotras | we
vosotros | you
vosotras | you
os | you
mío | mine
mía |
míos |
mías |
tuyo | thine
tuya |
tuyos |
tuyas |
suyo | his, hers, theirs
suya |
suyos |
suyas |
nuestro | ours
nuestra |
nuestros |
nuestras |
vuestro | yours
vuestra |
vuestros |
vuestras |
esos | those
esas | those
| forms of estar, to be (not including the infinitive):
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
| forms of haber, to have (not including the infinitive):
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
| forms of ser, to be (not including the infinitive):
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
siendo
sido
| sed also means 'thirst'
| forms of tener, to have (not including the infinitive):
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SpanishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package fa
import (
@ -18,7 +16,7 @@ import (
"github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
"github.com/blevesearch/bleve/analysis/language/ar"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
)
const AnalyzerName = "fa"
@ -28,7 +26,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
if err != nil {
return nil, err
}
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
@ -52,7 +50,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
CharFilters: []analysis.CharFilter{
zFilter,
},
Tokenizer: icuTokenizer,
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normArFilter,

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package fa
import (

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package fi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "fi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopFiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopFiFilter,
stemmerFiFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package fi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestFinishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("edeltäjiinsä"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
{
input: []byte("edeltäjistään"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
// stop word
{
input: []byte("olla"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package fi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_fi"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fi")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package fi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,121 +0,0 @@
package fi
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_fi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE
olla
olen
olet
on
olemme
olette
ovat
ole | negative form
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en | negation
et
ei
emme
ette
eivät
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
mitkä | (pl)
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
| conjunctions
että | that
ja | and
jos | if
koska | because
kuin | than
mutta | but
niin | so
sekä | and
sillä | for
tai | or
vaan | but
vai | or
vaikka | although
| prepositions
kanssa | with
mukaan | according to
noin | about
poikki | across
yli | over, across
| other
kun | when
niin | so
nyt | now
itse | self
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FinnishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package fr
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_fr"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("fr")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package hu
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "hu"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopHuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHuFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopHuFilter,
stemmerHuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package hu
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestHungarianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("babakocsi"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("babakocs"),
},
},
},
{
input: []byte("babakocsijáért"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("babakocs"),
},
},
},
// stop word
{
input: []byte("által"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package hu
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_hu"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("hu")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package hu
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,235 +0,0 @@
package hu
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_hu"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var HungarianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| Hungarian stop word list
| prepared by Anna Tordai
a
ahogy
ahol
aki
akik
akkor
alatt
által
általában
amely
amelyek
amelyekben
amelyeket
amelyet
amelynek
ami
amit
amolyan
amíg
amikor
át
abban
ahhoz
annak
arra
arról
az
azok
azon
azt
azzal
azért
aztán
azután
azonban
bár
be
belül
benne
cikk
cikkek
cikkeket
csak
de
e
eddig
egész
egy
egyes
egyetlen
egyéb
egyik
egyre
ekkor
el
elég
ellen
elő
először
előtt
első
én
éppen
ebben
ehhez
emilyen
ennek
erre
ez
ezt
ezek
ezen
ezzel
ezért
és
fel
felé
hanem
hiszen
hogy
hogyan
igen
így
illetve
ill.
ill
ilyen
ilyenkor
ison
ismét
itt
jól
jobban
kell
kellett
keresztül
keressünk
ki
kívül
között
közül
legalább
lehet
lehetett
legyen
lenne
lenni
lesz
lett
maga
magát
majd
majd
már
más
másik
meg
még
mellett
mert
mely
melyek
mi
mit
míg
miért
milyen
mikor
minden
mindent
mindenki
mindig
mint
mintha
mivel
most
nagy
nagyobb
nagyon
ne
néha
nekem
neki
nem
néhány
nélkül
nincs
olyan
ott
össze
ő
ők
őket
pedig
persze
s
saját
sem
semmi
sok
sokat
sokkal
számára
szemben
szerint
szinte
talán
tehát
teljes
tovább
továbbá
több
úgy
ugyanis
új
újabb
újra
után
utána
utolsó
vagy
vagyis
valaki
valami
valamint
való
vagyok
van
vannak
volt
voltam
voltak
voltunk
vissza
vele
viszont
volna
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HungarianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package it
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_it"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("it")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build kagome full
package ja
import (

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build kagome full
package ja
import (

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build kagome full
package ja
import (

View File

@ -7,8 +7,6 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build kagome full
package ja
import (

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package nl
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "nl"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopNlFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerNlFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopNlFilter,
stemmerNlFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package nl
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestDutchAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("lichamelijk"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("licham"),
},
},
},
{
input: []byte("lichamelijke"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("licham"),
},
},
},
// stop word
{
input: []byte("van"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package nl
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_nl"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("nl")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package nl
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,143 +0,0 @@
package nl
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_nl"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DutchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large sample of Dutch text.
| Dutch stop words frequently exhibit homonym clashes. These are indicated
| clearly below.
de | the
en | and
van | of, from
ik | I, the ego
te | (1) chez, at etc, (2) to, (3) too
dat | that, which
die | that, those, who, which
in | in, inside
een | a, an, one
hij | he
het | the, it
niet | not, nothing, naught
zijn | (1) to be, being, (2) his, one's, its
is | is
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
op | on, upon, at, in, up, used up
aan | on, upon, to (as dative)
met | with, by
als | like, such as, when
voor | (1) before, in front of, (2) furrow
had | had, past tense all persons sing. of 'hebben' (have)
er | there
maar | but, only
om | round, about, for etc
hem | him
dan | then
zou | should/would, past tense all persons sing. of 'zullen'
of | or, whether, if
wat | what, something, anything
mijn | possessive and noun 'mine'
men | people, 'one'
dit | this
zo | so, thus, in this way
door | through by
over | over, across
ze | she, her, they, them
zich | oneself
bij | (1) a bee, (2) by, near, at
ook | also, too
tot | till, until
je | you
mij | me
uit | out of, from
der | Old Dutch form of 'van der' still found in surnames
daar | (1) there, (2) because
haar | (1) her, their, them, (2) hair
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
heb | present first person sing. of 'to have'
hoe | how, why
heeft | present third person sing. of 'to have'
hebben | 'to have' and various parts thereof
deze | this
u | you
want | (1) for, (2) mitten, (3) rigging
nog | yet, still
zal | 'shall', first and third person sing. of verb 'zullen' (will)
me | me
zij | she, they
nu | now
ge | 'thou', still used in Belgium and south Netherlands
geen | none
omdat | because
iets | something, somewhat
worden | to become, grow, get
toch | yet, still
al | all, every, each
waren | (1) 'were' (2) to wander, (3) wares, (3)
veel | much, many
meer | (1) more, (2) lake
doen | to do, to make
toen | then, when
moet | noun 'spot/mote' and present form of 'to must'
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
zonder | without
kan | noun 'can' and present form of 'to be able'
hun | their, them
dus | so, consequently
alles | all, everything, anything
onder | under, beneath
ja | yes, of course
eens | once, one day
hier | here
wie | who
werd | imperfect third person sing. of 'become'
altijd | always
doch | yet, but etc
wordt | present third person sing. of 'become'
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
kunnen | to be able
ons | us/our
zelf | self
tegen | against, towards, at
na | after, near
reeds | already
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
kon | could; past tense of 'to be able'
niets | nothing
uw | your
iemand | somebody
geweest | been; past participle of 'be'
andere | other
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DutchStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package no
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "no"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopNoFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerNoFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopNoFilter,
stemmerNoFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package no
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestNorwegianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("havnedistriktene"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("havnedistrikt"),
},
},
},
{
input: []byte("havnedistrikter"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("havnedistrikt"),
},
},
},
// stop word
{
input: []byte("det"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package no
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_no"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("no")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package no
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,218 +0,0 @@
package no
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_no"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var NorwegianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This stop word list is for the dominant bokmål dialect. Words unique
| to nynorsk are marked *.
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
og | and
i | in
jeg | I
det | it/this/that
at | to (w. inf.)
en | a/an
et | a/an
den | it/this/that
til | to
er | is/am/are
som | who/that
| on
de | they / you(formal)
med | with
han | he
av | of
ikke | not
ikkje | not *
der | there
| so
var | was/were
meg | me
seg | you
men | but
ett | one
har | have
om | about
vi | we
min | my
mitt | my
ha | have
hadde | had
hun | she
| now
over | over
da | when/as
ved | by/know
fra | from
du | you
ut | out
sin | your
dem | them
oss | us
opp | up
man | you/one
kan | can
hans | his
hvor | where
eller | or
hva | what
skal | shall/must
selv | self (reflective)
sjøl | self (reflective)
her | here
alle | all
vil | will
bli | become
ble | became
blei | became *
blitt | have become
kunne | could
inn | in
når | when
være | be
kom | come
noen | some
noe | some
ville | would
dere | you
som | who/which/that
deres | their/theirs
kun | only/just
ja | yes
etter | after
ned | down
skulle | should
denne | this
for | for/because
deg | you
si | hers/his
sine | hers/his
sitt | hers/his
mot | against
å | to
meget | much
hvorfor | why
dette | this
disse | these/those
uten | without
hvordan | how
ingen | none
din | your
ditt | your
blir | become
samme | same
hvilken | which
hvilke | which (plural)
sånn | such a
inni | inside/within
mellom | between
vår | our
hver | each
hvem | who
vors | us/ours
hvis | whose
både | both
bare | only/just
enn | than
fordi | as/because
før | before
mange | many
også | also
slik | just
vært | been
være | to be
båe | both *
begge | both
siden | since
dykk | your *
dykkar | yours *
dei | they *
deira | them *
deires | theirs *
deim | them *
di | your (fem.) *
| as/when *
eg | I *
ein | a/an *
eit | a/an *
eitt | a/an *
elles | or *
honom | he *
hjå | at *
ho | she *
hoe | she *
henne | her
hennar | her/hers
hennes | hers
hoss | how *
hossen | how *
ikkje | not *
ingi | noone *
inkje | noone *
korleis | how *
korso | how *
kva | what/which *
kvar | where *
kvarhelst | where *
kven | who/whom *
kvi | why *
kvifor | why *
me | we *
medan | while *
mi | my *
mine | my *
mykje | much *
no | now *
nokon | some (masc./neut.) *
noka | some (fem.) *
nokor | some *
noko | some *
nokre | some *
si | his/hers *
sia | since *
sidan | since *
so | so *
somt | some *
somme | some *
um | about*
upp | up *
vere | be *
vore | was *
verte | become *
vort | become *
varte | became *
vart | became *
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(NorwegianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package porter
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_porter_classic"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("porter")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package pt
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_pt"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("pt")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package ro
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "ro"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopRoFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerRoFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopRoFilter,
stemmerRoFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package ro
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestRomanianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("absenţa"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("absenţ"),
},
},
},
{
input: []byte("absenţi"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("absenţ"),
},
},
},
// stop word
{
input: []byte("îl"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package ro
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ro"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("ro")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ro
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,257 +0,0 @@
package ro
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ro"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var RomanianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
acea
aceasta
această
aceea
acei
aceia
acel
acela
acele
acelea
acest
acesta
aceste
acestea
aceşti
aceştia
acolo
acum
ai
aia
aibă
aici
al
ăla
ale
alea
ălea
altceva
altcineva
am
ar
are
aşadar
asemenea
asta
ăsta
astăzi
astea
ăstea
ăştia
asupra
aţi
au
avea
avem
aveţi
azi
bine
bucur
bună
ca
căci
când
care
cărei
căror
cărui
cât
câte
câţi
către
câtva
ce
cel
ceva
chiar
cînd
cine
cineva
cît
cîte
cîţi
cîtva
contra
cu
cum
cumva
curând
curînd
da
dacă
dar
datorită
de
deci
deja
deoarece
departe
deşi
din
dinaintea
dintr
dintre
drept
după
ea
ei
el
ele
eram
este
eşti
eu
face
fără
fi
fie
fiecare
fii
fim
fiţi
iar
ieri
îi
îl
îmi
împotriva
în
înainte
înaintea
încât
încît
încotro
între
întrucât
întrucît
îţi
la
lângă
le
li
lîngă
lor
lui
mâine
mea
mei
mele
mereu
meu
mi
mine
mult
multă
mulţi
ne
nicăieri
nici
nimeni
nişte
noastră
noastre
noi
noştri
nostru
nu
ori
oricând
oricare
oricât
orice
oricînd
oricine
oricît
oricum
oriunde
până
pe
pentru
peste
pînă
poate
pot
prea
prima
primul
prin
printr
sa
săi
sale
sau
său
se
şi
sînt
sîntem
sînteţi
spre
sub
sunt
suntem
sunteţi
ta
tăi
tale
tău
te
ţi
ţie
tine
toată
toate
tot
toţi
totuşi
tu
un
una
unde
undeva
unei
unele
uneori
unor
vi
voastră
voastre
voi
voştri
vostru
vouă
vreo
vreun
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(RomanianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "ru"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopRuFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerRuFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopRuFilter,
stemmerRuFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,98 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package ru
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestRussianAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// digits safe
{
input: []byte("text 1000"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("text"),
},
&analysis.Token{
Term: []byte("1000"),
},
},
},
{
input: []byte("Вместе с тем о силе электромагнитной энергии имели представление еще"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("вмест"),
},
&analysis.Token{
Term: []byte("сил"),
},
&analysis.Token{
Term: []byte("электромагнитн"),
},
&analysis.Token{
Term: []byte("энерг"),
},
&analysis.Token{
Term: []byte("имел"),
},
&analysis.Token{
Term: []byte("представлен"),
},
},
},
{
input: []byte("Но знание это хранилось в тайне"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("знан"),
},
&analysis.Token{
Term: []byte("эт"),
},
&analysis.Token{
Term: []byte("хран"),
},
&analysis.Token{
Term: []byte("тайн"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_ru"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("ru")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,267 +0,0 @@
package ru
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_ru"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var RussianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.
| this is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| letter 'ё' is translated to 'е'.
и | and
в | in/into
во | alternative form
не | not
что | what/that
он | he
на | on/onto
я | i
с | from
со | alternative form
как | how
а | milder form of 'no' (but)
то | conjunction and form of 'that'
все | all
она | she
так | so, thus
его | him
но | but
да | yes/and
ты | thou
к | towards, by
у | around, chez
же | intensifier particle
вы | you
за | beyond, behind
бы | conditional/subj. particle
по | up to, along
только | only
ее | her
мне | to me
было | it was
вот | here is/are, particle
от | away from
меня | me
еще | still, yet, more
нет | no, there isnt/arent
о | about
из | out of
ему | to him
теперь | now
когда | when
даже | even
ну | so, well
вдруг | suddenly
ли | interrogative particle
если | if
уже | already, but homonym of 'narrower'
или | or
ни | neither
быть | to be
был | he was
него | prepositional form of его
до | up to
вас | you accusative
нибудь | indef. suffix preceded by hyphen
опять | again
уж | already, but homonym of 'adder'
вам | to you
сказал | he said
ведь | particle 'after all'
там | there
потом | then
себя | oneself
ничего | nothing
ей | to her
может | usually with 'быть' as 'maybe'
они | they
тут | here
где | where
есть | there is/are
надо | got to, must
ней | prepositional form of ей
для | for
мы | we
тебя | thee
их | them, their
чем | than
была | she was
сам | self
чтоб | in order to
без | without
будто | as if
человек | man, person, one
чего | genitive form of 'what'
раз | once
тоже | also
себе | to oneself
под | beneath
жизнь | life
будет | will be
ж | short form of intensifer particle 'же'
тогда | then
кто | who
этот | this
говорил | was saying
того | genitive form of 'that'
потому | for that reason
этого | genitive form of 'this'
какой | which
совсем | altogether
ним | prepositional form of 'его', 'они'
здесь | here
этом | prepositional form of 'этот'
один | one
почти | almost
мой | my
тем | instrumental/dative plural of 'тот', 'то'
чтобы | full form of 'in order that'
нее | her (acc.)
кажется | it seems
сейчас | now
были | they were
куда | where to
зачем | why
сказать | to say
всех | all (acc., gen. preposn. plural)
никогда | never
сегодня | today
можно | possible, one can
при | by
наконец | finally
два | two
об | alternative form of 'о', about
другой | another
хоть | even
после | after
над | above
больше | more
тот | that one (masc.)
через | across, in
эти | these
нас | us
про | about
всего | in all, only, of all
них | prepositional form of 'они' (they)
какая | which, feminine
много | lots
разве | interrogative particle
сказала | she said
три | three
эту | this, acc. fem. sing.
моя | my, feminine
впрочем | moreover, besides
хорошо | good
свою | ones own, acc. fem. sing.
этой | oblique form of 'эта', fem. 'this'
перед | in front of
иногда | sometimes
лучше | better
чуть | a little
том | preposn. form of 'that one'
нельзя | one must not
такой | such a one
им | to them
более | more
всегда | always
конечно | of course
всю | acc. fem. sing of 'all'
между | between
| b: some paradigms
|
| personal pronouns
|
| я меня мне мной [мною]
| ты тебя тебе тобой [тобою]
| он его ему им [него, нему, ним]
| она ее эи ею [нее, нэи, нею]
| оно его ему им [него, нему, ним]
|
| мы нас нам нами
| вы вас вам вами
| они их им ими [них, ним, ними]
|
| себя себе собой [собою]
|
| demonstrative pronouns: этот (this), тот (that)
|
| этот эта это эти
| этого эты это эти
| этого этой этого этих
| этому этой этому этим
| этим этой этим [этою] этими
| этом этой этом этих
|
| тот та то те
| того ту то те
| того той того тех
| тому той тому тем
| тем той тем [тою] теми
| том той том тех
|
| determinative pronouns
|
| (a) весь (all)
|
| весь вся все все
| всего всю все все
| всего всей всего всех
| всему всей всему всем
| всем всей всем [всею] всеми
| всем всей всем всех
|
| (b) сам (himself etc)
|
| сам сама само сами
| самого саму само самих
| самого самой самого самих
| самому самой самому самим
| самим самой самим [самою] самими
| самом самой самом самих
|
| stems of verbs 'to be', 'to have', 'to do' and modal
|
| быть бы буд быв есть суть
| име
| дел
| мог мож мочь
| уме
| хоч хот
| долж
| можн
| нужн
| нельзя
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(RussianStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,55 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package sv
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "sv"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopSvFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerSvFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopSvFilter,
stemmerSvFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,68 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package sv
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestSwedishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("jaktkarlarne"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jaktkarl"),
},
},
},
{
input: []byte("jaktkarlens"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jaktkarl"),
},
},
},
// stop word
{
input: []byte("och"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package sv
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_sv"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("sv")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package sv
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,157 +0,0 @@
package sv
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_sv"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| Swedish stop words occasionally exhibit homonym clashes. For example
| = so, but also seed. These are indicated clearly below.
och | and
det | it, this/that
att | to (with infinitive)
i | in, at
en | a
jag | I
hon | she
som | who, that
han | he
| on
den | it, this/that
med | with
var | where, each
sig | him(self) etc
för | for
| so (also: seed)
till | to
är | is
men | but
ett | a
om | if; around, about
hade | had
de | they, these/those
av | of
icke | not, no
mig | me
du | you
henne | her
| then, when
sin | his
nu | now
har | have
inte | inte någon = no one
hans | his
honom | him
skulle | 'sake'
hennes | her
där | there
min | my
man | one (pronoun)
ej | nor
vid | at, by, on (also: vast)
kunde | could
något | some etc
från | from, off
ut | out
när | when
efter | after, behind
upp | up
vi | we
dem | them
vara | be
vad | what
över | over
än | than
dig | you
kan | can
sina | his
här | here
ha | have
mot | towards
alla | all
under | under (also: wonder)
någon | some etc
eller | or (else)
allt | all
mycket | much
sedan | since
ju | why
denna | this/that
själv | myself, yourself etc
detta | this/that
åt | to
utan | without
varit | was
hur | how
ingen | no
mitt | my
ni | you
bli | to be, become
blev | from bli
oss | us
din | thy
dessa | these/those
några | some etc
deras | their
blir | from bli
mina | my
samma | (the) same
vilken | who, that
er | you, your
sådan | such a
vår | our
blivit | from bli
dess | its
inom | within
mellan | between
sådant | such a
varför | why
varje | each
vilka | who, that
ditt | thy
vem | who
vilket | who, that
sitta | his
sådana | such a
vart | each
dina | thy
vars | whose
vårt | our
våra | our
ert | your
era | your
vilkas | whose
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SwedishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,48 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package th
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
)
const AnalyzerName = "th"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(TokenizerName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopThFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopThFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,119 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package th
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
// tried to adapt these from the lucene tests, most of which either
// use the empty stop dictionary or the english one.
func TestThaiAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stop words
{
input: []byte("การที่ได้ต้องแสดงว่างานดี"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("แสดง"),
Position: 5,
Start: 39,
End: 51,
},
&analysis.Token{
Term: []byte("งาน"),
Position: 7,
Start: 60,
End: 69,
},
&analysis.Token{
Term: []byte("ดี"),
Position: 8,
Start: 69,
End: 75,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}
func TestThaiAnalyzerWihtoutOffsets(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stop words
{
input: []byte("บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("บริษัท"),
},
&analysis.Token{
Term: []byte("ชื่อ"),
},
&analysis.Token{
Term: []byte("xy"),
},
&analysis.Token{
Term: []byte("z"),
},
&analysis.Token{
Term: []byte("คุย"),
},
&analysis.Token{
Term: []byte("xyz"),
},
&analysis.Token{
Term: []byte("demo.com"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Errorf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package th
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,143 +0,0 @@
package th
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_th"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var ThaiStopWords = []byte(`# Thai stopwords from:
# "Opinion Detection in Thai Political News Columns
# Based on Subjectivity Analysis"
# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
ไว้
ไม่
ไป
ได้
ให้
ใน
โดย
แห่ง
แล้ว
และ
แรก
แบบ
แต่
เอง
เห็น
เลย
เริ่ม
เรา
เมื่อ
เพื่อ
เพราะ
เป็นการ
เป็น
เปิดเผย
เปิด
เนื่องจาก
เดียวกัน
เดียว
เช่น
เฉพาะ
เคย
เข้า
เขา
อีก
อาจ
อะไร
ออก
อย่าง
อยู่
อยาก
หาก
หลาย
หลังจาก
หลัง
หรือ
หนึ่ง
ส่วน
ส่ง
สุด
สําหรับ
ว่า
วัน
ลง
ร่วม
ราย
รับ
ระหว่าง
รวม
ยัง
มี
มาก
มา
พร้อม
พบ
ผ่าน
ผล
บาง
น่า
นี้
นํา
นั้น
นัก
นอกจาก
ทุก
ที่สุด
ที่
ทําให้
ทํา
ทาง
ทั้งนี้
ทั้ง
ถ้า
ถูก
ถึง
ต้อง
ต่างๆ
ต่าง
ต่อ
ตาม
ตั้งแต่
ตั้ง
ด้าน
ด้วย
ดัง
ซึ่ง
ช่วง
จึง
จาก
จัด
จะ
คือ
ความ
ครั้ง
คง
ขึ้น
ของ
ขอ
ขณะ
ก่อน
ก็
การ
กับ
กัน
กว่า
กล่าว
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ThaiStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package th
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
"github.com/blevesearch/bleve/registry"
)
const TokenizerName = "icu_th"
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
return icu.NewUnicodeWordBoundaryCustomLocaleTokenizer("th_TH"), nil
}
func init() {
registry.RegisterTokenizer(TokenizerName, TokenizerConstructor)
}

View File

@ -1,61 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package tr
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/tokenizers/icu"
)
const AnalyzerName = "tr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
icuTokenizer, err := cache.TokenizerNamed(icu.Name)
if err != nil {
return nil, err
}
aposFilter, err := cache.TokenFilterNamed(apostrophe_filter.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopTrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerTrFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: icuTokenizer,
TokenFilters: []analysis.TokenFilter{
aposFilter,
toLowerFilter,
stopTrFilter,
stemmerTrFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
}

View File

@ -1,88 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
// +build icu full
package tr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestTurkishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("ağacı"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ağaç"),
},
},
},
{
input: []byte("ağaç"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ağaç"),
},
},
},
// stop word
{
input: []byte("dolayı"),
output: analysis.TokenStream{},
},
// apostrophes
{
input: []byte("Kıbrıs'ta"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("kıbrıs"),
},
},
},
{
input: []byte("Van Gölü'ne"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("van"),
},
&analysis.Token{
Term: []byte("göl"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package tr
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
"github.com/blevesearch/bleve/registry"
)
const StemmerName = "stemmer_tr"
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return stemmer_filter.NewStemmerFilter("tr")
}
func init() {
registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
}

View File

@ -1,28 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package tr
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
"github.com/blevesearch/bleve/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
}
func init() {
registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
}

View File

@ -1,236 +0,0 @@
package tr
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const StopName = "stop_tr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var TurkishStopWords = []byte(`# Turkish stopwords from LUCENE-559
# merged with the list from "Information Retrieval on Turkish Texts"
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
acaba
altmış
altı
ama
ancak
arada
aslında
ayrıca
bana
bazı
belki
ben
benden
beni
benim
beri
beş
bile
bin
bir
birçok
biri
birkaç
birkez
birşey
birşeyi
biz
bize
bizden
bizi
bizim
böyle
böylece
bu
buna
bunda
bundan
bunlar
bunları
bunların
bunu
bunun
burada
çok
çünkü
da
daha
dahi
de
defa
değil
diğer
diye
doksan
dokuz
dolayı
dolayısıyla
dört
edecek
eden
ederek
edilecek
ediliyor
edilmesi
ediyor
eğer
elli
en
etmesi
etti
ettiği
ettiğini
gibi
göre
halen
hangi
hatta
hem
henüz
hep
hepsi
her
herhangi
herkesin
hiç
hiçbir
için
iki
ile
ilgili
ise
işte
itibaren
itibariyle
kadar
karşın
katrilyon
kendi
kendilerine
kendini
kendisi
kendisine
kendisini
kez
ki
kim
kimden
kime
kimi
kimse
kırk
milyar
milyon
mu
mı
nasıl
ne
neden
nedenle
nerde
nerede
nereye
niye
niçin
o
olan
olarak
oldu
olduğu
olduğunu
olduklarını
olmadı
olmadığı
olmak
olması
olmayan
olmaz
olsa
olsun
olup
olur
olursa
oluyor
on
ona
ondan
onlar
onlardan
onları
onların
onu
onun
otuz
oysa
öyle
pek
rağmen
sadece
sanki
sekiz
seksen
sen
senden
seni
senin
siz
sizden
sizi
sizin
şey
şeyden
şeyi
şeyler
şöyle
şu
şuna
şunda
şundan
şunları
şunu
tarafından
trilyon
tüm
üç
üzere
var
vardı
ve
veya
ya
yani
yapacak
yapılan
yapılması
yapıyor
yapmak
yaptı
yaptığı
yaptığını
yaptıkları
yedi
yerine
yetmiş
yine
yirmi
yoksa
yüz
zaten
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(TurkishStopWords)
return rv, err
}
func init() {
registry.RegisterTokenMap(StopName, TokenMapConstructor)
}

View File

@ -1,33 +0,0 @@
# cld2 token filter
A bleve token filter which passes the text of each token and passes it to the cld2 library. The library determines what it thinks the language most likely is. The ISO-639 language code replaces the token term.
In normal usage, you use this with the "single" tokenizer, so there is only one input token. Further, you should precede it with the "to_lower" filter so that the input term is in all lower-case unicode characters.
# Building
1. Acquire the source to cld2 in this directory.
$ svn checkout -r 167 http://cld2.googlecode.com/svn/trunk/ cld2-read-only
2. Build cld2
As dynamic library
$ cd cld2-read-only/internal/
$ ./compile_libs.sh
$ cp *.so /usr/local/lib
$ cd ../..
Or static library
$ ./compile_cld2.sh
$ cp *.a /usr/local/lib
3. Run the unit tests
$ go test -v
=== RUN TestCld2Filter
--- PASS: TestCld2Filter (0.00 seconds)
PASS
ok github.com/couchbaselabs/bleve/analysis/token_filters/cld2 0.033s

View File

@ -1,44 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
#include <cstddef>
#include <string.h>
#include <stdio.h>
#include <string>
#include "cld2_filter.h"
#include "cld2-read-only/public/compact_lang_det.h"
const char* DetectLang(const char *buffer) {
bool is_plain_text = true;
CLD2::CLDHints cldhints = {NULL, NULL, 0, CLD2::UNKNOWN_LANGUAGE};
bool allow_extended_lang = true;
int flags = 0;
CLD2::Language language3[3];
int percent3[3];
double normalized_score3[3];
CLD2::ResultChunkVector resultchunkvector;
int text_bytes;
bool is_reliable;
CLD2::Language summary_lang = CLD2::UNKNOWN_LANGUAGE;
summary_lang = CLD2::ExtDetectLanguageSummary(buffer,
strlen(buffer),
is_plain_text,
&cldhints,
flags,
language3,
percent3,
normalized_score3,
&resultchunkvector,
&text_bytes,
&is_reliable);
return CLD2::LanguageCode(summary_lang);
}

View File

@ -1,67 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cld2 full
package cld2
// #cgo LDFLAGS: -lcld2_full
// #include "cld2_filter.h"
// #include <string.h>
import "C"
import (
"unsafe"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "detect_lang"
type Cld2Filter struct {
}
func NewCld2Filter() *Cld2Filter {
return &Cld2Filter{}
}
func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
offset := 0
for _, token := range input {
var err error
token.Term, err = f.detectLanguage(token.Term)
if err != nil {
token.Term = []byte("error")
}
token.Start = offset
token.End = token.Start + len(token.Term)
token.Type = analysis.AlphaNumeric
rv = append(rv, token)
offset = token.End + 1
}
return rv
}
func (f *Cld2Filter) detectLanguage(input []byte) ([]byte, error) {
cstr := C.CString(string(input))
res := C.DetectLang(cstr)
return C.GoBytes(unsafe.Pointer(res), C.int(C.strlen(res))), nil
}
func Cld2FilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewCld2Filter(), nil
}
func init() {
registry.RegisterTokenFilter(Name, Cld2FilterConstructor)
}

View File

@ -1,123 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cld2 full
package cld2
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestCld2Filter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick brown fox"),
Start: 0,
End: 19,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("en"),
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こんにちは世界"),
Start: 0,
End: 21,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ja"),
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
Start: 0,
End: 72,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("th"),
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("مرحبا، العالم!"),
Start: 0,
End: 26,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ar"),
Start: 0,
End: 2,
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
}
filter := NewCld2Filter()
for _, test := range tests {
res := filter.Filter(test.input)
if !reflect.DeepEqual(res, test.output) {
t.Errorf("expected:")
for _, token := range test.output {
t.Errorf("%#v - %s", token, token.Term)
}
t.Errorf("got:")
for _, token := range res {
t.Errorf("%#v - %s", token, token.Term)
}
}
}
}

View File

@ -1,10 +0,0 @@
#!/bin/bash
SRC="cldutil cldutil_shared compact_lang_det compact_lang_det_hint_code compact_lang_det_impl debug fixunicodevalue generated_entities generated_language generated_ulscript getonescriptspan lang_script offsetmap scoreonescriptspan tote utf8statetable cld_generated_cjk_uni_prop_80 cld2_generated_cjk_compatible cld_generated_cjk_delta_bi_32 generated_distinct_bi_0 cld2_generated_quad0122 cld2_generated_deltaocta0122 cld2_generated_distinctocta0122 cld_generated_score_quad_octa_0122";
OBJ="";
for f in ${SRC}; do
g++ -c -fPIC -O2 -m64 -o "cld2-read-only/internal/${f}.o" "cld2-read-only/internal/${f}.cc";
OBJ="${OBJ} cld2-read-only/internal/${f}.o";
done;
ar rcs libcld2_full.a ${OBJ};

View File

@ -1,18 +0,0 @@
## Languages supported
"danish",
"dutch",
"english",
"finnish",
"french",
"german",
"hungarian",
"italian",
"norwegian",
"porter",
"portuguese",
"romanian",
"russian",
"spanish",
"swedish",
"turkish"

View File

@ -1,80 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package stemmer_filter
import (
"fmt"
"bitbucket.org/tebeka/snowball"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "stem"
type StemmerFilter struct {
lang string
stemmerPool chan *snowball.Stemmer
}
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
stemmerPool := make(chan *snowball.Stemmer, 4)
for i := 0; i < 4; i++ {
stemmer, err := snowball.New(lang)
if err != nil {
return nil, err
}
stemmerPool <- stemmer
}
return &StemmerFilter{
lang: lang,
stemmerPool: stemmerPool,
}, nil
}
func MustNewStemmerFilter(lang string) *StemmerFilter {
sf, err := NewStemmerFilter(lang)
if err != nil {
panic(err)
}
return sf
}
func (s *StemmerFilter) List() []string {
return snowball.LangList()
}
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmer := <-s.stemmerPool
stemmed := stemmer.Stem(string(token.Term))
s.stemmerPool <- stemmer
token.Term = []byte(stemmed)
}
}
return input
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
langVal, ok := config["lang"].(string)
if !ok {
return nil, fmt.Errorf("must specify stemmer language")
}
lang := langVal
return NewStemmerFilter(lang)
}
func init() {
registry.RegisterTokenFilter(Name, StemmerFilterConstructor)
}

View File

@ -1,63 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build libstemmer full
package stemmer_filter
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestStemmerFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
}
filter, err := NewStemmerFilter("en")
if err != nil {
t.Fatal(err)
}
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}

View File

@ -1,138 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package icu
// #cgo LDFLAGS: -licuuc -licudata
// #include <stdio.h>
// #include <stdlib.h>
// #include "unicode/utypes.h"
// #include "unicode/uchar.h"
// #include "unicode/ubrk.h"
// #include "unicode/ustring.h"
import "C"
import (
"unsafe"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "icu"
type UnicodeWordBoundaryTokenizer struct {
locale *C.char
}
func NewUnicodeWordBoundaryTokenizer() *UnicodeWordBoundaryTokenizer {
return &UnicodeWordBoundaryTokenizer{}
}
func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoundaryTokenizer {
return &UnicodeWordBoundaryTokenizer{
locale: C.CString(locale),
}
}
func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
if len(input) < 1 {
return rv
}
// works
var myUnsafePointer = unsafe.Pointer(&(input[0]))
var myCCharPointer *C.char = (*C.char)(myUnsafePointer)
var inlen C.int32_t = C.int32_t(len(input))
var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2
var stringToExamine []C.UChar = make([]C.UChar, buflen)
var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0]))
var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine)
C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen)
var err C.UErrorCode = C.U_ZERO_ERROR
bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err)
if err > C.U_ZERO_ERROR {
return rv
}
defer C.ubrk_close(bi)
position := 0
var prev C.int32_t
p := C.ubrk_first(bi)
for p != C.UBRK_DONE {
q := C.ubrk_getRuleStatus(bi)
// convert boundaries back to utf8 positions
var nilCString *C.char
var indexA C.int32_t
C.u_strToUTF8(nilCString, 0, &indexA, myUCharPointer, prev, &err)
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
return rv
} else {
err = C.U_ZERO_ERROR
}
var indexB C.int32_t
C.u_strToUTF8(nilCString, 0, &indexB, myUCharPointer, p, &err)
if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR {
return rv
} else {
err = C.U_ZERO_ERROR
}
if q != 0 {
position += 1
token := analysis.Token{
Start: int(indexA),
End: int(indexB),
Term: input[indexA:indexB],
Position: position,
Type: analysis.AlphaNumeric,
}
if q == 100 {
token.Type = analysis.Numeric
}
if q == 400 {
token.Type = analysis.Ideographic
}
rv = append(rv, &token)
}
prev = p
p = C.ubrk_next(bi)
}
return rv
}
func UnicodeWordBoundaryTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
locale := ""
localeVal, ok := config["locale"].(string)
if ok {
locale = localeVal
}
if locale == "" {
return NewUnicodeWordBoundaryTokenizer(), nil
} else {
return NewUnicodeWordBoundaryCustomLocaleTokenizer(locale), nil
}
}
func init() {
registry.RegisterTokenizer(Name, UnicodeWordBoundaryTokenizerConstructor)
}

View File

@ -1,191 +0,0 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build icu full
package icu
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
)
func TestBoundary(t *testing.T) {
tests := []struct {
input []byte
locale string
output analysis.TokenStream
}{
{
[]byte("Hello World"),
"en_US",
analysis.TokenStream{
{
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 6,
End: 11,
Term: []byte("World"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("steven's"),
"en_US",
analysis.TokenStream{
{
Start: 0,
End: 8,
Term: []byte("steven's"),
Position: 1,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("こんにちは世界"),
"en_US",
analysis.TokenStream{
{
Start: 0,
End: 15,
Term: []byte("こんにちは"),
Position: 1,
Type: analysis.Ideographic,
},
{
Start: 15,
End: 21,
Term: []byte("世界"),
Position: 2,
Type: analysis.Ideographic,
},
},
},
{
[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
"th_TH",
analysis.TokenStream{
{
Start: 0,
End: 9,
Term: []byte("แยก"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 9,
End: 15,
Term: []byte("คำ"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 15,
End: 27,
Term: []byte("ภาษา"),
Position: 3,
Type: analysis.AlphaNumeric,
},
{
Start: 27,
End: 36,
Term: []byte("ไทย"),
Position: 4,
Type: analysis.AlphaNumeric,
},
{
Start: 36,
End: 42,
Term: []byte("ก็"),
Position: 5,
Type: analysis.AlphaNumeric,
},
{
Start: 42,
End: 57,
Term: []byte("ทำได้"),
Position: 6,
Type: analysis.AlphaNumeric,
},
{
Start: 57,
End: 63,
Term: []byte("นะ"),
Position: 7,
Type: analysis.AlphaNumeric,
},
{
Start: 63,
End: 72,
Term: []byte("จ้ะ"),
Position: 8,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("age 25"),
"en_US",
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("age"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 4,
End: 6,
Term: []byte("25"),
Position: 2,
Type: analysis.Numeric,
},
},
},
}
for _, test := range tests {
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer(test.locale)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
}
}
}
var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
func BenchmarkTokenizeEnglishText(b *testing.B) {
tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer("en_US")
b.ResetTimer()
for i := 0; i < b.N; i++ {
tokenizer.Tokenize(sampleLargeInput)
}
}

135
config.go
View File

@ -16,103 +16,10 @@ import (
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/boltdb"
"github.com/blevesearch/bleve/index/upside_down"
"github.com/blevesearch/bleve/registry"
// token maps
_ "github.com/blevesearch/bleve/analysis/token_map"
// fragment formatters
_ "github.com/blevesearch/bleve/search/highlight/fragment_formatters/ansi"
_ "github.com/blevesearch/bleve/search/highlight/fragment_formatters/html"
// fragmenters
_ "github.com/blevesearch/bleve/search/highlight/fragmenters/simple"
// highlighters
_ "github.com/blevesearch/bleve/search/highlight/highlighters/simple"
// char filters
_ "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter"
_ "github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
_ "github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
// analyzers
_ "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
// token filters
_ "github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/compound"
_ "github.com/blevesearch/bleve/analysis/token_filters/edge_ngram_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/keyword_marker_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/length_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/ngram_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/shingle"
_ "github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/truncate_token_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
// tokenizers
_ "github.com/blevesearch/bleve/analysis/tokenizers/exception"
_ "github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
_ "github.com/blevesearch/bleve/analysis/tokenizers/single_token"
_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
_ "github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
// date time parsers
_ "github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional"
_ "github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
// languages
_ "github.com/blevesearch/bleve/analysis/language/ar"
_ "github.com/blevesearch/bleve/analysis/language/bg"
_ "github.com/blevesearch/bleve/analysis/language/ca"
_ "github.com/blevesearch/bleve/analysis/language/cjk"
_ "github.com/blevesearch/bleve/analysis/language/ckb"
_ "github.com/blevesearch/bleve/analysis/language/cs"
_ "github.com/blevesearch/bleve/analysis/language/da"
_ "github.com/blevesearch/bleve/analysis/language/de"
_ "github.com/blevesearch/bleve/analysis/language/el"
_ "github.com/blevesearch/bleve/analysis/language/en"
_ "github.com/blevesearch/bleve/analysis/language/es"
_ "github.com/blevesearch/bleve/analysis/language/eu"
_ "github.com/blevesearch/bleve/analysis/language/fa"
_ "github.com/blevesearch/bleve/analysis/language/fi"
_ "github.com/blevesearch/bleve/analysis/language/fr"
_ "github.com/blevesearch/bleve/analysis/language/ga"
_ "github.com/blevesearch/bleve/analysis/language/gl"
_ "github.com/blevesearch/bleve/analysis/language/hi"
_ "github.com/blevesearch/bleve/analysis/language/hu"
_ "github.com/blevesearch/bleve/analysis/language/hy"
_ "github.com/blevesearch/bleve/analysis/language/id"
_ "github.com/blevesearch/bleve/analysis/language/in"
_ "github.com/blevesearch/bleve/analysis/language/it"
_ "github.com/blevesearch/bleve/analysis/language/nl"
_ "github.com/blevesearch/bleve/analysis/language/no"
_ "github.com/blevesearch/bleve/analysis/language/pt"
_ "github.com/blevesearch/bleve/analysis/language/ro"
_ "github.com/blevesearch/bleve/analysis/language/ru"
_ "github.com/blevesearch/bleve/analysis/language/sv"
_ "github.com/blevesearch/bleve/analysis/language/th"
_ "github.com/blevesearch/bleve/analysis/language/tr"
// kv stores
_ "github.com/blevesearch/bleve/index/store/boltdb"
_ "github.com/blevesearch/bleve/index/store/goleveldb"
_ "github.com/blevesearch/bleve/index/store/gtreap"
_ "github.com/blevesearch/bleve/index/store/inmem"
// index types
_ "github.com/blevesearch/bleve/index/upside_down"
// byte array converters
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/ignore"
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/json"
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/string"
"github.com/blevesearch/bleve/search/highlight/highlighters/html"
)
var bleveExpVar = expvar.NewMap("bleve")
@ -146,44 +53,14 @@ func init() {
// build the default configuration
Config = newConfiguration()
_, err := Config.Cache.DefineFragmentFormatter("highlightSpanHTML",
map[string]interface{}{
"type": "html",
"before": `<mark>`,
"after": `</mark>`,
})
if err != nil {
panic(err)
}
_, err = Config.Cache.DefineHighlighter("html",
map[string]interface{}{
"type": "simple",
"fragmenter": "simple",
"formatter": "highlightSpanHTML",
})
if err != nil {
panic(err)
}
_, err = Config.Cache.DefineHighlighter("ansi",
map[string]interface{}{
"type": "simple",
"fragmenter": "simple",
"formatter": "ansi",
})
if err != nil {
panic(err)
}
// set the default highlighter
Config.DefaultHighlighter = "html"
Config.DefaultHighlighter = html.Name
// default kv store
Config.DefaultKVStore = "boltdb"
Config.DefaultKVStore = boltdb.Name
// default index
Config.DefaultIndexType = "upside_down"
Config.DefaultIndexType = upside_down.Name
bootDuration := time.Since(bootStart)
bleveExpVar.Add("bootDuration", int64(bootDuration))

11
config/README.md Normal file
View File

@ -0,0 +1,11 @@
# Bleve Config
**NOTE** you probably do not need this package. It is only intended for general purpose applications that want to include large parts of Bleve regardless of whether or not the code is directly using it.
## General Purpose Applications
A general purpose application, that must allow users to express the need for Bleve components at runtime can accomplish this by:
```
import _ "github.com/blevesearch/bleve/config"
```

98
config/config.go Normal file
View File

@ -0,0 +1,98 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package config
import (
// token maps
_ "github.com/blevesearch/bleve/analysis/token_map"
// fragment formatters
_ "github.com/blevesearch/bleve/search/highlight/fragment_formatters/ansi"
_ "github.com/blevesearch/bleve/search/highlight/fragment_formatters/html"
// fragmenters
_ "github.com/blevesearch/bleve/search/highlight/fragmenters/simple"
// highlighters
_ "github.com/blevesearch/bleve/search/highlight/highlighters/ansi"
_ "github.com/blevesearch/bleve/search/highlight/highlighters/html"
_ "github.com/blevesearch/bleve/search/highlight/highlighters/simple"
// char filters
_ "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter"
_ "github.com/blevesearch/bleve/analysis/char_filters/regexp_char_filter"
_ "github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
// analyzers
_ "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
// token filters
_ "github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/compound"
_ "github.com/blevesearch/bleve/analysis/token_filters/edge_ngram_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/keyword_marker_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/length_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/ngram_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/shingle"
_ "github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/truncate_token_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
// tokenizers
_ "github.com/blevesearch/bleve/analysis/tokenizers/exception"
_ "github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
_ "github.com/blevesearch/bleve/analysis/tokenizers/single_token"
_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
_ "github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
// date time parsers
_ "github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional"
_ "github.com/blevesearch/bleve/analysis/datetime_parsers/flexible_go"
// languages
_ "github.com/blevesearch/bleve/analysis/language/ar"
_ "github.com/blevesearch/bleve/analysis/language/bg"
_ "github.com/blevesearch/bleve/analysis/language/ca"
_ "github.com/blevesearch/bleve/analysis/language/cjk"
_ "github.com/blevesearch/bleve/analysis/language/ckb"
_ "github.com/blevesearch/bleve/analysis/language/cs"
_ "github.com/blevesearch/bleve/analysis/language/el"
_ "github.com/blevesearch/bleve/analysis/language/en"
_ "github.com/blevesearch/bleve/analysis/language/eu"
_ "github.com/blevesearch/bleve/analysis/language/fa"
_ "github.com/blevesearch/bleve/analysis/language/fr"
_ "github.com/blevesearch/bleve/analysis/language/ga"
_ "github.com/blevesearch/bleve/analysis/language/gl"
_ "github.com/blevesearch/bleve/analysis/language/hi"
_ "github.com/blevesearch/bleve/analysis/language/hy"
_ "github.com/blevesearch/bleve/analysis/language/id"
_ "github.com/blevesearch/bleve/analysis/language/in"
_ "github.com/blevesearch/bleve/analysis/language/it"
_ "github.com/blevesearch/bleve/analysis/language/pt"
// kv stores
_ "github.com/blevesearch/bleve/index/store/boltdb"
_ "github.com/blevesearch/bleve/index/store/goleveldb"
_ "github.com/blevesearch/bleve/index/store/gtreap"
_ "github.com/blevesearch/bleve/index/store/inmem"
// index types
_ "github.com/blevesearch/bleve/index/upside_down"
// byte array converters
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/ignore"
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/json"
_ "github.com/blevesearch/bleve/analysis/byte_array_converters/string"
)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -6,13 +6,11 @@
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
#ifdef __cplusplus
extern "C" {
#endif
const char* DetectLang(const char *buffer);
// +build cld2 full
#ifdef __cplusplus
} /* extern "C" */
#endif
package config
import (
_ "github.com/blevesearch/blevex/detect_lang"
)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -7,10 +7,10 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build go1.4
// +build cznicb full
package bleve
package config
import (
_ "github.com/blevesearch/bleve/index/store/cznicb"
_ "github.com/blevesearch/blevex/cznicb"
)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -7,10 +7,10 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build forestdb
// +build forestdb full
package bleve
package config
import (
_ "github.com/blevesearch/bleve/index/store/forestdb"
_ "github.com/blevesearch/blevex/forestdb"
)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -9,8 +9,8 @@
// +build icu full
package bleve
package config
import (
_ "github.com/blevesearch/bleve/analysis/tokenizers/icu"
_ "github.com/blevesearch/blevex/lang/th"
)

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -9,7 +9,7 @@
// +build kagome full
package bleve
package config
import (
_ "github.com/blevesearch/bleve/analysis/language/ja"

View File

@ -1,4 +1,4 @@
// Copyright (c) 2014 Couchbase, Inc.
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
@ -9,13 +9,8 @@
// +build leveldb full
package bleve
package config
import (
_ "github.com/blevesearch/bleve/index/store/leveldb"
_ "github.com/blevesearch/blevex/leveldb"
)
func init() {
// install leveldb as the default kv store
Config.DefaultKVStore = "leveldb"
}

Some files were not shown because too many files have changed in this diff Show More