Merge pull request #587 from mschoch/add-spanish

add a pure Go Spanish analyzer
2017-04-29 19:48:18 -04:00 · 2017-04-29 19:48:18 -04:00 · 8435ce5054
parent c0d5e75e70 b8de2df68d
commit 8435ce5054
5 changed files with 684 additions and 0 deletions
--- a/analysis/lang/es/analyzer_es.go
+++ b/analysis/lang/es/analyzer_es.go
@ -0,0 +1,58 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package es
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+
+	"github.com/blevesearch/bleve/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
+)
+
+const AnalyzerName = "es"
+
+func AnalyzerConstructor(config map[string]interface{},
+	cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopEsFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	lightStemmerEsFilter, err := cache.TokenFilterNamed(LightStemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopEsFilter,
+			lightStemmerEsFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}
--- a/analysis/lang/es/analyzer_es_test.go
+++ b/analysis/lang/es/analyzer_es_test.go
@ -0,0 +1,122 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package es
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func TestSpanishAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// stemming
+		{
+			input: []byte("chicana"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("chican"),
+					Position: 1,
+					Start:    0,
+					End:      7,
+				},
+			},
+		},
+		{
+			input: []byte("chicano"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("chican"),
+					Position: 1,
+					Start:    0,
+					End:      7,
+				},
+			},
+		},
+		// added by marty for better coverage
+		{
+			input: []byte("yeses"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("yes"),
+					Position: 1,
+					Start:    0,
+					End:      5,
+				},
+			},
+		},
+		{
+			input: []byte("jaeces"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("jaez"),
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+			},
+		},
+		{
+			input: []byte("arcos"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("arc"),
+					Position: 1,
+					Start:    0,
+					End:      5,
+				},
+			},
+		},
+		{
+			input: []byte("caos"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("caos"),
+					Position: 1,
+					Start:    0,
+					End:      4,
+				},
+			},
+		},
+		{
+			input: []byte("parecer"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("parecer"),
+					Position: 1,
+					Start:    0,
+					End:      7,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/lang/es/light_stemmer_es.go
+++ b/analysis/lang/es/light_stemmer_es.go
@ -0,0 +1,91 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package es
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const LightStemmerName = "stemmer_es_light"
+
+type SpanishLightStemmerFilter struct {
+}
+
+func NewSpanishLightStemmerFilter() *SpanishLightStemmerFilter {
+	return &SpanishLightStemmerFilter{}
+}
+
+func (s *SpanishLightStemmerFilter) Filter(
+	input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		runes := bytes.Runes(token.Term)
+		runes = stem(runes)
+		token.Term = analysis.BuildTermFromRunes(runes)
+	}
+	return input
+}
+
+func stem(input []rune) []rune {
+	l := len(input)
+	if l < 5 {
+		return input
+	}
+
+	for i, r := range input {
+		switch r {
+		case 'à', 'á', 'â', 'ä':
+			input[i] = 'a'
+		case 'ò', 'ó', 'ô', 'ö':
+			input[i] = 'o'
+		case 'è', 'é', 'ê', 'ë':
+			input[i] = 'e'
+		case 'ù', 'ú', 'û', 'ü':
+			input[i] = 'u'
+		case 'ì', 'í', 'î', 'ï':
+			input[i] = 'i'
+		}
+	}
+
+	switch input[l-1] {
+	case 'o', 'a', 'e':
+		return input[:l-1]
+	case 's':
+		if input[l-2] == 'e' && input[l-3] == 's' && input[l-4] == 'e' {
+			return input[:l-2]
+		}
+		if input[l-2] == 'e' && input[l-3] == 'c' {
+			input[l-3] = 'z'
+			return input[:l-2]
+		}
+		if input[l-2] == 'o' || input[l-2] == 'a' || input[l-2] == 'e' {
+			return input[:l-2]
+		}
+	}
+
+	return input
+}
+
+func SpanishLightStemmerFilterConstructor(config map[string]interface{},
+	cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSpanishLightStemmerFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(LightStemmerName,
+		SpanishLightStemmerFilterConstructor)
+}
--- a/analysis/lang/es/stop_filter_es.go
+++ b/analysis/lang/es/stop_filter_es.go
@ -0,0 +1,33 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package es
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/analysis/token/stop"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{},
+	cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}
--- a/analysis/lang/es/stop_words_es.go
+++ b/analysis/lang/es/stop_words_es.go
@ -0,0 +1,380 @@
+package es
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const StopName = "stop_es"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
+// ` was changed to ' to allow for literal string
+
+var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ |
+ | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
+
+ | A Spanish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+
+ | The following is a ranked list (commonest to rarest) of stopwords
+ | deriving from a large sample of text.
+
+ | Extra words have been added at the end.
+
+de             |  from, of
+la             |  the, her
+que            |  who, that
+el             |  the
+en             |  in
+y              |  and
+a              |  to
+los            |  the, them
+del            |  de + el
+se             |  himself, from him etc
+las            |  the, them
+por            |  for, by, etc
+un             |  a
+para           |  for
+con            |  with
+no             |  no
+una            |  a
+su             |  his, her
+al             |  a + el
+  | es         from SER
+lo             |  him
+como           |  how
+más            |  more
+pero           |  pero
+sus            |  su plural
+le             |  to him, her
+ya             |  already
+o              |  or
+  | fue        from SER
+este           |  this
+  | ha         from HABER
+sí             |  himself etc
+porque         |  because
+esta           |  this
+  | son        from SER
+entre          |  between
+  | está     from ESTAR
+cuando         |  when
+muy            |  very
+sin            |  without
+sobre          |  on
+  | ser        from SER
+  | tiene      from TENER
+también        |  also
+me             |  me
+hasta          |  until
+hay            |  there is/are
+donde          |  where
+  | han        from HABER
+quien          |  whom, that
+  | están      from ESTAR
+  | estado     from ESTAR
+desde          |  from
+todo           |  all
+nos            |  us
+durante        |  during
+  | estados    from ESTAR
+todos          |  all
+uno            |  a
+les            |  to them
+ni             |  nor
+contra         |  against
+otros          |  other
+  | fueron     from SER
+ese            |  that
+eso            |  that
+  | había      from HABER
+ante           |  before
+ellos          |  they
+e              |  and (variant of y)
+esto           |  this
+mí             |  me
+antes          |  before
+algunos        |  some
+qué            |  what?
+unos           |  a
+yo             |  I
+otro           |  other
+otras          |  other
+otra           |  other
+él             |  he
+tanto          |  so much, many
+esa            |  that
+estos          |  these
+mucho          |  much, many
+quienes        |  who
+nada           |  nothing
+muchos         |  many
+cual           |  who
+  | sea        from SER
+poco           |  few
+ella           |  she
+estar          |  to be
+  | haber      from HABER
+estas          |  these
+  | estaba     from ESTAR
+  | estamos    from ESTAR
+algunas        |  some
+algo           |  something
+nosotros       |  we
+
+      | other forms
+
+mi             |  me
+mis            |  mi plural
+tú             |  thou
+te             |  thee
+ti             |  thee
+tu             |  thy
+tus            |  tu plural
+ellas          |  they
+nosotras       |  we
+vosotros       |  you
+vosotras       |  you
+os             |  you
+mío            |  mine
+mía            |
+míos           |
+mías           |
+tuyo           |  thine
+tuya           |
+tuyos          |
+tuyas          |
+suyo           |  his, hers, theirs
+suya           |
+suyos          |
+suyas          |
+nuestro        |  ours
+nuestra        |
+nuestros       |
+nuestras       |
+vuestro        |  yours
+vuestra        |
+vuestros       |
+vuestras       |
+esos           |  those
+esas           |  those
+
+               | forms of estar, to be (not including the infinitive):
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+
+               | forms of haber, to have (not including the infinitive):
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+
+               | forms of ser, to be (not including the infinitive):
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+siendo
+sido
+  |  sed also means 'thirst'
+
+               | forms of tener, to have (not including the infinitive):
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened
+
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(SpanishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}