0
0

added some godoc documentation for the en analyzer

This commit is contained in:
Ben Campbell 2015-11-18 15:28:57 +13:00
parent 7dd52a5463
commit 994f4b4d11
5 changed files with 25 additions and 1 deletions

View File

@ -7,6 +7,13 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// Package en implements an analyzer with reasonable defaults for processing
// English text.
//
// It strips possessive suffixes ('s), transforms tokens to lower case,
// removes stopwords from a built-in list, and applies porter stemming.
//
// The built-in stopwords list is defined in EnglishStopWords.
package en
import (

View File

@ -16,6 +16,8 @@ import (
"github.com/blevesearch/bleve/registry"
)
// PossessiveName is the name PossessiveFilter is registered as
// in the bleve registry.
const PossessiveName = "possessive_en"
const rightSingleQuotationMark = ''
@ -24,6 +26,11 @@ const fullWidthApostrophe = ''
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
// PossessiveFilter implements a TokenFilter which
// strips the English possessive suffix ('s) from tokens.
// It handle a variety of apostrophe types, is case-insensitive
// and doesn't distinguish between possessive and contraction.
// (ie "She's So Rad" becomes "She So Rad")
type PossessiveFilter struct {
}

View File

@ -7,10 +7,11 @@ import (
const StopName = "stop_en"
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
//
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php

View File

@ -7,6 +7,8 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// Package lower_case_filter implements a TokenFilter which converts
// tokens to lower case according to unicode rules.
package lower_case_filter
import (
@ -18,6 +20,7 @@ import (
"github.com/blevesearch/bleve/registry"
)
// Name is the name used to register LowerCaseFilter in the bleve registry
const Name = "to_lower"
type LowerCaseFilter struct {

View File

@ -23,6 +23,9 @@ func NewTokenMap() TokenMap {
return make(TokenMap, 0)
}
// LoadFile reads in a list of tokens from a text file,
// one per line.
// Comments are supported using `#` or `|`
func (t TokenMap) LoadFile(filename string) error {
data, err := ioutil.ReadFile(filename)
if err != nil {
@ -31,6 +34,9 @@ func (t TokenMap) LoadFile(filename string) error {
return t.LoadBytes(data)
}
// LoadBytes reads in a list of tokens from memory,
// one per line.
// Comments are supported using `#` or `|`
func (t TokenMap) LoadBytes(data []byte) error {
bytesReader := bytes.NewReader(data)
bufioReader := bufio.NewReader(bytesReader)