added some godoc documentation for the en analyzer
This commit is contained in:
parent
7dd52a5463
commit
994f4b4d11
|
@ -7,6 +7,13 @@
|
|||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// Package en implements an analyzer with reasonable defaults for processing
|
||||
// English text.
|
||||
//
|
||||
// It strips possessive suffixes ('s), transforms tokens to lower case,
|
||||
// removes stopwords from a built-in list, and applies porter stemming.
|
||||
//
|
||||
// The built-in stopwords list is defined in EnglishStopWords.
|
||||
package en
|
||||
|
||||
import (
|
||||
|
|
|
@ -16,6 +16,8 @@ import (
|
|||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
// PossessiveName is the name PossessiveFilter is registered as
|
||||
// in the bleve registry.
|
||||
const PossessiveName = "possessive_en"
|
||||
|
||||
const rightSingleQuotationMark = '’'
|
||||
|
@ -24,6 +26,11 @@ const fullWidthApostrophe = '''
|
|||
|
||||
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
|
||||
|
||||
// PossessiveFilter implements a TokenFilter which
|
||||
// strips the English possessive suffix ('s) from tokens.
|
||||
// It handle a variety of apostrophe types, is case-insensitive
|
||||
// and doesn't distinguish between possessive and contraction.
|
||||
// (ie "She's So Rad" becomes "She So Rad")
|
||||
type PossessiveFilter struct {
|
||||
}
|
||||
|
||||
|
|
|
@ -7,10 +7,11 @@ import (
|
|||
|
||||
const StopName = "stop_en"
|
||||
|
||||
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
|
||||
//
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
// Package lower_case_filter implements a TokenFilter which converts
|
||||
// tokens to lower case according to unicode rules.
|
||||
package lower_case_filter
|
||||
|
||||
import (
|
||||
|
@ -18,6 +20,7 @@ import (
|
|||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
// Name is the name used to register LowerCaseFilter in the bleve registry
|
||||
const Name = "to_lower"
|
||||
|
||||
type LowerCaseFilter struct {
|
||||
|
|
|
@ -23,6 +23,9 @@ func NewTokenMap() TokenMap {
|
|||
return make(TokenMap, 0)
|
||||
}
|
||||
|
||||
// LoadFile reads in a list of tokens from a text file,
|
||||
// one per line.
|
||||
// Comments are supported using `#` or `|`
|
||||
func (t TokenMap) LoadFile(filename string) error {
|
||||
data, err := ioutil.ReadFile(filename)
|
||||
if err != nil {
|
||||
|
@ -31,6 +34,9 @@ func (t TokenMap) LoadFile(filename string) error {
|
|||
return t.LoadBytes(data)
|
||||
}
|
||||
|
||||
// LoadBytes reads in a list of tokens from memory,
|
||||
// one per line.
|
||||
// Comments are supported using `#` or `|`
|
||||
func (t TokenMap) LoadBytes(data []byte) error {
|
||||
bytesReader := bytes.NewReader(data)
|
||||
bufioReader := bufio.NewReader(bytesReader)
|
||||
|
|
Loading…
Reference in New Issue
Block a user