From 994f4b4d114588155b8025cf38921c9124853960 Mon Sep 17 00:00:00 2001 From: Ben Campbell Date: Wed, 18 Nov 2015 15:28:57 +1300 Subject: [PATCH] added some godoc documentation for the en analyzer --- analysis/language/en/analyzer_en.go | 7 +++++++ analysis/language/en/possessive_filter_en.go | 7 +++++++ analysis/language/en/stop_words_en.go | 3 ++- .../token_filters/lower_case_filter/lower_case_filter.go | 3 +++ analysis/token_map.go | 6 ++++++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/analysis/language/en/analyzer_en.go b/analysis/language/en/analyzer_en.go index 18bd764f..171cd617 100644 --- a/analysis/language/en/analyzer_en.go +++ b/analysis/language/en/analyzer_en.go @@ -7,6 +7,13 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. +// Package en implements an analyzer with reasonable defaults for processing +// English text. +// +// It strips possessive suffixes ('s), transforms tokens to lower case, +// removes stopwords from a built-in list, and applies porter stemming. +// +// The built-in stopwords list is defined in EnglishStopWords. package en import ( diff --git a/analysis/language/en/possessive_filter_en.go b/analysis/language/en/possessive_filter_en.go index 84a62c2f..f322c04a 100644 --- a/analysis/language/en/possessive_filter_en.go +++ b/analysis/language/en/possessive_filter_en.go @@ -16,6 +16,8 @@ import ( "github.com/blevesearch/bleve/registry" ) +// PossessiveName is the name PossessiveFilter is registered as +// in the bleve registry. const PossessiveName = "possessive_en" const rightSingleQuotationMark = '’' @@ -24,6 +26,11 @@ const fullWidthApostrophe = ''' const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe +// PossessiveFilter implements a TokenFilter which +// strips the English possessive suffix ('s) from tokens. +// It handle a variety of apostrophe types, is case-insensitive +// and doesn't distinguish between possessive and contraction. +// (ie "She's So Rad" becomes "She So Rad") type PossessiveFilter struct { } diff --git a/analysis/language/en/stop_words_en.go b/analysis/language/en/stop_words_en.go index e19c6178..6423cf2c 100644 --- a/analysis/language/en/stop_words_en.go +++ b/analysis/language/en/stop_words_en.go @@ -7,10 +7,11 @@ import ( const StopName = "stop_en" +// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter. +// // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ // ` was changed to ' to allow for literal string - var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt | This file is distributed under the BSD License. | See http://snowball.tartarus.org/license.php diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter.go b/analysis/token_filters/lower_case_filter/lower_case_filter.go index 6d63d6d3..aa7bebde 100644 --- a/analysis/token_filters/lower_case_filter/lower_case_filter.go +++ b/analysis/token_filters/lower_case_filter/lower_case_filter.go @@ -7,6 +7,8 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. +// Package lower_case_filter implements a TokenFilter which converts +// tokens to lower case according to unicode rules. package lower_case_filter import ( @@ -18,6 +20,7 @@ import ( "github.com/blevesearch/bleve/registry" ) +// Name is the name used to register LowerCaseFilter in the bleve registry const Name = "to_lower" type LowerCaseFilter struct { diff --git a/analysis/token_map.go b/analysis/token_map.go index 17a26f48..e2c23788 100644 --- a/analysis/token_map.go +++ b/analysis/token_map.go @@ -23,6 +23,9 @@ func NewTokenMap() TokenMap { return make(TokenMap, 0) } +// LoadFile reads in a list of tokens from a text file, +// one per line. +// Comments are supported using `#` or `|` func (t TokenMap) LoadFile(filename string) error { data, err := ioutil.ReadFile(filename) if err != nil { @@ -31,6 +34,9 @@ func (t TokenMap) LoadFile(filename string) error { return t.LoadBytes(data) } +// LoadBytes reads in a list of tokens from memory, +// one per line. +// Comments are supported using `#` or `|` func (t TokenMap) LoadBytes(data []byte) error { bytesReader := bytes.NewReader(data) bufioReader := bufio.NewReader(bytesReader)