From 8b17787a6500c426237ad327e1c476bf87461daf Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Tue, 27 Oct 2015 16:51:54 +0100 Subject: [PATCH] analysis: document "exception" tokenizer, and Tokenizer interface --- analysis/tokenizers/exception/exception.go | 12 ++++++++++++ analysis/type.go | 2 ++ 2 files changed, 14 insertions(+) diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go index 0000f4c6..51da89ac 100644 --- a/analysis/tokenizers/exception/exception.go +++ b/analysis/tokenizers/exception/exception.go @@ -7,6 +7,18 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. +// package exception implements a Tokenizer which extracts pieces matched by a +// regular expression from the input data, delegates the rest to another +// tokenizer, then insert back extracted parts in the token stream. Use it to +// preserve sequences which a regular tokenizer would alter or remove. +// +// Its constructor takes the following arguments: +// +// "exceptions" ([]string): one or more Go regular expressions matching the +// sequence to preserve. Multiple expressions are combined with "|". +// +// "tokenizer" (string): the name of the tokenizer processing the data not +// matched by "exceptions". package exception import ( diff --git a/analysis/type.go b/analysis/type.go index 13759ec8..a8feeabd 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -54,6 +54,8 @@ func (t *Token) String() string { type TokenStream []*Token +// A Tokenizer splits an input string into tokens, the usual behaviour being to +// map words to tokens. type Tokenizer interface { Tokenize([]byte) TokenStream }