diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go index 0000f4c6..51da89ac 100644 --- a/analysis/tokenizers/exception/exception.go +++ b/analysis/tokenizers/exception/exception.go @@ -7,6 +7,18 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. +// package exception implements a Tokenizer which extracts pieces matched by a +// regular expression from the input data, delegates the rest to another +// tokenizer, then insert back extracted parts in the token stream. Use it to +// preserve sequences which a regular tokenizer would alter or remove. +// +// Its constructor takes the following arguments: +// +// "exceptions" ([]string): one or more Go regular expressions matching the +// sequence to preserve. Multiple expressions are combined with "|". +// +// "tokenizer" (string): the name of the tokenizer processing the data not +// matched by "exceptions". package exception import ( diff --git a/analysis/type.go b/analysis/type.go index 13759ec8..a8feeabd 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -54,6 +54,8 @@ func (t *Token) String() string { type TokenStream []*Token +// A Tokenizer splits an input string into tokens, the usual behaviour being to +// map words to tokens. type Tokenizer interface { Tokenize([]byte) TokenStream }