0
0

analysis: document "exception" tokenizer, and Tokenizer interface

This commit is contained in:
Patrick Mezard 2015-10-27 16:51:54 +01:00
parent f2b3d5698e
commit 8b17787a65
2 changed files with 14 additions and 0 deletions

View File

@ -7,6 +7,18 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception
import (

View File

@ -54,6 +54,8 @@ func (t *Token) String() string {
type TokenStream []*Token
// A Tokenizer splits an input string into tokens, the usual behaviour being to
// map words to tokens.
type Tokenizer interface {
Tokenize([]byte) TokenStream
}