analysis: document "exception" tokenizer, and Tokenizer interface
This commit is contained in:
parent
f2b3d5698e
commit
8b17787a65
|
@ -7,6 +7,18 @@
|
||||||
// either express or implied. See the License for the specific language governing permissions
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
// and limitations under the License.
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// package exception implements a Tokenizer which extracts pieces matched by a
|
||||||
|
// regular expression from the input data, delegates the rest to another
|
||||||
|
// tokenizer, then insert back extracted parts in the token stream. Use it to
|
||||||
|
// preserve sequences which a regular tokenizer would alter or remove.
|
||||||
|
//
|
||||||
|
// Its constructor takes the following arguments:
|
||||||
|
//
|
||||||
|
// "exceptions" ([]string): one or more Go regular expressions matching the
|
||||||
|
// sequence to preserve. Multiple expressions are combined with "|".
|
||||||
|
//
|
||||||
|
// "tokenizer" (string): the name of the tokenizer processing the data not
|
||||||
|
// matched by "exceptions".
|
||||||
package exception
|
package exception
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
|
|
@ -54,6 +54,8 @@ func (t *Token) String() string {
|
||||||
|
|
||||||
type TokenStream []*Token
|
type TokenStream []*Token
|
||||||
|
|
||||||
|
// A Tokenizer splits an input string into tokens, the usual behaviour being to
|
||||||
|
// map words to tokens.
|
||||||
type Tokenizer interface {
|
type Tokenizer interface {
|
||||||
Tokenize([]byte) TokenStream
|
Tokenize([]byte) TokenStream
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user