0
0
bleve/analysis/tokenizers/exception/exception.go

137 lines
4.0 KiB
Go

// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception
import (
"fmt"
"regexp"
"strings"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "exception"
type ExceptionsTokenizer struct {
exception *regexp.Regexp
remaining analysis.Tokenizer
}
func NewExceptionsTokenizer(exception *regexp.Regexp, remaining analysis.Tokenizer) *ExceptionsTokenizer {
return &ExceptionsTokenizer{
exception: exception,
remaining: remaining,
}
}
func (t *ExceptionsTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
matches := t.exception.FindAllIndex(input, -1)
currInput := 0
lastPos := 0
for _, match := range matches {
start := match[0]
end := match[1]
if start > currInput {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:start])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
lastPos += len(intermediate)
currInput = start
}
// create single token with this regexp match
token := &analysis.Token{
Term: input[start:end],
Start: start,
End: end,
Position: lastPos + 1,
}
rv = append(rv, token)
lastPos++
currInput = end
}
if currInput < len(input) {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
}
return rv
}
func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
exceptions := []string{}
iexceptions, ok := config["exceptions"].([]interface{})
if ok {
for _, exception := range iexceptions {
exception, ok := exception.(string)
if ok {
exceptions = append(exceptions, exception)
}
}
}
aexceptions, ok := config["exceptions"].([]string)
if ok {
exceptions = append(exceptions, aexceptions...)
}
if len(exceptions) == 0 {
return nil, fmt.Errorf("no pattern found in 'exception' property")
}
exceptionPattern := strings.Join(exceptions, "|")
r, err := regexp.Compile(exceptionPattern)
if err != nil {
return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
}
remainingName, ok := config["tokenizer"].(string)
if !ok {
return nil, fmt.Errorf("must specify tokenizer for remaining input")
}
remaining, err := cache.TokenizerNamed(remainingName)
if err != nil {
return nil, err
}
return NewExceptionsTokenizer(r, remaining), nil
}
func init() {
registry.RegisterTokenizer(Name, ExceptionsTokenizerConstructor)
}