0
0

Merge pull request #262 from pmezard/index-and-tokenizer-doc-and-fix

Index and tokenizer doc and fix
This commit is contained in:
Marty Schoch 2015-11-02 11:51:21 -05:00
commit 4791625b9b
6 changed files with 48 additions and 13 deletions

View File

@ -7,6 +7,18 @@
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception
import (
@ -99,6 +111,9 @@ func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *regist
if ok {
exceptions = append(exceptions, aexceptions...)
}
if len(exceptions) == 0 {
return nil, fmt.Errorf("no pattern found in 'exception' property")
}
exceptionPattern := strings.Join(exceptions, "|")
r, err := regexp.Compile(exceptionPattern)
if err != nil {

View File

@ -54,6 +54,8 @@ func (t *Token) String() string {
type TokenStream []*Token
// A Tokenizer splits an input string into tokens, the usual behaviour being to
// map words to tokens.
type Tokenizer interface {
Tokenize([]byte) TokenStream
}

View File

@ -48,8 +48,10 @@ type IndexReader interface {
TermFieldReader(term []byte, field string) (TermFieldReader, error)
// DocIDReader returns an iterator over documents which identifiers are
// greater than or equal to start and smaller than end. The caller must
// close returned instance to release associated resources.
// greater than or equal to start and smaller than end. Set start to the
// empty string to iterate from the first document, end to the empty string
// to iterate to the last one.
// The caller must close returned instance to release associated resources.
DocIDReader(start, end string) (DocIDReader, error)
FieldDict(field string) (FieldDict, error)
@ -88,9 +90,19 @@ type TermFieldDoc struct {
Vectors []*TermFieldVector
}
// TermFieldReader is the interface exposing the enumeration of documents
// containing a given term in a given field. Documents are returned in byte
// lexicographic order over their identifiers.
type TermFieldReader interface {
// Next returns the next document containing the term in this field, or nil
// when it reaches the end of the enumeration.
Next() (*TermFieldDoc, error)
// Advance resets the enumeration at specified document or its immediate
// follower.
Advance(ID string) (*TermFieldDoc, error)
// Count returns the number of documents contains the term in this field.
Count() uint64
Close() error
}

View File

@ -256,8 +256,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
}
tokDepsL1 := map[string]interface{}{
"type": exception.Name,
"tokenizer": "a",
"type": exception.Name,
"tokenizer": "a",
"exceptions": []string{".*"},
}
// this tests a 1-level dependency
@ -280,8 +281,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
}
tokDepsL2 := map[string]interface{}{
"type": "exception",
"tokenizer": "b",
"type": "exception",
"tokenizer": "b",
"exceptions": []string{".*"},
}
// now test a second-level dependency

View File

@ -61,11 +61,15 @@ func NewCache() *Cache {
}
func typeFromConfig(config map[string]interface{}) (string, error) {
typ, ok := config["type"].(string)
if ok {
return typ, nil
prop, ok := config["type"]
if !ok {
return "", fmt.Errorf("'type' property is not defined")
}
return "", fmt.Errorf("unable to determine type")
typ, ok := prop.(string)
if !ok {
return "", fmt.Errorf("'type' property must be a string, not %T", prop)
}
return typ, nil
}
func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) {
@ -87,7 +91,7 @@ func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) {
func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) {
typ, err := typeFromConfig(config)
if err != nil {
return nil, err
return nil, fmt.Errorf("cannot resolve '%s' tokenizer type: %s", name, err)
}
return c.Tokenizers.DefineTokenizer(name, typ, config, c)
}

View File

@ -38,7 +38,7 @@ func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Toke
}
tokenizer, err := tokenizerConstructor(nil, cache)
if err != nil {
return nil, fmt.Errorf("error building tokenizer: %v", err)
return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
}
c[name] = tokenizer
return tokenizer, nil
@ -55,7 +55,7 @@ func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[stri
}
tokenizer, err := tokenizerConstructor(config, cache)
if err != nil {
return nil, fmt.Errorf("error building tokenizer: %v", err)
return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
}
c[name] = tokenizer
return tokenizer, nil