0
0

Merge pull request #262 from pmezard/index-and-tokenizer-doc-and-fix

Index and tokenizer doc and fix
This commit is contained in:
Marty Schoch 2015-11-02 11:51:21 -05:00
commit 4791625b9b
6 changed files with 48 additions and 13 deletions

View File

@ -7,6 +7,18 @@
// either express or implied. See the License for the specific language governing permissions // either express or implied. See the License for the specific language governing permissions
// and limitations under the License. // and limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception package exception
import ( import (
@ -99,6 +111,9 @@ func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *regist
if ok { if ok {
exceptions = append(exceptions, aexceptions...) exceptions = append(exceptions, aexceptions...)
} }
if len(exceptions) == 0 {
return nil, fmt.Errorf("no pattern found in 'exception' property")
}
exceptionPattern := strings.Join(exceptions, "|") exceptionPattern := strings.Join(exceptions, "|")
r, err := regexp.Compile(exceptionPattern) r, err := regexp.Compile(exceptionPattern)
if err != nil { if err != nil {

View File

@ -54,6 +54,8 @@ func (t *Token) String() string {
type TokenStream []*Token type TokenStream []*Token
// A Tokenizer splits an input string into tokens, the usual behaviour being to
// map words to tokens.
type Tokenizer interface { type Tokenizer interface {
Tokenize([]byte) TokenStream Tokenize([]byte) TokenStream
} }

View File

@ -48,8 +48,10 @@ type IndexReader interface {
TermFieldReader(term []byte, field string) (TermFieldReader, error) TermFieldReader(term []byte, field string) (TermFieldReader, error)
// DocIDReader returns an iterator over documents which identifiers are // DocIDReader returns an iterator over documents which identifiers are
// greater than or equal to start and smaller than end. The caller must // greater than or equal to start and smaller than end. Set start to the
// close returned instance to release associated resources. // empty string to iterate from the first document, end to the empty string
// to iterate to the last one.
// The caller must close returned instance to release associated resources.
DocIDReader(start, end string) (DocIDReader, error) DocIDReader(start, end string) (DocIDReader, error)
FieldDict(field string) (FieldDict, error) FieldDict(field string) (FieldDict, error)
@ -88,9 +90,19 @@ type TermFieldDoc struct {
Vectors []*TermFieldVector Vectors []*TermFieldVector
} }
// TermFieldReader is the interface exposing the enumeration of documents
// containing a given term in a given field. Documents are returned in byte
// lexicographic order over their identifiers.
type TermFieldReader interface { type TermFieldReader interface {
// Next returns the next document containing the term in this field, or nil
// when it reaches the end of the enumeration.
Next() (*TermFieldDoc, error) Next() (*TermFieldDoc, error)
// Advance resets the enumeration at specified document or its immediate
// follower.
Advance(ID string) (*TermFieldDoc, error) Advance(ID string) (*TermFieldDoc, error)
// Count returns the number of documents contains the term in this field.
Count() uint64 Count() uint64
Close() error Close() error
} }

View File

@ -256,8 +256,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
} }
tokDepsL1 := map[string]interface{}{ tokDepsL1 := map[string]interface{}{
"type": exception.Name, "type": exception.Name,
"tokenizer": "a", "tokenizer": "a",
"exceptions": []string{".*"},
} }
// this tests a 1-level dependency // this tests a 1-level dependency
@ -280,8 +281,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
} }
tokDepsL2 := map[string]interface{}{ tokDepsL2 := map[string]interface{}{
"type": "exception", "type": "exception",
"tokenizer": "b", "tokenizer": "b",
"exceptions": []string{".*"},
} }
// now test a second-level dependency // now test a second-level dependency

View File

@ -61,11 +61,15 @@ func NewCache() *Cache {
} }
func typeFromConfig(config map[string]interface{}) (string, error) { func typeFromConfig(config map[string]interface{}) (string, error) {
typ, ok := config["type"].(string) prop, ok := config["type"]
if ok { if !ok {
return typ, nil return "", fmt.Errorf("'type' property is not defined")
} }
return "", fmt.Errorf("unable to determine type") typ, ok := prop.(string)
if !ok {
return "", fmt.Errorf("'type' property must be a string, not %T", prop)
}
return typ, nil
} }
func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) { func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) {
@ -87,7 +91,7 @@ func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) {
func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) { func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) {
typ, err := typeFromConfig(config) typ, err := typeFromConfig(config)
if err != nil { if err != nil {
return nil, err return nil, fmt.Errorf("cannot resolve '%s' tokenizer type: %s", name, err)
} }
return c.Tokenizers.DefineTokenizer(name, typ, config, c) return c.Tokenizers.DefineTokenizer(name, typ, config, c)
} }

View File

@ -38,7 +38,7 @@ func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Toke
} }
tokenizer, err := tokenizerConstructor(nil, cache) tokenizer, err := tokenizerConstructor(nil, cache)
if err != nil { if err != nil {
return nil, fmt.Errorf("error building tokenizer: %v", err) return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
} }
c[name] = tokenizer c[name] = tokenizer
return tokenizer, nil return tokenizer, nil
@ -55,7 +55,7 @@ func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[stri
} }
tokenizer, err := tokenizerConstructor(config, cache) tokenizer, err := tokenizerConstructor(config, cache)
if err != nil { if err != nil {
return nil, fmt.Errorf("error building tokenizer: %v", err) return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
} }
c[name] = tokenizer c[name] = tokenizer
return tokenizer, nil return tokenizer, nil