Merge pull request #262 from pmezard/index-and-tokenizer-doc-and-fix
Index and tokenizer doc and fix
This commit is contained in:
commit
4791625b9b
|
@ -7,6 +7,18 @@
|
||||||
// either express or implied. See the License for the specific language governing permissions
|
// either express or implied. See the License for the specific language governing permissions
|
||||||
// and limitations under the License.
|
// and limitations under the License.
|
||||||
|
|
||||||
|
// package exception implements a Tokenizer which extracts pieces matched by a
|
||||||
|
// regular expression from the input data, delegates the rest to another
|
||||||
|
// tokenizer, then insert back extracted parts in the token stream. Use it to
|
||||||
|
// preserve sequences which a regular tokenizer would alter or remove.
|
||||||
|
//
|
||||||
|
// Its constructor takes the following arguments:
|
||||||
|
//
|
||||||
|
// "exceptions" ([]string): one or more Go regular expressions matching the
|
||||||
|
// sequence to preserve. Multiple expressions are combined with "|".
|
||||||
|
//
|
||||||
|
// "tokenizer" (string): the name of the tokenizer processing the data not
|
||||||
|
// matched by "exceptions".
|
||||||
package exception
|
package exception
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
@ -99,6 +111,9 @@ func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *regist
|
||||||
if ok {
|
if ok {
|
||||||
exceptions = append(exceptions, aexceptions...)
|
exceptions = append(exceptions, aexceptions...)
|
||||||
}
|
}
|
||||||
|
if len(exceptions) == 0 {
|
||||||
|
return nil, fmt.Errorf("no pattern found in 'exception' property")
|
||||||
|
}
|
||||||
exceptionPattern := strings.Join(exceptions, "|")
|
exceptionPattern := strings.Join(exceptions, "|")
|
||||||
r, err := regexp.Compile(exceptionPattern)
|
r, err := regexp.Compile(exceptionPattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -54,6 +54,8 @@ func (t *Token) String() string {
|
||||||
|
|
||||||
type TokenStream []*Token
|
type TokenStream []*Token
|
||||||
|
|
||||||
|
// A Tokenizer splits an input string into tokens, the usual behaviour being to
|
||||||
|
// map words to tokens.
|
||||||
type Tokenizer interface {
|
type Tokenizer interface {
|
||||||
Tokenize([]byte) TokenStream
|
Tokenize([]byte) TokenStream
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,8 +48,10 @@ type IndexReader interface {
|
||||||
TermFieldReader(term []byte, field string) (TermFieldReader, error)
|
TermFieldReader(term []byte, field string) (TermFieldReader, error)
|
||||||
|
|
||||||
// DocIDReader returns an iterator over documents which identifiers are
|
// DocIDReader returns an iterator over documents which identifiers are
|
||||||
// greater than or equal to start and smaller than end. The caller must
|
// greater than or equal to start and smaller than end. Set start to the
|
||||||
// close returned instance to release associated resources.
|
// empty string to iterate from the first document, end to the empty string
|
||||||
|
// to iterate to the last one.
|
||||||
|
// The caller must close returned instance to release associated resources.
|
||||||
DocIDReader(start, end string) (DocIDReader, error)
|
DocIDReader(start, end string) (DocIDReader, error)
|
||||||
|
|
||||||
FieldDict(field string) (FieldDict, error)
|
FieldDict(field string) (FieldDict, error)
|
||||||
|
@ -88,9 +90,19 @@ type TermFieldDoc struct {
|
||||||
Vectors []*TermFieldVector
|
Vectors []*TermFieldVector
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TermFieldReader is the interface exposing the enumeration of documents
|
||||||
|
// containing a given term in a given field. Documents are returned in byte
|
||||||
|
// lexicographic order over their identifiers.
|
||||||
type TermFieldReader interface {
|
type TermFieldReader interface {
|
||||||
|
// Next returns the next document containing the term in this field, or nil
|
||||||
|
// when it reaches the end of the enumeration.
|
||||||
Next() (*TermFieldDoc, error)
|
Next() (*TermFieldDoc, error)
|
||||||
|
|
||||||
|
// Advance resets the enumeration at specified document or its immediate
|
||||||
|
// follower.
|
||||||
Advance(ID string) (*TermFieldDoc, error)
|
Advance(ID string) (*TermFieldDoc, error)
|
||||||
|
|
||||||
|
// Count returns the number of documents contains the term in this field.
|
||||||
Count() uint64
|
Count() uint64
|
||||||
Close() error
|
Close() error
|
||||||
}
|
}
|
||||||
|
|
|
@ -258,6 +258,7 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
|
||||||
tokDepsL1 := map[string]interface{}{
|
tokDepsL1 := map[string]interface{}{
|
||||||
"type": exception.Name,
|
"type": exception.Name,
|
||||||
"tokenizer": "a",
|
"tokenizer": "a",
|
||||||
|
"exceptions": []string{".*"},
|
||||||
}
|
}
|
||||||
|
|
||||||
// this tests a 1-level dependency
|
// this tests a 1-level dependency
|
||||||
|
@ -282,6 +283,7 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
|
||||||
tokDepsL2 := map[string]interface{}{
|
tokDepsL2 := map[string]interface{}{
|
||||||
"type": "exception",
|
"type": "exception",
|
||||||
"tokenizer": "b",
|
"tokenizer": "b",
|
||||||
|
"exceptions": []string{".*"},
|
||||||
}
|
}
|
||||||
|
|
||||||
// now test a second-level dependency
|
// now test a second-level dependency
|
||||||
|
|
|
@ -61,11 +61,15 @@ func NewCache() *Cache {
|
||||||
}
|
}
|
||||||
|
|
||||||
func typeFromConfig(config map[string]interface{}) (string, error) {
|
func typeFromConfig(config map[string]interface{}) (string, error) {
|
||||||
typ, ok := config["type"].(string)
|
prop, ok := config["type"]
|
||||||
if ok {
|
if !ok {
|
||||||
return typ, nil
|
return "", fmt.Errorf("'type' property is not defined")
|
||||||
}
|
}
|
||||||
return "", fmt.Errorf("unable to determine type")
|
typ, ok := prop.(string)
|
||||||
|
if !ok {
|
||||||
|
return "", fmt.Errorf("'type' property must be a string, not %T", prop)
|
||||||
|
}
|
||||||
|
return typ, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) {
|
func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) {
|
||||||
|
@ -87,7 +91,7 @@ func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) {
|
||||||
func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) {
|
func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) {
|
||||||
typ, err := typeFromConfig(config)
|
typ, err := typeFromConfig(config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, fmt.Errorf("cannot resolve '%s' tokenizer type: %s", name, err)
|
||||||
}
|
}
|
||||||
return c.Tokenizers.DefineTokenizer(name, typ, config, c)
|
return c.Tokenizers.DefineTokenizer(name, typ, config, c)
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Toke
|
||||||
}
|
}
|
||||||
tokenizer, err := tokenizerConstructor(nil, cache)
|
tokenizer, err := tokenizerConstructor(nil, cache)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error building tokenizer: %v", err)
|
return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
|
||||||
}
|
}
|
||||||
c[name] = tokenizer
|
c[name] = tokenizer
|
||||||
return tokenizer, nil
|
return tokenizer, nil
|
||||||
|
@ -55,7 +55,7 @@ func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[stri
|
||||||
}
|
}
|
||||||
tokenizer, err := tokenizerConstructor(config, cache)
|
tokenizer, err := tokenizerConstructor(config, cache)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error building tokenizer: %v", err)
|
return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
|
||||||
}
|
}
|
||||||
c[name] = tokenizer
|
c[name] = tokenizer
|
||||||
return tokenizer, nil
|
return tokenizer, nil
|
||||||
|
|
Loading…
Reference in New Issue
Block a user