From 3df789d2580d46e61337e5005f6c9034cf739027 Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Fri, 23 Oct 2015 18:03:31 +0200 Subject: [PATCH 1/6] index: document empty strings behaviour when calling DocIDReader() --- index/index.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index/index.go b/index/index.go index b763ead7..03f2f340 100644 --- a/index/index.go +++ b/index/index.go @@ -48,8 +48,10 @@ type IndexReader interface { TermFieldReader(term []byte, field string) (TermFieldReader, error) // DocIDReader returns an iterator over documents which identifiers are - // greater than or equal to start and smaller than end. The caller must - // close returned instance to release associated resources. + // greater than or equal to start and smaller than end. Set start to the + // empty string to iterate from the first document, end to the empty string + // to iterate to the last one. + // The caller must close returned instance to release associated resources. DocIDReader(start, end string) (DocIDReader, error) FieldDict(field string) (FieldDict, error) From f2b3d5698e7434334328c3c8dbafb518d78e676e Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Tue, 27 Oct 2015 14:44:28 +0100 Subject: [PATCH 2/6] index: document TermFieldReader interface --- index/index.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/index/index.go b/index/index.go index 03f2f340..04626e4b 100644 --- a/index/index.go +++ b/index/index.go @@ -90,9 +90,19 @@ type TermFieldDoc struct { Vectors []*TermFieldVector } +// TermFieldReader is the interface exposing the enumeration of documents +// containing a given term in a given field. Documents are returned in byte +// lexicographic order over their identifiers. type TermFieldReader interface { + // Next returns the next document containing the term in this field, or nil + // when it reaches the end of the enumeration. Next() (*TermFieldDoc, error) + + // Advance resets the enumeration at specified document or its immediate + // follower. Advance(ID string) (*TermFieldDoc, error) + + // Count returns the number of documents contains the term in this field. Count() uint64 Close() error } From 8b17787a6500c426237ad327e1c476bf87461daf Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Tue, 27 Oct 2015 16:51:54 +0100 Subject: [PATCH 3/6] analysis: document "exception" tokenizer, and Tokenizer interface --- analysis/tokenizers/exception/exception.go | 12 ++++++++++++ analysis/type.go | 2 ++ 2 files changed, 14 insertions(+) diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go index 0000f4c6..51da89ac 100644 --- a/analysis/tokenizers/exception/exception.go +++ b/analysis/tokenizers/exception/exception.go @@ -7,6 +7,18 @@ // either express or implied. See the License for the specific language governing permissions // and limitations under the License. +// package exception implements a Tokenizer which extracts pieces matched by a +// regular expression from the input data, delegates the rest to another +// tokenizer, then insert back extracted parts in the token stream. Use it to +// preserve sequences which a regular tokenizer would alter or remove. +// +// Its constructor takes the following arguments: +// +// "exceptions" ([]string): one or more Go regular expressions matching the +// sequence to preserve. Multiple expressions are combined with "|". +// +// "tokenizer" (string): the name of the tokenizer processing the data not +// matched by "exceptions". package exception import ( diff --git a/analysis/type.go b/analysis/type.go index 13759ec8..a8feeabd 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -54,6 +54,8 @@ func (t *Token) String() string { type TokenStream []*Token +// A Tokenizer splits an input string into tokens, the usual behaviour being to +// map words to tokens. type Tokenizer interface { Tokenize([]byte) TokenStream } From 54a85fa96a35dc099b469aeb0cead58b4c61cd21 Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Tue, 27 Oct 2015 16:52:26 +0100 Subject: [PATCH 4/6] registry: improve error message upon forgotten "type" property Registering a custom tokenizer while forgetting its "type" used to return: error: unable to determine type It now says: error: cannot resolve 'foo' tokenizer type: 'type' property is not defined --- registry/registry.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/registry/registry.go b/registry/registry.go index d9d254ef..cdbe1916 100644 --- a/registry/registry.go +++ b/registry/registry.go @@ -61,11 +61,15 @@ func NewCache() *Cache { } func typeFromConfig(config map[string]interface{}) (string, error) { - typ, ok := config["type"].(string) - if ok { - return typ, nil + prop, ok := config["type"] + if !ok { + return "", fmt.Errorf("'type' property is not defined") } - return "", fmt.Errorf("unable to determine type") + typ, ok := prop.(string) + if !ok { + return "", fmt.Errorf("'type' property must be a string, not %T", prop) + } + return typ, nil } func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) { @@ -87,7 +91,7 @@ func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) { func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) { typ, err := typeFromConfig(config) if err != nil { - return nil, err + return nil, fmt.Errorf("cannot resolve '%s' tokenizer type: %s", name, err) } return c.Tokenizers.DefineTokenizer(name, typ, config, c) } From f95f1d29a0b06270b6caf83ffc9fda21c425a2bb Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Tue, 27 Oct 2015 17:42:02 +0100 Subject: [PATCH 5/6] exception: fail if pattern is empty, name tokenizer in error --- analysis/tokenizers/exception/exception.go | 3 +++ registry/tokenizer.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go index 51da89ac..63f9897c 100644 --- a/analysis/tokenizers/exception/exception.go +++ b/analysis/tokenizers/exception/exception.go @@ -111,6 +111,9 @@ func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *regist if ok { exceptions = append(exceptions, aexceptions...) } + if len(exceptions) == 0 { + return nil, fmt.Errorf("no pattern found in 'exception' property") + } exceptionPattern := strings.Join(exceptions, "|") r, err := regexp.Compile(exceptionPattern) if err != nil { diff --git a/registry/tokenizer.go b/registry/tokenizer.go index efd8339a..24b5bf93 100644 --- a/registry/tokenizer.go +++ b/registry/tokenizer.go @@ -38,7 +38,7 @@ func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Toke } tokenizer, err := tokenizerConstructor(nil, cache) if err != nil { - return nil, fmt.Errorf("error building tokenizer: %v", err) + return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err) } c[name] = tokenizer return tokenizer, nil @@ -55,7 +55,7 @@ func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[stri } tokenizer, err := tokenizerConstructor(config, cache) if err != nil { - return nil, fmt.Errorf("error building tokenizer: %v", err) + return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err) } c[name] = tokenizer return tokenizer, nil From 0579d58263a0a531dff3465f7cab03d90c3836ff Mon Sep 17 00:00:00 2001 From: Patrick Mezard Date: Thu, 29 Oct 2015 19:57:12 +0100 Subject: [PATCH 6/6] mapping_test: fix TestMappingWithTokenizerDeps now patterns are required --- mapping_test.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mapping_test.go b/mapping_test.go index 63cab4de..0cd7adb4 100644 --- a/mapping_test.go +++ b/mapping_test.go @@ -256,8 +256,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) { } tokDepsL1 := map[string]interface{}{ - "type": exception.Name, - "tokenizer": "a", + "type": exception.Name, + "tokenizer": "a", + "exceptions": []string{".*"}, } // this tests a 1-level dependency @@ -280,8 +281,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) { } tokDepsL2 := map[string]interface{}{ - "type": "exception", - "tokenizer": "b", + "type": "exception", + "tokenizer": "b", + "exceptions": []string{".*"}, } // now test a second-level dependency