From 3df789d2580d46e61337e5005f6c9034cf739027 Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Fri, 23 Oct 2015 18:03:31 +0200
Subject: [PATCH 1/6] index: document empty strings behaviour when calling
 DocIDReader()

---
 index/index.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/index/index.go b/index/index.go
index b763ead7..03f2f340 100644
--- a/index/index.go
+++ b/index/index.go
@@ -48,8 +48,10 @@ type IndexReader interface {
 	TermFieldReader(term []byte, field string) (TermFieldReader, error)
 
 	// DocIDReader returns an iterator over documents which identifiers are
-	// greater than or equal to start and smaller than end. The caller must
-	// close returned instance to release associated resources.
+	// greater than or equal to start and smaller than end. Set start to the
+	// empty string to iterate from the first document, end to the empty string
+	// to iterate to the last one.
+	// The caller must close returned instance to release associated resources.
 	DocIDReader(start, end string) (DocIDReader, error)
 
 	FieldDict(field string) (FieldDict, error)

From f2b3d5698e7434334328c3c8dbafb518d78e676e Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Tue, 27 Oct 2015 14:44:28 +0100
Subject: [PATCH 2/6] index: document TermFieldReader interface

---
 index/index.go | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/index/index.go b/index/index.go
index 03f2f340..04626e4b 100644
--- a/index/index.go
+++ b/index/index.go
@@ -90,9 +90,19 @@ type TermFieldDoc struct {
 	Vectors []*TermFieldVector
 }
 
+// TermFieldReader is the interface exposing the enumeration of documents
+// containing a given term in a given field. Documents are returned in byte
+// lexicographic order over their identifiers.
 type TermFieldReader interface {
+	// Next returns the next document containing the term in this field, or nil
+	// when it reaches the end of the enumeration.
 	Next() (*TermFieldDoc, error)
+
+	// Advance resets the enumeration at specified document or its immediate
+	// follower.
 	Advance(ID string) (*TermFieldDoc, error)
+
+	// Count returns the number of documents contains the term in this field.
 	Count() uint64
 	Close() error
 }

From 8b17787a6500c426237ad327e1c476bf87461daf Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Tue, 27 Oct 2015 16:51:54 +0100
Subject: [PATCH 3/6] analysis: document "exception" tokenizer, and Tokenizer
 interface

---
 analysis/tokenizers/exception/exception.go | 12 ++++++++++++
 analysis/type.go                           |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go
index 0000f4c6..51da89ac 100644
--- a/analysis/tokenizers/exception/exception.go
+++ b/analysis/tokenizers/exception/exception.go
@@ -7,6 +7,18 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
+// package exception implements a Tokenizer which extracts pieces matched by a
+// regular expression from the input data, delegates the rest to another
+// tokenizer, then insert back extracted parts in the token stream. Use it to
+// preserve sequences which a regular tokenizer would alter or remove.
+//
+// Its constructor takes the following arguments:
+//
+// "exceptions" ([]string): one or more Go regular expressions matching the
+// sequence to preserve. Multiple expressions are combined with "|".
+//
+// "tokenizer" (string): the name of the tokenizer processing the data not
+// matched by "exceptions".
 package exception
 
 import (
diff --git a/analysis/type.go b/analysis/type.go
index 13759ec8..a8feeabd 100644
--- a/analysis/type.go
+++ b/analysis/type.go
@@ -54,6 +54,8 @@ func (t *Token) String() string {
 
 type TokenStream []*Token
 
+// A Tokenizer splits an input string into tokens, the usual behaviour being to
+// map words to tokens.
 type Tokenizer interface {
 	Tokenize([]byte) TokenStream
 }

From 54a85fa96a35dc099b469aeb0cead58b4c61cd21 Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Tue, 27 Oct 2015 16:52:26 +0100
Subject: [PATCH 4/6] registry: improve error message upon forgotten "type"
 property

Registering a custom tokenizer while forgetting its "type" used to
return:

  error: unable to determine type

It now says:

  error: cannot resolve 'foo' tokenizer type: 'type' property is not defined
---
 registry/registry.go | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/registry/registry.go b/registry/registry.go
index d9d254ef..cdbe1916 100644
--- a/registry/registry.go
+++ b/registry/registry.go
@@ -61,11 +61,15 @@ func NewCache() *Cache {
 }
 
 func typeFromConfig(config map[string]interface{}) (string, error) {
-	typ, ok := config["type"].(string)
-	if ok {
-		return typ, nil
+	prop, ok := config["type"]
+	if !ok {
+		return "", fmt.Errorf("'type' property is not defined")
 	}
-	return "", fmt.Errorf("unable to determine type")
+	typ, ok := prop.(string)
+	if !ok {
+		return "", fmt.Errorf("'type' property must be a string, not %T", prop)
+	}
+	return typ, nil
 }
 
 func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) {
@@ -87,7 +91,7 @@ func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) {
 func (c *Cache) DefineTokenizer(name string, config map[string]interface{}) (analysis.Tokenizer, error) {
 	typ, err := typeFromConfig(config)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("cannot resolve '%s' tokenizer type: %s", name, err)
 	}
 	return c.Tokenizers.DefineTokenizer(name, typ, config, c)
 }

From f95f1d29a0b06270b6caf83ffc9fda21c425a2bb Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Tue, 27 Oct 2015 17:42:02 +0100
Subject: [PATCH 5/6] exception: fail if pattern is empty, name tokenizer in
 error

---
 analysis/tokenizers/exception/exception.go | 3 +++
 registry/tokenizer.go                      | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/analysis/tokenizers/exception/exception.go b/analysis/tokenizers/exception/exception.go
index 51da89ac..63f9897c 100644
--- a/analysis/tokenizers/exception/exception.go
+++ b/analysis/tokenizers/exception/exception.go
@@ -111,6 +111,9 @@ func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *regist
 	if ok {
 		exceptions = append(exceptions, aexceptions...)
 	}
+	if len(exceptions) == 0 {
+		return nil, fmt.Errorf("no pattern found in 'exception' property")
+	}
 	exceptionPattern := strings.Join(exceptions, "|")
 	r, err := regexp.Compile(exceptionPattern)
 	if err != nil {
diff --git a/registry/tokenizer.go b/registry/tokenizer.go
index efd8339a..24b5bf93 100644
--- a/registry/tokenizer.go
+++ b/registry/tokenizer.go
@@ -38,7 +38,7 @@ func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Toke
 	}
 	tokenizer, err := tokenizerConstructor(nil, cache)
 	if err != nil {
-		return nil, fmt.Errorf("error building tokenizer: %v", err)
+		return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
 	}
 	c[name] = tokenizer
 	return tokenizer, nil
@@ -55,7 +55,7 @@ func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[stri
 	}
 	tokenizer, err := tokenizerConstructor(config, cache)
 	if err != nil {
-		return nil, fmt.Errorf("error building tokenizer: %v", err)
+		return nil, fmt.Errorf("error building tokenizer '%s': %v", name, err)
 	}
 	c[name] = tokenizer
 	return tokenizer, nil

From 0579d58263a0a531dff3465f7cab03d90c3836ff Mon Sep 17 00:00:00 2001
From: Patrick Mezard <patrick@mezard.eu>
Date: Thu, 29 Oct 2015 19:57:12 +0100
Subject: [PATCH 6/6] mapping_test: fix TestMappingWithTokenizerDeps now
 patterns are required

---
 mapping_test.go | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mapping_test.go b/mapping_test.go
index 63cab4de..0cd7adb4 100644
--- a/mapping_test.go
+++ b/mapping_test.go
@@ -256,8 +256,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
 	}
 
 	tokDepsL1 := map[string]interface{}{
-		"type":      exception.Name,
-		"tokenizer": "a",
+		"type":       exception.Name,
+		"tokenizer":  "a",
+		"exceptions": []string{".*"},
 	}
 
 	// this tests a 1-level dependency
@@ -280,8 +281,9 @@ func TestMappingWithTokenizerDeps(t *testing.T) {
 	}
 
 	tokDepsL2 := map[string]interface{}{
-		"type":      "exception",
-		"tokenizer": "b",
+		"type":       "exception",
+		"tokenizer":  "b",
+		"exceptions": []string{".*"},
 	}
 
 	// now test a second-level dependency