added ngram and edge ngram token filters

closes #46 and closes #47
2014-08-06 22:11:42 -04:00 · 2014-08-06 22:11:42 -04:00 · c19270108c
commit c19270108c
parent 9a777aaa80
4 changed files with 429 additions and 0 deletions
--- a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go
+++ b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go
@ -0,0 +1,91 @@
 //  Copyright (c) 2014 Couchbase, Inc.
 //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 //  except in compliance with the License. You may obtain a copy of the License at
 //    http://www.apache.org/licenses/LICENSE-2.0
 //  Unless required by applicable law or agreed to in writing, software distributed under the
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 package edge_ngram_filter
 import (
 	"bytes"
 	"unicode/utf8"
 	"github.com/couchbaselabs/bleve/analysis"
 )
 type Side bool
 const BACK Side = true
 const FRONT Side = false
 type EdgeNgramFilter struct {
 	back      Side
 	minLength int
 	maxLength int
 }
 func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
 	return &EdgeNgramFilter{
 		back:      side,
 		minLength: minLength,
 		maxLength: maxLength,
 	}
 }
 func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	rv := make(analysis.TokenStream, 0)
 	for _, token := range input {
 		runeCount := utf8.RuneCount(token.Term)
 		runes := bytes.Runes(token.Term)
 		if s.back {
 			i := runeCount
 			// index of the starting rune for this token
 			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
 				// build an ngram of this size starting at i
 				if i-ngramSize > 0 {
 					ngramTerm := buildTermFromRunes(runes[i-ngramSize : i])
 					token := analysis.Token{
 						Position: token.Position,
 						Start:    token.Start,
 						End:      token.End,
 						Type:     token.Type,
 						Term:     ngramTerm,
 					}
 					rv = append(rv, &token)
 				}
 			}
 		} else {
 			i := 0
 			// index of the starting rune for this token
 			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
 				// build an ngram of this size starting at i
 				if i+ngramSize <= runeCount {
 					ngramTerm := buildTermFromRunes(runes[i : i+ngramSize])
 					token := analysis.Token{
 						Position: token.Position,
 						Start:    token.Start,
 						End:      token.End,
 						Type:     token.Type,
 						Term:     ngramTerm,
 					}
 					rv = append(rv, &token)
 				}
 			}
 		}
 	}
 	return rv
 }
 func buildTermFromRunes(runes []rune) []byte {
 	rv := make([]byte, 0, len(runes)*4)
 	for _, r := range runes {
 		runeBytes := make([]byte, utf8.RuneLen(r))
 		utf8.EncodeRune(runeBytes, r)
 		rv = append(rv, runeBytes...)
 	}
 	return rv
 }
--- a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter_test.go
+++ b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter_test.go
@ -0,0 +1,141 @@
 //  Copyright (c) 2014 Couchbase, Inc.
 //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 //  except in compliance with the License. You may obtain a copy of the License at
 //    http://www.apache.org/licenses/LICENSE-2.0
 //  Unless required by applicable law or agreed to in writing, software distributed under the
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 package edge_ngram_filter
 import (
 	"reflect"
 	"testing"
 	"github.com/couchbaselabs/bleve/analysis"
 )
 func TestEdgeNgramFilter(t *testing.T) {
 	tests := []struct {
 		side   Side
 		min    int
 		max    int
 		input  analysis.TokenStream
 		output analysis.TokenStream
 	}{
 		{
 			side: FRONT,
 			min:  1,
 			max:  1,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("a"),
 				},
 			},
 		},
 		{
 			side: BACK,
 			min:  1,
 			max:  1,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("e"),
 				},
 			},
 		},
 		{
 			side: FRONT,
 			min:  1,
 			max:  3,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("a"),
 				},
 				&analysis.Token{
 					Term: []byte("ab"),
 				},
 				&analysis.Token{
 					Term: []byte("abc"),
 				},
 			},
 		},
 		{
 			side: BACK,
 			min:  1,
 			max:  3,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("e"),
 				},
 				&analysis.Token{
 					Term: []byte("de"),
 				},
 				&analysis.Token{
 					Term: []byte("cde"),
 				},
 			},
 		},
 		{
 			side: FRONT,
 			min:  1,
 			max:  3,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 				&analysis.Token{
 					Term: []byte("vwxyz"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("a"),
 				},
 				&analysis.Token{
 					Term: []byte("ab"),
 				},
 				&analysis.Token{
 					Term: []byte("abc"),
 				},
 				&analysis.Token{
 					Term: []byte("v"),
 				},
 				&analysis.Token{
 					Term: []byte("vw"),
 				},
 				&analysis.Token{
 					Term: []byte("vwx"),
 				},
 			},
 		},
 	}
 	for _, test := range tests {
 		edgeNgramFilter := NewEdgeNgramFilter(test.side, test.min, test.max)
 		actual := edgeNgramFilter.Filter(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
 			t.Errorf("expected %s, got %s", test.output, actual)
 		}
 	}
 }
--- a/analysis/token_filters/ngram_filter/ngram_filter.go
+++ b/analysis/token_filters/ngram_filter/ngram_filter.go
@ -0,0 +1,66 @@
 //  Copyright (c) 2014 Couchbase, Inc.
 //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 //  except in compliance with the License. You may obtain a copy of the License at
 //    http://www.apache.org/licenses/LICENSE-2.0
 //  Unless required by applicable law or agreed to in writing, software distributed under the
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 package ngram_filter
 import (
 	"bytes"
 	"unicode/utf8"
 	"github.com/couchbaselabs/bleve/analysis"
 )
 type NgramFilter struct {
 	minLength int
 	maxLength int
 }
 func NewNgramFilter(minLength, maxLength int) *NgramFilter {
 	return &NgramFilter{
 		minLength: minLength,
 		maxLength: maxLength,
 	}
 }
 func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	rv := make(analysis.TokenStream, 0)
 	for _, token := range input {
 		runeCount := utf8.RuneCount(token.Term)
 		runes := bytes.Runes(token.Term)
 		for i := 0; i < runeCount; i++ {
 			// index of the starting rune for this token
 			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
 				// build an ngram of this size starting at i
 				if i+ngramSize <= runeCount {
 					ngramTerm := buildTermFromRunes(runes[i : i+ngramSize])
 					token := analysis.Token{
 						Position: token.Position,
 						Start:    token.Start,
 						End:      token.End,
 						Type:     token.Type,
 						Term:     ngramTerm,
 					}
 					rv = append(rv, &token)
 				}
 			}
 		}
 	}
 	return rv
 }
 func buildTermFromRunes(runes []rune) []byte {
 	rv := make([]byte, 0, len(runes)*4)
 	for _, r := range runes {
 		runeBytes := make([]byte, utf8.RuneLen(r))
 		utf8.EncodeRune(runeBytes, r)
 		rv = append(rv, runeBytes...)
 	}
 	return rv
 }
--- a/analysis/token_filters/ngram_filter/ngram_filter_test.go
+++ b/analysis/token_filters/ngram_filter/ngram_filter_test.go
@ -0,0 +1,131 @@
 //  Copyright (c) 2014 Couchbase, Inc.
 //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 //  except in compliance with the License. You may obtain a copy of the License at
 //    http://www.apache.org/licenses/LICENSE-2.0
 //  Unless required by applicable law or agreed to in writing, software distributed under the
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 package ngram_filter
 import (
 	"reflect"
 	"testing"
 	"github.com/couchbaselabs/bleve/analysis"
 )
 func TestNgramFilter(t *testing.T) {
 	tests := []struct {
 		min    int
 		max    int
 		input  analysis.TokenStream
 		output analysis.TokenStream
 	}{
 		{
 			min: 1,
 			max: 1,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("a"),
 				},
 				&analysis.Token{
 					Term: []byte("b"),
 				},
 				&analysis.Token{
 					Term: []byte("c"),
 				},
 				&analysis.Token{
 					Term: []byte("d"),
 				},
 				&analysis.Token{
 					Term: []byte("e"),
 				},
 			},
 		},
 		{
 			min: 2,
 			max: 2,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("ab"),
 				},
 				&analysis.Token{
 					Term: []byte("bc"),
 				},
 				&analysis.Token{
 					Term: []byte("cd"),
 				},
 				&analysis.Token{
 					Term: []byte("de"),
 				},
 			},
 		},
 		{
 			min: 1,
 			max: 3,
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("abcde"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("a"),
 				},
 				&analysis.Token{
 					Term: []byte("ab"),
 				},
 				&analysis.Token{
 					Term: []byte("abc"),
 				},
 				&analysis.Token{
 					Term: []byte("b"),
 				},
 				&analysis.Token{
 					Term: []byte("bc"),
 				},
 				&analysis.Token{
 					Term: []byte("bcd"),
 				},
 				&analysis.Token{
 					Term: []byte("c"),
 				},
 				&analysis.Token{
 					Term: []byte("cd"),
 				},
 				&analysis.Token{
 					Term: []byte("cde"),
 				},
 				&analysis.Token{
 					Term: []byte("d"),
 				},
 				&analysis.Token{
 					Term: []byte("de"),
 				},
 				&analysis.Token{
 					Term: []byte("e"),
 				},
 			},
 		},
 	}
 	for _, test := range tests {
 		ngramFilter := NewNgramFilter(test.min, test.max)
 		actual := ngramFilter.Filter(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
 			t.Errorf("expected %s, got %s", test.output, actual)
 		}
 	}
 }