0
0
Fork 0

make shingle token filter stateless

the previous implementation was incorectly stateful, which
violates the contract for token filters

fixes #431
This commit is contained in:
Marty Schoch 2016-09-15 08:59:43 -04:00
parent ffee3c3764
commit c5159251a9
2 changed files with 96 additions and 17 deletions

View File

@ -16,8 +16,6 @@ type ShingleFilter struct {
outputOriginal bool
tokenSeparator string
fill string
ring *ring.Ring
itemsInRing int
}
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
@ -27,13 +25,14 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin
outputOriginal: outputOriginal,
tokenSeparator: sep,
fill: fill,
ring: ring.New(max),
}
}
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
ring := ring.New(s.max)
itemsInRing := 0
currentPosition := 0
for _, token := range input {
if s.outputOriginal {
@ -50,35 +49,34 @@ func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream
Type: analysis.AlphaNumeric,
Term: []byte(s.fill),
}
s.ring.Value = &fillerToken
if s.itemsInRing < s.max {
s.itemsInRing++
ring.Value = &fillerToken
if itemsInRing < s.max {
itemsInRing++
}
rv = append(rv, s.shingleCurrentRingState()...)
s.ring = s.ring.Next()
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
ring = ring.Next()
offset--
}
currentPosition = token.Position
s.ring.Value = token
if s.itemsInRing < s.max {
s.itemsInRing++
ring.Value = token
if itemsInRing < s.max {
itemsInRing++
}
rv = append(rv, s.shingleCurrentRingState()...)
s.ring = s.ring.Next()
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
ring = ring.Next()
}
return rv
}
func (s *ShingleFilter) shingleCurrentRingState() analysis.TokenStream {
func (s *ShingleFilter) shingleCurrentRingState(ring *ring.Ring, itemsInRing int) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for shingleN := s.min; shingleN <= s.max; shingleN++ {
// if there are enough items in the ring
// to produce a shingle of this size
if s.itemsInRing >= shingleN {
thisShingleRing := s.ring.Move(-(shingleN - 1))
if itemsInRing >= shingleN {
thisShingleRing := ring.Move(-(shingleN - 1))
shingledBytes := make([]byte, 0)
pos := 0
start := -1

View File

@ -328,3 +328,84 @@ func TestShingleFilter(t *testing.T) {
}
}
}
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
// by making using the same filter instance twice and ensuring we do not get
// contaminated output
func TestShingleFilterBug431(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("brown fox"),
Type: analysis.Shingle,
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("sad"),
},
&analysis.Token{
Term: []byte("dirty"),
},
&analysis.Token{
Term: []byte("sock"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a sad"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("sad dirty"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("dirty sock"),
Type: analysis.Shingle,
},
},
},
}
shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
for _, test := range tests {
actual := shingleFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}