make shingle token filter stateless
the previous implementation was incorectly stateful, which violates the contract for token filters fixes #431
This commit is contained in:
parent
ffee3c3764
commit
c5159251a9
|
@ -16,8 +16,6 @@ type ShingleFilter struct {
|
|||
outputOriginal bool
|
||||
tokenSeparator string
|
||||
fill string
|
||||
ring *ring.Ring
|
||||
itemsInRing int
|
||||
}
|
||||
|
||||
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
|
||||
|
@ -27,13 +25,14 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin
|
|||
outputOriginal: outputOriginal,
|
||||
tokenSeparator: sep,
|
||||
fill: fill,
|
||||
ring: ring.New(max),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
ring := ring.New(s.max)
|
||||
itemsInRing := 0
|
||||
currentPosition := 0
|
||||
for _, token := range input {
|
||||
if s.outputOriginal {
|
||||
|
@ -50,35 +49,34 @@ func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream
|
|||
Type: analysis.AlphaNumeric,
|
||||
Term: []byte(s.fill),
|
||||
}
|
||||
s.ring.Value = &fillerToken
|
||||
if s.itemsInRing < s.max {
|
||||
s.itemsInRing++
|
||||
ring.Value = &fillerToken
|
||||
if itemsInRing < s.max {
|
||||
itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState()...)
|
||||
s.ring = s.ring.Next()
|
||||
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
|
||||
ring = ring.Next()
|
||||
offset--
|
||||
}
|
||||
currentPosition = token.Position
|
||||
|
||||
s.ring.Value = token
|
||||
if s.itemsInRing < s.max {
|
||||
s.itemsInRing++
|
||||
ring.Value = token
|
||||
if itemsInRing < s.max {
|
||||
itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState()...)
|
||||
s.ring = s.ring.Next()
|
||||
|
||||
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
|
||||
ring = ring.Next()
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) shingleCurrentRingState() analysis.TokenStream {
|
||||
func (s *ShingleFilter) shingleCurrentRingState(ring *ring.Ring, itemsInRing int) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
for shingleN := s.min; shingleN <= s.max; shingleN++ {
|
||||
// if there are enough items in the ring
|
||||
// to produce a shingle of this size
|
||||
if s.itemsInRing >= shingleN {
|
||||
thisShingleRing := s.ring.Move(-(shingleN - 1))
|
||||
if itemsInRing >= shingleN {
|
||||
thisShingleRing := ring.Move(-(shingleN - 1))
|
||||
shingledBytes := make([]byte, 0)
|
||||
pos := 0
|
||||
start := -1
|
||||
|
|
|
@ -328,3 +328,84 @@ func TestShingleFilter(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
|
||||
// by making using the same filter instance twice and ensuring we do not get
|
||||
// contaminated output
|
||||
func TestShingleFilterBug431(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sad"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("dirty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sock"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a sad"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sad dirty"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("dirty sock"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
|
||||
for _, test := range tests {
|
||||
actual := shingleFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue