0
0
Fork 0

add support for phrase slop to internals of phrase searcher

phrase slop is not yet supported on the frontend
added lots of tests around slop
This commit is contained in:
Marty Schoch 2017-02-09 15:59:51 -05:00
parent f82638c117
commit 232fc80dad
3 changed files with 311 additions and 23 deletions

View File

@ -21,27 +21,27 @@ import (
"github.com/blevesearch/bleve/index"
)
type Location struct {
Pos float64 `json:"pos"`
Start float64 `json:"start"`
End float64 `json:"end"`
ArrayPositions []float64 `json:"array_positions"`
}
type ArrayPositions []float64
// SameArrayElement returns true if two locations are point to
// the same array element
func (l *Location) SameArrayElement(other *Location) bool {
if len(l.ArrayPositions) != len(other.ArrayPositions) {
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
if len(ap) != len(other) {
return false
}
for i, elem := range l.ArrayPositions {
if other.ArrayPositions[i] != elem {
for i := range ap {
if ap[i] != other[i] {
return false
}
}
return true
}
type Location struct {
Pos float64 `json:"pos"`
Start float64 `json:"start"`
End float64 `json:"end"`
ArrayPositions ArrayPositions `json:"array_positions"`
}
type Locations []*Location
type TermLocationMap map[string]Locations

View File

@ -151,8 +151,8 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
// satisfied, and these locations are returned. otherwise 0 and either
// a nil or empty TermLocationMap
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) {
paths := findPhrasePaths(nil, s.terms, tlm, 1, nil)
rv := make(search.TermLocationMap, 0)
paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0)
rv := make(search.TermLocationMap, len(s.terms))
for _, p := range paths {
p.MergeInto(rv)
}
@ -185,7 +185,7 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) {
// this is the primary state being built during the traversal
//
// returns slice of paths, or nil if invocation did not find any successul paths
func findPhrasePaths(prev *search.Location, phraseTerms []string, tlm search.TermLocationMap, offset int, p phrasePath) []phrasePath {
func findPhrasePaths(prevPos float64, ap search.ArrayPositions, phraseTerms []string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath {
// no more terms
if len(phraseTerms) < 1 {
@ -195,25 +195,51 @@ func findPhrasePaths(prev *search.Location, phraseTerms []string, tlm search.Ter
car := phraseTerms[0]
cdr := phraseTerms[1:]
// empty term is treated as match (continue), but offset now +1
// empty term is treated as match (continue)
if car == "" {
return findPhrasePaths(prev, cdr, tlm, offset+1, p)
nextPos := prevPos + 1.0
if prevPos == 0.0 {
// if prevPos was 0.0, don't set it to 1 (as thats not a real abs pos)
nextPos = 0.0 // don't advance nextPos if prevPos was 0
}
return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop)
}
// locations for this term
locations := tlm[car]
var rv []phrasePath
for _, loc := range locations {
// check each location against prev (nil treated as match, the initial case)
if prev == nil || (prev.Pos+float64(offset) == loc.Pos && prev.SameArrayElement(loc)) {
if prevPos != 0.0 && !loc.ArrayPositions.Equals(ap) {
// if the array positions are wrong, can't match, try next location
continue
}
// compute distance from previous phrase term
dist := 0
if prevPos != 0.0 {
dist = editDistance(prevPos+1.0, loc.Pos)
}
// if enough slop reamining, continue recursively
if prevPos == 0.0 || (remainingSlop-dist) >= 0 {
// this location works, add it to the path (but not for empty term)
px := append(p, &phrasePart{term: car, loc: loc})
rv = append(rv, findPhrasePaths(loc, cdr, tlm, 1, px)...)
rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...)
}
}
return rv
}
func editDistance(p1, p2 float64) int {
i1 := int(p1)
i2 := int(p2)
dist := i1 - i2
if dist < 0 {
return -dist
}
return dist
}
func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {
if !s.initialized {
err := s.initSearchers(ctx)

View File

@ -233,12 +233,274 @@ func TestFindPhrasePaths(t *testing.T) {
},
},
},
// match with leading ""
{
phrase: []string{"", "cat", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 2,
},
},
"dog": search.Locations{
&search.Location{
Pos: 3,
},
},
},
paths: []phrasePath{
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 2}},
&phrasePart{"dog", &search.Location{Pos: 3}},
},
},
},
// match with trailing ""
{
phrase: []string{"cat", "dog", ""},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 2,
},
},
"dog": search.Locations{
&search.Location{
Pos: 3,
},
},
},
paths: []phrasePath{
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 2}},
&phrasePart{"dog", &search.Location{Pos: 3}},
},
},
},
}
for _, test := range tests {
actualPaths := findPhrasePaths(nil, test.phrase, test.tlm, 1, nil)
for i, test := range tests {
actualPaths := findPhrasePaths(0, nil, test.phrase, test.tlm, nil, 0)
if !reflect.DeepEqual(actualPaths, test.paths) {
t.Fatalf("expected: %v got %v", test.paths, actualPaths)
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
}
}
}
func TestFindPhrasePathsSloppy(t *testing.T) {
tlm := search.TermLocationMap{
"one": search.Locations{
&search.Location{
Pos: 1,
},
},
"two": search.Locations{
&search.Location{
Pos: 2,
},
},
"three": search.Locations{
&search.Location{
Pos: 3,
},
},
"four": search.Locations{
&search.Location{
Pos: 4,
},
},
"five": search.Locations{
&search.Location{
Pos: 5,
},
},
}
tests := []struct {
phrase []string
paths []phrasePath
slop int
}{
// no match
{
phrase: []string{"one", "five"},
slop: 2,
},
// should match
{
phrase: []string{"one", "five"},
slop: 3,
paths: []phrasePath{
phrasePath{
&phrasePart{"one", &search.Location{Pos: 1}},
&phrasePart{"five", &search.Location{Pos: 5}},
},
},
},
// slop 0 finds exact match
{
phrase: []string{"four", "five"},
slop: 0,
paths: []phrasePath{
phrasePath{
&phrasePart{"four", &search.Location{Pos: 4}},
&phrasePart{"five", &search.Location{Pos: 5}},
},
},
},
// slop 0 does not find exact match (reversed)
{
phrase: []string{"two", "one"},
slop: 0,
},
// slop 1 finds exact match
{
phrase: []string{"one", "two"},
slop: 1,
paths: []phrasePath{
phrasePath{
&phrasePart{"one", &search.Location{Pos: 1}},
&phrasePart{"two", &search.Location{Pos: 2}},
},
},
},
// slop 1 *still* does not find exact match (reversed) requires at least 2
{
phrase: []string{"two", "one"},
slop: 1,
},
// slop 2 does finds exact match reversed
{
phrase: []string{"two", "one"},
slop: 2,
paths: []phrasePath{
phrasePath{
&phrasePart{"two", &search.Location{Pos: 2}},
&phrasePart{"one", &search.Location{Pos: 1}},
},
},
},
// slop 2 not enough for this
{
phrase: []string{"three", "one"},
slop: 2,
},
// slop should be cumulative
{
phrase: []string{"one", "three", "five"},
slop: 2,
paths: []phrasePath{
phrasePath{
&phrasePart{"one", &search.Location{Pos: 1}},
&phrasePart{"three", &search.Location{Pos: 3}},
&phrasePart{"five", &search.Location{Pos: 5}},
},
},
},
// should require 6
{
phrase: []string{"five", "three", "one"},
slop: 5,
},
// so lets try 6
{
phrase: []string{"five", "three", "one"},
slop: 6,
paths: []phrasePath{
phrasePath{
&phrasePart{"five", &search.Location{Pos: 5}},
&phrasePart{"three", &search.Location{Pos: 3}},
&phrasePart{"one", &search.Location{Pos: 1}},
},
},
},
}
for i, test := range tests {
actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop)
if !reflect.DeepEqual(actualPaths, test.paths) {
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
}
}
}
func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) {
tlm := search.TermLocationMap{
"one": search.Locations{
&search.Location{
Pos: 1,
},
&search.Location{
Pos: 5,
},
},
"two": search.Locations{
&search.Location{
Pos: 2,
},
&search.Location{
Pos: 4,
},
},
"three": search.Locations{
&search.Location{
Pos: 3,
},
},
}
tests := []struct {
phrase []string
paths []phrasePath
slop int
}{
// search non palyndrone, exact match
{
phrase: []string{"two", "three"},
slop: 0,
paths: []phrasePath{
phrasePath{
&phrasePart{"two", &search.Location{Pos: 2}},
&phrasePart{"three", &search.Location{Pos: 3}},
},
},
},
// same with slop 2 (not required) (find it twice)
{
phrase: []string{"two", "three"},
slop: 2,
paths: []phrasePath{
phrasePath{
&phrasePart{"two", &search.Location{Pos: 2}},
&phrasePart{"three", &search.Location{Pos: 3}},
},
phrasePath{
&phrasePart{"two", &search.Location{Pos: 4}},
&phrasePart{"three", &search.Location{Pos: 3}},
},
},
},
// palyndrone reversed
{
phrase: []string{"three", "two"},
slop: 2,
paths: []phrasePath{
phrasePath{
&phrasePart{"three", &search.Location{Pos: 3}},
&phrasePart{"two", &search.Location{Pos: 2}},
},
phrasePath{
&phrasePart{"three", &search.Location{Pos: 3}},
&phrasePart{"two", &search.Location{Pos: 4}},
},
},
},
}
for i, test := range tests {
actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop)
if !reflect.DeepEqual(actualPaths, test.paths) {
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
}
}
}