Merge pull request #527 from mschoch/recursive_phrase
refactor phrase search to be recursive
This commit is contained in:
commit
0c87b7bff1
|
@ -21,27 +21,27 @@ import (
|
|||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
type Location struct {
|
||||
Pos float64 `json:"pos"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
ArrayPositions []float64 `json:"array_positions"`
|
||||
}
|
||||
type ArrayPositions []float64
|
||||
|
||||
// SameArrayElement returns true if two locations are point to
|
||||
// the same array element
|
||||
func (l *Location) SameArrayElement(other *Location) bool {
|
||||
if len(l.ArrayPositions) != len(other.ArrayPositions) {
|
||||
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
|
||||
if len(ap) != len(other) {
|
||||
return false
|
||||
}
|
||||
for i, elem := range l.ArrayPositions {
|
||||
if other.ArrayPositions[i] != elem {
|
||||
for i := range ap {
|
||||
if ap[i] != other[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type Location struct {
|
||||
Pos float64 `json:"pos"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
ArrayPositions ArrayPositions `json:"array_positions"`
|
||||
}
|
||||
|
||||
type Locations []*Location
|
||||
|
||||
type TermLocationMap map[string]Locations
|
||||
|
|
|
@ -126,9 +126,9 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
|
|||
// one field, but we allow for this to not be the case
|
||||
// but, we note that phrase constraints can only be satisfied within
|
||||
// a single field, so we can check them each independently
|
||||
for field := range s.currMust.Locations {
|
||||
for field, tlm := range s.currMust.Locations {
|
||||
|
||||
f, rvtlm := s.checkCurrMustMatchField(ctx, field)
|
||||
f, rvtlm := s.checkCurrMustMatchField(ctx, tlm)
|
||||
if f > 0 {
|
||||
freq += f
|
||||
rvftlm[field] = rvtlm
|
||||
|
@ -150,49 +150,94 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
|
|||
// constraints (possibly more than once). if so, the number of times it was
|
||||
// satisfied, and these locations are returned. otherwise 0 and either
|
||||
// a nil or empty TermLocationMap
|
||||
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, field string) (int, search.TermLocationMap) {
|
||||
firstTerm := s.terms[0]
|
||||
freq := 0
|
||||
termLocMap := s.currMust.Locations[field]
|
||||
locations, ok := termLocMap[firstTerm]
|
||||
if !ok {
|
||||
return 0, nil
|
||||
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) {
|
||||
paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0)
|
||||
rv := make(search.TermLocationMap, len(s.terms))
|
||||
for _, p := range paths {
|
||||
p.MergeInto(rv)
|
||||
}
|
||||
return len(paths), rv
|
||||
}
|
||||
|
||||
type phrasePart struct {
|
||||
term string
|
||||
loc *search.Location
|
||||
}
|
||||
|
||||
type phrasePath []*phrasePart
|
||||
|
||||
func (p phrasePath) MergeInto(in search.TermLocationMap) {
|
||||
for _, pp := range p {
|
||||
in[pp.term] = append(in[pp.term], pp.loc)
|
||||
}
|
||||
}
|
||||
|
||||
// findPhrasePaths is a function to identify phase matches from a set of known
|
||||
// term locations. the implementation is recursive, so care must be taken
|
||||
// with arguments and return values.
|
||||
//
|
||||
// prev - the previous location, nil on first invocation
|
||||
// phraseTerms - slice containing the phrase terms themselves
|
||||
// may contain empty string as placeholder (don't care)
|
||||
// tlm - the Term Location Map containing all relevant term locations
|
||||
// offset - the offset from the previous that this next term must match
|
||||
// p - the current path being explored (appended to in recursive calls)
|
||||
// this is the primary state being built during the traversal
|
||||
//
|
||||
// returns slice of paths, or nil if invocation did not find any successul paths
|
||||
func findPhrasePaths(prevPos float64, ap search.ArrayPositions, phraseTerms []string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath {
|
||||
|
||||
// no more terms
|
||||
if len(phraseTerms) < 1 {
|
||||
return []phrasePath{p}
|
||||
}
|
||||
|
||||
rvtlm := make(search.TermLocationMap, 0)
|
||||
car := phraseTerms[0]
|
||||
cdr := phraseTerms[1:]
|
||||
|
||||
OUTER:
|
||||
for _, location := range locations {
|
||||
crvtlm := make(search.TermLocationMap, 0)
|
||||
crvtlm.AddLocation(firstTerm, location)
|
||||
INNER:
|
||||
for i := 1; i < len(s.terms); i++ {
|
||||
nextTerm := s.terms[i]
|
||||
if nextTerm == "" {
|
||||
continue
|
||||
}
|
||||
// look through all these term locations
|
||||
// to try and find the correct offsets
|
||||
nextLocations, ok := termLocMap[nextTerm]
|
||||
if !ok {
|
||||
continue OUTER
|
||||
}
|
||||
for _, nextLocation := range nextLocations {
|
||||
if nextLocation.Pos == location.Pos+float64(i) && nextLocation.SameArrayElement(location) {
|
||||
// found a location match for this term
|
||||
crvtlm.AddLocation(nextTerm, nextLocation)
|
||||
continue INNER
|
||||
}
|
||||
}
|
||||
// if we got here we didn't find a location match for this term
|
||||
continue OUTER
|
||||
// empty term is treated as match (continue)
|
||||
if car == "" {
|
||||
nextPos := prevPos + 1.0
|
||||
if prevPos == 0.0 {
|
||||
// if prevPos was 0.0, don't set it to 1 (as thats not a real abs pos)
|
||||
nextPos = 0.0 // don't advance nextPos if prevPos was 0
|
||||
}
|
||||
// if we got here all the terms matched
|
||||
freq++
|
||||
search.MergeTermLocationMaps(rvtlm, crvtlm)
|
||||
return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop)
|
||||
}
|
||||
|
||||
return freq, rvtlm
|
||||
// locations for this term
|
||||
locations := tlm[car]
|
||||
var rv []phrasePath
|
||||
for _, loc := range locations {
|
||||
if prevPos != 0.0 && !loc.ArrayPositions.Equals(ap) {
|
||||
// if the array positions are wrong, can't match, try next location
|
||||
continue
|
||||
}
|
||||
|
||||
// compute distance from previous phrase term
|
||||
dist := 0
|
||||
if prevPos != 0.0 {
|
||||
dist = editDistance(prevPos+1.0, loc.Pos)
|
||||
}
|
||||
|
||||
// if enough slop reamining, continue recursively
|
||||
if prevPos == 0.0 || (remainingSlop-dist) >= 0 {
|
||||
// this location works, add it to the path (but not for empty term)
|
||||
px := append(p, &phrasePart{term: car, loc: loc})
|
||||
rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func editDistance(p1, p2 float64) int {
|
||||
i1 := int(p1)
|
||||
i2 := int(p2)
|
||||
dist := i1 - i2
|
||||
if dist < 0 {
|
||||
return -dist
|
||||
}
|
||||
return dist
|
||||
}
|
||||
|
||||
func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
|
|
|
@ -121,3 +121,386 @@ func TestPhraseSearch(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindPhrasePaths(t *testing.T) {
|
||||
tests := []struct {
|
||||
phrase []string
|
||||
tlm search.TermLocationMap
|
||||
paths []phrasePath
|
||||
}{
|
||||
// simplest matching case
|
||||
{
|
||||
phrase: []string{"cat", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 1}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 2}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// second term missing, no match
|
||||
{
|
||||
phrase: []string{"cat", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: nil,
|
||||
},
|
||||
// second term exists but in wrong position
|
||||
{
|
||||
phrase: []string{"cat", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: nil,
|
||||
},
|
||||
// matches multiple times
|
||||
{
|
||||
phrase: []string{"cat", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
&search.Location{
|
||||
Pos: 8,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
&search.Location{
|
||||
Pos: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 1}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 2}},
|
||||
},
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 8}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 9}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// match over gaps
|
||||
{
|
||||
phrase: []string{"cat", "", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 1}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 3}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// match with leading ""
|
||||
{
|
||||
phrase: []string{"", "cat", "dog"},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 2}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 3}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// match with trailing ""
|
||||
{
|
||||
phrase: []string{"cat", "dog", ""},
|
||||
tlm: search.TermLocationMap{
|
||||
"cat": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
},
|
||||
"dog": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"cat", &search.Location{Pos: 2}},
|
||||
&phrasePart{"dog", &search.Location{Pos: 3}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, test := range tests {
|
||||
actualPaths := findPhrasePaths(0, nil, test.phrase, test.tlm, nil, 0)
|
||||
if !reflect.DeepEqual(actualPaths, test.paths) {
|
||||
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindPhrasePathsSloppy(t *testing.T) {
|
||||
tlm := search.TermLocationMap{
|
||||
"one": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
},
|
||||
"two": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
},
|
||||
"three": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
"four": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 4,
|
||||
},
|
||||
},
|
||||
"five": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 5,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
phrase []string
|
||||
paths []phrasePath
|
||||
slop int
|
||||
}{
|
||||
// no match
|
||||
{
|
||||
phrase: []string{"one", "five"},
|
||||
slop: 2,
|
||||
},
|
||||
// should match
|
||||
{
|
||||
phrase: []string{"one", "five"},
|
||||
slop: 3,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"one", &search.Location{Pos: 1}},
|
||||
&phrasePart{"five", &search.Location{Pos: 5}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// slop 0 finds exact match
|
||||
{
|
||||
phrase: []string{"four", "five"},
|
||||
slop: 0,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"four", &search.Location{Pos: 4}},
|
||||
&phrasePart{"five", &search.Location{Pos: 5}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// slop 0 does not find exact match (reversed)
|
||||
{
|
||||
phrase: []string{"two", "one"},
|
||||
slop: 0,
|
||||
},
|
||||
// slop 1 finds exact match
|
||||
{
|
||||
phrase: []string{"one", "two"},
|
||||
slop: 1,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"one", &search.Location{Pos: 1}},
|
||||
&phrasePart{"two", &search.Location{Pos: 2}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// slop 1 *still* does not find exact match (reversed) requires at least 2
|
||||
{
|
||||
phrase: []string{"two", "one"},
|
||||
slop: 1,
|
||||
},
|
||||
// slop 2 does finds exact match reversed
|
||||
{
|
||||
phrase: []string{"two", "one"},
|
||||
slop: 2,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"two", &search.Location{Pos: 2}},
|
||||
&phrasePart{"one", &search.Location{Pos: 1}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// slop 2 not enough for this
|
||||
{
|
||||
phrase: []string{"three", "one"},
|
||||
slop: 2,
|
||||
},
|
||||
// slop should be cumulative
|
||||
{
|
||||
phrase: []string{"one", "three", "five"},
|
||||
slop: 2,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"one", &search.Location{Pos: 1}},
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
&phrasePart{"five", &search.Location{Pos: 5}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// should require 6
|
||||
{
|
||||
phrase: []string{"five", "three", "one"},
|
||||
slop: 5,
|
||||
},
|
||||
// so lets try 6
|
||||
{
|
||||
phrase: []string{"five", "three", "one"},
|
||||
slop: 6,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"five", &search.Location{Pos: 5}},
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
&phrasePart{"one", &search.Location{Pos: 1}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, test := range tests {
|
||||
actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop)
|
||||
if !reflect.DeepEqual(actualPaths, test.paths) {
|
||||
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) {
|
||||
tlm := search.TermLocationMap{
|
||||
"one": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 1,
|
||||
},
|
||||
&search.Location{
|
||||
Pos: 5,
|
||||
},
|
||||
},
|
||||
"two": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 2,
|
||||
},
|
||||
&search.Location{
|
||||
Pos: 4,
|
||||
},
|
||||
},
|
||||
"three": search.Locations{
|
||||
&search.Location{
|
||||
Pos: 3,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
phrase []string
|
||||
paths []phrasePath
|
||||
slop int
|
||||
}{
|
||||
// search non palyndrone, exact match
|
||||
{
|
||||
phrase: []string{"two", "three"},
|
||||
slop: 0,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"two", &search.Location{Pos: 2}},
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// same with slop 2 (not required) (find it twice)
|
||||
{
|
||||
phrase: []string{"two", "three"},
|
||||
slop: 2,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"two", &search.Location{Pos: 2}},
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
},
|
||||
phrasePath{
|
||||
&phrasePart{"two", &search.Location{Pos: 4}},
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
},
|
||||
},
|
||||
},
|
||||
// palyndrone reversed
|
||||
{
|
||||
phrase: []string{"three", "two"},
|
||||
slop: 2,
|
||||
paths: []phrasePath{
|
||||
phrasePath{
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
&phrasePart{"two", &search.Location{Pos: 2}},
|
||||
},
|
||||
phrasePath{
|
||||
&phrasePart{"three", &search.Location{Pos: 3}},
|
||||
&phrasePart{"two", &search.Location{Pos: 4}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for i, test := range tests {
|
||||
actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop)
|
||||
if !reflect.DeepEqual(actualPaths, test.paths) {
|
||||
t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue