0
0
Fork 0

refactor phrase search to be recursive

a more correct solution that will enable us to extend in two
important ways:

1) support slop
2) support multi-phrase
This commit is contained in:
Marty Schoch 2017-02-03 16:05:21 -05:00
parent 12a7257b5f
commit f82638c117
2 changed files with 180 additions and 40 deletions

View File

@ -126,9 +126,9 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
// one field, but we allow for this to not be the case
// but, we note that phrase constraints can only be satisfied within
// a single field, so we can check them each independently
for field := range s.currMust.Locations {
for field, tlm := range s.currMust.Locations {
f, rvtlm := s.checkCurrMustMatchField(ctx, field)
f, rvtlm := s.checkCurrMustMatchField(ctx, tlm)
if f > 0 {
freq += f
rvftlm[field] = rvtlm
@ -150,49 +150,68 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D
// constraints (possibly more than once). if so, the number of times it was
// satisfied, and these locations are returned. otherwise 0 and either
// a nil or empty TermLocationMap
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, field string) (int, search.TermLocationMap) {
firstTerm := s.terms[0]
freq := 0
termLocMap := s.currMust.Locations[field]
locations, ok := termLocMap[firstTerm]
if !ok {
return 0, nil
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) {
paths := findPhrasePaths(nil, s.terms, tlm, 1, nil)
rv := make(search.TermLocationMap, 0)
for _, p := range paths {
p.MergeInto(rv)
}
return len(paths), rv
}
type phrasePart struct {
term string
loc *search.Location
}
type phrasePath []*phrasePart
func (p phrasePath) MergeInto(in search.TermLocationMap) {
for _, pp := range p {
in[pp.term] = append(in[pp.term], pp.loc)
}
}
// findPhrasePaths is a function to identify phase matches from a set of known
// term locations. the implementation is recursive, so care must be taken
// with arguments and return values.
//
// prev - the previous location, nil on first invocation
// phraseTerms - slice containing the phrase terms themselves
// may contain empty string as placeholder (don't care)
// tlm - the Term Location Map containing all relevant term locations
// offset - the offset from the previous that this next term must match
// p - the current path being explored (appended to in recursive calls)
// this is the primary state being built during the traversal
//
// returns slice of paths, or nil if invocation did not find any successul paths
func findPhrasePaths(prev *search.Location, phraseTerms []string, tlm search.TermLocationMap, offset int, p phrasePath) []phrasePath {
// no more terms
if len(phraseTerms) < 1 {
return []phrasePath{p}
}
rvtlm := make(search.TermLocationMap, 0)
car := phraseTerms[0]
cdr := phraseTerms[1:]
OUTER:
for _, location := range locations {
crvtlm := make(search.TermLocationMap, 0)
crvtlm.AddLocation(firstTerm, location)
INNER:
for i := 1; i < len(s.terms); i++ {
nextTerm := s.terms[i]
if nextTerm == "" {
continue
}
// look through all these term locations
// to try and find the correct offsets
nextLocations, ok := termLocMap[nextTerm]
if !ok {
continue OUTER
}
for _, nextLocation := range nextLocations {
if nextLocation.Pos == location.Pos+float64(i) && nextLocation.SameArrayElement(location) {
// found a location match for this term
crvtlm.AddLocation(nextTerm, nextLocation)
continue INNER
}
}
// if we got here we didn't find a location match for this term
continue OUTER
// empty term is treated as match (continue), but offset now +1
if car == "" {
return findPhrasePaths(prev, cdr, tlm, offset+1, p)
}
// locations for this term
locations := tlm[car]
var rv []phrasePath
for _, loc := range locations {
// check each location against prev (nil treated as match, the initial case)
if prev == nil || (prev.Pos+float64(offset) == loc.Pos && prev.SameArrayElement(loc)) {
// this location works, add it to the path (but not for empty term)
px := append(p, &phrasePart{term: car, loc: loc})
rv = append(rv, findPhrasePaths(loc, cdr, tlm, 1, px)...)
}
// if we got here all the terms matched
freq++
search.MergeTermLocationMaps(rvtlm, crvtlm)
}
return freq, rvtlm
return rv
}
func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {

View File

@ -121,3 +121,124 @@ func TestPhraseSearch(t *testing.T) {
}
}
}
func TestFindPhrasePaths(t *testing.T) {
tests := []struct {
phrase []string
tlm search.TermLocationMap
paths []phrasePath
}{
// simplest matching case
{
phrase: []string{"cat", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 1,
},
},
"dog": search.Locations{
&search.Location{
Pos: 2,
},
},
},
paths: []phrasePath{
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 1}},
&phrasePart{"dog", &search.Location{Pos: 2}},
},
},
},
// second term missing, no match
{
phrase: []string{"cat", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 1,
},
},
},
paths: nil,
},
// second term exists but in wrong position
{
phrase: []string{"cat", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 1,
},
},
"dog": search.Locations{
&search.Location{
Pos: 3,
},
},
},
paths: nil,
},
// matches multiple times
{
phrase: []string{"cat", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 1,
},
&search.Location{
Pos: 8,
},
},
"dog": search.Locations{
&search.Location{
Pos: 2,
},
&search.Location{
Pos: 9,
},
},
},
paths: []phrasePath{
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 1}},
&phrasePart{"dog", &search.Location{Pos: 2}},
},
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 8}},
&phrasePart{"dog", &search.Location{Pos: 9}},
},
},
},
// match over gaps
{
phrase: []string{"cat", "", "dog"},
tlm: search.TermLocationMap{
"cat": search.Locations{
&search.Location{
Pos: 1,
},
},
"dog": search.Locations{
&search.Location{
Pos: 3,
},
},
},
paths: []phrasePath{
phrasePath{
&phrasePart{"cat", &search.Location{Pos: 1}},
&phrasePart{"dog", &search.Location{Pos: 3}},
},
},
},
}
for _, test := range tests {
actualPaths := findPhrasePaths(nil, test.phrase, test.tlm, 1, nil)
if !reflect.DeepEqual(actualPaths, test.paths) {
t.Fatalf("expected: %v got %v", test.paths, actualPaths)
}
}
}