0
0
bleve/search/collector/topn_test.go

578 lines
14 KiB
Go
Raw Normal View History

2014-04-17 22:55:53 +02:00
// Copyright (c) 2014 Couchbase, Inc.
2016-10-02 16:13:14 +02:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collector
2014-04-17 22:55:53 +02:00
import (
"testing"
"golang.org/x/net/context"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/search"
2014-04-17 22:55:53 +02:00
)
func TestTop10Scores(t *testing.T) {
// a stub search with more than 10 matches
// the top-10 scores are > 10
// everything else is less than 10
searcher := &stubSearcher{
matches: []*search.DocumentMatch{
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("a"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("b"),
Score: 9,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("c"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("d"),
Score: 9,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("e"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("f"),
Score: 9,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("g"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("h"),
Score: 9,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("i"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("j"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("k"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("l"),
Score: 99,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("m"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("n"),
Score: 11,
2014-04-17 22:55:53 +02:00
},
},
}
collector := NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
err := collector.Collect(context.Background(), searcher, &stubReader{})
if err != nil {
t.Fatal(err)
}
maxScore := collector.MaxScore()
if maxScore != 99.0 {
t.Errorf("expected max score 99.0, got %f", maxScore)
}
total := collector.Total()
if total != 14 {
t.Errorf("expected 14 total results, got %d", total)
}
2014-04-17 22:55:53 +02:00
results := collector.Results()
if len(results) != 10 {
t.Logf("results: %v", results)
2014-04-17 22:55:53 +02:00
t.Fatalf("expected 10 results, got %d", len(results))
}
if results[0].ID != "l" {
t.Errorf("expected first result to have ID 'l', got %s", results[0].ID)
}
if results[0].Score != 99.0 {
t.Errorf("expected highest score to be 99.0, got %f", results[0].Score)
}
minScore := 1000.0
for _, result := range results {
if result.Score < minScore {
minScore = result.Score
}
}
if minScore < 10 {
t.Errorf("expected minimum score to be higher than 10, got %f", minScore)
}
}
func TestTop10ScoresSkip10(t *testing.T) {
// a stub search with more than 10 matches
// the top-10 scores are > 10
// everything else is less than 10
searcher := &stubSearcher{
matches: []*search.DocumentMatch{
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("a"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("b"),
Score: 9.5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("c"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("d"),
Score: 9,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("e"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("f"),
Score: 9,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("g"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("h"),
Score: 9,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("i"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("j"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("k"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("l"),
Score: 99,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("m"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("n"),
Score: 11,
},
},
}
collector := NewTopNCollector(10, 10, search.SortOrder{&search.SortScore{Desc: true}})
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
err := collector.Collect(context.Background(), searcher, &stubReader{})
if err != nil {
t.Fatal(err)
}
maxScore := collector.MaxScore()
if maxScore != 99.0 {
t.Errorf("expected max score 99.0, got %f", maxScore)
}
total := collector.Total()
if total != 14 {
t.Errorf("expected 14 total results, got %d", total)
}
results := collector.Results()
if len(results) != 4 {
t.Fatalf("expected 4 results, got %d", len(results))
}
if results[0].ID != "b" {
t.Errorf("expected first result to have ID 'b', got %s", results[0].ID)
}
if results[0].Score != 9.5 {
t.Errorf("expected highest score to be 9.5, got %f", results[0].Score)
}
}
2015-03-06 18:59:44 +01:00
func TestTop10ScoresSkip10Only9Hits(t *testing.T) {
// a stub search with only 10 matches
searcher := &stubSearcher{
matches: []*search.DocumentMatch{
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("a"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("c"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("e"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("g"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("i"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("j"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("k"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("m"),
Score: 11,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("n"),
Score: 11,
},
},
}
collector := NewTopNCollector(10, 10, search.SortOrder{&search.SortScore{Desc: true}})
err := collector.Collect(context.Background(), searcher, &stubReader{})
if err != nil {
t.Fatal(err)
}
total := collector.Total()
if total != 9 {
t.Errorf("expected 9 total results, got %d", total)
}
results := collector.Results()
if len(results) != 0 {
t.Fatalf("expected 0 results, got %d", len(results))
}
}
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
func TestPaginationSameScores(t *testing.T) {
// a stub search with more than 10 matches
// all documents have the same score
searcher := &stubSearcher{
matches: []*search.DocumentMatch{
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("a"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("b"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("c"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("d"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("e"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("f"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("g"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("h"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("i"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("j"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("k"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("l"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("m"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("n"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
},
}
// first get first 5 hits
collector := NewTopNCollector(5, 0, search.SortOrder{&search.SortScore{Desc: true}})
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
err := collector.Collect(context.Background(), searcher, &stubReader{})
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
if err != nil {
t.Fatal(err)
}
total := collector.Total()
if total != 14 {
t.Errorf("expected 14 total results, got %d", total)
}
results := collector.Results()
if len(results) != 5 {
t.Fatalf("expected 5 results, got %d", len(results))
}
firstResults := make(map[string]struct{})
for _, hit := range results {
firstResults[hit.ID] = struct{}{}
}
// a stub search with more than 10 matches
// all documents have the same score
searcher = &stubSearcher{
matches: []*search.DocumentMatch{
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("a"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("b"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("c"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("d"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("e"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("f"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("g"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("h"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("i"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("j"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("k"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("l"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("m"),
Score: 5,
},
2016-10-02 18:14:53 +02:00
{
IndexInternalID: index.IndexInternalID("n"),
Score: 5,
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
},
},
}
// now get next 5 hits
collector = NewTopNCollector(5, 5, search.SortOrder{&search.SortScore{Desc: true}})
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
err = collector.Collect(context.Background(), searcher, &stubReader{})
fix pagination bug introduced by collector optimization fixes #378 this bug was introduced by: https://github.com/blevesearch/bleve/commit/f2aba116c49ea51b27bc9afd3bf15305ef04883c theory of operation for this collector (top N, skip K) - collect the highest scoring N+K results - if K > 0, skip K and return the next N internal details - the top N+K are kept in a list - the list is ordered from lowest scoring (first) to highest scoring (last) - as a hit comes in, we find where this new hit would fit into this list - if this caused the list to get too big, trim off the head (lowest scoring hit) theory of the optimization - we were not tracking the lowest score in the list - so if the score was lower than the lowest score, we would add/remove it - by keeping track of the lowest score in the list, we can avoid these ops problem with the optimization - the optimization worked by returning early - by returning early there was a subtle change to documents which had the same score - the reason is that which docs end up in the top N+K changed by returning early - why was that? docs are coming in, in order by key ascending - when finding the correct position to insert a hit into the list, we checked <, not <= the score - this has the subtle effect that docs with the same score end up in reverse order for example consider the following in progress list: doc ids [ c a b ] scores [ 1 5 9 ] if we now see doc d with score 5, we get: doc ids [ c a d b ] scores [ 1 5 5 9 ] While that appears in order (a, d) it is actually reverse order, because when we produce the top N we start at the end. theory of the fix - previous pagination depended on later hits with the same score "bumping" earlier hits with the same score off the bottom of the list - however, if we change the logic to <= instead of <, now the list in the previous example would look like: doc ids [ c d a b ] scores [ 1 5 5 9 ] - this small change means that now earlier (lower id) will score higher, and thus we no longer depend on later hits bumping things down, which means returning early is a valid thing to do NOTE: this does depend on the hits coming back in order by ID. this is not something strictly guaranteed, but it was the same assumption that allowed the original behavior This also has the side-effect that 2 hits with the same score come back in ascending ID order, which is somehow more pleasing to me than reverse order.
2016-06-01 16:43:14 +02:00
if err != nil {
t.Fatal(err)
}
total = collector.Total()
if total != 14 {
t.Errorf("expected 14 total results, got %d", total)
}
results = collector.Results()
if len(results) != 5 {
t.Fatalf("expected 5 results, got %d", len(results))
}
// make sure that none of these hits repeat ones we saw in the top 5
for _, hit := range results {
if _, ok := firstResults[hit.ID]; ok {
t.Errorf("doc ID %s is in top 5 and next 5 result sets", hit.ID)
}
}
}
2017-04-18 23:24:50 +02:00
func BenchmarkTop10of0Scores(b *testing.B) {
benchHelper(0, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of3Scores(b *testing.B) {
benchHelper(3, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of10Scores(b *testing.B) {
benchHelper(10, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of25Scores(b *testing.B) {
benchHelper(25, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of50Scores(b *testing.B) {
benchHelper(50, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of10000Scores(b *testing.B) {
2016-08-24 21:56:26 +02:00
benchHelper(10000, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
2016-08-24 21:56:26 +02:00
}, b)
}
2015-03-06 18:59:44 +01:00
2017-04-18 23:24:50 +02:00
func BenchmarkTop100of0Scores(b *testing.B) {
benchHelper(0, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop100of3Scores(b *testing.B) {
benchHelper(3, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop100of10Scores(b *testing.B) {
benchHelper(10, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop100of25Scores(b *testing.B) {
benchHelper(25, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop100of50Scores(b *testing.B) {
benchHelper(50, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop100of10000Scores(b *testing.B) {
2016-08-24 21:56:26 +02:00
benchHelper(10000, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
2016-08-24 21:56:26 +02:00
}, b)
}
2015-03-06 18:59:44 +01:00
func BenchmarkTop1000of10000Scores(b *testing.B) {
benchHelper(10000, func() search.Collector {
return NewTopNCollector(1000, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10000of100000Scores(b *testing.B) {
benchHelper(100000, func() search.Collector {
return NewTopNCollector(10000, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10of100000Scores(b *testing.B) {
2016-08-24 21:56:26 +02:00
benchHelper(100000, func() search.Collector {
return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}})
2016-08-24 21:56:26 +02:00
}, b)
}
2015-03-06 18:59:44 +01:00
func BenchmarkTop100of100000Scores(b *testing.B) {
2016-08-24 21:56:26 +02:00
benchHelper(100000, func() search.Collector {
return NewTopNCollector(100, 0, search.SortOrder{&search.SortScore{Desc: true}})
2016-08-24 21:56:26 +02:00
}, b)
2015-03-06 18:59:44 +01:00
}
func BenchmarkTop1000of100000Scores(b *testing.B) {
benchHelper(100000, func() search.Collector {
return NewTopNCollector(1000, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}
func BenchmarkTop10000of1000000Scores(b *testing.B) {
benchHelper(1000000, func() search.Collector {
return NewTopNCollector(10000, 0, search.SortOrder{&search.SortScore{Desc: true}})
}, b)
}