2014-04-17 22:55:53 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
2016-10-02 16:13:14 +02:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2014-09-02 16:54:50 +02:00
|
|
|
|
2016-09-30 17:30:17 +02:00
|
|
|
package upsidedown
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
import (
|
2015-05-06 16:04:02 +02:00
|
|
|
"math"
|
2014-04-17 22:55:53 +02:00
|
|
|
"reflect"
|
|
|
|
"testing"
|
2014-08-19 14:58:26 +02:00
|
|
|
|
2014-12-09 23:24:59 +01:00
|
|
|
"github.com/golang/protobuf/proto"
|
2014-04-17 22:55:53 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
func TestRows(t *testing.T) {
|
|
|
|
tests := []struct {
|
|
|
|
input UpsideDownCouchRow
|
|
|
|
outKey []byte
|
|
|
|
outVal []byte
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
NewVersionRow(1),
|
|
|
|
[]byte{'v'},
|
|
|
|
[]byte{0x1},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
NewFieldRow(0, "name"),
|
|
|
|
[]byte{'f', 0, 0},
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'n', 'a', 'm', 'e', ByteSeparator},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
NewFieldRow(1, "desc"),
|
|
|
|
[]byte{'f', 1, 0},
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'d', 'e', 's', 'c', ByteSeparator},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
NewFieldRow(513, "style"),
|
|
|
|
[]byte{'f', 1, 2},
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'s', 't', 'y', 'l', 'e', ByteSeparator},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
2015-03-10 21:22:19 +01:00
|
|
|
{
|
|
|
|
NewDictionaryRow([]byte{'b', 'e', 'e', 'r'}, 0, 27),
|
|
|
|
[]byte{'d', 0, 0, 'b', 'e', 'e', 'r'},
|
|
|
|
[]byte{27},
|
|
|
|
},
|
2014-04-17 22:55:53 +02:00
|
|
|
{
|
2016-01-07 08:38:02 +01:00
|
|
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("catz"), 3, 3.14),
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'},
|
2015-03-06 19:00:53 +01:00
|
|
|
[]byte{3, 195, 235, 163, 130, 4},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
2016-01-07 08:38:02 +01:00
|
|
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14),
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-03-06 19:00:53 +01:00
|
|
|
[]byte{3, 195, 235, 163, 130, 4},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
2016-04-03 03:54:33 +02:00
|
|
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-05-17 07:07:14 +02:00
|
|
|
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
|
2015-03-06 19:00:53 +01:00
|
|
|
},
|
|
|
|
// test larger varints
|
|
|
|
{
|
2016-04-03 03:54:33 +02:00
|
|
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
2015-03-06 19:00:53 +01:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-05-17 07:07:14 +02:00
|
|
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
|
|
|
|
},
|
|
|
|
// test vectors with arrayPositions
|
|
|
|
{
|
2016-04-03 03:54:33 +02:00
|
|
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
|
2015-05-17 07:07:14 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
|
|
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil),
|
2014-04-17 22:55:53 +02:00
|
|
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r'},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
|
|
|
{
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}, {Field: proto.Uint32(1), Terms: []string{"beat"}}}, nil),
|
2014-04-17 22:55:53 +02:00
|
|
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't'},
|
2014-04-17 22:55:53 +02:00
|
|
|
},
|
2014-06-26 17:43:13 +02:00
|
|
|
{
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}, {Field: proto.Uint32(1), Terms: []string{"beat"}}}, []*BackIndexStoreEntry{{Field: proto.Uint32(3)}, {Field: proto.Uint32(4)}, {Field: proto.Uint32(5)}}),
|
2014-06-26 17:43:13 +02:00
|
|
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't', 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
|
2014-06-26 17:43:13 +02:00
|
|
|
},
|
|
|
|
{
|
2016-01-07 08:38:02 +01:00
|
|
|
NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")),
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0},
|
2014-08-06 19:52:20 +02:00
|
|
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
2014-06-26 17:43:13 +02:00
|
|
|
},
|
2015-03-11 20:12:13 +01:00
|
|
|
{
|
2016-01-07 08:38:02 +01:00
|
|
|
NewStoredRow([]byte("budweiser"), 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
|
2015-03-11 20:12:13 +01:00
|
|
|
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
|
|
|
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
|
|
|
},
|
2014-08-14 03:14:47 +02:00
|
|
|
{
|
|
|
|
NewInternalRow([]byte("mapping"), []byte(`{"mapping":"json content"}`)),
|
|
|
|
[]byte{'i', 'm', 'a', 'p', 'p', 'i', 'n', 'g'},
|
|
|
|
[]byte{'{', '"', 'm', 'a', 'p', 'p', 'i', 'n', 'g', '"', ':', '"', 'j', 's', 'o', 'n', ' ', 'c', 'o', 'n', 't', 'e', 'n', 't', '"', '}'},
|
|
|
|
},
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// test going from struct to k/v bytes
|
2015-03-06 19:00:53 +01:00
|
|
|
for i, test := range tests {
|
2014-04-18 22:09:34 +02:00
|
|
|
rk := test.input.Key()
|
2014-04-17 22:55:53 +02:00
|
|
|
if !reflect.DeepEqual(rk, test.outKey) {
|
|
|
|
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
|
|
|
|
}
|
2014-04-18 22:09:34 +02:00
|
|
|
rv := test.input.Value()
|
2014-04-17 22:55:53 +02:00
|
|
|
if !reflect.DeepEqual(rv, test.outVal) {
|
2015-03-06 19:00:53 +01:00
|
|
|
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// now test going back from k/v bytes to struct
|
2014-08-19 14:58:26 +02:00
|
|
|
for i, test := range tests {
|
2014-04-19 03:07:41 +02:00
|
|
|
row, err := ParseFromKeyValue(test.outKey, test.outVal)
|
|
|
|
if err != nil {
|
2015-03-06 19:00:53 +01:00
|
|
|
t.Errorf("error parsking key/value: %v", err)
|
2014-04-19 03:07:41 +02:00
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
if !reflect.DeepEqual(row, test.input) {
|
2014-08-19 14:58:26 +02:00
|
|
|
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2014-04-19 04:31:13 +02:00
|
|
|
|
|
|
|
func TestInvalidRows(t *testing.T) {
|
|
|
|
tests := []struct {
|
|
|
|
key []byte
|
|
|
|
val []byte
|
|
|
|
}{
|
|
|
|
// empty key
|
|
|
|
{
|
|
|
|
[]byte{},
|
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// no such type q
|
|
|
|
{
|
|
|
|
[]byte{'q'},
|
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// type v, invalid empty value
|
|
|
|
{
|
|
|
|
[]byte{'v'},
|
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// type f, invalid key
|
|
|
|
{
|
|
|
|
[]byte{'f'},
|
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// type f, valid key, invalid value
|
|
|
|
{
|
|
|
|
[]byte{'f', 0, 0},
|
|
|
|
[]byte{},
|
|
|
|
},
|
2014-08-07 15:39:04 +02:00
|
|
|
// type t, invalid key (missing field)
|
2014-04-19 04:31:13 +02:00
|
|
|
{
|
|
|
|
[]byte{'t'},
|
|
|
|
[]byte{},
|
|
|
|
},
|
2014-08-07 15:39:04 +02:00
|
|
|
// type t, invalid key (missing term)
|
2014-04-19 04:31:13 +02:00
|
|
|
{
|
2014-08-07 15:39:04 +02:00
|
|
|
[]byte{'t', 0, 0},
|
2014-04-19 04:31:13 +02:00
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// type t, invalid key (missing id)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator},
|
2014-04-19 04:31:13 +02:00
|
|
|
[]byte{},
|
|
|
|
},
|
2014-12-18 18:43:12 +01:00
|
|
|
// type t, invalid val (missing freq)
|
2014-04-19 04:31:13 +02:00
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2014-04-19 04:31:13 +02:00
|
|
|
[]byte{},
|
|
|
|
},
|
|
|
|
// type t, invalid val (missing norm)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{3},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type t, invalid val (half missing tv field, full missing is valid (no term vectors))
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{3, 25, 255},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type t, invalid val (missing tv pos)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{3, 25, 0},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type t, invalid val (missing tv start)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{3, 25, 0, 0},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type t, invalid val (missing tv end)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2015-04-03 22:50:48 +02:00
|
|
|
[]byte{3, 25, 0, 0, 0},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type b, invalid key (missing id)
|
|
|
|
{
|
|
|
|
[]byte{'b'},
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'b', 'e', 'e', 'r', ByteSeparator, 0, 0},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
|
|
|
// type b, invalid val (missing field)
|
|
|
|
{
|
|
|
|
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
2014-08-19 14:58:26 +02:00
|
|
|
[]byte{'g', 'a', 'r', 'b', 'a', 'g', 'e'},
|
2014-04-19 04:31:13 +02:00
|
|
|
},
|
2014-06-26 17:43:13 +02:00
|
|
|
// type s, invalid key (missing id)
|
|
|
|
{
|
|
|
|
[]byte{'s'},
|
2014-08-06 19:52:20 +02:00
|
|
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
2014-06-26 17:43:13 +02:00
|
|
|
},
|
|
|
|
// type b, invalid val (missing field)
|
|
|
|
{
|
2014-09-03 23:48:40 +02:00
|
|
|
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator},
|
2014-08-06 19:52:20 +02:00
|
|
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
2014-06-26 17:43:13 +02:00
|
|
|
},
|
2014-04-19 04:31:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, test := range tests {
|
|
|
|
_, err := ParseFromKeyValue(test.key, test.val)
|
|
|
|
if err == nil {
|
|
|
|
t.Errorf("expected error, got nil")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-03-06 19:00:53 +01:00
|
|
|
|
2015-05-06 16:04:02 +02:00
|
|
|
func TestDictionaryRowValueBug197(t *testing.T) {
|
|
|
|
// this was the smallest value that would trigger a crash
|
|
|
|
dr := &DictionaryRow{
|
|
|
|
field: 0,
|
|
|
|
term: []byte("marty"),
|
|
|
|
count: 72057594037927936,
|
|
|
|
}
|
|
|
|
dr.Value()
|
|
|
|
// this is the maximum possible value
|
|
|
|
dr = &DictionaryRow{
|
|
|
|
field: 0,
|
|
|
|
term: []byte("marty"),
|
|
|
|
count: math.MaxUint64,
|
|
|
|
}
|
|
|
|
dr.Value()
|
|
|
|
// neither of these should panic
|
|
|
|
}
|
|
|
|
|
2015-03-06 19:00:53 +01:00
|
|
|
func BenchmarkTermFrequencyRowEncode(b *testing.B) {
|
2015-10-13 03:06:38 +02:00
|
|
|
row := NewTermFrequencyRowWithTermVectors(
|
|
|
|
[]byte{'b', 'e', 'e', 'r'},
|
|
|
|
0,
|
2016-01-07 08:38:02 +01:00
|
|
|
[]byte("budweiser"),
|
2015-10-13 03:06:38 +02:00
|
|
|
3,
|
|
|
|
3.14,
|
|
|
|
[]*TermVector{
|
2016-04-03 03:54:33 +02:00
|
|
|
{
|
2015-10-13 03:06:38 +02:00
|
|
|
field: 0,
|
|
|
|
pos: 1,
|
|
|
|
start: 3,
|
|
|
|
end: 11,
|
|
|
|
},
|
2016-04-03 03:54:33 +02:00
|
|
|
{
|
2015-10-13 03:06:38 +02:00
|
|
|
field: 0,
|
|
|
|
pos: 2,
|
|
|
|
start: 23,
|
|
|
|
end: 31,
|
|
|
|
},
|
2016-04-03 03:54:33 +02:00
|
|
|
{
|
2015-10-13 03:06:38 +02:00
|
|
|
field: 0,
|
|
|
|
pos: 3,
|
|
|
|
start: 43,
|
|
|
|
end: 51,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
b.ResetTimer()
|
2015-03-06 19:00:53 +01:00
|
|
|
for i := 0; i < b.N; i++ {
|
|
|
|
row.Key()
|
|
|
|
row.Value()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func BenchmarkTermFrequencyRowDecode(b *testing.B) {
|
2015-10-13 03:06:38 +02:00
|
|
|
k := []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}
|
|
|
|
v := []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}
|
|
|
|
b.ResetTimer()
|
2015-03-06 19:00:53 +01:00
|
|
|
for i := 0; i < b.N; i++ {
|
2015-04-07 20:52:00 +02:00
|
|
|
_, err := NewTermFrequencyRowKV(k, v)
|
|
|
|
if err != nil {
|
|
|
|
b.Fatal(err)
|
|
|
|
}
|
2015-03-06 19:00:53 +01:00
|
|
|
}
|
|
|
|
}
|
2015-03-11 20:12:13 +01:00
|
|
|
|
|
|
|
func BenchmarkBackIndexRowEncode(b *testing.B) {
|
|
|
|
field := uint32(1)
|
|
|
|
t1 := "term1"
|
2016-01-07 08:38:02 +01:00
|
|
|
row := NewBackIndexRow([]byte("beername"),
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
[]*BackIndexTermsEntry{
|
2016-04-03 03:54:33 +02:00
|
|
|
{
|
2015-10-13 03:06:38 +02:00
|
|
|
Field: &field,
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
Terms: []string{t1},
|
2015-03-11 20:12:13 +01:00
|
|
|
},
|
2015-10-13 03:06:38 +02:00
|
|
|
},
|
|
|
|
[]*BackIndexStoreEntry{
|
2016-04-03 03:54:33 +02:00
|
|
|
{
|
2015-10-13 03:06:38 +02:00
|
|
|
Field: &field,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
b.ResetTimer()
|
|
|
|
for i := 0; i < b.N; i++ {
|
2015-03-11 20:12:13 +01:00
|
|
|
row.Key()
|
|
|
|
row.Value()
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
b.Logf("%#v", row.Value())
|
2015-03-11 20:12:13 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func BenchmarkBackIndexRowDecode(b *testing.B) {
|
2015-10-13 03:06:38 +02:00
|
|
|
k := []byte{0x62, 0x62, 0x65, 0x65, 0x72, 0x6e, 0x61, 0x6d, 0x65}
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
v := []byte{0xa, 0x9, 0x8, 0x1, 0x12, 0x5, 0x74, 0x65, 0x72, 0x6d, 0x31, 0x12, 0x2, 0x8, 0x1}
|
2015-10-13 03:06:38 +02:00
|
|
|
b.ResetTimer()
|
2015-03-11 20:12:13 +01:00
|
|
|
for i := 0; i < b.N; i++ {
|
2015-04-07 20:52:00 +02:00
|
|
|
_, err := NewBackIndexRowKV(k, v)
|
|
|
|
if err != nil {
|
|
|
|
b.Fatal(err)
|
|
|
|
}
|
2015-03-11 20:12:13 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func BenchmarkStoredRowEncode(b *testing.B) {
|
2016-01-07 08:38:02 +01:00
|
|
|
row := NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer"))
|
2015-10-13 03:06:38 +02:00
|
|
|
b.ResetTimer()
|
2015-03-11 20:12:13 +01:00
|
|
|
for i := 0; i < b.N; i++ {
|
|
|
|
row.Key()
|
|
|
|
row.Value()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func BenchmarkStoredRowDecode(b *testing.B) {
|
2015-10-13 03:06:38 +02:00
|
|
|
k := []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0}
|
|
|
|
v := []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}
|
|
|
|
b.ResetTimer()
|
2015-03-11 20:12:13 +01:00
|
|
|
for i := 0; i < b.N; i++ {
|
2015-04-07 20:52:00 +02:00
|
|
|
_, err := NewStoredRowKV(k, v)
|
|
|
|
if err != nil {
|
|
|
|
b.Fatal(err)
|
|
|
|
}
|
2015-03-11 20:12:13 +01:00
|
|
|
}
|
|
|
|
}
|