0
0
Fork 0

make backindex rows more compact, fix bug counting docs on start

This commit is contained in:
Marty Schoch 2016-09-09 11:04:11 -04:00
parent d3ca5424e2
commit 1b68c4ec5b
19 changed files with 316 additions and 265 deletions

View File

@ -63,12 +63,9 @@ type AsyncIndex interface {
type IndexReader interface {
TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
// DocIDReader returns an iterator over documents which identifiers are
// greater than or equal to start and smaller than end. Set start to the
// empty string to iterate from the first document, end to the empty string
// to iterate to the last one.
// DocIDReader returns an iterator over all doc ids
// The caller must close returned instance to release associated resources.
DocIDReader(start, end string) (DocIDReader, error)
DocIDReaderAll() (DocIDReader, error)
DocIDReaderOnly(ids []string) (DocIDReader, error)

View File

@ -101,7 +101,7 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, rowsCapNeeded)
backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))
// walk through the collated information and process
// once for each indexed field (unique name)
@ -110,11 +110,11 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// encode this field
rv.Rows, backIndexTermEntries = udc.indexField(d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermEntries)
rv.Rows, backIndexTermsEntries = udc.indexField(d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
}
// build the back index row
backIndexRow := NewBackIndexRow(d.Number, backIndexTermEntries, backIndexStoredEntries)
backIndexRow := NewBackIndexRow(d.Number, backIndexTermsEntries, backIndexStoredEntries)
rv.Rows = append(rv.Rows, backIndexRow)
return rv

View File

@ -163,9 +163,11 @@ func (udc *SmolderingCouch) DumpDoc(id string) chan interface{} {
}
// build sorted list of term keys
keys := make(keyset, 0)
for _, entry := range back.termEntries {
tfrk := TermFrequencyRowStart([]byte(*entry.Term), uint16(*entry.Field), back.docNumber)
keys = append(keys, tfrk)
for _, entry := range back.termsEntries {
for i := range entry.Terms {
tfrk := TermFrequencyRowStart([]byte(entry.Terms[i]), uint16(*entry.Field), back.docNumber)
keys = append(keys, tfrk)
}
}
sort.Sort(keys)

View File

@ -92,7 +92,8 @@ func TestDump(t *testing.T) {
// 16 date terms
// 3 stored fields
// 1 id term row
expectedDocRowCount := int(1+(2*(64/document.DefaultPrecisionStep))+3) + 1
// 1 id stored row
expectedDocRowCount := int(1+(2*(64/document.DefaultPrecisionStep))+3) + 1 + 1
docRowCount := 0
docRows := idx.DumpDoc("1")
for range docRows {

View File

@ -47,8 +47,8 @@ func (i *IndexReader) FieldDictPrefix(fieldName string, termPrefix []byte) (inde
return i.FieldDictRange(fieldName, termPrefix, termPrefix)
}
func (i *IndexReader) DocIDReader(start, end string) (index.DocIDReader, error) {
return newSmolderingCouchDocIDReader(i, start, end)
func (i *IndexReader) DocIDReaderAll() (index.DocIDReader, error) {
return newSmolderingCouchDocIDReader(i)
}
func (i *IndexReader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
@ -106,15 +106,10 @@ func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID) (index.FieldT
if back == nil {
return nil, nil
}
rv := make(index.FieldTerms, len(back.termEntries))
for _, entry := range back.termEntries {
rv := make(index.FieldTerms, len(back.termsEntries))
for _, entry := range back.termsEntries {
fieldName := i.index.fieldCache.FieldIndexed(uint16(*entry.Field))
terms, ok := rv[fieldName]
if !ok {
terms = make([]string, 0)
}
terms = append(terms, *entry.Term)
rv[fieldName] = terms
rv[fieldName] = entry.Terms
}
return rv, nil
}
@ -133,14 +128,9 @@ func (i *IndexReader) DocumentFieldTermsForFields(id index.IndexInternalID, fiel
}
fieldsMap[id] = f
}
for _, entry := range back.termEntries {
for _, entry := range back.termsEntries {
if field, ok := fieldsMap[uint16(*entry.Field)]; ok {
terms, ok := rv[field]
if !ok {
terms = make([]string, 0)
}
terms = append(terms, *entry.Term)
rv[field] = terms
rv[field] = entry.Terms
}
}
return rv, nil

View File

@ -135,15 +135,10 @@ type SmolderingCouchDocIDReader struct {
onlyMode bool
}
func newSmolderingCouchDocIDReader(indexReader *IndexReader, start, end string) (*SmolderingCouchDocIDReader, error) {
startBytes := []byte(start)
if start == "" {
startBytes = []byte{0x0}
}
endBytes := []byte(end)
if end == "" {
endBytes = []byte{0xff}
}
func newSmolderingCouchDocIDReader(indexReader *IndexReader) (*SmolderingCouchDocIDReader, error) {
startBytes := []byte{0x0}
endBytes := []byte{0xff}
bisrk := BackIndexRowKey(startBytes)
bierk := BackIndexRowKey(endBytes)
it := indexReader.kvreader.RangeIterator(bisrk, bierk)
@ -187,12 +182,14 @@ func (r *SmolderingCouchDocIDReader) Next() (index.IndexInternalID, error) {
}
// find doc id
for _, te := range br.termEntries {
for _, te := range br.termsEntries {
if te.GetField() == 0 {
if _, ok := r.only[te.GetTerm()]; ok {
rv = append([]byte(nil), br.docNumber...)
r.iterator.Next()
return rv, nil
for i := range te.Terms {
if _, ok := r.only[te.Terms[i]]; ok {
rv = append([]byte(nil), br.docNumber...)
r.iterator.Next()
return rv, nil
}
}
break
}
@ -228,12 +225,14 @@ func (r *SmolderingCouchDocIDReader) Advance(docID index.IndexInternalID) (index
}
// find doc id
for _, te := range br.termEntries {
for _, te := range br.termsEntries {
if te.GetField() == 0 {
if _, ok := r.only[te.GetTerm()]; ok {
rv = append([]byte(nil), br.docNumber...)
r.iterator.Next()
return rv, nil
for i := range te.Terms {
if _, ok := r.only[te.Terms[i]]; ok {
rv = append([]byte(nil), br.docNumber...)
r.iterator.Next()
return rv, nil
}
}
break
}

View File

@ -249,7 +249,7 @@ func TestIndexDocIdReader(t *testing.T) {
}()
// first get all doc ids
reader, err := indexReader.DocIDReader("", "")
reader, err := indexReader.DocIDReaderAll()
if err != nil {
t.Errorf("Error accessing doc id reader: %v", err)
}
@ -271,7 +271,7 @@ func TestIndexDocIdReader(t *testing.T) {
}
// try it again, but jump to the second doc this time
reader2, err := indexReader.DocIDReader("", "")
reader2, err := indexReader.DocIDReaderAll()
if err != nil {
t.Errorf("Error accessing doc id reader: %v", err)
}

View File

@ -493,6 +493,13 @@ func TermFrequencyRowStart(term []byte, field uint16, docNum []byte) []byte {
return tfr.Key()
}
func TermFrequencyRowStartField(field uint16) []byte {
buf := make([]byte, 3)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], field)
return buf
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum uint64, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
@ -641,28 +648,21 @@ func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
type BackIndexRow struct {
docNumber []byte
termEntries []*BackIndexTermEntry
termsEntries []*BackIndexTermsEntry
storedEntries []*BackIndexStoreEntry
}
func (br *BackIndexRow) FindExternalID() string {
for _, te := range br.termEntries {
if te.GetField() == 0 {
return te.GetTerm()
}
}
return ""
}
func (br *BackIndexRow) AllTermKeys() [][]byte {
if br == nil {
return nil
}
rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries {
termRowK := TermFrequencyRowStart([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.docNumber)
rv[i] = termRowK
rv := make([][]byte, 0, len(br.termsEntries)) // FIXME this underestimates severely
for _, termsEntry := range br.termsEntries {
for i := range termsEntry.Terms {
termRowK := TermFrequencyRowStart([]byte(termsEntry.Terms[i]), uint16(termsEntry.GetField()), br.docNumber)
rv = append(rv, termRowK)
}
}
return rv
}
@ -703,7 +703,7 @@ func (br *BackIndexRow) Value() []byte {
func (br *BackIndexRow) ValueSize() int {
birv := &BackIndexRowValue{
TermEntries: br.termEntries,
TermsEntries: br.termsEntries,
StoredEntries: br.storedEntries,
}
return birv.Size()
@ -711,7 +711,7 @@ func (br *BackIndexRow) ValueSize() int {
func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
birv := &BackIndexRowValue{
TermEntries: br.termEntries,
TermsEntries: br.termsEntries,
StoredEntries: br.storedEntries,
}
return birv.MarshalTo(buf)
@ -719,13 +719,13 @@ func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
func (br *BackIndexRow) String() string {
_, dn, _ := DecodeUvarintAscending(br.docNumber)
return fmt.Sprintf("Backindex Document: %d Term Entries: %v, Stored Entries: %v", dn, br.termEntries, br.storedEntries)
return fmt.Sprintf("Backindex Document: %d Terms Entries: %v, Stored Entries: %v", dn, br.termsEntries, br.storedEntries)
}
func NewBackIndexRow(docNum uint64, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
func NewBackIndexRow(docNum uint64, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
return &BackIndexRow{
docNumber: EncodeUvarintAscending(nil, docNum),
termEntries: entries,
termsEntries: entries,
storedEntries: storedFields,
}
}
@ -745,7 +745,7 @@ func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
if err != nil {
return nil, err
}
rv.termEntries = birv.TermEntries
rv.termsEntries = birv.TermsEntries
rv.storedEntries = birv.StoredEntries
return &rv, nil

View File

@ -76,19 +76,50 @@ func TestRows(t *testing.T) {
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
},
{
NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
NewBackIndexRow(1, []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil),
[]byte{'b', 137},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0},
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r'},
},
{
NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}, {Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
NewBackIndexRow(1, []*BackIndexTermsEntry{
{
Field: proto.Uint32(0),
Terms: []string{"beer"},
},
{
Field: proto.Uint32(1),
Terms: []string{"beat"},
},
}, nil),
[]byte{'b', 137},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1},
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't'},
},
{
NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}, {Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{{Field: proto.Uint32(3)}, {Field: proto.Uint32(4)}, {Field: proto.Uint32(5)}}),
NewBackIndexRow(1,
[]*BackIndexTermsEntry{
{
Field: proto.Uint32(0),
Terms: []string{"beer"},
},
{
Field: proto.Uint32(1),
Terms: []string{"beat"},
},
},
[]*BackIndexStoreEntry{
{
Field: proto.Uint32(3),
},
{
Field: proto.Uint32(4),
},
{
Field: proto.Uint32(5),
},
},
),
[]byte{'b', 137},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't', 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
},
{
NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")),
@ -305,10 +336,10 @@ func BenchmarkBackIndexRowEncode(b *testing.B) {
field := uint32(1)
t1 := "term1"
row := NewBackIndexRow(1,
[]*BackIndexTermEntry{
[]*BackIndexTermsEntry{
{
Term: &t1,
Field: &field,
Terms: []string{t1},
},
},
[]*BackIndexStoreEntry{

View File

@ -3,15 +3,15 @@
// DO NOT EDIT!
/*
Package smolder is a generated protocol buffer package.
Package smolder is a generated protocol buffer package.
It is generated from these files:
smolder.proto
It is generated from these files:
smolder.proto
It has these top-level messages:
BackIndexTermEntry
BackIndexStoreEntry
BackIndexRowValue
It has these top-level messages:
BackIndexTermsEntry
BackIndexStoreEntry
BackIndexRowValue
*/
package smolder
@ -26,30 +26,30 @@ import github_com_golang_protobuf_proto "github.com/golang/protobuf/proto"
var _ = proto.Marshal
var _ = math.Inf
type BackIndexTermEntry struct {
Term *string `protobuf:"bytes,1,req,name=term" json:"term,omitempty"`
Field *uint32 `protobuf:"varint,2,req,name=field" json:"field,omitempty"`
XXX_unrecognized []byte `json:"-"`
type BackIndexTermsEntry struct {
Field *uint32 `protobuf:"varint,1,req,name=field" json:"field,omitempty"`
Terms []string `protobuf:"bytes,2,rep,name=terms" json:"terms,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *BackIndexTermEntry) Reset() { *m = BackIndexTermEntry{} }
func (m *BackIndexTermEntry) String() string { return proto.CompactTextString(m) }
func (*BackIndexTermEntry) ProtoMessage() {}
func (m *BackIndexTermsEntry) Reset() { *m = BackIndexTermsEntry{} }
func (m *BackIndexTermsEntry) String() string { return proto.CompactTextString(m) }
func (*BackIndexTermsEntry) ProtoMessage() {}
func (m *BackIndexTermEntry) GetTerm() string {
if m != nil && m.Term != nil {
return *m.Term
}
return ""
}
func (m *BackIndexTermEntry) GetField() uint32 {
func (m *BackIndexTermsEntry) GetField() uint32 {
if m != nil && m.Field != nil {
return *m.Field
}
return 0
}
func (m *BackIndexTermsEntry) GetTerms() []string {
if m != nil {
return m.Terms
}
return nil
}
type BackIndexStoreEntry struct {
Field *uint32 `protobuf:"varint,1,req,name=field" json:"field,omitempty"`
ArrayPositions []uint64 `protobuf:"varint,2,rep,name=arrayPositions" json:"arrayPositions,omitempty"`
@ -75,7 +75,7 @@ func (m *BackIndexStoreEntry) GetArrayPositions() []uint64 {
}
type BackIndexRowValue struct {
TermEntries []*BackIndexTermEntry `protobuf:"bytes,1,rep,name=termEntries" json:"termEntries,omitempty"`
TermsEntries []*BackIndexTermsEntry `protobuf:"bytes,1,rep,name=termsEntries" json:"termsEntries,omitempty"`
StoredEntries []*BackIndexStoreEntry `protobuf:"bytes,2,rep,name=storedEntries" json:"storedEntries,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
@ -84,9 +84,9 @@ func (m *BackIndexRowValue) Reset() { *m = BackIndexRowValue{} }
func (m *BackIndexRowValue) String() string { return proto.CompactTextString(m) }
func (*BackIndexRowValue) ProtoMessage() {}
func (m *BackIndexRowValue) GetTermEntries() []*BackIndexTermEntry {
func (m *BackIndexRowValue) GetTermsEntries() []*BackIndexTermsEntry {
if m != nil {
return m.TermEntries
return m.TermsEntries
}
return nil
}
@ -98,7 +98,7 @@ func (m *BackIndexRowValue) GetStoredEntries() []*BackIndexStoreEntry {
return nil
}
func (m *BackIndexTermEntry) Unmarshal(data []byte) error {
func (m *BackIndexTermsEntry) Unmarshal(data []byte) error {
var hasFields [1]uint64
l := len(data)
iNdEx := 0
@ -119,8 +119,26 @@ func (m *BackIndexTermEntry) Unmarshal(data []byte) error {
wireType := int(wire & 0x7)
switch fieldNum {
case 1:
if wireType != 0 {
return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType)
}
var v uint32
for shift := uint(0); ; shift += 7 {
if iNdEx >= l {
return io.ErrUnexpectedEOF
}
b := data[iNdEx]
iNdEx++
v |= (uint32(b) & 0x7F) << shift
if b < 0x80 {
break
}
}
m.Field = &v
hasFields[0] |= uint64(0x00000001)
case 2:
if wireType != 2 {
return fmt.Errorf("proto: wrong wireType = %d for field Term", wireType)
return fmt.Errorf("proto: wrong wireType = %d for field Terms", wireType)
}
var stringLen uint64
for shift := uint(0); ; shift += 7 {
@ -138,28 +156,8 @@ func (m *BackIndexTermEntry) Unmarshal(data []byte) error {
if postIndex > l {
return io.ErrUnexpectedEOF
}
s := string(data[iNdEx:postIndex])
m.Term = &s
m.Terms = append(m.Terms, string(data[iNdEx:postIndex]))
iNdEx = postIndex
hasFields[0] |= uint64(0x00000001)
case 2:
if wireType != 0 {
return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType)
}
var v uint32
for shift := uint(0); ; shift += 7 {
if iNdEx >= l {
return io.ErrUnexpectedEOF
}
b := data[iNdEx]
iNdEx++
v |= (uint32(b) & 0x7F) << shift
if b < 0x80 {
break
}
}
m.Field = &v
hasFields[0] |= uint64(0x00000002)
default:
var sizeOfWire int
for {
@ -187,9 +185,6 @@ func (m *BackIndexTermEntry) Unmarshal(data []byte) error {
if hasFields[0]&uint64(0x00000001) == 0 {
return new(github_com_golang_protobuf_proto.RequiredNotSetError)
}
if hasFields[0]&uint64(0x00000002) == 0 {
return new(github_com_golang_protobuf_proto.RequiredNotSetError)
}
return nil
}
@ -299,7 +294,7 @@ func (m *BackIndexRowValue) Unmarshal(data []byte) error {
switch fieldNum {
case 1:
if wireType != 2 {
return fmt.Errorf("proto: wrong wireType = %d for field TermEntries", wireType)
return fmt.Errorf("proto: wrong wireType = %d for field TermsEntries", wireType)
}
var msglen int
for shift := uint(0); ; shift += 7 {
@ -320,8 +315,8 @@ func (m *BackIndexRowValue) Unmarshal(data []byte) error {
if postIndex > l {
return io.ErrUnexpectedEOF
}
m.TermEntries = append(m.TermEntries, &BackIndexTermEntry{})
if err := m.TermEntries[len(m.TermEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
m.TermsEntries = append(m.TermsEntries, &BackIndexTermsEntry{})
if err := m.TermsEntries[len(m.TermsEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
return err
}
iNdEx = postIndex
@ -472,16 +467,18 @@ var (
ErrInvalidLengthSmolder = fmt.Errorf("proto: negative length found during unmarshaling")
)
func (m *BackIndexTermEntry) Size() (n int) {
func (m *BackIndexTermsEntry) Size() (n int) {
var l int
_ = l
if m.Term != nil {
l = len(*m.Term)
n += 1 + l + sovSmolder(uint64(l))
}
if m.Field != nil {
n += 1 + sovSmolder(uint64(*m.Field))
}
if len(m.Terms) > 0 {
for _, s := range m.Terms {
l = len(s)
n += 1 + l + sovSmolder(uint64(l))
}
}
if m.XXX_unrecognized != nil {
n += len(m.XXX_unrecognized)
}
@ -508,8 +505,8 @@ func (m *BackIndexStoreEntry) Size() (n int) {
func (m *BackIndexRowValue) Size() (n int) {
var l int
_ = l
if len(m.TermEntries) > 0 {
for _, e := range m.TermEntries {
if len(m.TermsEntries) > 0 {
for _, e := range m.TermsEntries {
l = e.Size()
n += 1 + l + sovSmolder(uint64(l))
}
@ -539,7 +536,7 @@ func sovSmolder(x uint64) (n int) {
func sozSmolder(x uint64) (n int) {
return sovSmolder(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (m *BackIndexTermEntry) Marshal() (data []byte, err error) {
func (m *BackIndexTermsEntry) Marshal() (data []byte, err error) {
size := m.Size()
data = make([]byte, size)
n, err := m.MarshalTo(data)
@ -549,26 +546,33 @@ func (m *BackIndexTermEntry) Marshal() (data []byte, err error) {
return data[:n], nil
}
func (m *BackIndexTermEntry) MarshalTo(data []byte) (n int, err error) {
func (m *BackIndexTermsEntry) MarshalTo(data []byte) (n int, err error) {
var i int
_ = i
var l int
_ = l
if m.Term == nil {
return 0, new(github_com_golang_protobuf_proto.RequiredNotSetError)
} else {
data[i] = 0xa
i++
i = encodeVarintSmolder(data, i, uint64(len(*m.Term)))
i += copy(data[i:], *m.Term)
}
if m.Field == nil {
return 0, new(github_com_golang_protobuf_proto.RequiredNotSetError)
} else {
data[i] = 0x10
data[i] = 0x8
i++
i = encodeVarintSmolder(data, i, uint64(*m.Field))
}
if len(m.Terms) > 0 {
for _, s := range m.Terms {
data[i] = 0x12
i++
l = len(s)
for l >= 1<<7 {
data[i] = uint8(uint64(l)&0x7f | 0x80)
l >>= 7
i++
}
data[i] = uint8(l)
i++
i += copy(data[i:], s)
}
}
if m.XXX_unrecognized != nil {
i += copy(data[i:], m.XXX_unrecognized)
}
@ -625,8 +629,8 @@ func (m *BackIndexRowValue) MarshalTo(data []byte) (n int, err error) {
_ = i
var l int
_ = l
if len(m.TermEntries) > 0 {
for _, msg := range m.TermEntries {
if len(m.TermsEntries) > 0 {
for _, msg := range m.TermsEntries {
data[i] = 0xa
i++
i = encodeVarintSmolder(data, i, uint64(msg.Size()))

View File

@ -1,6 +1,6 @@
message BackIndexTermEntry {
required string term = 1;
required uint32 field = 2;
message BackIndexTermsEntry {
required uint32 field = 1;
repeated string terms = 2;
}
message BackIndexStoreEntry {
@ -9,6 +9,6 @@ message BackIndexStoreEntry {
}
message BackIndexRowValue {
repeated BackIndexTermEntry termEntries = 1;
repeated BackIndexTermsEntry termsEntries = 1;
repeated BackIndexStoreEntry storedEntries = 2;
}
}

View File

@ -41,7 +41,7 @@ const RowBufferSize = 4 * 1024
var VersionKey = []byte{'v'}
const Version uint8 = 5
const Version uint8 = 6
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
@ -374,7 +374,7 @@ func (udc *SmolderingCouch) Open() (err error) {
}
func (udc *SmolderingCouch) countDocs(kvreader store.KVReader) (count, highDocNum uint64, err error) {
k := TermFrequencyRowStart(nil, 0, nil)
k := TermFrequencyRowStartField(0)
it := kvreader.PrefixIterator(k)
defer func() {
if cerr := it.Close(); err == nil && cerr != nil {
@ -552,9 +552,11 @@ func (udc *SmolderingCouch) mergeOldAndNew(externalDocId string, backIndexRow *B
if backIndexRow != nil {
row.docNumber = backIndexRow.docNumber
// look through the backindex and update the term entry for _id
for _, te := range row.termEntries {
for _, te := range row.termsEntries {
if *te.Field == 0 {
te.Term = &externalDocId
for i := range te.Terms {
te.Terms[i] = externalDocId
}
}
}
}
@ -641,9 +643,10 @@ func encodeFieldType(f document.Field) byte {
return fieldType
}
func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermEntries []*BackIndexTermEntry) ([]index.IndexRow, []*BackIndexTermEntry) {
func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermsEntries []*BackIndexTermsEntry) ([]index.IndexRow, []*BackIndexTermsEntry) {
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
terms := make([]string, 0, len(tokenFreqs))
for k, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if includeTermVectors {
@ -655,13 +658,14 @@ func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, f
}
// record the back index entry
backIndexTermEntry := BackIndexTermEntry{Term: proto.String(k), Field: proto.Uint32(uint32(fieldIndex))}
backIndexTermEntries = append(backIndexTermEntries, &backIndexTermEntry)
terms = append(terms, k)
rows = append(rows, termFreqRow)
}
return rows, backIndexTermEntries
backIndexTermsEntry := BackIndexTermsEntry{Field: proto.Uint32(uint32(fieldIndex)), Terms: terms}
backIndexTermsEntries = append(backIndexTermsEntries, &backIndexTermsEntry)
return rows, backIndexTermsEntries
}
func (udc *SmolderingCouch) Delete(id string) (err error) {
@ -731,9 +735,11 @@ func (udc *SmolderingCouch) Delete(id string) (err error) {
}
func (udc *SmolderingCouch) deleteSingle(id index.IndexInternalID, backIndexRow *BackIndexRow, deleteRows []SmolderingCouchRow) []SmolderingCouchRow {
for _, backIndexEntry := range backIndexRow.termEntries {
tfr := TermFrequencyRowDocNumBytes([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id)
deleteRows = append(deleteRows, tfr)
for _, backIndexEntry := range backIndexRow.termsEntries {
for i := range backIndexEntry.Terms {
tfr := TermFrequencyRowDocNumBytes([]byte(backIndexEntry.Terms[i]), uint16(*backIndexEntry.Field), id)
deleteRows = append(deleteRows, tfr)
}
}
for _, se := range backIndexRow.storedEntries {
sf := NewStoredRowDocBytes(id, uint16(*se.Field), se.ArrayPositions, 'x', nil)

View File

@ -142,9 +142,10 @@ func TestIndexInsert(t *testing.T) {
// should have 4 rows (1 for version, 1 for schema field, and 1 for single term, and 1 for the term count, and 1 for the back index entry)
// +1 for id term
// +1 for id stored
// +1 for id term dictionary
// +1 for id field def
expectedLength := uint64(1 + 1 + 1 + 1 + 1 + 1 + 1 + 1)
expectedLength := uint64(1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -294,8 +295,8 @@ func TestIndexInsertThenUpdate(t *testing.T) {
t.Errorf("Error deleting entry from index: %v", err)
}
// should have 2 rows (1 for version, 2 for schema field, and 3 for the two term, and 3 for the term counts, and 1 for the back index entry)
expectedLength := uint64(1 + 2 + 3 + 3 + 1)
// should have 2 rows (1 for version, 2 for schema field, and 3 for the two term, and 3 for the term counts, and 1 for the back index entry, and 1 for stord id)
expectedLength := uint64(1 + 2 + 3 + 3 + 1 + 1)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -316,8 +317,8 @@ func TestIndexInsertThenUpdate(t *testing.T) {
t.Errorf("Error deleting entry from index: %v", err)
}
// should have 2 rows (1 for version, 2 for schema field, and 2 for the remaining terms, and 2 for the term diciontary, and 1 for the back index entry)
expectedLength = uint64(1 + 2 + 2 + 3 + 1)
// should have 2 rows (1 for version, 2 for schema field, and 2 for the remaining terms, and 2 for the term diciontary, and 1 for the back index entry, and 1 for stored id)
expectedLength = uint64(1 + 2 + 2 + 3 + 1 + 1)
rowCount, err = idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -367,8 +368,8 @@ func TestIndexInsertMultiple(t *testing.T) {
}
expectedCount++
// should have 4 rows (1 for version, 1 for schema field, and 4 for terms, and 3 for the term count, and 2 for the back index entries)
expectedLength := uint64(1 + 2 + 4 + 3 + 2)
// should have 4 rows (1 for version, 1 for schema field, and 4 for terms, and 3 for the term count, and 2 for the back index entries, and 2 for stored ids)
expectedLength := uint64(1 + 2 + 4 + 3 + 2 + 2)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -464,8 +465,8 @@ func TestIndexInsertWithStore(t *testing.T) {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
// should have 6 rows (1 for version, 2 for schema field, and 2 for terms, and 1 for the stored field and 2 for the term counts, and 1 for the back index entry)
expectedLength := uint64(1 + 2 + 2 + 1 + 2 + 1)
// should have 6 rows (1 for version, 2 for schema field, and 2 for terms, and 2 for the stored field and 2 for the term counts, and 1 for the back index entry)
expectedLength := uint64(1 + 2 + 2 + 2 + 2 + 1)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -490,15 +491,19 @@ func TestIndexInsertWithStore(t *testing.T) {
t.Error(err)
}
if len(storedDoc.Fields) != 1 {
if len(storedDoc.Fields) != 2 {
t.Errorf("expected 1 stored field, got %d", len(storedDoc.Fields))
}
textField, ok := storedDoc.Fields[0].(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "test" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
for _, f := range storedDoc.Fields {
if f.Name() == "name" {
textField, ok := f.(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "test" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
}
}
}
}
@ -675,7 +680,7 @@ func TestIndexBatch(t *testing.T) {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
docIDReader, err := indexReader.DocIDReader("", "")
docIDReader, err := indexReader.DocIDReaderAll()
if err != nil {
t.Error(err)
}
@ -753,20 +758,20 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
// should have 77 rows
// should have 78 rows
// 1 for version
// 4 for schema fields
// 1 for id term
// 1 for text term
// 16 for numeric terms
// 16 for date terms
// 3 for the stored field
// 4 for the stored field
// 1 for id term count
// 1 for the text term count
// 16 for numeric term counts
// 16 for date term counts
// 1 for the back index entry
expectedLength := uint64(1 + 4 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 3 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 1)
expectedLength := uint64(1 + 4 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 4 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 1)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -790,38 +795,48 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) {
t.Error(err)
}
if len(storedDoc.Fields) != 3 {
t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields))
if len(storedDoc.Fields) != 4 {
t.Errorf("expected 4 stored field, got %d", len(storedDoc.Fields))
}
textField, ok := storedDoc.Fields[0].(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "test" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
}
numField, ok := storedDoc.Fields[1].(*document.NumericField)
if !ok {
t.Errorf("expected numeric field")
}
numFieldNumer, err := numField.Number()
if err != nil {
t.Error(err)
} else {
if numFieldNumer != 35.99 {
t.Errorf("expeted numeric value 35.99, got %f", numFieldNumer)
for _, f := range storedDoc.Fields {
if f.Name() == "name" {
textField, ok := f.(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "test" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
}
}
}
dateField, ok := storedDoc.Fields[2].(*document.DateTimeField)
if !ok {
t.Errorf("expected date field")
}
dateFieldDate, err := dateField.DateTime()
if err != nil {
t.Error(err)
} else {
if dateFieldDate != time.Unix(0, 0).UTC() {
t.Errorf("expected date value unix epoch, got %v", dateFieldDate)
if f.Name() == "age" {
numField, ok := f.(*document.NumericField)
if !ok {
t.Errorf("expected numeric field")
}
numFieldNumer, err := numField.Number()
if err != nil {
t.Error(err)
} else {
if numFieldNumer != 35.99 {
t.Errorf("expeted numeric value 35.99, got %f", numFieldNumer)
}
}
}
if f.Name() == "unixEpoch" {
dateField, ok := f.(*document.DateTimeField)
if !ok {
t.Errorf("expected date field")
}
dateFieldDate, err := dateField.DateTime()
if err != nil {
t.Error(err)
} else {
if dateFieldDate != time.Unix(0, 0).UTC() {
t.Errorf("expected date value unix epoch, got %v", dateFieldDate)
}
}
}
}
@ -856,26 +871,32 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) {
t.Error(err)
}
if len(storedDoc.Fields) != 2 {
if len(storedDoc.Fields) != 3 {
t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields))
}
textField, ok = storedDoc.Fields[0].(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "testup" {
t.Errorf("expected field content 'testup', got '%s'", string(textField.Value()))
}
numField, ok = storedDoc.Fields[1].(*document.NumericField)
if !ok {
t.Errorf("expected numeric field")
}
numFieldNumer, err = numField.Number()
if err != nil {
t.Error(err)
} else {
if numFieldNumer != 36.99 {
t.Errorf("expeted numeric value 36.99, got %f", numFieldNumer)
for _, f := range storedDoc.Fields {
if f.Name() == "name" {
textField, ok := f.(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "testup" {
t.Errorf("expected field content 'testup', got '%s'", string(textField.Value()))
}
}
if f.Name() == "age" {
numField, ok := f.(*document.NumericField)
if !ok {
t.Errorf("expected numeric field")
}
numFieldNumer, err := numField.Number()
if err != nil {
t.Error(err)
} else {
if numFieldNumer != 36.99 {
t.Errorf("expeted numeric value 36.99, got %f", numFieldNumer)
}
}
}
}
@ -989,10 +1010,10 @@ func TestIndexUpdateComposites(t *testing.T) {
// 1 for version
// 4 for schema fields
// 5 for text term
// 2 for the stored field
// 3 for the stored field
// 5 for the text term count
// 1 for the back index entry
expectedLength := uint64(1 + 4 + 5 + 2 + 5 + 1)
expectedLength := uint64(1 + 4 + 5 + 3 + 5 + 1)
rowCount, err := idx.(*SmolderingCouch).rowCount()
if err != nil {
t.Error(err)
@ -1027,15 +1048,19 @@ func TestIndexUpdateComposites(t *testing.T) {
if err != nil {
t.Error(err)
}
if len(storedDoc.Fields) != 2 {
t.Errorf("expected 2 stored field, got %d", len(storedDoc.Fields))
if len(storedDoc.Fields) != 3 {
t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields))
}
textField, ok := storedDoc.Fields[0].(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "testupdated" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
for _, f := range storedDoc.Fields {
if f.Name() == "name" {
textField, ok := f.(*document.TextField)
if !ok {
t.Errorf("expected text field")
}
if string(textField.Value()) != "testupdated" {
t.Errorf("expected field content 'test', got '%s'", string(textField.Value()))
}
}
}
// should have the same row count as before, plus 4 term dictionary garbage rows
@ -1305,8 +1330,8 @@ func TestConcurrentUpdate(t *testing.T) {
log.Fatal(err)
}
if len(doc.Fields) > 1 {
t.Errorf("expected single field, found %d", len(doc.Fields))
if len(doc.Fields) > 2 {
t.Errorf("expected two fields, found %d", len(doc.Fields))
}
}

View File

@ -45,8 +45,8 @@ func (i *IndexReader) FieldDictPrefix(fieldName string, termPrefix []byte) (inde
return i.FieldDictRange(fieldName, termPrefix, termPrefix)
}
func (i *IndexReader) DocIDReader(start, end string) (index.DocIDReader, error) {
return newUpsideDownCouchDocIDReader(i, start, end)
func (i *IndexReader) DocIDReaderAll() (index.DocIDReader, error) {
return newUpsideDownCouchDocIDReader(i)
}
func (i *IndexReader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {

View File

@ -138,15 +138,11 @@ type UpsideDownCouchDocIDReader struct {
onlyMode bool
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte(start)
if start == "" {
startBytes = []byte{0x0}
}
endBytes := []byte(end)
if end == "" {
endBytes = []byte{0xff}
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte{0x0}
endBytes := []byte{0xff}
bisr := NewBackIndexRow(startBytes, nil, nil)
bier := NewBackIndexRow(endBytes, nil, nil)
it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key())

View File

@ -247,7 +247,7 @@ func TestIndexDocIdReader(t *testing.T) {
}()
// first get all doc ids
reader, err := indexReader.DocIDReader("", "")
reader, err := indexReader.DocIDReaderAll()
if err != nil {
t.Errorf("Error accessing doc id reader: %v", err)
}
@ -269,7 +269,7 @@ func TestIndexDocIdReader(t *testing.T) {
}
// try it again, but jump to the second doc this time
reader2, err := indexReader.DocIDReader("", "")
reader2, err := indexReader.DocIDReaderAll()
if err != nil {
t.Errorf("Error accessing doc id reader: %v", err)
}

View File

@ -659,7 +659,7 @@ func TestIndexBatch(t *testing.T) {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
docIDReader, err := indexReader.DocIDReader("", "")
docIDReader, err := indexReader.DocIDReaderAll()
if err != nil {
t.Error(err)
}

View File

@ -75,7 +75,7 @@ func (sr *stubReader) TermFieldReader(term []byte, field string, includeFreq, in
return nil, nil
}
func (sr *stubReader) DocIDReader(start, end string) (index.DocIDReader, error) {
func (sr *stubReader) DocIDReaderAll() (index.DocIDReader, error) {
return nil, nil
}

View File

@ -22,7 +22,7 @@ type MatchAllSearcher struct {
}
func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, explain bool) (*MatchAllSearcher, error) {
reader, err := indexReader.DocIDReader("", "")
reader, err := indexReader.DocIDReaderAll()
if err != nil {
return nil, err
}