0
0
Fork 0

upside_down analysis converts to docIDBytes once

This commit is contained in:
Steve Yen 2016-01-06 23:38:02 -08:00
parent d6a997d8c1
commit 82b8b3468e
7 changed files with 48 additions and 41 deletions

View File

@ -21,6 +21,8 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
Rows: make([]index.IndexRow, 0, 100),
}
docIDBytes := []byte(d.ID)
// track our back index entries
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
@ -56,7 +58,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
}
if field.Options().IsStored() {
storeRows, indexBackIndexStoreEntries := udc.storeField(d.ID, field, fieldIndex)
storeRows, indexBackIndexStoreEntries := udc.storeField(docIDBytes, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRows...)
backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...)
}
@ -75,7 +77,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
}
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
indexRows, indexBackIndexTermEntries := udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
@ -89,14 +91,14 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
indexRows, indexBackIndexTermEntries := udc.indexField(docIDBytes, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
}
// build the back index row
backIndexRow := NewBackIndexRow(d.ID, backIndexTermEntries, backIndexStoredEntries)
backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermEntries, backIndexStoredEntries)
rv.Rows = append(rv.Rows, backIndexRow)
return rv

View File

@ -131,6 +131,8 @@ func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }
// DumpDoc returns all rows in the index related to this doc id
func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
idBytes := []byte(id)
rv := make(chan interface{})
go func() {
@ -162,14 +164,14 @@ func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
// build sorted list of term keys
keys := make(keyset, 0)
for _, entry := range back.termEntries {
tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), id, 0, 0)
tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), idBytes, 0, 0)
key := tfr.Key()
keys = append(keys, key)
}
sort.Sort(keys)
// first add all the stored rows
storedRowPrefix := NewStoredRow(id, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
udc.dumpPrefix(kvreader, rv, storedRowPrefix)
// now walk term keys in order and add them as well

View File

@ -60,7 +60,7 @@ func (i *IndexReader) Document(id string) (doc *document.Document, err error) {
return
}
doc = document.NewDocument(id)
storedRow := NewStoredRow(id, 0, []uint64{}, 'x', nil)
storedRow := NewStoredRow([]byte(id), 0, []uint64{}, 'x', nil)
storedRowScanPrefix := storedRow.ScanPrefixForDoc()
it := i.kvreader.PrefixIterator(storedRowScanPrefix)
defer func() {

View File

@ -41,7 +41,7 @@ func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, fi
return nil, err
}
tfr := NewTermFrequencyRow(term, field, "", 0, 0)
tfr := NewTermFrequencyRow(term, field, []byte{}, 0, 0)
it := indexReader.kvreader.PrefixIterator(tfr.Key())
return &UpsideDownCouchTermFieldReader{
@ -80,7 +80,7 @@ func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) {
func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) {
if r.iterator != nil {
tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0)
tfr := NewTermFrequencyRow(r.term, r.field, []byte(docID), 0, 0)
r.iterator.Seek(tfr.Key())
key, val, valid := r.iterator.Current()
if valid {
@ -114,14 +114,16 @@ type UpsideDownCouchDocIDReader struct {
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte(start)
if start == "" {
start = string([]byte{0x0})
startBytes = []byte{0x0}
}
endBytes := []byte(end)
if end == "" {
end = string([]byte{0xff})
endBytes = []byte{0xff}
}
bisr := NewBackIndexRow(start, nil, nil)
bier := NewBackIndexRow(end, nil, nil)
bisr := NewBackIndexRow(startBytes, nil, nil)
bier := NewBackIndexRow(endBytes, nil, nil)
it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key())
return &UpsideDownCouchDocIDReader{
@ -145,7 +147,7 @@ func (r *UpsideDownCouchDocIDReader) Next() (string, error) {
}
func (r *UpsideDownCouchDocIDReader) Advance(docID string) (string, error) {
bir := NewBackIndexRow(docID, nil, nil)
bir := NewBackIndexRow([]byte(docID), nil, nil)
r.iterator.Seek(bir.Key())
key, val, valid := r.iterator.Current()
if valid {

View File

@ -459,21 +459,21 @@ func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
}
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
doc: docID,
freq: freq,
norm: norm,
}
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
doc: docID,
freq: freq,
norm: norm,
vectors: vectors,
@ -605,7 +605,7 @@ func (br *BackIndexRow) AllTermKeys() [][]byte {
}
rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries {
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0)
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.doc, 0, 0)
rv[i] = termRow.Key()
}
return rv
@ -617,7 +617,7 @@ func (br *BackIndexRow) AllStoredKeys() [][]byte {
}
rv := make([][]byte, len(br.storedEntries))
for i, storedEntry := range br.storedEntries {
storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
rv[i] = storedRow.Key()
}
return rv
@ -665,9 +665,9 @@ func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries)
}
func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
func NewBackIndexRow(docID []byte, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
return &BackIndexRow{
doc: []byte(doc),
doc: docID,
termEntries: entries,
storedEntries: storedFields,
}
@ -766,9 +766,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte {
return buf
}
func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
return &StoredRow{
doc: []byte(doc),
doc: docID,
field: field,
arrayPositions: arrayPositions,
typ: typ,

View File

@ -49,54 +49,54 @@ func TestRows(t *testing.T) {
[]byte{27},
},
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "catz", 3, 3.14),
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("catz"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test larger varints
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test vectors with arrayPositions
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
},
{
NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")),
NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
},
{
NewStoredRow("budweiser", 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
NewStoredRow([]byte("budweiser"), 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
},
@ -259,7 +259,7 @@ func BenchmarkTermFrequencyRowEncode(b *testing.B) {
row := NewTermFrequencyRowWithTermVectors(
[]byte{'b', 'e', 'e', 'r'},
0,
"budweiser",
[]byte("budweiser"),
3,
3.14,
[]*TermVector{
@ -304,7 +304,7 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) {
func BenchmarkBackIndexRowEncode(b *testing.B) {
field := uint32(1)
t1 := "term1"
row := NewBackIndexRow("beername",
row := NewBackIndexRow([]byte("beername"),
[]*BackIndexTermEntry{
&BackIndexTermEntry{
Term: &t1,
@ -336,7 +336,7 @@ func BenchmarkBackIndexRowDecode(b *testing.B) {
}
func BenchmarkStoredRowEncode(b *testing.B) {
row := NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer"))
row := NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
row.Key()

View File

@ -473,7 +473,7 @@ func (udc *UpsideDownCouch) mergeOldAndNew(backIndexRow *BackIndexRow, rows []in
return addRows, updateRows, deleteRows
}
func (udc *UpsideDownCouch) storeField(docID string, field document.Field, fieldIndex uint16) ([]index.IndexRow, []*BackIndexStoreEntry) {
func (udc *UpsideDownCouch) storeField(docID []byte, field document.Field, fieldIndex uint16) ([]index.IndexRow, []*BackIndexStoreEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
fieldType := encodeFieldType(field)
@ -502,7 +502,7 @@ func encodeFieldType(f document.Field) byte {
return fieldType
}
func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
func (udc *UpsideDownCouch) indexField(docID []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs))
@ -592,13 +592,14 @@ func (udc *UpsideDownCouch) Delete(id string) (err error) {
}
func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow {
idBytes := []byte(id)
for _, backIndexEntry := range backIndexRow.termEntries {
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0)
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), idBytes, 0, 0)
deleteRows = append(deleteRows, tfr)
}
for _, se := range backIndexRow.storedEntries {
sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil)
sf := NewStoredRow(idBytes, uint16(*se.Field), se.ArrayPositions, 'x', nil)
deleteRows = append(deleteRows, sf)
}