0
0

Merge branch 'WIP-perf-20160106' of https://github.com/steveyen/bleve into steveyen-WIP-perf-20160106

This commit is contained in:
Marty Schoch 2016-01-07 15:40:29 -05:00
commit 48fcd5a7d5
14 changed files with 159 additions and 138 deletions

View File

@ -44,10 +44,16 @@ type Snapshot struct {
// returns which doc number is valid
// if none, then 0
func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 {
sort.Sort(docNumList)
highestValidDocNum := docNumList.HighestValid(s.maxRead)
if highestValidDocNum > 0 && s.Valid(docID, highestValidDocNum) {
return highestValidDocNum
inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
sort.Sort(docNumList) // Descending ordering.
for _, docNum := range docNumList {
if docNum > 0 && docNum <= s.maxRead &&
(inFlightVal == nil || inFlightVal.(*InFlightItem).docNum == docNum) &&
!s.deletedDocNumbers.Test(uint(docNum)) {
return docNum
}
}
return 0
}

View File

@ -146,7 +146,7 @@ func (f *Firestorm) Update(doc *document.Document) (err error) {
aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue
go f.analysisQueue.Queue(aw)
f.analysisQueue.Queue(aw)
// wait for the result
result := <-resultChan

View File

@ -135,9 +135,10 @@ func (gc *GarbageCollector) cleanup() {
termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator})
termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator})
var tfr TermFreqRow
dictionaryDeltas := make(map[string]int64)
err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val)
err := tfr.ParseKey(key)
if err != nil {
return false, err
}
@ -158,8 +159,9 @@ func (gc *GarbageCollector) cleanup() {
}
// walk all the stored rows
var sr StoredRow
err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
sr, err := NewStoredRowKV(key, val)
err := sr.ParseKey(key)
if err != nil {
return false, err
}

View File

@ -91,10 +91,11 @@ func (l *Lookuper) lookup(item *InFlightItem) {
prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID)
logger.Printf("lookuper prefix - % x", prefix)
var tfk TermFreqRow
docNums := make(DocNumberList, 0)
err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) {
logger.Printf("lookuper sees key % x", key)
tfk, err := NewTermFreqRowKV(key, val)
err := tfk.ParseKey(key)
if err != nil {
return false, err
}

View File

@ -41,46 +41,52 @@ func NewStoredRow(docID []byte, docNum uint64, field uint16, arrayPositions []ui
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv := StoredRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
err := rv.ParseKey(key)
if err != nil {
return nil, err
}
rv.docID, err = buf.ReadBytes(ByteSeparator)
if len(rv.docID) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return nil, err
}
rv.docID = rv.docID[:len(rv.docID)-1] // trim off separator byte
rv.docNum, err = binary.ReadUvarint(buf)
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (sr *StoredRow) ParseKey(key []byte) error {
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return err
}
sr.docID, err = buf.ReadBytes(ByteSeparator)
if len(sr.docID) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return err
}
sr.docID = sr.docID[:len(sr.docID)-1] // trim off separator byte
sr.docNum, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &sr.field)
if err != nil {
return err
}
sr.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
sr.arrayPositions = append(sr.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
return nil
}
func (sr *StoredRow) KeySize() int {
return 1 + len(sr.docID) + 1 + binary.MaxVarintLen64 + 2 + (binary.MaxVarintLen64 * len(sr.arrayPositions))
}

View File

@ -62,32 +62,39 @@ func InitTermFreqRow(tfr *TermFreqRow, field uint16, term []byte, docID []byte,
func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) {
rv := TermFreqRow{}
err := rv.ParseKey(key)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (tfr *TermFreqRow) ParseKey(key []byte) error {
keyLen := len(key)
if keyLen < 3 {
return nil, fmt.Errorf("invalid term frequency key, no valid field")
return fmt.Errorf("invalid term frequency key, no valid field")
}
rv.field = binary.LittleEndian.Uint16(key[1:3])
tfr.field = binary.LittleEndian.Uint16(key[1:3])
termStartPos := 3
termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator)
if termEndPos < 0 {
return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term")
return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
}
rv.term = key[termStartPos : termStartPos+termEndPos]
tfr.term = key[termStartPos : termStartPos+termEndPos]
docStartPos := termStartPos + termEndPos + 1
docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator)
rv.docID = key[docStartPos : docStartPos+docEndPos]
tfr.docID = key[docStartPos : docStartPos+docEndPos]
docNumPos := docStartPos + docEndPos + 1
rv.docNum, _ = binary.Uvarint(key[docNumPos:])
tfr.docNum, _ = binary.Uvarint(key[docNumPos:])
err := rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
return nil
}
func (tfr *TermFreqRow) KeySize() int {

View File

@ -67,10 +67,11 @@ func (f *Firestorm) warmup(reader store.KVReader) error {
tfkPrefix := TermFreqIteratorStart(idField, nil)
var tfk TermFreqRow
var lastDocId []byte
lastDocNumbers := make(DocNumberList, 1)
err = visitPrefix(reader, tfkPrefix, func(key, val []byte) (bool, error) {
tfk, err := NewTermFreqRowKV(key, val)
err := tfk.ParseKey(key)
if err != nil {
return false, err
}

View File

@ -21,8 +21,9 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
Rows: make([]index.IndexRow, 0, 100),
}
docIDBytes := []byte(d.ID)
// track our back index entries
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
// information we collate as we merge fields with same name
@ -31,11 +32,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
// walk all the fields, record stored fields now
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
for _, field := range d.Fields {
analyzeField := func(field document.Field, storable bool) {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
@ -55,48 +52,51 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if field.Options().IsStored() {
storeRows, indexBackIndexStoreEntries := udc.storeField(d.ID, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRows...)
backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...)
if storable && field.Options().IsStored() {
rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
}
}
// walk all the fields, record stored fields now
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
for _, field := range d.Fields {
analyzeField(field, true)
}
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
rowsCapNeeded := len(rv.Rows) + 1
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, rowsCapNeeded)
// walk through the collated information and proccess
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
// now index the composite fields
for _, compositeField := range d.CompositeFields {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(compositeField.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
rv.Rows, backIndexTermEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermEntries)
}
// build the back index row
backIndexRow := NewBackIndexRow(d.ID, backIndexTermEntries, backIndexStoredEntries)
backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermEntries, backIndexStoredEntries)
rv.Rows = append(rv.Rows, backIndexRow)
return rv

View File

@ -131,6 +131,8 @@ func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }
// DumpDoc returns all rows in the index related to this doc id
func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
idBytes := []byte(id)
rv := make(chan interface{})
go func() {
@ -162,14 +164,14 @@ func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
// build sorted list of term keys
keys := make(keyset, 0)
for _, entry := range back.termEntries {
tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), id, 0, 0)
tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), idBytes, 0, 0)
key := tfr.Key()
keys = append(keys, key)
}
sort.Sort(keys)
// first add all the stored rows
storedRowPrefix := NewStoredRow(id, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
udc.dumpPrefix(kvreader, rv, storedRowPrefix)
// now walk term keys in order and add them as well

View File

@ -60,7 +60,7 @@ func (i *IndexReader) Document(id string) (doc *document.Document, err error) {
return
}
doc = document.NewDocument(id)
storedRow := NewStoredRow(id, 0, []uint64{}, 'x', nil)
storedRow := NewStoredRow([]byte(id), 0, []uint64{}, 'x', nil)
storedRowScanPrefix := storedRow.ScanPrefixForDoc()
it := i.kvreader.PrefixIterator(storedRowScanPrefix)
defer func() {

View File

@ -41,7 +41,7 @@ func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, fi
return nil, err
}
tfr := NewTermFrequencyRow(term, field, "", 0, 0)
tfr := NewTermFrequencyRow(term, field, []byte{}, 0, 0)
it := indexReader.kvreader.PrefixIterator(tfr.Key())
return &UpsideDownCouchTermFieldReader{
@ -80,7 +80,7 @@ func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) {
func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) {
if r.iterator != nil {
tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0)
tfr := NewTermFrequencyRow(r.term, r.field, []byte(docID), 0, 0)
r.iterator.Seek(tfr.Key())
key, val, valid := r.iterator.Current()
if valid {
@ -114,14 +114,16 @@ type UpsideDownCouchDocIDReader struct {
}
func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte(start)
if start == "" {
start = string([]byte{0x0})
startBytes = []byte{0x0}
}
endBytes := []byte(end)
if end == "" {
end = string([]byte{0xff})
endBytes = []byte{0xff}
}
bisr := NewBackIndexRow(start, nil, nil)
bier := NewBackIndexRow(end, nil, nil)
bisr := NewBackIndexRow(startBytes, nil, nil)
bier := NewBackIndexRow(endBytes, nil, nil)
it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key())
return &UpsideDownCouchDocIDReader{
@ -145,7 +147,7 @@ func (r *UpsideDownCouchDocIDReader) Next() (string, error) {
}
func (r *UpsideDownCouchDocIDReader) Advance(docID string) (string, error) {
bir := NewBackIndexRow(docID, nil, nil)
bir := NewBackIndexRow([]byte(docID), nil, nil)
r.iterator.Seek(bir.Key())
key, val, valid := r.iterator.Current()
if valid {

View File

@ -459,21 +459,21 @@ func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
}
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
doc: docID,
freq: freq,
norm: norm,
}
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
doc: docID,
freq: freq,
norm: norm,
vectors: vectors,
@ -605,7 +605,7 @@ func (br *BackIndexRow) AllTermKeys() [][]byte {
}
rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries {
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0)
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.doc, 0, 0)
rv[i] = termRow.Key()
}
return rv
@ -617,7 +617,7 @@ func (br *BackIndexRow) AllStoredKeys() [][]byte {
}
rv := make([][]byte, len(br.storedEntries))
for i, storedEntry := range br.storedEntries {
storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
rv[i] = storedRow.Key()
}
return rv
@ -665,9 +665,9 @@ func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries)
}
func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
func NewBackIndexRow(docID []byte, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
return &BackIndexRow{
doc: []byte(doc),
doc: docID,
termEntries: entries,
storedEntries: storedFields,
}
@ -766,9 +766,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte {
return buf
}
func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
return &StoredRow{
doc: []byte(doc),
doc: docID,
field: field,
arrayPositions: arrayPositions,
typ: typ,

View File

@ -49,54 +49,54 @@ func TestRows(t *testing.T) {
[]byte{27},
},
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "catz", 3, 3.14),
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("catz"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test larger varints
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test vectors with arrayPositions
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}),
NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
},
{
NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")),
NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
},
{
NewStoredRow("budweiser", 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
NewStoredRow([]byte("budweiser"), 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
},
@ -259,7 +259,7 @@ func BenchmarkTermFrequencyRowEncode(b *testing.B) {
row := NewTermFrequencyRowWithTermVectors(
[]byte{'b', 'e', 'e', 'r'},
0,
"budweiser",
[]byte("budweiser"),
3,
3.14,
[]*TermVector{
@ -304,7 +304,7 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) {
func BenchmarkBackIndexRowEncode(b *testing.B) {
field := uint32(1)
t1 := "term1"
row := NewBackIndexRow("beername",
row := NewBackIndexRow([]byte("beername"),
[]*BackIndexTermEntry{
&BackIndexTermEntry{
Term: &t1,
@ -336,7 +336,7 @@ func BenchmarkBackIndexRowDecode(b *testing.B) {
}
func BenchmarkStoredRowEncode(b *testing.B) {
row := NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer"))
row := NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
row.Key()

View File

@ -29,7 +29,7 @@ import (
const Name = "upside_down"
// RowBufferSize should ideally this is sized to be the smallest
// size that can cotain an index row key and its corresponding
// size that can contain an index row key and its corresponding
// value. It is not a limit, if need be a larger buffer is
// allocated, but performance will be more optimal if *most*
// rows fit this size.
@ -344,6 +344,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(udc, doc, resultChan)
// put the work on the queue
udc.analysisQueue.Queue(aw)
@ -473,18 +474,14 @@ func (udc *UpsideDownCouch) mergeOldAndNew(backIndexRow *BackIndexRow, rows []in
return addRows, updateRows, deleteRows
}
func (udc *UpsideDownCouch) storeField(docID string, field document.Field, fieldIndex uint16) ([]index.IndexRow, []*BackIndexStoreEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
func (udc *UpsideDownCouch) storeField(docID []byte, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) {
fieldType := encodeFieldType(field)
storedRow := NewStoredRow(docID, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
// record the back index entry
backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()}
backIndexStoredEntries = append(backIndexStoredEntries, &backIndexStoredEntry)
rows = append(rows, storedRow)
return rows, backIndexStoredEntries
return append(rows, storedRow), append(backIndexStoredEntries, &backIndexStoredEntry)
}
func encodeFieldType(f document.Field) byte {
@ -502,17 +499,14 @@ func encodeFieldType(f document.Field) byte {
return fieldType
}
func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs))
func (udc *UpsideDownCouch) indexField(docID []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermEntries []*BackIndexTermEntry) ([]index.IndexRow, []*BackIndexTermEntry) {
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
for k, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if includeTermVectors {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
rows = append(rows, newFieldRows...)
var tv []*TermVector
tv, rows = udc.termVectorsFromTokenFreq(fieldIndex, tf, rows)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else {
termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
@ -592,13 +586,14 @@ func (udc *UpsideDownCouch) Delete(id string) (err error) {
}
func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow {
idBytes := []byte(id)
for _, backIndexEntry := range backIndexRow.termEntries {
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0)
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), idBytes, 0, 0)
deleteRows = append(deleteRows, tfr)
}
for _, se := range backIndexRow.storedEntries {
sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil)
sf := NewStoredRow(idBytes, uint16(*se.Field), se.ArrayPositions, 'x', nil)
deleteRows = append(deleteRows, sf)
}
@ -667,9 +662,8 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return tf.Frequency()
}
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]index.IndexRow, 0)
for i, l := range tf.Locations {
var newFieldRow *FieldRow
@ -678,7 +672,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
// lookup correct field
fieldIndex, newFieldRow = udc.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow)
rows = append(rows, newFieldRow)
}
}
tv := TermVector{
@ -691,7 +685,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
rv[i] = &tv
}
return rv, newFieldRows
return rv, rows
}
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {