0
0

Merge branch 'WIP-perf-20160106' of https://github.com/steveyen/bleve into steveyen-WIP-perf-20160106

This commit is contained in:
Marty Schoch 2016-01-07 15:40:29 -05:00
commit 48fcd5a7d5
14 changed files with 159 additions and 138 deletions

View File

@ -44,10 +44,16 @@ type Snapshot struct {
// returns which doc number is valid // returns which doc number is valid
// if none, then 0 // if none, then 0
func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 { func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 {
sort.Sort(docNumList) inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
highestValidDocNum := docNumList.HighestValid(s.maxRead)
if highestValidDocNum > 0 && s.Valid(docID, highestValidDocNum) { sort.Sort(docNumList) // Descending ordering.
return highestValidDocNum
for _, docNum := range docNumList {
if docNum > 0 && docNum <= s.maxRead &&
(inFlightVal == nil || inFlightVal.(*InFlightItem).docNum == docNum) &&
!s.deletedDocNumbers.Test(uint(docNum)) {
return docNum
}
} }
return 0 return 0
} }

View File

@ -146,7 +146,7 @@ func (f *Firestorm) Update(doc *document.Document) (err error) {
aw := index.NewAnalysisWork(f, doc, resultChan) aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue // put the work on the queue
go f.analysisQueue.Queue(aw) f.analysisQueue.Queue(aw)
// wait for the result // wait for the result
result := <-resultChan result := <-resultChan

View File

@ -135,9 +135,10 @@ func (gc *GarbageCollector) cleanup() {
termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator}) termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator})
termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator}) termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator})
var tfr TermFreqRow
dictionaryDeltas := make(map[string]int64) dictionaryDeltas := make(map[string]int64)
err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) { err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val) err := tfr.ParseKey(key)
if err != nil { if err != nil {
return false, err return false, err
} }
@ -158,8 +159,9 @@ func (gc *GarbageCollector) cleanup() {
} }
// walk all the stored rows // walk all the stored rows
var sr StoredRow
err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) { err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
sr, err := NewStoredRowKV(key, val) err := sr.ParseKey(key)
if err != nil { if err != nil {
return false, err return false, err
} }

View File

@ -91,10 +91,11 @@ func (l *Lookuper) lookup(item *InFlightItem) {
prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID) prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID)
logger.Printf("lookuper prefix - % x", prefix) logger.Printf("lookuper prefix - % x", prefix)
var tfk TermFreqRow
docNums := make(DocNumberList, 0) docNums := make(DocNumberList, 0)
err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) { err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) {
logger.Printf("lookuper sees key % x", key) logger.Printf("lookuper sees key % x", key)
tfk, err := NewTermFreqRowKV(key, val) err := tfk.ParseKey(key)
if err != nil { if err != nil {
return false, err return false, err
} }

View File

@ -41,46 +41,52 @@ func NewStoredRow(docID []byte, docNum uint64, field uint16, arrayPositions []ui
func NewStoredRowKV(key, value []byte) (*StoredRow, error) { func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv := StoredRow{} rv := StoredRow{}
err := rv.ParseKey(key)
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil { if err != nil {
return nil, err return nil, err
} }
rv.docID, err = buf.ReadBytes(ByteSeparator)
if len(rv.docID) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return nil, err
}
rv.docID = rv.docID[:len(rv.docID)-1] // trim off separator byte
rv.docNum, err = binary.ReadUvarint(buf)
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
err = rv.value.Unmarshal(value) err = rv.value.Unmarshal(value)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &rv, nil return &rv, nil
} }
func (sr *StoredRow) ParseKey(key []byte) error {
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return err
}
sr.docID, err = buf.ReadBytes(ByteSeparator)
if len(sr.docID) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return err
}
sr.docID = sr.docID[:len(sr.docID)-1] // trim off separator byte
sr.docNum, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &sr.field)
if err != nil {
return err
}
sr.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
sr.arrayPositions = append(sr.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
return nil
}
func (sr *StoredRow) KeySize() int { func (sr *StoredRow) KeySize() int {
return 1 + len(sr.docID) + 1 + binary.MaxVarintLen64 + 2 + (binary.MaxVarintLen64 * len(sr.arrayPositions)) return 1 + len(sr.docID) + 1 + binary.MaxVarintLen64 + 2 + (binary.MaxVarintLen64 * len(sr.arrayPositions))
} }

View File

@ -62,32 +62,39 @@ func InitTermFreqRow(tfr *TermFreqRow, field uint16, term []byte, docID []byte,
func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) { func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) {
rv := TermFreqRow{} rv := TermFreqRow{}
err := rv.ParseKey(key)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (tfr *TermFreqRow) ParseKey(key []byte) error {
keyLen := len(key) keyLen := len(key)
if keyLen < 3 { if keyLen < 3 {
return nil, fmt.Errorf("invalid term frequency key, no valid field") return fmt.Errorf("invalid term frequency key, no valid field")
} }
rv.field = binary.LittleEndian.Uint16(key[1:3]) tfr.field = binary.LittleEndian.Uint16(key[1:3])
termStartPos := 3 termStartPos := 3
termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator) termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator)
if termEndPos < 0 { if termEndPos < 0 {
return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term") return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
} }
rv.term = key[termStartPos : termStartPos+termEndPos] tfr.term = key[termStartPos : termStartPos+termEndPos]
docStartPos := termStartPos + termEndPos + 1 docStartPos := termStartPos + termEndPos + 1
docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator) docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator)
rv.docID = key[docStartPos : docStartPos+docEndPos] tfr.docID = key[docStartPos : docStartPos+docEndPos]
docNumPos := docStartPos + docEndPos + 1 docNumPos := docStartPos + docEndPos + 1
rv.docNum, _ = binary.Uvarint(key[docNumPos:]) tfr.docNum, _ = binary.Uvarint(key[docNumPos:])
err := rv.value.Unmarshal(value) return nil
if err != nil {
return nil, err
}
return &rv, nil
} }
func (tfr *TermFreqRow) KeySize() int { func (tfr *TermFreqRow) KeySize() int {

View File

@ -67,10 +67,11 @@ func (f *Firestorm) warmup(reader store.KVReader) error {
tfkPrefix := TermFreqIteratorStart(idField, nil) tfkPrefix := TermFreqIteratorStart(idField, nil)
var tfk TermFreqRow
var lastDocId []byte var lastDocId []byte
lastDocNumbers := make(DocNumberList, 1) lastDocNumbers := make(DocNumberList, 1)
err = visitPrefix(reader, tfkPrefix, func(key, val []byte) (bool, error) { err = visitPrefix(reader, tfkPrefix, func(key, val []byte) (bool, error) {
tfk, err := NewTermFreqRowKV(key, val) err := tfk.ParseKey(key)
if err != nil { if err != nil {
return false, err return false, err
} }

View File

@ -21,8 +21,9 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
Rows: make([]index.IndexRow, 0, 100), Rows: make([]index.IndexRow, 0, 100),
} }
docIDBytes := []byte(d.ID)
// track our back index entries // track our back index entries
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
// information we collate as we merge fields with same name // information we collate as we merge fields with same name
@ -31,11 +32,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
fieldIncludeTermVectors := make(map[uint16]bool) fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string) fieldNames := make(map[uint16]string)
// walk all the fields, record stored fields now analyzeField := func(field document.Field, storable bool) {
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
for _, field := range d.Fields {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name()) fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil { if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow) rv.Rows = append(rv.Rows, newFieldRow)
@ -55,48 +52,51 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
} }
if field.Options().IsStored() { if storable && field.Options().IsStored() {
storeRows, indexBackIndexStoreEntries := udc.storeField(d.ID, field, fieldIndex) rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
rv.Rows = append(rv.Rows, storeRows...)
backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...)
} }
} }
// walk all the fields, record stored fields now
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
for _, field := range d.Fields {
analyzeField(field, true)
}
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
rowsCapNeeded := len(rv.Rows) + 1
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, rowsCapNeeded)
// walk through the collated information and proccess // walk through the collated information and proccess
// once for each indexed field (unique name) // once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs { for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex] fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field // encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs) rv.Rows, backIndexTermEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermEntries)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
// now index the composite fields
for _, compositeField := range d.CompositeFields {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(compositeField.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
} }
// build the back index row // build the back index row
backIndexRow := NewBackIndexRow(d.ID, backIndexTermEntries, backIndexStoredEntries) backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermEntries, backIndexStoredEntries)
rv.Rows = append(rv.Rows, backIndexRow) rv.Rows = append(rv.Rows, backIndexRow)
return rv return rv

View File

@ -131,6 +131,8 @@ func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }
// DumpDoc returns all rows in the index related to this doc id // DumpDoc returns all rows in the index related to this doc id
func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} { func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
idBytes := []byte(id)
rv := make(chan interface{}) rv := make(chan interface{})
go func() { go func() {
@ -162,14 +164,14 @@ func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} {
// build sorted list of term keys // build sorted list of term keys
keys := make(keyset, 0) keys := make(keyset, 0)
for _, entry := range back.termEntries { for _, entry := range back.termEntries {
tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), id, 0, 0) tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), idBytes, 0, 0)
key := tfr.Key() key := tfr.Key()
keys = append(keys, key) keys = append(keys, key)
} }
sort.Sort(keys) sort.Sort(keys)
// first add all the stored rows // first add all the stored rows
storedRowPrefix := NewStoredRow(id, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc() storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
udc.dumpPrefix(kvreader, rv, storedRowPrefix) udc.dumpPrefix(kvreader, rv, storedRowPrefix)
// now walk term keys in order and add them as well // now walk term keys in order and add them as well

View File

@ -60,7 +60,7 @@ func (i *IndexReader) Document(id string) (doc *document.Document, err error) {
return return
} }
doc = document.NewDocument(id) doc = document.NewDocument(id)
storedRow := NewStoredRow(id, 0, []uint64{}, 'x', nil) storedRow := NewStoredRow([]byte(id), 0, []uint64{}, 'x', nil)
storedRowScanPrefix := storedRow.ScanPrefixForDoc() storedRowScanPrefix := storedRow.ScanPrefixForDoc()
it := i.kvreader.PrefixIterator(storedRowScanPrefix) it := i.kvreader.PrefixIterator(storedRowScanPrefix)
defer func() { defer func() {

View File

@ -41,7 +41,7 @@ func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, fi
return nil, err return nil, err
} }
tfr := NewTermFrequencyRow(term, field, "", 0, 0) tfr := NewTermFrequencyRow(term, field, []byte{}, 0, 0)
it := indexReader.kvreader.PrefixIterator(tfr.Key()) it := indexReader.kvreader.PrefixIterator(tfr.Key())
return &UpsideDownCouchTermFieldReader{ return &UpsideDownCouchTermFieldReader{
@ -80,7 +80,7 @@ func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) {
func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) { func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) {
if r.iterator != nil { if r.iterator != nil {
tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0) tfr := NewTermFrequencyRow(r.term, r.field, []byte(docID), 0, 0)
r.iterator.Seek(tfr.Key()) r.iterator.Seek(tfr.Key())
key, val, valid := r.iterator.Current() key, val, valid := r.iterator.Current()
if valid { if valid {
@ -114,14 +114,16 @@ type UpsideDownCouchDocIDReader struct {
} }
func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) { func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) {
startBytes := []byte(start)
if start == "" { if start == "" {
start = string([]byte{0x0}) startBytes = []byte{0x0}
} }
endBytes := []byte(end)
if end == "" { if end == "" {
end = string([]byte{0xff}) endBytes = []byte{0xff}
} }
bisr := NewBackIndexRow(start, nil, nil) bisr := NewBackIndexRow(startBytes, nil, nil)
bier := NewBackIndexRow(end, nil, nil) bier := NewBackIndexRow(endBytes, nil, nil)
it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key()) it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key())
return &UpsideDownCouchDocIDReader{ return &UpsideDownCouchDocIDReader{
@ -145,7 +147,7 @@ func (r *UpsideDownCouchDocIDReader) Next() (string, error) {
} }
func (r *UpsideDownCouchDocIDReader) Advance(docID string) (string, error) { func (r *UpsideDownCouchDocIDReader) Advance(docID string) (string, error) {
bir := NewBackIndexRow(docID, nil, nil) bir := NewBackIndexRow([]byte(docID), nil, nil)
r.iterator.Seek(bir.Key()) r.iterator.Seek(bir.Key())
key, val, valid := r.iterator.Current() key, val, valid := r.iterator.Current()
if valid { if valid {

View File

@ -459,21 +459,21 @@ func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors) return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
} }
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow { func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{ return &TermFrequencyRow{
term: term, term: term,
field: field, field: field,
doc: []byte(doc), doc: docID,
freq: freq, freq: freq,
norm: norm, norm: norm,
} }
} }
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{ return &TermFrequencyRow{
term: term, term: term,
field: field, field: field,
doc: []byte(doc), doc: docID,
freq: freq, freq: freq,
norm: norm, norm: norm,
vectors: vectors, vectors: vectors,
@ -605,7 +605,7 @@ func (br *BackIndexRow) AllTermKeys() [][]byte {
} }
rv := make([][]byte, len(br.termEntries)) rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries { for i, termEntry := range br.termEntries {
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0) termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.doc, 0, 0)
rv[i] = termRow.Key() rv[i] = termRow.Key()
} }
return rv return rv
@ -617,7 +617,7 @@ func (br *BackIndexRow) AllStoredKeys() [][]byte {
} }
rv := make([][]byte, len(br.storedEntries)) rv := make([][]byte, len(br.storedEntries))
for i, storedEntry := range br.storedEntries { for i, storedEntry := range br.storedEntries {
storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{}) storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
rv[i] = storedRow.Key() rv[i] = storedRow.Key()
} }
return rv return rv
@ -665,9 +665,9 @@ func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries) return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries)
} }
func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { func NewBackIndexRow(docID []byte, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
return &BackIndexRow{ return &BackIndexRow{
doc: []byte(doc), doc: docID,
termEntries: entries, termEntries: entries,
storedEntries: storedFields, storedEntries: storedFields,
} }
@ -766,9 +766,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte {
return buf return buf
} }
func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
return &StoredRow{ return &StoredRow{
doc: []byte(doc), doc: docID,
field: field, field: field,
arrayPositions: arrayPositions, arrayPositions: arrayPositions,
typ: typ, typ: typ,

View File

@ -49,54 +49,54 @@ func TestRows(t *testing.T) {
[]byte{27}, []byte{27},
}, },
{ {
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "catz", 3, 3.14), NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("catz"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'},
[]byte{3, 195, 235, 163, 130, 4}, []byte{3, 195, 235, 163, 130, 4},
}, },
{ {
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14), NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4}, []byte{3, 195, 235, 163, 130, 4},
}, },
{ {
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}, []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
}, },
// test larger varints // test larger varints
{ {
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
}, },
// test vectors with arrayPositions // test vectors with arrayPositions
{ {
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
}, },
{ {
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0},
}, },
{ {
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil), NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1},
}, },
{ {
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}), NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}),
[]byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
}, },
{ {
NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")), NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0}, []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
}, },
{ {
NewStoredRow("budweiser", 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), NewStoredRow([]byte("budweiser"), 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
[]byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24}, []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
}, },
@ -259,7 +259,7 @@ func BenchmarkTermFrequencyRowEncode(b *testing.B) {
row := NewTermFrequencyRowWithTermVectors( row := NewTermFrequencyRowWithTermVectors(
[]byte{'b', 'e', 'e', 'r'}, []byte{'b', 'e', 'e', 'r'},
0, 0,
"budweiser", []byte("budweiser"),
3, 3,
3.14, 3.14,
[]*TermVector{ []*TermVector{
@ -304,7 +304,7 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) {
func BenchmarkBackIndexRowEncode(b *testing.B) { func BenchmarkBackIndexRowEncode(b *testing.B) {
field := uint32(1) field := uint32(1)
t1 := "term1" t1 := "term1"
row := NewBackIndexRow("beername", row := NewBackIndexRow([]byte("beername"),
[]*BackIndexTermEntry{ []*BackIndexTermEntry{
&BackIndexTermEntry{ &BackIndexTermEntry{
Term: &t1, Term: &t1,
@ -336,7 +336,7 @@ func BenchmarkBackIndexRowDecode(b *testing.B) {
} }
func BenchmarkStoredRowEncode(b *testing.B) { func BenchmarkStoredRowEncode(b *testing.B) {
row := NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")) row := NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer"))
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
row.Key() row.Key()

View File

@ -29,7 +29,7 @@ import (
const Name = "upside_down" const Name = "upside_down"
// RowBufferSize should ideally this is sized to be the smallest // RowBufferSize should ideally this is sized to be the smallest
// size that can cotain an index row key and its corresponding // size that can contain an index row key and its corresponding
// value. It is not a limit, if need be a larger buffer is // value. It is not a limit, if need be a larger buffer is
// allocated, but performance will be more optimal if *most* // allocated, but performance will be more optimal if *most*
// rows fit this size. // rows fit this size.
@ -344,6 +344,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) {
analysisStart := time.Now() analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult) resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(udc, doc, resultChan) aw := index.NewAnalysisWork(udc, doc, resultChan)
// put the work on the queue // put the work on the queue
udc.analysisQueue.Queue(aw) udc.analysisQueue.Queue(aw)
@ -473,18 +474,14 @@ func (udc *UpsideDownCouch) mergeOldAndNew(backIndexRow *BackIndexRow, rows []in
return addRows, updateRows, deleteRows return addRows, updateRows, deleteRows
} }
func (udc *UpsideDownCouch) storeField(docID string, field document.Field, fieldIndex uint16) ([]index.IndexRow, []*BackIndexStoreEntry) { func (udc *UpsideDownCouch) storeField(docID []byte, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
fieldType := encodeFieldType(field) fieldType := encodeFieldType(field)
storedRow := NewStoredRow(docID, fieldIndex, field.ArrayPositions(), fieldType, field.Value()) storedRow := NewStoredRow(docID, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
// record the back index entry // record the back index entry
backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()} backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()}
backIndexStoredEntries = append(backIndexStoredEntries, &backIndexStoredEntry)
rows = append(rows, storedRow) return append(rows, storedRow), append(backIndexStoredEntries, &backIndexStoredEntry)
return rows, backIndexStoredEntries
} }
func encodeFieldType(f document.Field) byte { func encodeFieldType(f document.Field) byte {
@ -502,17 +499,14 @@ func encodeFieldType(f document.Field) byte {
return fieldType return fieldType
} }
func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) { func (udc *UpsideDownCouch) indexField(docID []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermEntries []*BackIndexTermEntry) ([]index.IndexRow, []*BackIndexTermEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs))
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
for k, tf := range tokenFreqs { for k, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow var termFreqRow *TermFrequencyRow
if includeTermVectors { if includeTermVectors {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf) var tv []*TermVector
rows = append(rows, newFieldRows...) tv, rows = udc.termVectorsFromTokenFreq(fieldIndex, tf, rows)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else { } else {
termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm) termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
@ -592,13 +586,14 @@ func (udc *UpsideDownCouch) Delete(id string) (err error) {
} }
func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow { func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow {
idBytes := []byte(id)
for _, backIndexEntry := range backIndexRow.termEntries { for _, backIndexEntry := range backIndexRow.termEntries {
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0) tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), idBytes, 0, 0)
deleteRows = append(deleteRows, tfr) deleteRows = append(deleteRows, tfr)
} }
for _, se := range backIndexRow.storedEntries { for _, se := range backIndexRow.storedEntries {
sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil) sf := NewStoredRow(idBytes, uint16(*se.Field), se.ArrayPositions, 'x', nil)
deleteRows = append(deleteRows, sf) deleteRows = append(deleteRows, sf)
} }
@ -667,9 +662,8 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return tf.Frequency() return tf.Frequency()
} }
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) { func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations)) rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]index.IndexRow, 0)
for i, l := range tf.Locations { for i, l := range tf.Locations {
var newFieldRow *FieldRow var newFieldRow *FieldRow
@ -678,7 +672,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
// lookup correct field // lookup correct field
fieldIndex, newFieldRow = udc.fieldIndexOrNewRow(l.Field) fieldIndex, newFieldRow = udc.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil { if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow) rows = append(rows, newFieldRow)
} }
} }
tv := TermVector{ tv := TermVector{
@ -691,7 +685,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
rv[i] = &tv rv[i] = &tv
} }
return rv, newFieldRows return rv, rows
} }
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {