diff --git a/index/firestorm/comp.go b/index/firestorm/comp.go index 5fad468d..b4115630 100644 --- a/index/firestorm/comp.go +++ b/index/firestorm/comp.go @@ -44,10 +44,16 @@ type Snapshot struct { // returns which doc number is valid // if none, then 0 func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 { - sort.Sort(docNumList) - highestValidDocNum := docNumList.HighestValid(s.maxRead) - if highestValidDocNum > 0 && s.Valid(docID, highestValidDocNum) { - return highestValidDocNum + inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID}) + + sort.Sort(docNumList) // Descending ordering. + + for _, docNum := range docNumList { + if docNum > 0 && docNum <= s.maxRead && + (inFlightVal == nil || inFlightVal.(*InFlightItem).docNum == docNum) && + !s.deletedDocNumbers.Test(uint(docNum)) { + return docNum + } } return 0 } diff --git a/index/firestorm/firestorm.go b/index/firestorm/firestorm.go index c76b445f..ea721334 100644 --- a/index/firestorm/firestorm.go +++ b/index/firestorm/firestorm.go @@ -146,7 +146,7 @@ func (f *Firestorm) Update(doc *document.Document) (err error) { aw := index.NewAnalysisWork(f, doc, resultChan) // put the work on the queue - go f.analysisQueue.Queue(aw) + f.analysisQueue.Queue(aw) // wait for the result result := <-resultChan diff --git a/index/firestorm/garbage.go b/index/firestorm/garbage.go index d70d5036..7a2abe40 100644 --- a/index/firestorm/garbage.go +++ b/index/firestorm/garbage.go @@ -135,9 +135,10 @@ func (gc *GarbageCollector) cleanup() { termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator}) termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator}) + var tfr TermFreqRow dictionaryDeltas := make(map[string]int64) err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) { - tfr, err := NewTermFreqRowKV(key, val) + err := tfr.ParseKey(key) if err != nil { return false, err } @@ -158,8 +159,9 @@ func (gc *GarbageCollector) cleanup() { } // walk all the stored rows + var sr StoredRow err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) { - sr, err := NewStoredRowKV(key, val) + err := sr.ParseKey(key) if err != nil { return false, err } diff --git a/index/firestorm/lookup.go b/index/firestorm/lookup.go index 0964f29d..d58640e6 100644 --- a/index/firestorm/lookup.go +++ b/index/firestorm/lookup.go @@ -91,10 +91,11 @@ func (l *Lookuper) lookup(item *InFlightItem) { prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID) logger.Printf("lookuper prefix - % x", prefix) + var tfk TermFreqRow docNums := make(DocNumberList, 0) err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) { logger.Printf("lookuper sees key % x", key) - tfk, err := NewTermFreqRowKV(key, val) + err := tfk.ParseKey(key) if err != nil { return false, err } diff --git a/index/firestorm/stored.go b/index/firestorm/stored.go index ad7c8519..a8a5917e 100644 --- a/index/firestorm/stored.go +++ b/index/firestorm/stored.go @@ -41,46 +41,52 @@ func NewStoredRow(docID []byte, docNum uint64, field uint16, arrayPositions []ui func NewStoredRowKV(key, value []byte) (*StoredRow, error) { rv := StoredRow{} - - buf := bytes.NewBuffer(key) - _, err := buf.ReadByte() // type + err := rv.ParseKey(key) if err != nil { return nil, err } - - rv.docID, err = buf.ReadBytes(ByteSeparator) - if len(rv.docID) < 2 { // 1 for min doc id length, 1 for separator - err = fmt.Errorf("invalid doc length 0") - return nil, err - } - - rv.docID = rv.docID[:len(rv.docID)-1] // trim off separator byte - - rv.docNum, err = binary.ReadUvarint(buf) - if err != nil { - return nil, err - } - - err = binary.Read(buf, binary.LittleEndian, &rv.field) - if err != nil { - return nil, err - } - - rv.arrayPositions = make([]uint64, 0) - nextArrayPos, err := binary.ReadUvarint(buf) - for err == nil { - rv.arrayPositions = append(rv.arrayPositions, nextArrayPos) - nextArrayPos, err = binary.ReadUvarint(buf) - } - err = rv.value.Unmarshal(value) if err != nil { return nil, err } - return &rv, nil } +func (sr *StoredRow) ParseKey(key []byte) error { + buf := bytes.NewBuffer(key) + _, err := buf.ReadByte() // type + if err != nil { + return err + } + + sr.docID, err = buf.ReadBytes(ByteSeparator) + if len(sr.docID) < 2 { // 1 for min doc id length, 1 for separator + err = fmt.Errorf("invalid doc length 0") + return err + } + + sr.docID = sr.docID[:len(sr.docID)-1] // trim off separator byte + + sr.docNum, err = binary.ReadUvarint(buf) + if err != nil { + return err + } + + err = binary.Read(buf, binary.LittleEndian, &sr.field) + if err != nil { + return err + } + + sr.arrayPositions = make([]uint64, 0) + nextArrayPos, err := binary.ReadUvarint(buf) + for err == nil { + sr.arrayPositions = append(sr.arrayPositions, nextArrayPos) + nextArrayPos, err = binary.ReadUvarint(buf) + } + + return nil +} + func (sr *StoredRow) KeySize() int { return 1 + len(sr.docID) + 1 + binary.MaxVarintLen64 + 2 + (binary.MaxVarintLen64 * len(sr.arrayPositions)) } diff --git a/index/firestorm/termfreq.go b/index/firestorm/termfreq.go index 1d36ec26..6ba6078d 100644 --- a/index/firestorm/termfreq.go +++ b/index/firestorm/termfreq.go @@ -62,32 +62,39 @@ func InitTermFreqRow(tfr *TermFreqRow, field uint16, term []byte, docID []byte, func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) { rv := TermFreqRow{} + err := rv.ParseKey(key) + if err != nil { + return nil, err + } + err = rv.value.Unmarshal(value) + if err != nil { + return nil, err + } + return &rv, nil +} + +func (tfr *TermFreqRow) ParseKey(key []byte) error { keyLen := len(key) if keyLen < 3 { - return nil, fmt.Errorf("invalid term frequency key, no valid field") + return fmt.Errorf("invalid term frequency key, no valid field") } - rv.field = binary.LittleEndian.Uint16(key[1:3]) + tfr.field = binary.LittleEndian.Uint16(key[1:3]) termStartPos := 3 termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator) if termEndPos < 0 { - return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term") + return fmt.Errorf("invalid term frequency key, no byte separator terminating term") } - rv.term = key[termStartPos : termStartPos+termEndPos] + tfr.term = key[termStartPos : termStartPos+termEndPos] docStartPos := termStartPos + termEndPos + 1 docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator) - rv.docID = key[docStartPos : docStartPos+docEndPos] + tfr.docID = key[docStartPos : docStartPos+docEndPos] docNumPos := docStartPos + docEndPos + 1 - rv.docNum, _ = binary.Uvarint(key[docNumPos:]) + tfr.docNum, _ = binary.Uvarint(key[docNumPos:]) - err := rv.value.Unmarshal(value) - if err != nil { - return nil, err - } - - return &rv, nil + return nil } func (tfr *TermFreqRow) KeySize() int { diff --git a/index/firestorm/warmup.go b/index/firestorm/warmup.go index 299f6fb4..02e3b21a 100644 --- a/index/firestorm/warmup.go +++ b/index/firestorm/warmup.go @@ -67,10 +67,11 @@ func (f *Firestorm) warmup(reader store.KVReader) error { tfkPrefix := TermFreqIteratorStart(idField, nil) + var tfk TermFreqRow var lastDocId []byte lastDocNumbers := make(DocNumberList, 1) err = visitPrefix(reader, tfkPrefix, func(key, val []byte) (bool, error) { - tfk, err := NewTermFreqRowKV(key, val) + err := tfk.ParseKey(key) if err != nil { return false, err } diff --git a/index/upside_down/analysis.go b/index/upside_down/analysis.go index b02ce9cd..dbb732b7 100644 --- a/index/upside_down/analysis.go +++ b/index/upside_down/analysis.go @@ -21,8 +21,9 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult Rows: make([]index.IndexRow, 0, 100), } + docIDBytes := []byte(d.ID) + // track our back index entries - backIndexTermEntries := make([]*BackIndexTermEntry, 0) backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) // information we collate as we merge fields with same name @@ -31,11 +32,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult fieldIncludeTermVectors := make(map[uint16]bool) fieldNames := make(map[uint16]string) - // walk all the fields, record stored fields now - // place information about indexed fields into map - // this collates information across fields with - // same names (arrays) - for _, field := range d.Fields { + analyzeField := func(field document.Field, storable bool) { fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name()) if newFieldRow != nil { rv.Rows = append(rv.Rows, newFieldRow) @@ -55,48 +52,51 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() } - if field.Options().IsStored() { - storeRows, indexBackIndexStoreEntries := udc.storeField(d.ID, field, fieldIndex) - rv.Rows = append(rv.Rows, storeRows...) - backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...) + if storable && field.Options().IsStored() { + rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries) } - } + // walk all the fields, record stored fields now + // place information about indexed fields into map + // this collates information across fields with + // same names (arrays) + for _, field := range d.Fields { + analyzeField(field, true) + } + + for fieldIndex, tokenFreqs := range fieldTermFreqs { + // see if any of the composite fields need this + for _, compositeField := range d.CompositeFields { + compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs) + } + } + + for _, compositeField := range d.CompositeFields { + analyzeField(compositeField, false) + } + + rowsCapNeeded := len(rv.Rows) + 1 + for _, tokenFreqs := range fieldTermFreqs { + rowsCapNeeded += len(tokenFreqs) + } + + rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...) + + backIndexTermEntries := make([]*BackIndexTermEntry, 0, rowsCapNeeded) + // walk through the collated information and proccess // once for each indexed field (unique name) for fieldIndex, tokenFreqs := range fieldTermFreqs { fieldLength := fieldLengths[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex] - // see if any of the composite fields need this - for _, compositeField := range d.CompositeFields { - compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs) - } - // encode this field - indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...) - } - - // now index the composite fields - for _, compositeField := range d.CompositeFields { - fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(compositeField.Name()) - if newFieldRow != nil { - rv.Rows = append(rv.Rows, newFieldRow) - } - if compositeField.Options().IsIndexed() { - fieldLength, tokenFreqs := compositeField.Analyze() - // encode this field - indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...) - } + rv.Rows, backIndexTermEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermEntries) } // build the back index row - backIndexRow := NewBackIndexRow(d.ID, backIndexTermEntries, backIndexStoredEntries) + backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermEntries, backIndexStoredEntries) rv.Rows = append(rv.Rows, backIndexRow) return rv diff --git a/index/upside_down/dump.go b/index/upside_down/dump.go index 69b7012b..023ae458 100644 --- a/index/upside_down/dump.go +++ b/index/upside_down/dump.go @@ -131,6 +131,8 @@ func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 } // DumpDoc returns all rows in the index related to this doc id func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} { + idBytes := []byte(id) + rv := make(chan interface{}) go func() { @@ -162,14 +164,14 @@ func (udc *UpsideDownCouch) DumpDoc(id string) chan interface{} { // build sorted list of term keys keys := make(keyset, 0) for _, entry := range back.termEntries { - tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), id, 0, 0) + tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), idBytes, 0, 0) key := tfr.Key() keys = append(keys, key) } sort.Sort(keys) // first add all the stored rows - storedRowPrefix := NewStoredRow(id, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc() + storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc() udc.dumpPrefix(kvreader, rv, storedRowPrefix) // now walk term keys in order and add them as well diff --git a/index/upside_down/index_reader.go b/index/upside_down/index_reader.go index 1f905adb..fb43a86e 100644 --- a/index/upside_down/index_reader.go +++ b/index/upside_down/index_reader.go @@ -60,7 +60,7 @@ func (i *IndexReader) Document(id string) (doc *document.Document, err error) { return } doc = document.NewDocument(id) - storedRow := NewStoredRow(id, 0, []uint64{}, 'x', nil) + storedRow := NewStoredRow([]byte(id), 0, []uint64{}, 'x', nil) storedRowScanPrefix := storedRow.ScanPrefixForDoc() it := i.kvreader.PrefixIterator(storedRowScanPrefix) defer func() { diff --git a/index/upside_down/reader.go b/index/upside_down/reader.go index 773d4ed4..2fea9114 100644 --- a/index/upside_down/reader.go +++ b/index/upside_down/reader.go @@ -41,7 +41,7 @@ func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, fi return nil, err } - tfr := NewTermFrequencyRow(term, field, "", 0, 0) + tfr := NewTermFrequencyRow(term, field, []byte{}, 0, 0) it := indexReader.kvreader.PrefixIterator(tfr.Key()) return &UpsideDownCouchTermFieldReader{ @@ -80,7 +80,7 @@ func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) { func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) { if r.iterator != nil { - tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0) + tfr := NewTermFrequencyRow(r.term, r.field, []byte(docID), 0, 0) r.iterator.Seek(tfr.Key()) key, val, valid := r.iterator.Current() if valid { @@ -114,14 +114,16 @@ type UpsideDownCouchDocIDReader struct { } func newUpsideDownCouchDocIDReader(indexReader *IndexReader, start, end string) (*UpsideDownCouchDocIDReader, error) { + startBytes := []byte(start) if start == "" { - start = string([]byte{0x0}) + startBytes = []byte{0x0} } + endBytes := []byte(end) if end == "" { - end = string([]byte{0xff}) + endBytes = []byte{0xff} } - bisr := NewBackIndexRow(start, nil, nil) - bier := NewBackIndexRow(end, nil, nil) + bisr := NewBackIndexRow(startBytes, nil, nil) + bier := NewBackIndexRow(endBytes, nil, nil) it := indexReader.kvreader.RangeIterator(bisr.Key(), bier.Key()) return &UpsideDownCouchDocIDReader{ @@ -145,7 +147,7 @@ func (r *UpsideDownCouchDocIDReader) Next() (string, error) { } func (r *UpsideDownCouchDocIDReader) Advance(docID string) (string, error) { - bir := NewBackIndexRow(docID, nil, nil) + bir := NewBackIndexRow([]byte(docID), nil, nil) r.iterator.Seek(bir.Key()) key, val, valid := r.iterator.Current() if valid { diff --git a/index/upside_down/row.go b/index/upside_down/row.go index 84bcd35f..e6663bf4 100644 --- a/index/upside_down/row.go +++ b/index/upside_down/row.go @@ -459,21 +459,21 @@ func (tfr *TermFrequencyRow) String() string { return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors) } -func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow { +func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow { return &TermFrequencyRow{ term: term, field: field, - doc: []byte(doc), + doc: docID, freq: freq, norm: norm, } } -func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { +func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { return &TermFrequencyRow{ term: term, field: field, - doc: []byte(doc), + doc: docID, freq: freq, norm: norm, vectors: vectors, @@ -605,7 +605,7 @@ func (br *BackIndexRow) AllTermKeys() [][]byte { } rv := make([][]byte, len(br.termEntries)) for i, termEntry := range br.termEntries { - termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0) + termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.doc, 0, 0) rv[i] = termRow.Key() } return rv @@ -617,7 +617,7 @@ func (br *BackIndexRow) AllStoredKeys() [][]byte { } rv := make([][]byte, len(br.storedEntries)) for i, storedEntry := range br.storedEntries { - storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{}) + storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{}) rv[i] = storedRow.Key() } return rv @@ -665,9 +665,9 @@ func (br *BackIndexRow) String() string { return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries) } -func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { +func NewBackIndexRow(docID []byte, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { return &BackIndexRow{ - doc: []byte(doc), + doc: docID, termEntries: entries, storedEntries: storedFields, } @@ -766,9 +766,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte { return buf } -func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { +func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { return &StoredRow{ - doc: []byte(doc), + doc: docID, field: field, arrayPositions: arrayPositions, typ: typ, diff --git a/index/upside_down/row_test.go b/index/upside_down/row_test.go index b99207ae..690a91dc 100644 --- a/index/upside_down/row_test.go +++ b/index/upside_down/row_test.go @@ -49,54 +49,54 @@ func TestRows(t *testing.T) { []byte{27}, }, { - NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "catz", 3, 3.14), + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("catz"), 3, 3.14), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'c', 'a', 't', 'z'}, []byte{3, 195, 235, 163, 130, 4}, }, { - NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14), + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{3, 195, 235, 163, 130, 4}, }, { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}, }, // test larger varints { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0}, }, // test vectors with arrayPositions { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, []byte("budweiser"), 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5}, }, { - NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), + NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0}, }, { - NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil), + NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil), []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1}, }, { - NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}), + NewBackIndexRow([]byte("budweiser"), []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}, &BackIndexTermEntry{Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{&BackIndexStoreEntry{Field: proto.Uint32(3)}, &BackIndexStoreEntry{Field: proto.Uint32(4)}, &BackIndexStoreEntry{Field: proto.Uint32(5)}}), []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5}, }, { - NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")), + NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")), []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, }, { - NewStoredRow("budweiser", 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), + NewStoredRow([]byte("budweiser"), 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0, 2, 166, 2, 134, 24}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, }, @@ -259,7 +259,7 @@ func BenchmarkTermFrequencyRowEncode(b *testing.B) { row := NewTermFrequencyRowWithTermVectors( []byte{'b', 'e', 'e', 'r'}, 0, - "budweiser", + []byte("budweiser"), 3, 3.14, []*TermVector{ @@ -304,7 +304,7 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) { func BenchmarkBackIndexRowEncode(b *testing.B) { field := uint32(1) t1 := "term1" - row := NewBackIndexRow("beername", + row := NewBackIndexRow([]byte("beername"), []*BackIndexTermEntry{ &BackIndexTermEntry{ Term: &t1, @@ -336,7 +336,7 @@ func BenchmarkBackIndexRowDecode(b *testing.B) { } func BenchmarkStoredRowEncode(b *testing.B) { - row := NewStoredRow("budweiser", 0, []uint64{}, byte('t'), []byte("an american beer")) + row := NewStoredRow([]byte("budweiser"), 0, []uint64{}, byte('t'), []byte("an american beer")) b.ResetTimer() for i := 0; i < b.N; i++ { row.Key() diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index 139bcdc1..1b57f990 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -29,7 +29,7 @@ import ( const Name = "upside_down" // RowBufferSize should ideally this is sized to be the smallest -// size that can cotain an index row key and its corresponding +// size that can contain an index row key and its corresponding // value. It is not a limit, if need be a larger buffer is // allocated, but performance will be more optimal if *most* // rows fit this size. @@ -344,6 +344,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) { analysisStart := time.Now() resultChan := make(chan *index.AnalysisResult) aw := index.NewAnalysisWork(udc, doc, resultChan) + // put the work on the queue udc.analysisQueue.Queue(aw) @@ -473,18 +474,14 @@ func (udc *UpsideDownCouch) mergeOldAndNew(backIndexRow *BackIndexRow, rows []in return addRows, updateRows, deleteRows } -func (udc *UpsideDownCouch) storeField(docID string, field document.Field, fieldIndex uint16) ([]index.IndexRow, []*BackIndexStoreEntry) { - rows := make([]index.IndexRow, 0, 100) - backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) +func (udc *UpsideDownCouch) storeField(docID []byte, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) { fieldType := encodeFieldType(field) storedRow := NewStoredRow(docID, fieldIndex, field.ArrayPositions(), fieldType, field.Value()) // record the back index entry backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()} - backIndexStoredEntries = append(backIndexStoredEntries, &backIndexStoredEntry) - rows = append(rows, storedRow) - return rows, backIndexStoredEntries + return append(rows, storedRow), append(backIndexStoredEntries, &backIndexStoredEntry) } func encodeFieldType(f document.Field) byte { @@ -502,17 +499,14 @@ func encodeFieldType(f document.Field) byte { return fieldType } -func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) { - - rows := make([]index.IndexRow, 0, 100) - backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs)) +func (udc *UpsideDownCouch) indexField(docID []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermEntries []*BackIndexTermEntry) ([]index.IndexRow, []*BackIndexTermEntry) { fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) for k, tf := range tokenFreqs { var termFreqRow *TermFrequencyRow if includeTermVectors { - tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf) - rows = append(rows, newFieldRows...) + var tv []*TermVector + tv, rows = udc.termVectorsFromTokenFreq(fieldIndex, tf, rows) termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) } else { termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm) @@ -592,13 +586,14 @@ func (udc *UpsideDownCouch) Delete(id string) (err error) { } func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow { + idBytes := []byte(id) for _, backIndexEntry := range backIndexRow.termEntries { - tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0) + tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), idBytes, 0, 0) deleteRows = append(deleteRows, tfr) } for _, se := range backIndexRow.storedEntries { - sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil) + sf := NewStoredRow(idBytes, uint16(*se.Field), se.ArrayPositions, 'x', nil) deleteRows = append(deleteRows, sf) } @@ -667,9 +662,8 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int { return tf.Frequency() } -func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) { +func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) { rv := make([]*TermVector, len(tf.Locations)) - newFieldRows := make([]index.IndexRow, 0) for i, l := range tf.Locations { var newFieldRow *FieldRow @@ -678,7 +672,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. // lookup correct field fieldIndex, newFieldRow = udc.fieldIndexOrNewRow(l.Field) if newFieldRow != nil { - newFieldRows = append(newFieldRows, newFieldRow) + rows = append(rows, newFieldRow) } } tv := TermVector{ @@ -691,7 +685,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. rv[i] = &tv } - return rv, newFieldRows + return rv, rows } func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {