From b2412424653ff873b5896dc9c5e11d2135720864 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 31 Dec 2015 18:13:54 -0800 Subject: [PATCH] firestorm.Analyze() preallocs rows, with analyzeField() func The new analyzeField() helper func is used for both regular fields and for composite fields. With this change, all analysis is done up front, for both regular fields and composite fields. After analysis, this change counts up all the row capacity needed and extends the AnalysisResult.Rows in one shot, as opposed to the previous approach of dynamically growing the array as needed during append()'s. Also, in this change, the TermFreqRow for _id is added first, which seems more correct. --- index/firestorm/analysis.go | 73 ++++++++++++++++---------------- index/firestorm/analysis_test.go | 2 +- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/index/firestorm/analysis.go b/index/firestorm/analysis.go index 58c6e2c5..58f105e3 100644 --- a/index/firestorm/analysis.go +++ b/index/firestorm/analysis.go @@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult { docIDBytes := []byte(d.ID) + // add the _id row + rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil)) + // information we collate as we merge fields with same name fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies) fieldLengths := make(map[uint16]int) fieldIncludeTermVectors := make(map[uint16]bool) fieldNames := make(map[uint16]string) - for _, field := range d.Fields { + analyzeField := func(field document.Field, storable bool) { fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name()) if newFieldRow != nil { rv.Rows = append(rv.Rows, newFieldRow) } fieldNames[fieldIndex] = field.Name() - // add the _id row - rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil)) - if field.Options().IsIndexed() { fieldLength, tokenFreqs := field.Analyze() existingFreqs := fieldTermFreqs[fieldIndex] @@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult { fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() } - if field.Options().IsStored() { + if storable && field.Options().IsStored() { storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex) rv.Rows = append(rv.Rows, storeRow) } } + for _, field := range d.Fields { + analyzeField(field, true) + } + + for fieldIndex, tokenFreqs := range fieldTermFreqs { + // see if any of the composite fields need this + for _, compositeField := range d.CompositeFields { + compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs) + } + } + + for _, compositeField := range d.CompositeFields { + analyzeField(compositeField, false) + } + + rowsCapNeeded := len(rv.Rows) + for _, tokenFreqs := range fieldTermFreqs { + rowsCapNeeded += len(tokenFreqs) + } + + rows := make([]index.IndexRow, 0, rowsCapNeeded) + rv.Rows = append(rows, rv.Rows...) + // walk through the collated information and proccess // once for each indexed field (unique name) for fieldIndex, tokenFreqs := range fieldTermFreqs { fieldLength := fieldLengths[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex] - // see if any of the composite fields need this - for _, compositeField := range d.CompositeFields { - compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs) - } - - // encode this field - indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - } - - // now index the composite fields - for _, compositeField := range d.CompositeFields { - fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name()) - if newFieldRow != nil { - rv.Rows = append(rv.Rows, newFieldRow) - } - if compositeField.Options().IsIndexed() { - fieldLength, tokenFreqs := compositeField.Analyze() - // encode this field - indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - } + rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows) } return rv } -func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow { +func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow { tfrs := make([]TermFreqRow, len(tokenFreqs)) fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) if !includeTermVectors { - rows := make([]index.IndexRow, len(tokenFreqs)) i := 0 for _, tf := range tokenFreqs { - rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil) + rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)) i++ } return rows } - rows := make([]index.IndexRow, 0, len(tokenFreqs)) i := 0 for _, tf := range tokenFreqs { - tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf) - rows = append(rows, newFieldRows...) + var tv []*TermVector + tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows) rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv)) i++ } return rows } -func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) { +func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) { rv := make([]*TermVector, len(tf.Locations)) - newFieldRows := make([]index.IndexRow, 0) for i, l := range tf.Locations { var newFieldRow *FieldRow @@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre // lookup correct field fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field) if newFieldRow != nil { - newFieldRows = append(newFieldRows, newFieldRow) + rows = append(rows, newFieldRow) } } tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions) rv[i] = tv } - return rv, newFieldRows + return rv, rows } func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow { diff --git a/index/firestorm/analysis_test.go b/index/firestorm/analysis_test.go index 8d975ee3..4fe0c775 100644 --- a/index/firestorm/analysis_test.go +++ b/index/firestorm/analysis_test.go @@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) { r: &index.AnalysisResult{ DocID: "a", Rows: []index.IndexRow{ - NewFieldRow(1, "name"), NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), + NewFieldRow(1, "name"), NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")), NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}), },