diff --git a/index/firestorm/analysis.go b/index/firestorm/analysis.go index 58c6e2c5..58f105e3 100644 --- a/index/firestorm/analysis.go +++ b/index/firestorm/analysis.go @@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult { docIDBytes := []byte(d.ID) + // add the _id row + rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil)) + // information we collate as we merge fields with same name fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies) fieldLengths := make(map[uint16]int) fieldIncludeTermVectors := make(map[uint16]bool) fieldNames := make(map[uint16]string) - for _, field := range d.Fields { + analyzeField := func(field document.Field, storable bool) { fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name()) if newFieldRow != nil { rv.Rows = append(rv.Rows, newFieldRow) } fieldNames[fieldIndex] = field.Name() - // add the _id row - rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil)) - if field.Options().IsIndexed() { fieldLength, tokenFreqs := field.Analyze() existingFreqs := fieldTermFreqs[fieldIndex] @@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult { fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() } - if field.Options().IsStored() { + if storable && field.Options().IsStored() { storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex) rv.Rows = append(rv.Rows, storeRow) } } + for _, field := range d.Fields { + analyzeField(field, true) + } + + for fieldIndex, tokenFreqs := range fieldTermFreqs { + // see if any of the composite fields need this + for _, compositeField := range d.CompositeFields { + compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs) + } + } + + for _, compositeField := range d.CompositeFields { + analyzeField(compositeField, false) + } + + rowsCapNeeded := len(rv.Rows) + for _, tokenFreqs := range fieldTermFreqs { + rowsCapNeeded += len(tokenFreqs) + } + + rows := make([]index.IndexRow, 0, rowsCapNeeded) + rv.Rows = append(rows, rv.Rows...) + // walk through the collated information and proccess // once for each indexed field (unique name) for fieldIndex, tokenFreqs := range fieldTermFreqs { fieldLength := fieldLengths[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex] - // see if any of the composite fields need this - for _, compositeField := range d.CompositeFields { - compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs) - } - - // encode this field - indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - } - - // now index the composite fields - for _, compositeField := range d.CompositeFields { - fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name()) - if newFieldRow != nil { - rv.Rows = append(rv.Rows, newFieldRow) - } - if compositeField.Options().IsIndexed() { - fieldLength, tokenFreqs := compositeField.Analyze() - // encode this field - indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs) - rv.Rows = append(rv.Rows, indexRows...) - } + rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows) } return rv } -func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow { +func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow { tfrs := make([]TermFreqRow, len(tokenFreqs)) fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) if !includeTermVectors { - rows := make([]index.IndexRow, len(tokenFreqs)) i := 0 for _, tf := range tokenFreqs { - rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil) + rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)) i++ } return rows } - rows := make([]index.IndexRow, 0, len(tokenFreqs)) i := 0 for _, tf := range tokenFreqs { - tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf) - rows = append(rows, newFieldRows...) + var tv []*TermVector + tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows) rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv)) i++ } return rows } -func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) { +func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) { rv := make([]*TermVector, len(tf.Locations)) - newFieldRows := make([]index.IndexRow, 0) for i, l := range tf.Locations { var newFieldRow *FieldRow @@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre // lookup correct field fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field) if newFieldRow != nil { - newFieldRows = append(newFieldRows, newFieldRow) + rows = append(rows, newFieldRow) } } tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions) rv[i] = tv } - return rv, newFieldRows + return rv, rows } func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow { diff --git a/index/firestorm/analysis_test.go b/index/firestorm/analysis_test.go index 8d975ee3..4fe0c775 100644 --- a/index/firestorm/analysis_test.go +++ b/index/firestorm/analysis_test.go @@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) { r: &index.AnalysisResult{ DocID: "a", Rows: []index.IndexRow{ - NewFieldRow(1, "name"), NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), + NewFieldRow(1, "name"), NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")), NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}), },