firestorm.Analyze() preallocs rows, with analyzeField() func

The new analyzeField() helper func is used for both regular fields and for composite fields. With this change, all analysis is done up front, for both regular fields and composite fields. After analysis, this change counts up all the row capacity needed and extends the AnalysisResult.Rows in one shot, as opposed to the previous approach of dynamically growing the array as needed during append()'s. Also, in this change, the TermFreqRow for _id is added first, which seems more correct.
2015-12-31 18:13:54 -08:00 · 2015-12-31 18:13:54 -08:00 · b241242465
commit b241242465
parent 325a616993
2 changed files with 37 additions and 38 deletions
--- a/index/firestorm/analysis.go
+++ b/index/firestorm/analysis.go
@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {

 	docIDBytes := []byte(d.ID)

+	// add the _id row
+	rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
+
 	// information we collate as we merge fields with same name
 	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
 	fieldLengths := make(map[uint16]int)
 	fieldIncludeTermVectors := make(map[uint16]bool)
 	fieldNames := make(map[uint16]string)

-	for _, field := range d.Fields {
+	analyzeField := func(field document.Field, storable bool) {
 		fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
 		if newFieldRow != nil {
 			rv.Rows = append(rv.Rows, newFieldRow)
 		}
 		fieldNames[fieldIndex] = field.Name()

-		// add the _id row
-		rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
-
 		if field.Options().IsIndexed() {
 			fieldLength, tokenFreqs := field.Analyze()
 			existingFreqs := fieldTermFreqs[fieldIndex]
@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
 		}

-		if field.Options().IsStored() {
+		if storable && field.Options().IsStored() {
 			storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
 			rv.Rows = append(rv.Rows, storeRow)
 		}
 	}

+	for _, field := range d.Fields {
+		analyzeField(field, true)
+	}
+
+	for fieldIndex, tokenFreqs := range fieldTermFreqs {
+		// see if any of the composite fields need this
+		for _, compositeField := range d.CompositeFields {
+			compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
+		}
+	}
+
+	for _, compositeField := range d.CompositeFields {
+		analyzeField(compositeField, false)
+	}
+
+	rowsCapNeeded := len(rv.Rows)
+	for _, tokenFreqs := range fieldTermFreqs {
+		rowsCapNeeded += len(tokenFreqs)
+	}
+
+	rows := make([]index.IndexRow, 0, rowsCapNeeded)
+	rv.Rows = append(rows, rv.Rows...)
+
 	// walk through the collated information and proccess
 	// once for each indexed field (unique name)
 	for fieldIndex, tokenFreqs := range fieldTermFreqs {
 		fieldLength := fieldLengths[fieldIndex]
 		includeTermVectors := fieldIncludeTermVectors[fieldIndex]

-		// see if any of the composite fields need this
-		for _, compositeField := range d.CompositeFields {
-			compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
-		}
-
-		// encode this field
-		indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
-		rv.Rows = append(rv.Rows, indexRows...)
-	}
-
-	// now index the composite fields
-	for _, compositeField := range d.CompositeFields {
-		fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name())
-		if newFieldRow != nil {
-			rv.Rows = append(rv.Rows, newFieldRow)
-		}
-		if compositeField.Options().IsIndexed() {
-			fieldLength, tokenFreqs := compositeField.Analyze()
-			// encode this field
-			indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
-			rv.Rows = append(rv.Rows, indexRows...)
-		}
+		rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
 	}

 	return rv
 }

-func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
+func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {

 	tfrs := make([]TermFreqRow, len(tokenFreqs))

 	fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))

 	if !includeTermVectors {
-		rows := make([]index.IndexRow, len(tokenFreqs))
 		i := 0
 		for _, tf := range tokenFreqs {
-			rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)
+			rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
 			i++
 		}
 		return rows
 	}

-	rows := make([]index.IndexRow, 0, len(tokenFreqs))
 	i := 0
 	for _, tf := range tokenFreqs {
-		tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
-		rows = append(rows, newFieldRows...)
+		var tv []*TermVector
+		tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
 		rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
 		i++
 	}
 	return rows
 }

-func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
+func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
 	rv := make([]*TermVector, len(tf.Locations))
-	newFieldRows := make([]index.IndexRow, 0)

 	for i, l := range tf.Locations {
 		var newFieldRow *FieldRow
@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre
 			// lookup correct field
 			fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
 			if newFieldRow != nil {
-				newFieldRows = append(newFieldRows, newFieldRow)
+				rows = append(rows, newFieldRow)
 			}
 		}
 		tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
 		rv[i] = tv
 	}

-	return rv, newFieldRows
+	return rv, rows
 }

 func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
--- a/index/firestorm/analysis_test.go
+++ b/index/firestorm/analysis_test.go
@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) {
 			r: &index.AnalysisResult{
 				DocID: "a",
 				Rows: []index.IndexRow{
-					NewFieldRow(1, "name"),
 					NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
+					NewFieldRow(1, "name"),
 					NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
 					NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
 				},