From b2412424653ff873b5896dc9c5e11d2135720864 Mon Sep 17 00:00:00 2001
From: Steve Yen <steve.yen@gmail.com>
Date: Thu, 31 Dec 2015 18:13:54 -0800
Subject: [PATCH] firestorm.Analyze() preallocs rows, with analyzeField() func

The new analyzeField() helper func is used for both regular fields and
for composite fields.

With this change, all analysis is done up front, for both regular
fields and composite fields.

After analysis, this change counts up all the row capacity needed and
extends the AnalysisResult.Rows in one shot, as opposed to the
previous approach of dynamically growing the array as needed during
append()'s.

Also, in this change, the TermFreqRow for _id is added first, which
seems more correct.
---
 index/firestorm/analysis.go      | 73 ++++++++++++++++----------------
 index/firestorm/analysis_test.go |  2 +-
 2 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/index/firestorm/analysis.go b/index/firestorm/analysis.go
index 58c6e2c5..58f105e3 100644
--- a/index/firestorm/analysis.go
+++ b/index/firestorm/analysis.go
@@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 
 	docIDBytes := []byte(d.ID)
 
+	// add the _id row
+	rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
+
 	// information we collate as we merge fields with same name
 	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
 	fieldLengths := make(map[uint16]int)
 	fieldIncludeTermVectors := make(map[uint16]bool)
 	fieldNames := make(map[uint16]string)
 
-	for _, field := range d.Fields {
+	analyzeField := func(field document.Field, storable bool) {
 		fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
 		if newFieldRow != nil {
 			rv.Rows = append(rv.Rows, newFieldRow)
 		}
 		fieldNames[fieldIndex] = field.Name()
 
-		// add the _id row
-		rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
-
 		if field.Options().IsIndexed() {
 			fieldLength, tokenFreqs := field.Analyze()
 			existingFreqs := fieldTermFreqs[fieldIndex]
@@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
 		}
 
-		if field.Options().IsStored() {
+		if storable && field.Options().IsStored() {
 			storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
 			rv.Rows = append(rv.Rows, storeRow)
 		}
 	}
 
+	for _, field := range d.Fields {
+		analyzeField(field, true)
+	}
+
+	for fieldIndex, tokenFreqs := range fieldTermFreqs {
+		// see if any of the composite fields need this
+		for _, compositeField := range d.CompositeFields {
+			compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
+		}
+	}
+
+	for _, compositeField := range d.CompositeFields {
+		analyzeField(compositeField, false)
+	}
+
+	rowsCapNeeded := len(rv.Rows)
+	for _, tokenFreqs := range fieldTermFreqs {
+		rowsCapNeeded += len(tokenFreqs)
+	}
+
+	rows := make([]index.IndexRow, 0, rowsCapNeeded)
+	rv.Rows = append(rows, rv.Rows...)
+
 	// walk through the collated information and proccess
 	// once for each indexed field (unique name)
 	for fieldIndex, tokenFreqs := range fieldTermFreqs {
 		fieldLength := fieldLengths[fieldIndex]
 		includeTermVectors := fieldIncludeTermVectors[fieldIndex]
 
-		// see if any of the composite fields need this
-		for _, compositeField := range d.CompositeFields {
-			compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
-		}
-
-		// encode this field
-		indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
-		rv.Rows = append(rv.Rows, indexRows...)
-	}
-
-	// now index the composite fields
-	for _, compositeField := range d.CompositeFields {
-		fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name())
-		if newFieldRow != nil {
-			rv.Rows = append(rv.Rows, newFieldRow)
-		}
-		if compositeField.Options().IsIndexed() {
-			fieldLength, tokenFreqs := compositeField.Analyze()
-			// encode this field
-			indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
-			rv.Rows = append(rv.Rows, indexRows...)
-		}
+		rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
 	}
 
 	return rv
 }
 
-func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
+func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
 
 	tfrs := make([]TermFreqRow, len(tokenFreqs))
 
 	fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
 
 	if !includeTermVectors {
-		rows := make([]index.IndexRow, len(tokenFreqs))
 		i := 0
 		for _, tf := range tokenFreqs {
-			rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)
+			rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
 			i++
 		}
 		return rows
 	}
 
-	rows := make([]index.IndexRow, 0, len(tokenFreqs))
 	i := 0
 	for _, tf := range tokenFreqs {
-		tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
-		rows = append(rows, newFieldRows...)
+		var tv []*TermVector
+		tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
 		rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
 		i++
 	}
 	return rows
 }
 
-func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
+func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
 	rv := make([]*TermVector, len(tf.Locations))
-	newFieldRows := make([]index.IndexRow, 0)
 
 	for i, l := range tf.Locations {
 		var newFieldRow *FieldRow
@@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre
 			// lookup correct field
 			fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
 			if newFieldRow != nil {
-				newFieldRows = append(newFieldRows, newFieldRow)
+				rows = append(rows, newFieldRow)
 			}
 		}
 		tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
 		rv[i] = tv
 	}
 
-	return rv, newFieldRows
+	return rv, rows
 }
 
 func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
diff --git a/index/firestorm/analysis_test.go b/index/firestorm/analysis_test.go
index 8d975ee3..4fe0c775 100644
--- a/index/firestorm/analysis_test.go
+++ b/index/firestorm/analysis_test.go
@@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) {
 			r: &index.AnalysisResult{
 				DocID: "a",
 				Rows: []index.IndexRow{
-					NewFieldRow(1, "name"),
 					NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
+					NewFieldRow(1, "name"),
 					NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
 					NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
 				},