0
0

firestorm.Analyze() preallocs rows, with analyzeField() func

The new analyzeField() helper func is used for both regular fields and
for composite fields.

With this change, all analysis is done up front, for both regular
fields and composite fields.

After analysis, this change counts up all the row capacity needed and
extends the AnalysisResult.Rows in one shot, as opposed to the
previous approach of dynamically growing the array as needed during
append()'s.

Also, in this change, the TermFreqRow for _id is added first, which
seems more correct.
This commit is contained in:
Steve Yen 2015-12-31 18:13:54 -08:00
parent 325a616993
commit b241242465
2 changed files with 37 additions and 38 deletions

View File

@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
docIDBytes := []byte(d.ID) docIDBytes := []byte(d.ID)
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
// information we collate as we merge fields with same name // information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies) fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int) fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool) fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string) fieldNames := make(map[uint16]string)
for _, field := range d.Fields { analyzeField := func(field document.Field, storable bool) {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name()) fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil { if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow) rv.Rows = append(rv.Rows, newFieldRow)
} }
fieldNames[fieldIndex] = field.Name() fieldNames[fieldIndex] = field.Name()
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
if field.Options().IsIndexed() { if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze() fieldLength, tokenFreqs := field.Analyze()
existingFreqs := fieldTermFreqs[fieldIndex] existingFreqs := fieldTermFreqs[fieldIndex]
@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
} }
if field.Options().IsStored() { if storable && field.Options().IsStored() {
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex) storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRow) rv.Rows = append(rv.Rows, storeRow)
} }
} }
for _, field := range d.Fields {
analyzeField(field, true)
}
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
rowsCapNeeded := len(rv.Rows)
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rows := make([]index.IndexRow, 0, rowsCapNeeded)
rv.Rows = append(rows, rv.Rows...)
// walk through the collated information and proccess // walk through the collated information and proccess
// once for each indexed field (unique name) // once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs { for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex] fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex] includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field
indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
// now index the composite fields
for _, compositeField := range d.CompositeFields {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
} }
return rv return rv
} }
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow { func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
tfrs := make([]TermFreqRow, len(tokenFreqs)) tfrs := make([]TermFreqRow, len(tokenFreqs))
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
if !includeTermVectors { if !includeTermVectors {
rows := make([]index.IndexRow, len(tokenFreqs))
i := 0 i := 0
for _, tf := range tokenFreqs { for _, tf := range tokenFreqs {
rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil) rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
i++ i++
} }
return rows return rows
} }
rows := make([]index.IndexRow, 0, len(tokenFreqs))
i := 0 i := 0
for _, tf := range tokenFreqs { for _, tf := range tokenFreqs {
tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf) var tv []*TermVector
rows = append(rows, newFieldRows...) tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv)) rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
i++ i++
} }
return rows return rows
} }
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) { func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations)) rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]index.IndexRow, 0)
for i, l := range tf.Locations { for i, l := range tf.Locations {
var newFieldRow *FieldRow var newFieldRow *FieldRow
@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre
// lookup correct field // lookup correct field
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field) fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil { if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow) rows = append(rows, newFieldRow)
} }
} }
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions) tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
rv[i] = tv rv[i] = tv
} }
return rv, newFieldRows return rv, rows
} }
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow { func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {

View File

@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) {
r: &index.AnalysisResult{ r: &index.AnalysisResult{
DocID: "a", DocID: "a",
Rows: []index.IndexRow{ Rows: []index.IndexRow{
NewFieldRow(1, "name"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewFieldRow(1, "name"),
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")), NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}), NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
}, },