0
0

firestorm.Analyze() preallocs rows, with analyzeField() func

The new analyzeField() helper func is used for both regular fields and
for composite fields.

With this change, all analysis is done up front, for both regular
fields and composite fields.

After analysis, this change counts up all the row capacity needed and
extends the AnalysisResult.Rows in one shot, as opposed to the
previous approach of dynamically growing the array as needed during
append()'s.

Also, in this change, the TermFreqRow for _id is added first, which
seems more correct.
This commit is contained in:
Steve Yen 2015-12-31 18:13:54 -08:00
parent 325a616993
commit b241242465
2 changed files with 37 additions and 38 deletions

View File

@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
docIDBytes := []byte(d.ID)
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
for _, field := range d.Fields {
analyzeField := func(field document.Field, storable bool) {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
existingFreqs := fieldTermFreqs[fieldIndex]
@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if field.Options().IsStored() {
if storable && field.Options().IsStored() {
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRow)
}
}
for _, field := range d.Fields {
analyzeField(field, true)
}
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
rowsCapNeeded := len(rv.Rows)
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rows := make([]index.IndexRow, 0, rowsCapNeeded)
rv.Rows = append(rows, rv.Rows...)
// walk through the collated information and proccess
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field
indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
// now index the composite fields
for _, compositeField := range d.CompositeFields {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
}
return rv
}
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
tfrs := make([]TermFreqRow, len(tokenFreqs))
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
if !includeTermVectors {
rows := make([]index.IndexRow, len(tokenFreqs))
i := 0
for _, tf := range tokenFreqs {
rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
i++
}
return rows
}
rows := make([]index.IndexRow, 0, len(tokenFreqs))
i := 0
for _, tf := range tokenFreqs {
tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
rows = append(rows, newFieldRows...)
var tv []*TermVector
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
i++
}
return rows
}
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]index.IndexRow, 0)
for i, l := range tf.Locations {
var newFieldRow *FieldRow
@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre
// lookup correct field
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow)
rows = append(rows, newFieldRow)
}
}
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
rv[i] = tv
}
return rv, newFieldRows
return rv, rows
}
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {

View File

@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) {
r: &index.AnalysisResult{
DocID: "a",
Rows: []index.IndexRow{
NewFieldRow(1, "name"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewFieldRow(1, "name"),
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
},