firestorm.Analyze() preallocs rows, with analyzeField() func
The new analyzeField() helper func is used for both regular fields and for composite fields. With this change, all analysis is done up front, for both regular fields and composite fields. After analysis, this change counts up all the row capacity needed and extends the AnalysisResult.Rows in one shot, as opposed to the previous approach of dynamically growing the array as needed during append()'s. Also, in this change, the TermFreqRow for _id is added first, which seems more correct.
This commit is contained in:
parent
325a616993
commit
b241242465
|
@ -26,22 +26,22 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
|
|||
|
||||
docIDBytes := []byte(d.ID)
|
||||
|
||||
// add the _id row
|
||||
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
|
||||
|
||||
// information we collate as we merge fields with same name
|
||||
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
|
||||
fieldLengths := make(map[uint16]int)
|
||||
fieldIncludeTermVectors := make(map[uint16]bool)
|
||||
fieldNames := make(map[uint16]string)
|
||||
|
||||
for _, field := range d.Fields {
|
||||
analyzeField := func(field document.Field, storable bool) {
|
||||
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
|
||||
if newFieldRow != nil {
|
||||
rv.Rows = append(rv.Rows, newFieldRow)
|
||||
}
|
||||
fieldNames[fieldIndex] = field.Name()
|
||||
|
||||
// add the _id row
|
||||
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
|
||||
|
||||
if field.Options().IsIndexed() {
|
||||
fieldLength, tokenFreqs := field.Analyze()
|
||||
existingFreqs := fieldTermFreqs[fieldIndex]
|
||||
|
@ -55,75 +55,74 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
|
|||
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
|
||||
}
|
||||
|
||||
if field.Options().IsStored() {
|
||||
if storable && field.Options().IsStored() {
|
||||
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
|
||||
rv.Rows = append(rv.Rows, storeRow)
|
||||
}
|
||||
}
|
||||
|
||||
for _, field := range d.Fields {
|
||||
analyzeField(field, true)
|
||||
}
|
||||
|
||||
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
||||
// see if any of the composite fields need this
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
|
||||
}
|
||||
}
|
||||
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
analyzeField(compositeField, false)
|
||||
}
|
||||
|
||||
rowsCapNeeded := len(rv.Rows)
|
||||
for _, tokenFreqs := range fieldTermFreqs {
|
||||
rowsCapNeeded += len(tokenFreqs)
|
||||
}
|
||||
|
||||
rows := make([]index.IndexRow, 0, rowsCapNeeded)
|
||||
rv.Rows = append(rows, rv.Rows...)
|
||||
|
||||
// walk through the collated information and proccess
|
||||
// once for each indexed field (unique name)
|
||||
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
||||
fieldLength := fieldLengths[fieldIndex]
|
||||
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
||||
|
||||
// see if any of the composite fields need this
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
|
||||
}
|
||||
|
||||
// encode this field
|
||||
indexRows := f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
|
||||
rv.Rows = append(rv.Rows, indexRows...)
|
||||
}
|
||||
|
||||
// now index the composite fields
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(compositeField.Name())
|
||||
if newFieldRow != nil {
|
||||
rv.Rows = append(rv.Rows, newFieldRow)
|
||||
}
|
||||
if compositeField.Options().IsIndexed() {
|
||||
fieldLength, tokenFreqs := compositeField.Analyze()
|
||||
// encode this field
|
||||
indexRows := f.indexField(docIDBytes, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
|
||||
rv.Rows = append(rv.Rows, indexRows...)
|
||||
}
|
||||
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
|
||||
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
|
||||
|
||||
tfrs := make([]TermFreqRow, len(tokenFreqs))
|
||||
|
||||
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
||||
|
||||
if !includeTermVectors {
|
||||
rows := make([]index.IndexRow, len(tokenFreqs))
|
||||
i := 0
|
||||
for _, tf := range tokenFreqs {
|
||||
rows[i] = InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil)
|
||||
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
|
||||
i++
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
rows := make([]index.IndexRow, 0, len(tokenFreqs))
|
||||
i := 0
|
||||
for _, tf := range tokenFreqs {
|
||||
tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
|
||||
rows = append(rows, newFieldRows...)
|
||||
var tv []*TermVector
|
||||
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
|
||||
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
|
||||
i++
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
|
||||
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
|
||||
rv := make([]*TermVector, len(tf.Locations))
|
||||
newFieldRows := make([]index.IndexRow, 0)
|
||||
|
||||
for i, l := range tf.Locations {
|
||||
var newFieldRow *FieldRow
|
||||
|
@ -132,14 +131,14 @@ func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFre
|
|||
// lookup correct field
|
||||
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
|
||||
if newFieldRow != nil {
|
||||
newFieldRows = append(newFieldRows, newFieldRow)
|
||||
rows = append(rows, newFieldRow)
|
||||
}
|
||||
}
|
||||
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
|
||||
rv[i] = tv
|
||||
}
|
||||
|
||||
return rv, newFieldRows
|
||||
return rv, rows
|
||||
}
|
||||
|
||||
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
|
||||
|
|
|
@ -78,8 +78,8 @@ func TestAnalysis(t *testing.T) {
|
|||
r: &index.AnalysisResult{
|
||||
DocID: "a",
|
||||
Rows: []index.IndexRow{
|
||||
NewFieldRow(1, "name"),
|
||||
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
|
||||
NewFieldRow(1, "name"),
|
||||
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
|
||||
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue
Block a user