2015-08-25 20:52:42 +02:00
|
|
|
// Copyright (c) 2015 Couchbase, Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
|
|
// and limitations under the License.
|
|
|
|
|
|
|
|
package firestorm
|
|
|
|
|
|
|
|
import (
|
|
|
|
"math"
|
|
|
|
|
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
|
|
"github.com/blevesearch/bleve/document"
|
|
|
|
"github.com/blevesearch/bleve/index"
|
|
|
|
)
|
|
|
|
|
|
|
|
func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
|
|
|
|
|
|
|
|
rv := &index.AnalysisResult{
|
|
|
|
DocID: d.ID,
|
|
|
|
Rows: make([]index.IndexRow, 0, 100),
|
|
|
|
}
|
|
|
|
|
2015-12-30 23:03:32 +01:00
|
|
|
docIDBytes := []byte(d.ID)
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
// add the _id row
|
|
|
|
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
|
|
|
|
|
2015-12-21 20:59:32 +01:00
|
|
|
// information we collate as we merge fields with same name
|
|
|
|
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
|
|
|
|
fieldLengths := make(map[uint16]int)
|
|
|
|
fieldIncludeTermVectors := make(map[uint16]bool)
|
|
|
|
fieldNames := make(map[uint16]string)
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
analyzeField := func(field document.Field, storable bool) {
|
2015-08-25 20:52:42 +02:00
|
|
|
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
|
|
|
|
if newFieldRow != nil {
|
|
|
|
rv.Rows = append(rv.Rows, newFieldRow)
|
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
fieldNames[fieldIndex] = field.Name()
|
2015-08-25 20:52:42 +02:00
|
|
|
|
|
|
|
if field.Options().IsIndexed() {
|
|
|
|
fieldLength, tokenFreqs := field.Analyze()
|
2015-12-21 20:59:32 +01:00
|
|
|
existingFreqs := fieldTermFreqs[fieldIndex]
|
|
|
|
if existingFreqs == nil {
|
|
|
|
fieldTermFreqs[fieldIndex] = tokenFreqs
|
|
|
|
} else {
|
|
|
|
existingFreqs.MergeAll(field.Name(), tokenFreqs)
|
|
|
|
fieldTermFreqs[fieldIndex] = existingFreqs
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
fieldLengths[fieldIndex] += fieldLength
|
|
|
|
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
if storable && field.Options().IsStored() {
|
2015-12-30 23:03:32 +01:00
|
|
|
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
|
2015-08-25 20:52:42 +02:00
|
|
|
rv.Rows = append(rv.Rows, storeRow)
|
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
}
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
for _, field := range d.Fields {
|
|
|
|
analyzeField(field, true)
|
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
2015-12-21 20:59:32 +01:00
|
|
|
// see if any of the composite fields need this
|
|
|
|
for _, compositeField := range d.CompositeFields {
|
2016-01-01 03:13:54 +01:00
|
|
|
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
|
2015-12-21 20:59:32 +01:00
|
|
|
}
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, compositeField := range d.CompositeFields {
|
2016-01-01 03:13:54 +01:00
|
|
|
analyzeField(compositeField, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
rowsCapNeeded := len(rv.Rows)
|
|
|
|
for _, tokenFreqs := range fieldTermFreqs {
|
|
|
|
rowsCapNeeded += len(tokenFreqs)
|
|
|
|
}
|
|
|
|
|
|
|
|
rows := make([]index.IndexRow, 0, rowsCapNeeded)
|
|
|
|
rv.Rows = append(rows, rv.Rows...)
|
|
|
|
|
|
|
|
// walk through the collated information and proccess
|
|
|
|
// once for each indexed field (unique name)
|
|
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
|
|
|
fieldLength := fieldLengths[fieldIndex]
|
|
|
|
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
|
|
|
|
|
|
|
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
|
2015-08-25 20:52:42 +02:00
|
|
|
|
2015-12-31 06:15:24 +01:00
|
|
|
tfrs := make([]TermFreqRow, len(tokenFreqs))
|
|
|
|
|
2015-08-25 20:52:42 +02:00
|
|
|
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
|
|
|
|
2015-12-31 06:32:06 +01:00
|
|
|
if !includeTermVectors {
|
|
|
|
i := 0
|
|
|
|
for _, tf := range tokenFreqs {
|
2016-01-01 03:13:54 +01:00
|
|
|
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
|
2015-12-31 06:32:06 +01:00
|
|
|
i++
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
2015-12-31 06:32:06 +01:00
|
|
|
return rows
|
|
|
|
}
|
2015-08-25 20:52:42 +02:00
|
|
|
|
2015-12-31 06:32:06 +01:00
|
|
|
i := 0
|
|
|
|
for _, tf := range tokenFreqs {
|
2016-01-01 03:13:54 +01:00
|
|
|
var tv []*TermVector
|
|
|
|
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
|
2015-12-31 06:32:06 +01:00
|
|
|
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
|
2015-12-31 06:15:24 +01:00
|
|
|
i++
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
return rows
|
|
|
|
}
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
|
2015-08-25 20:52:42 +02:00
|
|
|
rv := make([]*TermVector, len(tf.Locations))
|
|
|
|
|
|
|
|
for i, l := range tf.Locations {
|
|
|
|
var newFieldRow *FieldRow
|
|
|
|
fieldIndex := field
|
|
|
|
if l.Field != "" {
|
|
|
|
// lookup correct field
|
|
|
|
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
|
|
|
|
if newFieldRow != nil {
|
2016-01-01 03:13:54 +01:00
|
|
|
rows = append(rows, newFieldRow)
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
|
|
|
|
rv[i] = tv
|
|
|
|
}
|
|
|
|
|
2016-01-01 03:13:54 +01:00
|
|
|
return rv, rows
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
2015-12-30 23:03:32 +01:00
|
|
|
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
|
2015-08-25 20:52:42 +02:00
|
|
|
fieldValue := make([]byte, 1+len(field.Value()))
|
|
|
|
fieldValue[0] = encodeFieldType(field)
|
|
|
|
copy(fieldValue[1:], field.Value())
|
2015-12-30 23:03:32 +01:00
|
|
|
storedRow := NewStoredRow(docID, docNum, fieldIndex, field.ArrayPositions(), fieldValue)
|
2015-08-25 20:52:42 +02:00
|
|
|
return storedRow
|
|
|
|
}
|
|
|
|
|
|
|
|
func encodeFieldType(f document.Field) byte {
|
|
|
|
fieldType := byte('x')
|
|
|
|
switch f.(type) {
|
|
|
|
case *document.TextField:
|
|
|
|
fieldType = 't'
|
|
|
|
case *document.NumericField:
|
|
|
|
fieldType = 'n'
|
|
|
|
case *document.DateTimeField:
|
|
|
|
fieldType = 'd'
|
|
|
|
case *document.CompositeField:
|
|
|
|
fieldType = 'c'
|
|
|
|
}
|
|
|
|
return fieldType
|
|
|
|
}
|