0
0
Fork 0

added support for composite fields

This commit is contained in:
Marty Schoch 2014-07-21 17:05:55 -04:00
parent 1f17195e7d
commit 70a8b03bed
10 changed files with 438 additions and 85 deletions

View File

@ -8,7 +8,10 @@
// and limitations under the License.
package analysis
import ()
type TokenLocation struct {
Field string
Start int
End int
Position int
@ -19,7 +22,38 @@ type TokenFreq struct {
Locations []*TokenLocation
}
func TokenFrequency(tokens TokenStream) []*TokenFreq {
type TokenFrequencies []*TokenFreq
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) TokenFrequencies {
// put existing tokens into a map
index := make(map[string]*TokenFreq)
for _, tf := range tfs {
index[string(tf.Term)] = tf
}
// walk the new token frequencies
for _, tf := range other {
// set the remoteField value in incoming token freqs
for _, l := range tf.Locations {
l.Field = remoteField
}
existingTf, exists := index[string(tf.Term)]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
} else {
index[string(tf.Term)] = tf
}
}
// flatten map back to array
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf
i += 1
}
return rv
}
func TokenFrequency(tokens TokenStream) TokenFrequencies {
index := make(map[string]*TokenFreq)
for _, token := range tokens {
@ -44,7 +78,7 @@ func TokenFrequency(tokens TokenStream) []*TokenFreq {
}
}
rv := make([]*TokenFreq, len(index))
rv := make(TokenFrequencies, len(index))
i := 0
for _, tf := range index {
rv[i] = tf

159
analysis/freq_test.go Normal file
View File

@ -0,0 +1,159 @@
package analysis
import (
"reflect"
"testing"
)
func TestTokenFrequency(t *testing.T) {
tokens := TokenStream{
&Token{
Term: []byte("water"),
Position: 1,
Start: 0,
End: 5,
},
&Token{
Term: []byte("water"),
Position: 2,
Start: 6,
End: 11,
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := TokenFrequency(tokens)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}
}
func TestTokenFrequenciesMergeAll(t *testing.T) {
tf1 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(tf1, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, tf1)
}
}
func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
tf1 := TokenFrequencies{}
tf2 := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Position: 2,
Start: 6,
End: 11,
},
},
},
}
expectedResult := TokenFrequencies{
&TokenFreq{
Term: []byte("water"),
Locations: []*TokenLocation{
&TokenLocation{
Field: "tf2",
Position: 1,
Start: 0,
End: 5,
},
&TokenLocation{
Field: "tf2",
Position: 2,
Start: 6,
End: 11,
},
},
},
}
result := tf1.MergeAll("tf2", tf2)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
//t.Logf("%#v", tf1[0])
}
}

View File

@ -13,19 +13,26 @@ import (
)
type Document struct {
ID string `json:"id"`
Fields []Field `json:"fields"`
ID string `json:"id"`
Fields []Field `json:"fields"`
CompositeFields []*CompositeField
}
func NewDocument(id string) *Document {
return &Document{
ID: id,
Fields: make([]Field, 0),
ID: id,
Fields: make([]Field, 0),
CompositeFields: make([]*CompositeField, 0),
}
}
func (d *Document) AddField(f Field) *Document {
d.Fields = append(d.Fields, f)
switch f := f.(type) {
case *CompositeField:
d.CompositeFields = append(d.CompositeFields, f)
default:
d.Fields = append(d.Fields, f)
}
return d
}

View File

@ -15,6 +15,6 @@ import (
type Field interface {
Name() string
Options() IndexingOptions
Analyzer() *analysis.Analyzer
Analyze() (int, analysis.TokenFrequencies)
Value() []byte
}

View File

@ -0,0 +1,73 @@
package document
import (
"github.com/couchbaselabs/bleve/analysis"
)
const DEFAULT_COMPOSITE_INDEXING_OPTIONS = INDEX_FIELD
type CompositeField struct {
name string
includedFields map[string]bool
excludedFields map[string]bool
defaultInclude bool
options IndexingOptions
totalLength int
compositeFrequencies analysis.TokenFrequencies
}
func NewCompositeField(name string, defaultInclude bool, include []string, exclude []string) *CompositeField {
return NewCompositeFieldWithIndexingOptions(name, defaultInclude, include, exclude, DEFAULT_COMPOSITE_INDEXING_OPTIONS)
}
func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, include []string, exclude []string, options IndexingOptions) *CompositeField {
rv := &CompositeField{
name: name,
options: options,
defaultInclude: defaultInclude,
includedFields: make(map[string]bool, len(include)),
excludedFields: make(map[string]bool, len(exclude)),
}
for _, i := range include {
rv.includedFields[i] = true
}
for _, e := range exclude {
rv.excludedFields[e] = true
}
return rv
}
func (c *CompositeField) Name() string {
return c.name
}
func (c *CompositeField) Options() IndexingOptions {
return c.options
}
func (c *CompositeField) Analyze() (int, analysis.TokenFrequencies) {
return c.totalLength, c.compositeFrequencies
}
func (c *CompositeField) Value() []byte {
return []byte{}
}
func (c *CompositeField) Compose(field string, length int, freq analysis.TokenFrequencies) {
shouldInclude := c.defaultInclude
_, fieldShouldBeIncluded := c.includedFields[field]
if fieldShouldBeIncluded {
shouldInclude = true
}
_, fieldShouldBeExcluded := c.excludedFields[field]
if fieldShouldBeExcluded {
shouldInclude = false
}
if shouldInclude {
c.totalLength += length
c.compositeFrequencies = c.compositeFrequencies.MergeAll(field, freq)
}
}

View File

@ -42,8 +42,11 @@ func (t *TextField) Options() IndexingOptions {
return t.options
}
func (t *TextField) Analyzer() *analysis.Analyzer {
return t.analyzer
func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
tokens := t.analyzer.Analyze(t.Value())
fieldLength := len(tokens) // number of tokens in this doc field
tokenFreqs := analysis.TokenFrequency(tokens)
return fieldLength, tokenFreqs
}
func (t *TextField) Value() []byte {

View File

@ -225,7 +225,7 @@ func (udc *UpsideDownCouch) Close() {
}
type termMap map[string]bool
type fieldTermMap map[int]termMap
type fieldTermMap map[uint16]termMap
func (udc *UpsideDownCouch) Update(doc *document.Document) error {
// first we lookup the backindex row for the doc id if it exists
@ -241,10 +241,10 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) error {
if backIndexRow != nil {
isAdd = false
for _, entry := range backIndexRow.entries {
existingTermMap, fieldExists := existingTermFieldMaps[int(entry.field)]
existingTermMap, fieldExists := existingTermFieldMaps[entry.field]
if !fieldExists {
existingTermMap = make(termMap, 0)
existingTermFieldMaps[int(entry.field)] = existingTermMap
existingTermFieldMaps[entry.field] = existingTermMap
}
existingTermMap[string(entry.term)] = true
}
@ -265,75 +265,55 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) error {
backIndexStoredFields := make([]uint16, 0)
for _, field := range doc.Fields {
fieldIndex, fieldExists := udc.fieldIndexes[field.Name()]
if !fieldExists {
// assign next field id
fieldIndex = uint16(udc.lastFieldIndex + 1)
udc.fieldIndexes[field.Name()] = fieldIndex
// ensure this batch adds a row for this field
row := NewFieldRow(uint16(fieldIndex), field.Name())
updateRows = append(updateRows, row)
udc.lastFieldIndex = int(fieldIndex)
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(field.Name())
if newFieldRow != nil {
updateRows = append(updateRows, newFieldRow)
}
existingTermMap, fieldExistedInDoc := existingTermFieldMaps[int(fieldIndex)]
existingTermMap := existingTermFieldMaps[fieldIndex]
if field.Options().IsIndexed() {
analyzer := field.Analyzer()
tokens := analyzer.Analyze(field.Value())
fieldLength := len(tokens) // number of tokens in this doc field
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
tokenFreqs := analysis.TokenFrequency(tokens)
for _, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() {
tv := termVectorsFromTokenFreq(uint16(fieldIndex), tf)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else {
termFreqRow = NewTermFrequencyRow(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
}
fieldLength, tokenFreqs := field.Analyze()
// record the back index entry
backIndexEntry := BackIndexEntry{tf.Term, uint16(fieldIndex)}
backIndexEntries = append(backIndexEntries, &backIndexEntry)
// remove the entry from the map of existing term fields if it exists
if fieldExistedInDoc {
termString := string(tf.Term)
_, ok := existingTermMap[termString]
if ok {
// this is an update
updateRows = append(updateRows, termFreqRow)
// this term existed last time, delete it from that map
delete(existingTermMap, termString)
} else {
// this is an add
addRows = append(addRows, termFreqRow)
}
} else {
// this is an add
addRows = append(addRows, termFreqRow)
}
// see if any of the composite fields need this
for _, compositeField := range doc.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
}
// encode this field
indexAddRows, indexUpdateRows, indexBackIndexEntries := udc.indexField(doc.ID, field, fieldIndex, fieldLength, tokenFreqs, existingTermMap)
addRows = append(addRows, indexAddRows...)
updateRows = append(updateRows, indexUpdateRows...)
backIndexEntries = append(backIndexEntries, indexBackIndexEntries...)
}
if field.Options().IsStored() {
storedRow := NewStoredRow(doc.ID, uint16(fieldIndex), field.Value())
storeAddRows, storeUpdateRows := udc.storeField(doc.ID, field, fieldIndex, existingStoredFieldMap)
addRows = append(addRows, storeAddRows...)
updateRows = append(updateRows, storeUpdateRows...)
backIndexStoredFields = append(backIndexStoredFields, fieldIndex)
_, ok := existingStoredFieldMap[uint16(fieldIndex)]
if ok {
// this is an update
updateRows = append(updateRows, storedRow)
// this field was stored last time, delete it from that map
delete(existingStoredFieldMap, uint16(fieldIndex))
} else {
addRows = append(addRows, storedRow)
}
}
}
// now index the composite fields
for _, compositeField := range doc.CompositeFields {
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(compositeField.Name())
if newFieldRow != nil {
updateRows = append(updateRows, newFieldRow)
}
existingTermMap := existingTermFieldMaps[fieldIndex]
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexAddRows, indexUpdateRows, indexBackIndexEntries := udc.indexField(doc.ID, compositeField, fieldIndex, fieldLength, tokenFreqs, existingTermMap)
addRows = append(addRows, indexAddRows...)
updateRows = append(updateRows, indexUpdateRows...)
backIndexEntries = append(backIndexEntries, indexBackIndexEntries...)
}
}
// build the back index row
backIndexRow = NewBackIndexRow(doc.ID, backIndexEntries, backIndexStoredFields)
updateRows = append(updateRows, backIndexRow)
@ -361,6 +341,79 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) error {
return err
}
func (udc *UpsideDownCouch) storeField(docId string, field document.Field, fieldIndex uint16, existingStoredFieldMap map[uint16]bool) ([]UpsideDownCouchRow, []UpsideDownCouchRow) {
updateRows := make([]UpsideDownCouchRow, 0)
addRows := make([]UpsideDownCouchRow, 0)
storedRow := NewStoredRow(docId, fieldIndex, field.Value())
_, ok := existingStoredFieldMap[fieldIndex]
if ok {
// this is an update
updateRows = append(updateRows, storedRow)
// this field was stored last time, delete it from that map
delete(existingStoredFieldMap, fieldIndex)
} else {
addRows = append(addRows, storedRow)
}
return addRows, updateRows
}
func (udc *UpsideDownCouch) indexField(docId string, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, existingTermMap termMap) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []*BackIndexEntry) {
updateRows := make([]UpsideDownCouchRow, 0)
addRows := make([]UpsideDownCouchRow, 0)
backIndexEntries := make([]*BackIndexEntry, 0)
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
for _, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
updateRows = append(updateRows, newFieldRows...)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docId, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else {
termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docId, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
}
// record the back index entry
backIndexEntry := BackIndexEntry{tf.Term, fieldIndex}
backIndexEntries = append(backIndexEntries, &backIndexEntry)
// remove the entry from the map of existing term fields if it exists
if existingTermMap != nil {
termString := string(tf.Term)
_, ok := existingTermMap[termString]
if ok {
// this is an update
updateRows = append(updateRows, termFreqRow)
// this term existed last time, delete it from that map
delete(existingTermMap, termString)
} else {
// this is an add
addRows = append(addRows, termFreqRow)
}
} else {
// this is an add
addRows = append(addRows, termFreqRow)
}
}
return addRows, updateRows, backIndexEntries
}
func (udc *UpsideDownCouch) fieldNameToFieldIndex(fieldName string) (uint16, *FieldRow) {
var fieldRow *FieldRow
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
if !fieldExists {
// assign next field id
fieldIndex = uint16(udc.lastFieldIndex + 1)
udc.fieldIndexes[fieldName] = fieldIndex
// ensure this batch adds a row for this field
fieldRow = NewFieldRow(uint16(fieldIndex), fieldName)
udc.lastFieldIndex = int(fieldIndex)
}
return fieldIndex, fieldRow
}
func (udc *UpsideDownCouch) Delete(id string) error {
// lookup the back index row
backIndexRow, err := udc.backIndexRowForDoc(id)
@ -453,7 +506,6 @@ func (udc *UpsideDownCouch) DumpDoc(id string) ([]interface{}, error) {
keys = append(keys, key)
}
for _, entry := range back.entries {
//log.Printf("term: `%s`, field: %d", entry.term, entry.field)
tfr := NewTermFrequencyRow(entry.term, entry.field, id, 0, 0)
key := tfr.Key()
keys = append(keys, key)
@ -515,12 +567,22 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return len(tf.Locations)
}
func termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) []*TermVector {
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]UpsideDownCouchRow, 0)
for i, l := range tf.Locations {
var newFieldRow *FieldRow
fieldIndex := field
if l.Field != "" {
// lookup correct field
fieldIndex, newFieldRow = udc.fieldNameToFieldIndex(l.Field)
if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow)
}
}
tv := TermVector{
field: field,
field: fieldIndex,
pos: uint64(l.Position),
start: uint64(l.Start),
end: uint64(l.End),
@ -528,7 +590,7 @@ func termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) []*TermVecto
rv[i] = &tv
}
return rv
return rv, newFieldRows
}
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {

View File

@ -14,10 +14,11 @@ import (
)
type SyntaxQuery struct {
Query string `json:"query"`
BoostVal float64 `json:"boost,omitempty"`
Explain bool `json:"explain,omitempty"`
mapping document.Mapping
Query string `json:"query"`
BoostVal float64 `json:"boost,omitempty"`
Explain bool `json:"explain,omitempty"`
DefaultField string `json:"default_field,omitemtpy"`
mapping document.Mapping
}
func (q *SyntaxQuery) Boost() float64 {
@ -25,7 +26,7 @@ func (q *SyntaxQuery) Boost() float64 {
}
func (q *SyntaxQuery) Searcher(index index.Index) (Searcher, error) {
newQuery, err := ParseQuerySyntax(q.Query, q.mapping)
newQuery, err := ParseQuerySyntax(q.Query, q.mapping, q.DefaultField)
if err != nil {
return nil, err
}

View File

@ -21,11 +21,12 @@ var parsingMustNotList *TermDisjunctionQuery
var parsingShouldList *TermDisjunctionQuery
var parsingMapping document.Mapping
func ParseQuerySyntax(query string, mapping document.Mapping) (rq Query, err error) {
func ParseQuerySyntax(query string, mapping document.Mapping, defaultField string) (rq Query, err error) {
parserMutex.Lock()
defer parserMutex.Unlock()
parsingMapping = mapping
parsingDefaultField = defaultField
parsingMustList = &TermConjunctionQuery{
Terms: make([]Query, 0),

View File

@ -133,19 +133,32 @@ func (s *TermQueryScorer) Score(termMatch *index.TermFieldDoc) *DocumentMatch {
}
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
locations := make(Locations, len(termMatch.Vectors))
for i, v := range termMatch.Vectors {
rv.Locations = make(FieldTermLocationMap)
for _, v := range termMatch.Vectors {
tlm := rv.Locations[v.Field]
if tlm == nil {
tlm = make(TermLocationMap)
}
loc := Location{
Pos: float64(v.Pos),
Start: float64(v.Start),
End: float64(v.End),
}
locations[i] = &loc
locations := tlm[s.query.Term]
if locations == nil {
locations = make(Locations, 1)
locations[0] = &loc
} else {
locations = append(locations, &loc)
}
tlm[s.query.Term] = locations
rv.Locations[v.Field] = tlm
}
tlm := make(TermLocationMap)
tlm[s.query.Term] = locations
rv.Locations = make(FieldTermLocationMap)
rv.Locations[s.query.Field] = tlm
}
return &rv