0
0
Fork 0

DocValue Config, new API Changes

-VisitableDocValueFields API for persisted DV field list
-making dv configs overridable at field level
-enabling on the fly/runtime un inverting of doc values
-few UT updates
This commit is contained in:
Sreekanth Sivasankaran 2018-01-08 10:58:33 +05:30
parent 1788a03803
commit 4c256f5669
18 changed files with 409 additions and 69 deletions

View File

@ -20,7 +20,7 @@ import (
"github.com/blevesearch/bleve/analysis"
)
const DefaultBooleanIndexingOptions = StoreField | IndexField
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
type BooleanField struct {
name string

View File

@ -23,7 +23,7 @@ import (
"github.com/blevesearch/bleve/numeric"
)
const DefaultDateTimeIndexingOptions = StoreField | IndexField
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
const DefaultDateTimePrecisionStep uint = 4
var MinTimeRepresentable = time.Unix(0, math.MinInt64)

View File

@ -21,7 +21,7 @@ import (
"github.com/blevesearch/bleve/numeric"
)
const DefaultNumericIndexingOptions = StoreField | IndexField
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
const DefaultPrecisionStep uint = 4

View File

@ -20,7 +20,7 @@ import (
"github.com/blevesearch/bleve/analysis"
)
const DefaultTextIndexingOptions = IndexField
const DefaultTextIndexingOptions = IndexField | DocValues
type TextField struct {
name string

View File

@ -20,6 +20,7 @@ const (
IndexField IndexingOptions = 1 << iota
StoreField
IncludeTermVectors
DocValues
)
func (o IndexingOptions) IsIndexed() bool {
@ -34,6 +35,10 @@ func (o IndexingOptions) IncludeTermVectors() bool {
return o&IncludeTermVectors != 0
}
func (o IndexingOptions) IncludeDocValues() bool {
return o&DocValues != 0
}
func (o IndexingOptions) String() string {
rv := ""
if o.IsIndexed() {
@ -51,5 +56,11 @@ func (o IndexingOptions) String() string {
}
rv += "TV"
}
if o.IncludeDocValues() {
if rv != "" {
rv += ", "
}
rv += "DV"
}
return rv
}

View File

@ -24,36 +24,56 @@ func TestIndexingOptions(t *testing.T) {
isIndexed bool
isStored bool
includeTermVectors bool
docValues bool
}{
{
options: IndexField | StoreField | IncludeTermVectors,
isIndexed: true,
isStored: true,
includeTermVectors: true,
docValues: false,
},
{
options: IndexField | IncludeTermVectors,
isIndexed: true,
isStored: false,
includeTermVectors: true,
docValues: false,
},
{
options: StoreField | IncludeTermVectors,
isIndexed: false,
isStored: true,
includeTermVectors: true,
docValues: false,
},
{
options: IndexField,
isIndexed: true,
isStored: false,
includeTermVectors: false,
docValues: false,
},
{
options: StoreField,
isIndexed: false,
isStored: true,
includeTermVectors: false,
docValues: false,
},
{
options: DocValues,
isIndexed: false,
isStored: false,
includeTermVectors: false,
docValues: true,
},
{
options: IndexField | StoreField | IncludeTermVectors | DocValues,
isIndexed: true,
isStored: true,
includeTermVectors: true,
docValues: true,
},
}
@ -70,5 +90,9 @@ func TestIndexingOptions(t *testing.T) {
if actuallyIncludeTermVectors != test.includeTermVectors {
t.Errorf("expected includeTermVectors to be %v, got %v for %d", test.includeTermVectors, actuallyIncludeTermVectors, test.options)
}
actuallyDocValues := test.options.IncludeDocValues()
if actuallyDocValues != test.docValues {
t.Errorf("expected docValue to be %v, got %v for %d", test.docValues, actuallyDocValues, test.options)
}
}
}

View File

@ -37,14 +37,6 @@ const Name = "scorch"
const Version uint8 = 1
// UnInvertIndex is implemented by various scorch index implementations
// to provide the un inverting of the postings or other indexed values.
type UnInvertIndex interface {
// apparently need better namings here..
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error
}
type Scorch struct {
readOnly bool
version uint8

View File

@ -1638,3 +1638,72 @@ func TestIndexDocumentVisitFieldTermsWithMultipleDocs(t *testing.T) {
}
}
func TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions(t *testing.T) {
defer func() {
err := DestroyTest()
if err != nil {
t.Fatal(err)
}
}()
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewScorch(Name, testConfig, analysisQueue)
if err != nil {
t.Fatal(err)
}
err = idx.Open()
if err != nil {
t.Fatalf("error opening index: %v", err)
}
defer func() {
err := idx.Close()
if err != nil {
t.Fatal(err)
}
}()
// mix of field options, this exercises the run time/ on the fly un inverting of
// doc values for custom options enabled field like designation, dept.
options := document.IndexField | document.StoreField | document.IncludeTermVectors
doc := document.NewDocument("1")
doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) // default doc value persisted
doc.AddField(document.NewTextField("title", []uint64{}, []byte("mister"))) // default doc value persisted
doc.AddField(document.NewTextFieldWithIndexingOptions("designation", []uint64{}, []byte("engineer"), options))
doc.AddField(document.NewTextFieldWithIndexingOptions("dept", []uint64{}, []byte("bleve"), options))
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
indexReader, err := idx.Reader()
if err != nil {
t.Error(err)
}
fieldTerms := make(index.FieldTerms)
docNumber, err := indexReader.InternalID("1")
if err != nil {
t.Fatal(err)
}
err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name", "designation", "dept"}, func(field string, term []byte) {
fieldTerms[field] = append(fieldTerms[field], string(term))
})
if err != nil {
t.Error(err)
}
expectedFieldTerms := index.FieldTerms{
"name": []string{"test"},
"designation": []string{"engineer"},
"dept": []string{"bleve"},
}
if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) {
t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)
}
err = indexReader.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -119,10 +119,10 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
if field.Options().IsStored() {
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
}
// TODO with mapping changes for dv
//if field.Options().IncludeDocValues() {
s.DocValueFields[fieldID] = true
//}
if field.Options().IncludeDocValues() {
s.DocValueFields[fieldID] = true
}
}
// now that its been rolled up into docMap, walk that

View File

@ -91,9 +91,14 @@ type Location interface {
}
// DocumentFieldTermVisitable is implemented by various scorch segment
// implementations to provide the un inverting of the postings
// or other indexed values.
// implementations with persistence for the un inverting of the
// postings or other indexed values.
type DocumentFieldTermVisitable interface {
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error
// VisitableDocValueFields implementation should return
// the list of fields which are document value persisted and
// therefore visitable by the above VisitDocumentFieldTerms method.
VisitableDocValueFields() ([]string, error)
}

View File

@ -286,3 +286,90 @@ func buildMemSegmentMulti() *mem.Segment {
return segment
}
func buildMemSegmentWithDefaultFieldMapping() (*mem.Segment, []string) {
doc := &document.Document{
ID: "a",
Fields: []document.Field{
document.NewTextField("_id", nil, []byte("a")),
document.NewTextField("name", nil, []byte("wow")),
document.NewTextField("desc", nil, []byte("some thing")),
document.NewTextField("tag", []uint64{0}, []byte("cold")),
},
CompositeFields: []*document.CompositeField{
document.NewCompositeField("_all", true, nil, []string{"_id"}),
},
}
var fields []string
fields = append(fields, "_id")
fields = append(fields, "name")
fields = append(fields, "desc")
fields = append(fields, "tag")
// forge analyzed docs
results := []*index.AnalysisResult{
&index.AnalysisResult{
Document: doc,
Analyzed: []analysis.TokenFrequencies{
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 1,
Position: 1,
Term: []byte("a"),
},
}, nil, false),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 3,
Position: 1,
Term: []byte("wow"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("some"),
},
&analysis.Token{
Start: 5,
End: 10,
Position: 2,
Term: []byte("thing"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("cold"),
},
}, []uint64{0}, true),
},
Length: []int{
1,
1,
2,
1,
1,
},
},
}
// fix up composite fields
for _, ar := range results {
for i, f := range ar.Document.Fields {
for _, cf := range ar.Document.CompositeFields {
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
}
}
}
return mem.NewFromAnalyzedDocs(results), fields
}

View File

@ -151,7 +151,8 @@ func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) {
return math.MaxUint64, math.MaxUint64
}
// VisitDocumentFieldTerms is an implementation of the UnInvertIndex interface
// VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error {
fieldID := uint16(0)
@ -178,3 +179,22 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
}
return nil
}
// VisitableDocValueFields returns the list of fields with
// persisted doc value terms ready to be visitable using the
// VisitDocumentFieldTerms method.
func (s *Segment) VisitableDocValueFields() ([]string, error) {
if len(s.fieldsInv) == 0 {
return nil, nil
}
var rv []string
for fieldID, field := range s.fieldsInv {
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
dvIter != nil {
rv = append(rv, field)
}
}
return rv, nil
}

View File

@ -19,6 +19,9 @@ import (
"os"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
func TestOpen(t *testing.T) {
@ -515,3 +518,83 @@ func TestOpenMultiWithTwoChunks(t *testing.T) {
t.Errorf("expected count to be 1, got %d", count)
}
}
func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegmentMulti()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
seg, err := Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
cerr := seg.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok {
fields, err := zaps.VisitableDocValueFields()
if err != nil {
t.Fatalf("segment VisitableDocValueFields err: %v", err)
}
// no persisted doc value fields
if len(fields) != 0 {
t.Errorf("expected no persisted fields for doc values, got: %#v", fields)
}
}
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment, expectedFields := buildMemSegmentWithDefaultFieldMapping()
err = PersistSegment(memSegment, "/tmp/scorch.zap", 1)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
seg, err = Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
defer func() {
cerr := seg.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
}()
if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok {
fields, err := zaps.VisitableDocValueFields()
if err != nil {
t.Fatalf("segment VisitableDocValueFields err: %v", err)
}
if !reflect.DeepEqual(fields, expectedFields) {
t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields)
}
fieldTerms := make(index.FieldTerms)
err = zaps.VisitDocumentFieldTerms(0, fields, func(field string, term []byte) {
fieldTerms[field] = append(fieldTerms[field], string(term))
})
if err != nil {
t.Error(err)
}
expectedFieldTerms := index.FieldTerms{
"name": []string{"wow"},
"desc": []string{"some", "thing"},
"tag": []string{"cold"},
"_id": []string{"a"},
}
if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) {
t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms)
}
}
}

View File

@ -412,15 +412,64 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
ss := i.segment[segmentIndex]
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok {
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
// get the list of doc value persisted fields
pFields, err := zaps.VisitableDocValueFields()
if err != nil {
return err
}
// assort the fields for which terms look up have to
// be performed runtime
dvPendingFields := extractDvPendingFields(fields, pFields)
if len(dvPendingFields) == 0 {
// all fields are doc value persisted
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
}
// concurrently trigger the runtime doc value preparations for
// pending fields as well as the visit of the persisted doc values
errCh := make(chan error, 1)
go func() {
defer close(errCh)
err := ss.cachedDocs.prepareFields(fields, ss)
if err != nil {
errCh <- err
}
}()
// visit the persisted dv while the cache preparation is in progress
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
if err != nil {
return err
}
// err out if fieldCache preparation failed
err = <-errCh
if err != nil {
return err
}
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor)
return nil
}
// else fallback to the in memory fieldCache
err = ss.cachedDocs.prepareFields(fields, ss)
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor)
}
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string,
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error {
err := ss.cachedDocs.prepareFields(fields, ss)
if err != nil {
return err
}
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor)
return nil
}
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string,
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) {
for _, field := range fields {
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
@ -436,5 +485,19 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
}
}
return nil
}
func extractDvPendingFields(requestedFields, persistedFields []string) []string {
removeMap := map[string]struct{}{}
for _, str := range persistedFields {
removeMap[str] = struct{}{}
}
rv := make([]string, 0, len(requestedFields))
for _, s := range requestedFields {
if _, ok := removeMap[s]; !ok {
rv = append(rv, s)
}
}
return rv
}

View File

@ -18,7 +18,6 @@ import (
"sync"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
)
@ -84,50 +83,6 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel
return s.segment.VisitDocument(num, visitor)
}
func (s *SegmentSnapshot) DocumentVisitFieldTerms(num uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error {
collection := make(map[string][][]byte)
// collect field indexed values
for _, field := range fields {
dict, err := s.Dictionary(field)
if err != nil {
return err
}
dictItr := dict.Iterator()
var next *index.DictEntry
next, err = dictItr.Next()
for next != nil && err == nil {
postings, err2 := dict.PostingsList(next.Term, nil)
if err2 != nil {
return err2
}
postingsItr := postings.Iterator()
nextPosting, err2 := postingsItr.Next()
for err2 == nil && nextPosting != nil && nextPosting.Number() <= num {
if nextPosting.Number() == num {
// got what we're looking for
collection[field] = append(collection[field], []byte(next.Term))
}
nextPosting, err = postingsItr.Next()
}
if err2 != nil {
return err
}
next, err = dictItr.Next()
}
if err != nil {
return err
}
}
// invoke callback
for field, values := range collection {
for _, value := range values {
visitor(field, value)
}
}
return nil
}
func (s *SegmentSnapshot) Count() uint64 {
rv := s.segment.Count()

View File

@ -28,6 +28,7 @@ import (
var (
IndexDynamic = true
StoreDynamic = true
DocValues = true // TODO revisit default?
)
// A FieldMapping describes how a specific item
@ -54,6 +55,10 @@ type FieldMapping struct {
IncludeTermVectors bool `json:"include_term_vectors,omitempty"`
IncludeInAll bool `json:"include_in_all,omitempty"`
DateFormat string `json:"date_format,omitempty"`
// DocValues, if true makes the index uninverting possible for this field
// It is useful for faceting and sorting queries.
DocValues bool `json:"docvalues,omitempty"`
}
// NewTextFieldMapping returns a default field mapping for text
@ -64,6 +69,7 @@ func NewTextFieldMapping() *FieldMapping {
Index: true,
IncludeTermVectors: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -71,6 +77,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewTextFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValues
return rv
}
@ -81,6 +88,7 @@ func NewNumericFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -88,6 +96,7 @@ func newNumericFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewNumericFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValues
return rv
}
@ -98,6 +107,7 @@ func NewDateTimeFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -105,6 +115,7 @@ func newDateTimeFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewDateTimeFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValues
return rv
}
@ -115,6 +126,7 @@ func NewBooleanFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -122,6 +134,7 @@ func newBooleanFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping {
rv := NewBooleanFieldMapping()
rv.Store = im.StoreDynamic
rv.Index = im.IndexDynamic
rv.DocValues = im.DocValues
return rv
}
@ -132,6 +145,7 @@ func NewGeoPointFieldMapping() *FieldMapping {
Store: true,
Index: true,
IncludeInAll: true,
DocValues: true,
}
}
@ -147,6 +161,9 @@ func (fm *FieldMapping) Options() document.IndexingOptions {
if fm.IncludeTermVectors {
rv |= document.IncludeTermVectors
}
if fm.DocValues {
rv |= document.DocValues
}
return rv
}
@ -308,6 +325,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
case "docvalues":
err := json.Unmarshal(v, &fm.DocValues)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}

View File

@ -50,6 +50,7 @@ type IndexMappingImpl struct {
DefaultField string `json:"default_field"`
StoreDynamic bool `json:"store_dynamic"`
IndexDynamic bool `json:"index_dynamic"`
DocValues bool `json:"docvalues,omitempty"`
CustomAnalysis *customAnalysis `json:"analysis,omitempty"`
cache *registry.Cache
}
@ -154,6 +155,7 @@ func NewIndexMapping() *IndexMappingImpl {
DefaultField: defaultField,
IndexDynamic: IndexDynamic,
StoreDynamic: StoreDynamic,
DocValues: DocValues,
CustomAnalysis: newCustomAnalysis(),
cache: registry.NewCache(),
}
@ -217,6 +219,7 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
im.TypeMapping = make(map[string]*DocumentMapping)
im.StoreDynamic = StoreDynamic
im.IndexDynamic = IndexDynamic
im.DocValues = DocValues
var invalidKeys []string
for k, v := range tmp {
@ -271,6 +274,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
if err != nil {
return err
}
case "docvalues":
err := json.Unmarshal(v, &im.DocValues)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}

View File

@ -40,7 +40,8 @@ var mappingSource = []byte(`{
"store": true,
"index": true,
"include_term_vectors": true,
"include_in_all": true
"include_in_all": true,
"docvalues": true
}
]
}