From 89d17f01ef46a38b5114c174d7de2b2fc7230c1b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 5 Jan 2016 10:58:38 -0800 Subject: [PATCH] analyze locations only if includeTermVectors enabled With this change, TermLocations are computed and maintained only if includeTermVectors is enabled, for higher performance. --- analysis/benchmark_test.go | 2 +- analysis/freq.go | 58 +++++++++++++++++++++----------- analysis/freq_test.go | 3 +- document/field_datetime.go | 2 +- document/field_numeric.go | 2 +- document/field_text.go | 2 +- index/firestorm/dict_updater.go | 2 +- index/upside_down/upside_down.go | 2 +- 8 files changed, 46 insertions(+), 27 deletions(-) diff --git a/analysis/benchmark_test.go b/analysis/benchmark_test.go index 2c7f6a86..5c7425e4 100644 --- a/analysis/benchmark_test.go +++ b/analysis/benchmark_test.go @@ -18,7 +18,7 @@ func BenchmarkAnalysis(b *testing.B) { } ts := analyzer.Analyze(bleveWikiArticle) - freqs := analysis.TokenFrequency(ts, nil) + freqs := analysis.TokenFrequency(ts, nil, true) if len(freqs) != 511 { b.Errorf("expected %d freqs, got %d", 511, len(freqs)) } diff --git a/analysis/freq.go b/analysis/freq.go index a67e0afa..85b5d1a6 100644 --- a/analysis/freq.go +++ b/analysis/freq.go @@ -26,10 +26,11 @@ type TokenLocation struct { type TokenFreq struct { Term []byte Locations []*TokenLocation + frequency int } func (tf *TokenFreq) Frequency() int { - return len(tf.Locations) + return tf.frequency } // TokenFrequencies maps document terms to their combined frequencies from all @@ -46,37 +47,54 @@ func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) existingTf, exists := tfs[tfk] if exists { existingTf.Locations = append(existingTf.Locations, tf.Locations...) + existingTf.frequency = existingTf.frequency + tf.frequency } else { tfs[tfk] = tf } } } -func TokenFrequency(tokens TokenStream, arrayPositions []uint64) TokenFrequencies { +func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies { rv := make(map[string]*TokenFreq, len(tokens)) - tls := make([]TokenLocation, len(tokens)) - tlNext := 0 + if includeTermVectors { + tls := make([]TokenLocation, len(tokens)) + tlNext := 0 - for _, token := range tokens { - tls[tlNext] = TokenLocation{ - ArrayPositions: arrayPositions, - Start: token.Start, - End: token.End, - Position: token.Position, + for _, token := range tokens { + tls[tlNext] = TokenLocation{ + ArrayPositions: arrayPositions, + Start: token.Start, + End: token.End, + Position: token.Position, + } + + curr, ok := rv[string(token.Term)] + if ok { + curr.Locations = append(curr.Locations, &tls[tlNext]) + curr.frequency++ + } else { + rv[string(token.Term)] = &TokenFreq{ + Term: token.Term, + Locations: []*TokenLocation{&tls[tlNext]}, + frequency: 1, + } + } + + tlNext++ } - - curr, ok := rv[string(token.Term)] - if ok { - curr.Locations = append(curr.Locations, &tls[tlNext]) - } else { - rv[string(token.Term)] = &TokenFreq{ - Term: token.Term, - Locations: []*TokenLocation{&tls[tlNext]}, + } else { + for _, token := range tokens { + curr, exists := rv[string(token.Term)] + if exists { + curr.frequency++ + } else { + rv[string(token.Term)] = &TokenFreq{ + Term: token.Term, + frequency: 1, + } } } - - tlNext++ } return rv diff --git a/analysis/freq_test.go b/analysis/freq_test.go index 9bbd3fb2..111e3c82 100644 --- a/analysis/freq_test.go +++ b/analysis/freq_test.go @@ -44,9 +44,10 @@ func TestTokenFrequency(t *testing.T) { End: 11, }, }, + frequency: 2, }, } - result := TokenFrequency(tokens, nil) + result := TokenFrequency(tokens, nil, true) if !reflect.DeepEqual(result, expectedResult) { t.Errorf("expected %#v, got %#v", expectedResult, result) } diff --git a/document/field_datetime.go b/document/field_datetime.go index faf430c8..973904d4 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -75,7 +75,7 @@ func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) { } fieldLength := len(tokens) - tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions) + tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors()) return fieldLength, tokenFreqs } diff --git a/document/field_numeric.go b/document/field_numeric.go index ddc8faae..9218318a 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -71,7 +71,7 @@ func (n *NumericField) Analyze() (int, analysis.TokenFrequencies) { } fieldLength := len(tokens) - tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions) + tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors()) return fieldLength, tokenFreqs } diff --git a/document/field_text.go b/document/field_text.go index 45d3063a..a18792fc 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -60,7 +60,7 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) { } } fieldLength := len(tokens) // number of tokens in this doc field - tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions) + tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors()) return fieldLength, tokenFreqs } diff --git a/index/firestorm/dict_updater.go b/index/firestorm/dict_updater.go index 74c66506..053ba0d4 100644 --- a/index/firestorm/dict_updater.go +++ b/index/firestorm/dict_updater.go @@ -59,7 +59,7 @@ func (d *DictUpdater) NotifyBatch(termUsages map[string]int64) { func (d *DictUpdater) Start() { d.closeWait.Add(1) - go d.runIncoming() + go d.runIncoming() go d.run() } diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index 87117c3e..139bcdc1 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -664,7 +664,7 @@ func decodeFieldType(typ byte, name string, pos []uint64, value []byte) document } func frequencyFromTokenFreq(tf *analysis.TokenFreq) int { - return len(tf.Locations) + return tf.Frequency() } func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {