0
0
Fork 0

analyze locations only if includeTermVectors enabled

With this change, TermLocations are computed and maintained only if
includeTermVectors is enabled, for higher performance.
This commit is contained in:
Steve Yen 2016-01-05 10:58:38 -08:00
parent e5c1af4164
commit 89d17f01ef
8 changed files with 46 additions and 27 deletions

View File

@ -18,7 +18,7 @@ func BenchmarkAnalysis(b *testing.B) {
}
ts := analyzer.Analyze(bleveWikiArticle)
freqs := analysis.TokenFrequency(ts, nil)
freqs := analysis.TokenFrequency(ts, nil, true)
if len(freqs) != 511 {
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
}

View File

@ -26,10 +26,11 @@ type TokenLocation struct {
type TokenFreq struct {
Term []byte
Locations []*TokenLocation
frequency int
}
func (tf *TokenFreq) Frequency() int {
return len(tf.Locations)
return tf.frequency
}
// TokenFrequencies maps document terms to their combined frequencies from all
@ -46,37 +47,54 @@ func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies)
existingTf, exists := tfs[tfk]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
existingTf.frequency = existingTf.frequency + tf.frequency
} else {
tfs[tfk] = tf
}
}
}
func TokenFrequency(tokens TokenStream, arrayPositions []uint64) TokenFrequencies {
func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
rv := make(map[string]*TokenFreq, len(tokens))
tls := make([]TokenLocation, len(tokens))
tlNext := 0
if includeTermVectors {
tls := make([]TokenLocation, len(tokens))
tlNext := 0
for _, token := range tokens {
tls[tlNext] = TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Position: token.Position,
for _, token := range tokens {
tls[tlNext] = TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Position: token.Position,
}
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &tls[tlNext])
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
frequency: 1,
}
}
tlNext++
}
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &tls[tlNext])
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
} else {
for _, token := range tokens {
curr, exists := rv[string(token.Term)]
if exists {
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
frequency: 1,
}
}
}
tlNext++
}
return rv

View File

@ -44,9 +44,10 @@ func TestTokenFrequency(t *testing.T) {
End: 11,
},
},
frequency: 2,
},
}
result := TokenFrequency(tokens, nil)
result := TokenFrequency(tokens, nil, true)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}

View File

@ -75,7 +75,7 @@ func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) {
}
fieldLength := len(tokens)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -71,7 +71,7 @@ func (n *NumericField) Analyze() (int, analysis.TokenFrequencies) {
}
fieldLength := len(tokens)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -60,7 +60,7 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
}
}
fieldLength := len(tokens) // number of tokens in this doc field
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -59,7 +59,7 @@ func (d *DictUpdater) NotifyBatch(termUsages map[string]int64) {
func (d *DictUpdater) Start() {
d.closeWait.Add(1)
go d.runIncoming()
go d.runIncoming()
go d.run()
}

View File

@ -664,7 +664,7 @@ func decodeFieldType(typ byte, name string, pos []uint64, value []byte) document
}
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return len(tf.Locations)
return tf.Frequency()
}
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {