Merge pull request #307 from steveyen/WIP-perf-20160105
analyze locations only if includeTermVectors enabled
This commit is contained in:
commit
83cd8da394
@ -18,7 +18,7 @@ func BenchmarkAnalysis(b *testing.B) {
|
||||
}
|
||||
|
||||
ts := analyzer.Analyze(bleveWikiArticle)
|
||||
freqs := analysis.TokenFrequency(ts, nil)
|
||||
freqs := analysis.TokenFrequency(ts, nil, true)
|
||||
if len(freqs) != 511 {
|
||||
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
|
||||
}
|
||||
|
@ -26,10 +26,11 @@ type TokenLocation struct {
|
||||
type TokenFreq struct {
|
||||
Term []byte
|
||||
Locations []*TokenLocation
|
||||
frequency int
|
||||
}
|
||||
|
||||
func (tf *TokenFreq) Frequency() int {
|
||||
return len(tf.Locations)
|
||||
return tf.frequency
|
||||
}
|
||||
|
||||
// TokenFrequencies maps document terms to their combined frequencies from all
|
||||
@ -46,37 +47,54 @@ func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies)
|
||||
existingTf, exists := tfs[tfk]
|
||||
if exists {
|
||||
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
|
||||
existingTf.frequency = existingTf.frequency + tf.frequency
|
||||
} else {
|
||||
tfs[tfk] = tf
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TokenFrequency(tokens TokenStream, arrayPositions []uint64) TokenFrequencies {
|
||||
func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
|
||||
rv := make(map[string]*TokenFreq, len(tokens))
|
||||
|
||||
tls := make([]TokenLocation, len(tokens))
|
||||
tlNext := 0
|
||||
if includeTermVectors {
|
||||
tls := make([]TokenLocation, len(tokens))
|
||||
tlNext := 0
|
||||
|
||||
for _, token := range tokens {
|
||||
tls[tlNext] = TokenLocation{
|
||||
ArrayPositions: arrayPositions,
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
for _, token := range tokens {
|
||||
tls[tlNext] = TokenLocation{
|
||||
ArrayPositions: arrayPositions,
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Position: token.Position,
|
||||
}
|
||||
|
||||
curr, ok := rv[string(token.Term)]
|
||||
if ok {
|
||||
curr.Locations = append(curr.Locations, &tls[tlNext])
|
||||
curr.frequency++
|
||||
} else {
|
||||
rv[string(token.Term)] = &TokenFreq{
|
||||
Term: token.Term,
|
||||
Locations: []*TokenLocation{&tls[tlNext]},
|
||||
frequency: 1,
|
||||
}
|
||||
}
|
||||
|
||||
tlNext++
|
||||
}
|
||||
|
||||
curr, ok := rv[string(token.Term)]
|
||||
if ok {
|
||||
curr.Locations = append(curr.Locations, &tls[tlNext])
|
||||
} else {
|
||||
rv[string(token.Term)] = &TokenFreq{
|
||||
Term: token.Term,
|
||||
Locations: []*TokenLocation{&tls[tlNext]},
|
||||
} else {
|
||||
for _, token := range tokens {
|
||||
curr, exists := rv[string(token.Term)]
|
||||
if exists {
|
||||
curr.frequency++
|
||||
} else {
|
||||
rv[string(token.Term)] = &TokenFreq{
|
||||
Term: token.Term,
|
||||
frequency: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tlNext++
|
||||
}
|
||||
|
||||
return rv
|
||||
|
@ -44,9 +44,10 @@ func TestTokenFrequency(t *testing.T) {
|
||||
End: 11,
|
||||
},
|
||||
},
|
||||
frequency: 2,
|
||||
},
|
||||
}
|
||||
result := TokenFrequency(tokens, nil)
|
||||
result := TokenFrequency(tokens, nil, true)
|
||||
if !reflect.DeepEqual(result, expectedResult) {
|
||||
t.Errorf("expected %#v, got %#v", expectedResult, result)
|
||||
}
|
||||
|
@ -75,7 +75,7 @@ func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) {
|
||||
}
|
||||
|
||||
fieldLength := len(tokens)
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
|
||||
return fieldLength, tokenFreqs
|
||||
}
|
||||
|
||||
|
@ -71,7 +71,7 @@ func (n *NumericField) Analyze() (int, analysis.TokenFrequencies) {
|
||||
}
|
||||
|
||||
fieldLength := len(tokens)
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
|
||||
return fieldLength, tokenFreqs
|
||||
}
|
||||
|
||||
|
@ -60,7 +60,7 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
|
||||
}
|
||||
}
|
||||
fieldLength := len(tokens) // number of tokens in this doc field
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions)
|
||||
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors())
|
||||
return fieldLength, tokenFreqs
|
||||
}
|
||||
|
||||
|
@ -59,7 +59,7 @@ func (d *DictUpdater) NotifyBatch(termUsages map[string]int64) {
|
||||
|
||||
func (d *DictUpdater) Start() {
|
||||
d.closeWait.Add(1)
|
||||
go d.runIncoming()
|
||||
go d.runIncoming()
|
||||
go d.run()
|
||||
}
|
||||
|
||||
|
@ -664,7 +664,7 @@ func decodeFieldType(typ byte, name string, pos []uint64, value []byte) document
|
||||
}
|
||||
|
||||
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
||||
return len(tf.Locations)
|
||||
return tf.Frequency()
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []index.IndexRow) {
|
||||
|
Loading…
Reference in New Issue
Block a user