3eb63a887b
stop words can be loaded from files/bytes, closes #19 stop words loaded for large list of languages, closes #20 defined language specific analyzers for as much as possible right now, closes #21 opened new issues for some of the remaining gaps
58 lines
1.1 KiB
Go
58 lines
1.1 KiB
Go
package stop_words_filter
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"io"
|
|
"io/ioutil"
|
|
"strings"
|
|
)
|
|
|
|
type StopWordsMap map[string]bool
|
|
|
|
func NewStopWordsMap() StopWordsMap {
|
|
return make(StopWordsMap, 0)
|
|
}
|
|
|
|
func (s StopWordsMap) LoadFile(filename string) error {
|
|
data, err := ioutil.ReadFile(filename)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return s.LoadBytes(data)
|
|
}
|
|
|
|
func (s StopWordsMap) LoadBytes(data []byte) error {
|
|
bytesReader := bytes.NewReader(data)
|
|
bufioReader := bufio.NewReader(bytesReader)
|
|
line, err := bufioReader.ReadString('\n')
|
|
for err == nil {
|
|
s.LoadLine(line)
|
|
line, err = bufioReader.ReadString('\n')
|
|
}
|
|
// if the err was EOF still need to process last value
|
|
if err == io.EOF {
|
|
s.LoadLine(line)
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (s StopWordsMap) LoadLine(line string) error {
|
|
// find the start of comment, if any
|
|
startComment := strings.IndexAny(line, "#|")
|
|
if startComment >= 0 {
|
|
line = line[:startComment]
|
|
}
|
|
|
|
stopWords := strings.Fields(line)
|
|
for _, stopWord := range stopWords {
|
|
s.AddWord(stopWord)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s StopWordsMap) AddWord(word string) {
|
|
s[word] = true
|
|
}
|