added an abstract stemmer and an implementation of a simple stemmer
This commit is contained in:
parent
6285ef9fdf
commit
aa153bc68c
|
@ -8,4 +8,5 @@ module Polecat
|
|||
require 'polecat/index_searcher'
|
||||
require 'polecat/query'
|
||||
require 'polecat/term'
|
||||
require 'polecat/stemmer'
|
||||
end
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
module Polecat
|
||||
# abstract class for stemmer
|
||||
#
|
||||
# This class can be used for inheritence for your own stemmer.
|
||||
# A stemmer is responsible to convert an document into an array of fragments
|
||||
# which then merged with the index. As every document can be built of
|
||||
# different words and fragments, the stemmer is very important to get the
|
||||
# best result when searching.
|
||||
#
|
||||
# Be warned, that you use the same stemmer for the index as for the search
|
||||
# input!
|
||||
#
|
||||
# To build your own stemmer implement the methods #stem and #result.
|
||||
class Stemmer
|
||||
# stems the word
|
||||
#
|
||||
# This method changes the word into a form, which get's interted into the
|
||||
# index.
|
||||
# @param [Object] word word to stem
|
||||
# @return [Object] the stemmed variant of the word or the same object
|
||||
def stem word
|
||||
raise NotImplementedError, 'please implement #stem'
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,18 @@
|
|||
module Polecat
|
||||
# simple stemmer for cleaning in a simple way
|
||||
#
|
||||
# This class can be used for cleaning strings in the most simple way. If it
|
||||
# does not do, what you intent it to do, inherit from Polecat::Stemmer and
|
||||
# implement your own.
|
||||
class SimpleStemmer < Polecat::Stemmer
|
||||
def stem word
|
||||
if word.class == Array
|
||||
word.each {|w| self.stem w }
|
||||
elsif word.class == String && word.length > 1
|
||||
word.gsub /(ing|ed)$/, ''
|
||||
else
|
||||
word
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,12 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
require 'polecat/stemmer/simple'
|
||||
|
||||
describe Polecat::SimpleStemmer do
|
||||
it "creates a new SimpleStemmer" do
|
||||
subject.class.should be(Polecat::SimpleStemmer)
|
||||
end
|
||||
|
||||
it "is a stemmer" do
|
||||
subject.kind_of?(Polecat::Stemmer).should be(true)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,36 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
require 'polecat/stemmer/simple'
|
||||
|
||||
describe Polecat::SimpleStemmer do
|
||||
it "takes one argument" do
|
||||
subject.method(:stem).arity.should be(1)
|
||||
end
|
||||
|
||||
it "returns nil if nil was given" do
|
||||
subject.stem(nil).should be(nil)
|
||||
end
|
||||
|
||||
it "returns the word if nothing was done" do
|
||||
subject.stem("a").should == "a"
|
||||
end
|
||||
|
||||
it "deletes 'ing' from the end" do
|
||||
subject.stem("finding").should == "find"
|
||||
end
|
||||
|
||||
it "deletes 'ed' from the word end" do
|
||||
subject.stem("coded").should == "cod"
|
||||
end
|
||||
|
||||
it "returns numbers not as a string" do
|
||||
subject.stem(1).class.should be(Fixnum)
|
||||
end
|
||||
|
||||
it "returns a float not as a string" do
|
||||
subject.stem(1.1).class.should be(Float)
|
||||
end
|
||||
|
||||
it "takes an array and stems every element in it" do
|
||||
subject.stem(['coding'])[0].should == 'coding'
|
||||
end
|
||||
end
|
|
@ -0,0 +1,8 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
|
||||
describe "Stemmer#new" do
|
||||
it "creates a new stemmer" do
|
||||
s = Polecat::Stemmer.new
|
||||
s.class.should be(Polecat::Stemmer)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,13 @@
|
|||
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
||||
|
||||
describe "Stemmer#stem" do
|
||||
let (:s) { Polecat::Stemmer.new }
|
||||
|
||||
it "takes one argument" do
|
||||
s.method(:stem).arity.should == 1
|
||||
end
|
||||
|
||||
it "raises an error, because it's an abstract class" do
|
||||
lambda { s.stem "word" }.should raise_error(NotImplementedError)
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue