From aa153bc68c2bf99dd1d75c3486ab795f4bd75ade Mon Sep 17 00:00:00 2001 From: Gibheer Date: Fri, 10 Jun 2011 00:40:03 +0200 Subject: [PATCH] added an abstract stemmer and an implementation of a simple stemmer --- lib/polecat.rb | 1 + lib/polecat/stemmer.rb | 25 ++++++++++++++++++++++ lib/polecat/stemmer/simple.rb | 18 ++++++++++++++++ spec/simple_stemmer/new_spec.rb | 12 +++++++++++ spec/simple_stemmer/stem_spec.rb | 36 ++++++++++++++++++++++++++++++++ spec/stemmer/new_spec.rb | 8 +++++++ spec/stemmer/stem_spec.rb | 13 ++++++++++++ 7 files changed, 113 insertions(+) create mode 100644 lib/polecat/stemmer.rb create mode 100644 lib/polecat/stemmer/simple.rb create mode 100644 spec/simple_stemmer/new_spec.rb create mode 100644 spec/simple_stemmer/stem_spec.rb create mode 100644 spec/stemmer/new_spec.rb create mode 100644 spec/stemmer/stem_spec.rb diff --git a/lib/polecat.rb b/lib/polecat.rb index 2204385..a15bbcc 100644 --- a/lib/polecat.rb +++ b/lib/polecat.rb @@ -8,4 +8,5 @@ module Polecat require 'polecat/index_searcher' require 'polecat/query' require 'polecat/term' + require 'polecat/stemmer' end diff --git a/lib/polecat/stemmer.rb b/lib/polecat/stemmer.rb new file mode 100644 index 0000000..e057f75 --- /dev/null +++ b/lib/polecat/stemmer.rb @@ -0,0 +1,25 @@ +module Polecat + # abstract class for stemmer + # + # This class can be used for inheritence for your own stemmer. + # A stemmer is responsible to convert an document into an array of fragments + # which then merged with the index. As every document can be built of + # different words and fragments, the stemmer is very important to get the + # best result when searching. + # + # Be warned, that you use the same stemmer for the index as for the search + # input! + # + # To build your own stemmer implement the methods #stem and #result. + class Stemmer + # stems the word + # + # This method changes the word into a form, which get's interted into the + # index. + # @param [Object] word word to stem + # @return [Object] the stemmed variant of the word or the same object + def stem word + raise NotImplementedError, 'please implement #stem' + end + end +end diff --git a/lib/polecat/stemmer/simple.rb b/lib/polecat/stemmer/simple.rb new file mode 100644 index 0000000..dbeacf1 --- /dev/null +++ b/lib/polecat/stemmer/simple.rb @@ -0,0 +1,18 @@ +module Polecat + # simple stemmer for cleaning in a simple way + # + # This class can be used for cleaning strings in the most simple way. If it + # does not do, what you intent it to do, inherit from Polecat::Stemmer and + # implement your own. + class SimpleStemmer < Polecat::Stemmer + def stem word + if word.class == Array + word.each {|w| self.stem w } + elsif word.class == String && word.length > 1 + word.gsub /(ing|ed)$/, '' + else + word + end + end + end +end diff --git a/spec/simple_stemmer/new_spec.rb b/spec/simple_stemmer/new_spec.rb new file mode 100644 index 0000000..0c4536d --- /dev/null +++ b/spec/simple_stemmer/new_spec.rb @@ -0,0 +1,12 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') +require 'polecat/stemmer/simple' + +describe Polecat::SimpleStemmer do + it "creates a new SimpleStemmer" do + subject.class.should be(Polecat::SimpleStemmer) + end + + it "is a stemmer" do + subject.kind_of?(Polecat::Stemmer).should be(true) + end +end diff --git a/spec/simple_stemmer/stem_spec.rb b/spec/simple_stemmer/stem_spec.rb new file mode 100644 index 0000000..a911c4c --- /dev/null +++ b/spec/simple_stemmer/stem_spec.rb @@ -0,0 +1,36 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') +require 'polecat/stemmer/simple' + +describe Polecat::SimpleStemmer do + it "takes one argument" do + subject.method(:stem).arity.should be(1) + end + + it "returns nil if nil was given" do + subject.stem(nil).should be(nil) + end + + it "returns the word if nothing was done" do + subject.stem("a").should == "a" + end + + it "deletes 'ing' from the end" do + subject.stem("finding").should == "find" + end + + it "deletes 'ed' from the word end" do + subject.stem("coded").should == "cod" + end + + it "returns numbers not as a string" do + subject.stem(1).class.should be(Fixnum) + end + + it "returns a float not as a string" do + subject.stem(1.1).class.should be(Float) + end + + it "takes an array and stems every element in it" do + subject.stem(['coding'])[0].should == 'coding' + end +end diff --git a/spec/stemmer/new_spec.rb b/spec/stemmer/new_spec.rb new file mode 100644 index 0000000..a1c6135 --- /dev/null +++ b/spec/stemmer/new_spec.rb @@ -0,0 +1,8 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe "Stemmer#new" do + it "creates a new stemmer" do + s = Polecat::Stemmer.new + s.class.should be(Polecat::Stemmer) + end +end diff --git a/spec/stemmer/stem_spec.rb b/spec/stemmer/stem_spec.rb new file mode 100644 index 0000000..e2828e5 --- /dev/null +++ b/spec/stemmer/stem_spec.rb @@ -0,0 +1,13 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe "Stemmer#stem" do + let (:s) { Polecat::Stemmer.new } + + it "takes one argument" do + s.method(:stem).arity.should == 1 + end + + it "raises an error, because it's an abstract class" do + lambda { s.stem "word" }.should raise_error(NotImplementedError) + end +end