[Xapian-discuss] Proper noun stemming
Colin Bell
colinabell at gmail.com
Thu Mar 27 13:55:22 GMT 2008
Hi James
Thanks for your (very patient) help. You have convinced me that I was
barking up the wrong tree. I have changed over to TermGenerator now.
Xapian::Utf8Iterator myIterator;
myIterator.assign(text);
Xapian::TermGenerator indexer;
indexer.set_stemmer(Xapian::Stem("english"));
indexer.set_stopper(new MyStopper());
indexer.set_database(database);
indexer.set_document(doc);
indexer.set_flags(Xapian::TermGenerator::FLAG_SPELLING);
indexer.index_text(myIterator);
I've noticed that my stopper is not working can anyone tell me why?
I'm getting loads of junk terms now. (I'm using Xapian 1.0.5)
class MyStopper : public Xapian::Stopper {
public:
bool operator()(const string &t) const {
switch (t[0]) {
case 'a':
return (t == "a" || t == "able" || t == "about" || t == "above"
|| t == "absolutely" || t == "according" || t == "accordingly" || t ==
"across" || t == "actually" || t == "after" || t == "afterwards" || t
== "again" || t == "against" || t == "ago" || t == "all" || t ==
"allow" || t == "allowed" || t == "allowing" || t == "allows" || t ==
"almost" || t == "along" || t == "already" || t == "also" || t ==
"although" || t == "always" || t == "am" || t == "among" || t ==
"amongst" || t == "an" || t == "and" || t == "another" || t == "any"
|| t == "anybody" || t == "anyhow" || t == "anyone" || t == "anything"
|| t == "anyway" || t == "anywhere" || t == "apart" || t == "approx"
|| t == "approximately" || t == "apr" || t == "april" || t == "are" ||
t == "arent" || t == "around" || t == "as" || t == "ask" || t ==
"asking" || t == "at" || t == "aug" || t == "august" || t ==
"available" || t == "away");
case 'b':
return (t == "b" || t == "bad" || t == "be" || t == "became" || t
== "because" || t == "become" || t == "becomes" || t == "becoming" ||
t == "been" || t == "before" || t == "began" || t == "behind" || t ==
"below" || t == "beside" || t == "besides" || t == "best" || t ==
"better" || t == "between" || t == "big" || t == "bigg" || t ==
"bigger" || t == "biggest" || t == "both" || t == "bring" || t ==
"bringing" || t == "brings" || t == "brought" || t == "but" || t ==
"by");
case 'c':
return (t == "c" || t == "cable" || t == "call" || t == "called"
|| t == "calls" || t == "came" || t == "can" || t == "cannot" || t ==
"cant" || t == "cause" || t == "caused" || t == "causes" || t ==
"causing" || t == "certain" || t == "certainly" || t == "change" || t
== "changes" || t == "clearly" || t == "click" || t == "co" || t ==
"com" || t == "come" || t == "comes" || t == "contact" || t ==
"copyright" || t == "could" || t == "couldn" || t == "couldnt");
case 'd':
return (t == "d" || t == "day" || t == "days" || t == "dec" || t
== "december" || t == "definite" || t == "definitely" || t ==
"despite" || t == "did" || t == "didn" || t == "didnt" || t ==
"different" || t == "do" || t == "does" || t == "doesn" || t ==
"doesnt" || t == "doing" || t == "done" || t == "dont" || t == "down"
|| t == "due" || t == "during");
case 'e':
return (t == "e" || t == "each" || t == "eg" || t == "eight" || t
== "eighth" || t == "either" || t == "else" || t == "elsewhere" || t
== "email" || t == "enough" || t == "especially" || t == "etc" || t ==
"even" || t == "ever" || t == "every" || t == "everybody" || t ==
"everyone" || t == "everything" || t == "everywhere" || t == "except");
case 'f':
return (t == "f" || t == "far" || t == "feb" || t == "february"
|| t == "few" || t == "fewer" || t == "fifth" || t == "find" || t ==
"first" || t == "five" || t == "for" || t == "forward" || t == "four"
|| t == "fourth" || t == "fri" || t == "friday" || t == "fridays" || t
== "from" || t == "further" || t == "furthermore");
case 'g':
return (t == "g" || t == "gave" || t == "get" || t == "gets" || t
== "getting" || t == "give" || t == "given" || t == "gives" || t ==
"giving" || t == "go" || t == "goes" || t == "going" || t == "gone" ||
t == "good" || t == "got");
case 'h':
return (t == "h" || t == "had" || t == "hadn" || t == "hadnt" ||
t == "happens" || t == "has" || t == "hasn" || t == "hasnt" || t ==
"have" || t == "having" || t == "he" || t == "held" || t == "help" ||
t == "her" || t == "here" || t == "hers" || t == "herself" || t ==
"hes" || t == "high" || t == "higher" || t == "highest" || t == "him"
|| t == "himself" || t == "his" || t == "home" || t == "hour" || t ==
"how" || t == "however");
case 'i':
return (t == "i" || t == "ie" || t == "if" || t == "in" || t ==
"include" || t == "includes" || t == "including" || t == "indeed" || t
== "info" || t == "information" || t == "internet" || t == "into" || t
== "involve" || t == "involved" || t == "involves" || t == "involving"
|| t == "is" || t == "isn" || t == "isnt" || t == "it" || t == "its"
|| t == "itself" || t == "ive");
case 'j':
return (t == "j" || t == "jan" || t == "january" || t == "jul" ||
t == "july" || t == "jun" || t == "june" || t == "just");
case 'k':
return (t == "k");
case 'l':
return (t == "l" || t == "large" || t == "larger" || t ==
"largest" || t == "last" || t == "lasts" || t == "later" || t == "led"
|| t == "less" || t == "lesser" || t == "like" || t == "liked" || t ==
"likely" || t == "link" || t == "ll" || t == "low" || t == "lower" ||
t == "lowest");
case 'm':
return (t == "m" || t == "made" || t == "make" || t == "many" ||
t == "mar" || t == "march" || t == "may" || t == "maybe" || t == "me"
|| t == "might" || t == "mon" || t == "monday" || t == "mondays" || t
== "month" || t == "more" || t == "most" || t == "mostly" || t ==
"much" || t == "must" || t == "mustn" || t == "mustnt" || t == "my");
case 'n':
return (t == "n" || t == "nbsp;-" || t == "near" || t == "need"
|| t == "needs" || t == "neither" || t == "never" || t ==
"nevertheless" || t == "new" || t == "newer" || t == "newest" || t ==
"next" || t == "nine" || t == "ninth" || t == "no" || t == "nobody" ||
t == "non" || t == "none" || t == "noone" || t == "nor" || t == "not"
|| t == "nothing" || t == "nov" || t == "november" || t == "now" || t
== "nowhere");
case 'o':
return (t == "o" || t == "oct" || t == "october" || t == "of" ||
t == "off" || t == "often" || t == "on" || t == "once" || t == "one"
|| t == "online" || t == "only" || t == "onto" || t == "open" || t ==
"or" || t == "other" || t == "others" || t == "otherwise" || t ==
"our" || t == "ours" || t == "out" || t == "over" || t == "own");
case 'p':
return (t == "p" || t == "page" || t == "perhaps" || t == "plain"
|| t == "please" || t == "possible" || t == "probably" || t ==
"provide" || t == "provided" || t == "provides");
case 'q':
return (t == "q" || t == "quite");
case 'r':
return (t == "r" || t == "rather" || t == "re" || t == "read" ||
t == "reads" || t == "really" || t == "regular" || t == "relatively"
|| t == "right");
case 's':
return (t == "s" || t == "said" || t == "same" || t == "sat" || t
== "saturday" || t == "saturdays" || t == "say" || t == "says" || t ==
"second" || t == "see" || t == "seeing" || t == "seem" || t ==
"seemed" || t == "seeming" || t == "seems" || t == "seen" || t ==
"sees" || t == "sep" || t == "sept" || t == "september" || t ==
"seven" || t == "seventh" || t == "several" || t == "shall" || t ==
"she" || t == "should" || t == "shouldn" || t == "shouldnt" || t ==
"show" || t == "shown" || t == "shows" || t == "since" || t == "site"
|| t == "six" || t == "sixth" || t == "so" || t == "some" || t ==
"somebody" || t == "somehow" || t == "someone" || t == "something" ||
t == "sometime" || t == "sometimes" || t == "somewhat" || t ==
"somewhere" || t == "soon" || t == "still" || t == "such" || t ==
"sunday" || t == "sundays");
case 't':
return (t == "t" || t == "take" || t == "taken" || t == "takes"
|| t == "taking" || t == "tel" || t == "tenth" || t == "than" || t ==
"thank" || t == "thanks" || t == "that" || t == "thats" || t == "the"
|| t == "their" || t == "theirs" || t == "them" || t == "then" || t ==
"there" || t == "thereafter" || t == "thereby" || t == "therefore" ||
t == "therefrom" || t == "therein" || t == "theres" || t == "these" ||
t == "they" || t == "third" || t == "this" || t == "those" || t ==
"though" || t == "three" || t == "through" || t == "throughout" || t
== "thur" || t == "thurs" || t == "thursday" || t == "thursdays" || t
== "thus" || t == "time" || t == "to" || t == "today" || t == "todays"
|| t == "together" || t == "tomorrow" || t == "tomorrows" || t ==
"too" || t == "toward" || t == "towards" || t == "tries" || t == "try"
|| t == "trying" || t == "tue" || t == "tues" || t == "tuesday" || t
== "tuesdays" || t == "twice" || t == "two");
case 'u':
return (t == "u" || t == "un" || t == "unable" || t == "under" ||
t == "unless" || t == "until" || t == "unto" || t == "up" || t ==
"update" || t == "updated" || t == "upon" || t == "us" || t == "use"
|| t == "used" || t == "uses" || t == "using" || t == "usually");
case 'v':
return (t == "v" || t == "very");
case 'w':
return (t == "w" || t == "want" || t == "wants" || t == "was" ||
t == "wasnt" || t == "way" || t == "we" || t == "web" || t ==
"website" || t == "wed" || t == "wednesday" || t == "wednesdays" || t
== "weds" || t == "week" || t == "welcome" || t == "well" || t ==
"went" || t == "were" || t == "what" || t == "whatever" || t == "when"
|| t == "whenever" || t == "where" || t == "wherever" || t ==
"whether" || t == "which" || t == "while" || t == "who" || t ==
"whoever" || t == "whom" || t == "whose" || t == "why" || t == "will"
|| t == "with" || t == "within" || t == "without" || t == "wont" || t
== "work" || t == "worse" || t == "worst" || t == "would" || t ==
"wouldn" || t == "wouldnt" || t == "www");
case 'x':
return (t == "x");
case 'y':
return (t == "y" || t == "year" || t == "years" || t == "yes" ||
t == "yet" || t == "you" || t == "your" || t == "youre" || t ==
"yours");
case 'z':
return (t == "z");
case '1':
return (t == "1" || t == "10th" || t == "11th" || t == "12th" ||
t == "13th" || t == "14th" || t == "15th" || t == "16th" || t ==
"17th" || t == "18th" || t == "19th" || t == "1st");
case '2':
return (t == "2" || t == "20th" || t == "21st" || t == "22nd" ||
t == "23rd" || t == "24th" || t == "25th" || t == "26th" || t ==
"27th" || t == "28th" || t == "29th" || t == "2nd");
case '3':
return (t == "3" || t == "30th" || t == "31st" || t == "3rd");
case '4':
return (t == "4" || t == "4th");
case '5':
return (t == "5" || t == "5th");
case '6':
return (t == "6" || t == "6th");
case '7':
return (t == "7" || t == "7th");
case '8':
return (t == "8" || t == "8th");
case '9':
return (t == "9" || t == "9th");
default:
return false;
}
}
};
On 27 Mar 2008, at 13:08, James Aylett wrote:
> On Thu, Mar 27, 2008 at 12:47:33PM +0000, Colin Bell wrote:
>
>>> As one of the above documents says, the convention is to store
>>> unstemmed forms with positional information, so the proximity of
>>> 'Gordon' to 'Brown' is retained in the database, and PHRASE and NEAR
>>> searches will be able to take advantage of that. (So the search
>>> 'meeting "Gordon Brown"' should match the above well.)
>>
>> This sounds ideal. Storing "Gordon" "Brown" and "Gordon Brown" and
>> linking them is a great solution. The only trick is picking out
>> proper
>> nouns like "Gordon Brown" or "Prime Minister" during the stemming
>> process to store them as phrases. Will TermGenerator be able to do
>> this? I'm going through the docs on this right now.
>
> No, it doesn't do that at all. It will store "Gordon" and "Brown" with
> appropriate positional information so that phrase searches work. In
> most cases there isn't a good reason to store "Gordon Brown" at all.
>
> Have a think about what *queries* you want to support, and then figure
> out if the TermGenerator/QueryParser pairing will achieve that.
More information about the Xapian-discuss
mailing list