[Xapian-discuss] Xapian::Queryparser / Encoding Problem (Utf8)
Olly Betts
olly at survex.com
Sat Aug 20 18:08:31 BST 2005
On Tue, Aug 16, 2005 at 12:49:51AM +0200, R. Mattes wrote:
> Could you send me the patch?
Attached. I got this off the gmane machine which lost a disk recently
so I'm not totally certain it's the latest version. I can't get to my
dev box at present as I'm away from home and failed to open a suitable
hole in the firewall before I left...
Cheers,
Olly
-------------- next part --------------
diff -ru orig/xapian-core-0.9.1/queryparser/Makefile.am xapian-core-0.9.1/queryparser/Makefile.am
--- orig/xapian-core-0.9.1/queryparser/Makefile.am 2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/Makefile.am 2005-06-23 01:39:35.000000000 +0200
@@ -1,6 +1,6 @@
## Process this file with automake to produce Makefile.in
-INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api
+INCLUDES = -I$(top_srcdir)/include -I${top_builddir}/include -I$(top_srcdir)/common -I$(top_srcdir)/api -I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include
noinst_HEADERS = accentnormalisingitor.h symboltab.h queryparser_internal.h \
queryparser_token.h
@@ -22,3 +22,4 @@
endif
libqueryparser_la_SOURCES = queryparser.cc queryparser_internal.cc
+libqueryparser_la_LIBADD = /usr/lib/libglib-2.0.la
diff -ru orig/xapian-core-0.9.1/queryparser/accentnormalisingitor.h xapian-core-0.9.1/queryparser/accentnormalisingitor.h
--- orig/xapian-core-0.9.1/queryparser/accentnormalisingitor.h 2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/accentnormalisingitor.h 2005-06-23 03:22:19.000000000 +0200
@@ -19,64 +19,74 @@
*/
#include "symboltab.h"
+#include <glib/gunicode.h>
#include <string>
using std::string;
-/** A wrapper class for a char which returns the char if dereferenced
+typedef gunichar char_type;
+
+/** A wrapper class for a char_type which returns the char_type if dereferenced
* with *. We need this to implement input_iterator semantics.
*/
class CharWrapper {
private:
- char ch;
+ char_type ch;
public:
- CharWrapper(char ch_) : ch(ch_) { }
- char operator*() const { return ch; }
+ CharWrapper(char_type ch_) : ch(ch_) { }
+ char_type operator*() const { return ch; }
};
class AccentNormalisingItor {
private:
- string::const_iterator itor;
- char queued;
+ /*string::const_iterator*/const gchar * itor;
+ const gchar * end;
+
+ char_type queued;
size_t trans;
public:
AccentNormalisingItor()
- : itor(), queued(0), trans(0) {}
- AccentNormalisingItor(string::const_iterator itor_)
- : itor(itor_), queued(0), trans(0) {}
+ : itor(NULL), queued(0), trans(0) {}
+ explicit AccentNormalisingItor(const char * itor_)
+ : itor(itor_), end(itor_), queued(0), trans(0) {}
+ AccentNormalisingItor(const char * itor_, const char *end_)
+ : itor(itor_), end(end_), queued(0), trans(0) {}
+#if 0
void operator=(string::const_iterator itor_)
{
itor = itor_;
+ end = end_;
queued = 0;
trans = 0;
}
+#endif
bool operator==(const AccentNormalisingItor &o) const {
return queued == o.queued && itor == o.itor;
}
bool operator!=(const AccentNormalisingItor &o) const {
return !(*this == o);
}
- char operator*() const {
+ char_type operator*() const {
if (queued) return queued;
- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
- ) return TRANSLIT1[ch - 160];
- return (char)ch;
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
+ ) ch = /*return*/ char_type(TRANSLIT1[ch - 160]);
+ return /*(char)*/ch;
}
AccentNormalisingItor & operator++() {
if (queued) {
queued = 0;
} else {
- unsigned char ch = (unsigned char)*itor;
+ char_type ch = g_utf8_get_char_validated(itor, end - itor);
if (ch >= 160
-#if CHAR_BIT > 8 // Avoid compiler warning.
- && ch < 256
-#endif
+//#if CHAR_BIT > 8 // Avoid compiler warning.
+ && ch < 0x240
+//#endif
) {
++trans;
ch = TRANSLIT2[ch - 160];
@@ -86,16 +96,22 @@
}
}
}
- ++itor;
+ // ++itor; becomes:
+ size_t skip = g_utf8_skip[*reinterpret_cast<const guchar *>(itor)];
+ if (size_t(end - itor) < skip) {
+ itor = end;
+ } else {
+ itor += skip;
+ }
return *this;
}
CharWrapper operator++(int) {
- char tmp = **this;
+ char_type tmp = **this;
operator++();
return CharWrapper(tmp);
}
size_t transliterations() const { return trans; }
- string::const_iterator raw() const { return itor; }
+ //string::const_iterator raw() const { return itor; }
/// We implement the semantics of an STL input_iterator.
//@{
diff -ru orig/xapian-core-0.9.1/queryparser/queryparser.lemony xapian-core-0.9.1/queryparser/queryparser.lemony
--- orig/xapian-core-0.9.1/queryparser/queryparser.lemony 2005-06-07 17:59:07.000000000 +0200
+++ xapian-core-0.9.1/queryparser/queryparser.lemony 2005-06-23 03:22:45.000000000 +0200
@@ -36,6 +36,41 @@
using namespace Xapian;
+static inline bool
+U_isupper(gunichar ch) {
+ return (ch < 128 && C_isupper(ch));
+}
+
+static inline bool
+U_isspace(gunichar ch) {
+ return (ch < 128 && C_isspace(ch));
+}
+
+static inline bool
+U_isnotspace(gunichar ch) {
+ return !U_isspace(ch);
+}
+
+static inline bool
+U_isalnum(gunichar ch) {
+ return (ch < 128 && C_isalnum(ch));
+}
+
+static inline bool
+U_isnotalnum(gunichar ch) {
+ return !U_isalnum(ch);
+}
+
+static inline bool
+U_issign(gunichar ch) {
+ return (ch < 128 && C_issign(ch));
+}
+
+static inline bool
+G_unichar_isnotalnum(gunichar ch) {
+ return !g_unichar_isalnum(ch);
+}
+
// Disable debug code lemon adds.
#define NDEBUG
@@ -93,27 +128,27 @@
static inline string
downcase_term(const string &term)
{
- string t;
- t.reserve(term.size());
- AccentNormalisingItor i(term.begin());
- const AccentNormalisingItor end(term.end());
- while (i != end) t += C_tolower(*i++);
+ gchar * r;
+ r = g_utf8_strdown(static_cast<const gchar*>(term.data()),
+ term.length());
+ string t(static_cast<char *>(r));
+ free(r);
return t;
}
static inline bool
-is_phrase_generator(unsigned char ch)
+is_phrase_generator(gunichar ch)
{
// These characters generate a phrase search.
// Ordered mostly by frequency of calls to this function done when
// running queryparsertest.
- return (ch && strchr(".-/':\\_@", ch) != NULL);
+ return (ch && ch < 128 && strchr(".-/':\\_@", ch) != NULL);
}
static inline bool
-prefix_needs_colon(const string & prefix, unsigned char ch)
+prefix_needs_colon(const string & prefix, gunichar ch)
{
- if (!C_isupper(ch)) return false;
+ if (!U_isupper(ch)) return false;
string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':');
}
@@ -126,6 +161,7 @@
Query
QueryParser::Internal::parse_query(const string &qs, unsigned int flags)
{
+ gchar ubuf[6];
#ifndef NDEBUG
// Set the prefix added to Lemon's debug output, if it's enabled.
// FIXME: arrange to send this to the Xapian debug log, and turn
@@ -136,29 +172,29 @@
void * pParser = ParseAlloc(malloc);
termpos term_pos = 1;
- AccentNormalisingItor it(qs.begin()), end(qs.end());
+ AccentNormalisingItor it(qs.data(), qs.data() + qs.size()), end(qs.data() + qs.size());
State state(this);
enum { DEFAULT, IN_QUOTES, IN_PHRASED_TERM } mode = DEFAULT;
- unsigned char newprev = ' ';
+ gunichar newprev = ' ';
while (it != end) {
if (mode == IN_PHRASED_TERM) mode = DEFAULT;
- if (C_isspace(*it)) {
+ if (U_isspace(*it)) {
newprev = ' ';
++it;
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (it == end) break;
}
- if (!C_isalnum(*it)) {
- unsigned char prev = newprev;
- unsigned char ch = *it++;
+ if (!g_unichar_isalnum(*it)) {
+ gunichar prev = newprev;
+ gunichar ch = *it++;
if (it != end) newprev = *it;
switch (ch) {
case '"':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
if (mode != IN_QUOTES) {
if (it == end) {
// Ignore an unmatched " at the end of the query to
@@ -191,7 +227,7 @@
// Or if not after whitespace or an open bracket.
continue;
}
- if (C_isspace(*it) || *it == '+' || *it == '-') {
+ if (U_isspace(*it) || *it == '+' || *it == '-') {
// Ignore + or - followed by a space, or further + or -.
// Postfix + (such as in C++ and H+) is handled as part of
// the term lexing code below.
@@ -204,7 +240,7 @@
case '(':
// Skip whitespace.
- it = find_if(it, end, C_isnotspace);
+ it = find_if(it, end, U_isnotspace);
// Ignore ( at end of query.
if (it == end) goto done;
if (prev > ' ' && prev != '(' && prev != ')') {
@@ -239,24 +275,25 @@
string prefix;
if (mode == DEFAULT && !prefixes.empty()) {
// Check for fieldname prefixes (e.g. title:historical).
- AccentNormalisingItor p = find_if(it, end, C_isnotalnum);
+ AccentNormalisingItor p = find_if(it, end, G_unichar_isnotalnum);
if (p != end && *p == ':' && ++p != end) {
- unsigned char ch = *p;
- if (C_isalnum(ch) ||
+ gunichar ch = *p;
+ if (g_unichar_isalnum(ch) ||
((flags & FLAG_PHRASE) && ch == '"') ||
((flags & FLAG_BOOLEAN) && ch == '(')) {
string field;
p = it;
- while (*p != ':') field += *p++;
+ while (*p != ':')
+ field += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
map<string, pair<bool, string> >::const_iterator f;
f = prefixes.find(field);
if (f != prefixes.end()) {
// Can't boolean prefix a subexpression or phrase.
bool boolean_filter = f->second.first;
- if (!boolean_filter || C_isalnum(ch)) {
+ if (!boolean_filter || g_unichar_isalnum(ch)) {
it = p;
++it;
- if (!C_isalnum(ch)) {
+ if (!g_unichar_isalnum(ch)) {
newprev = ch;
++it;
state.push_prefix(f->second.second);
@@ -275,7 +312,7 @@
if (prefix_needs_colon(prefix, *it))
prefix += ':';
while (it != end && *it > ' ' && *it != ')')
- prefix += *it++;
+ prefix += string(ubuf, g_unichar_to_utf8(*it++, ubuf));
Parse(pParser, BOOLEAN_FILTER,
new Term(prefix, 0), &state);
continue;
@@ -291,18 +328,18 @@
size_t transliterations = it.transliterations();
// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
// Don't worry if there's a trailing '.' or not.
- if (C_isupper(*it)) {
+ if (U_isupper(*it)) {
string t;
AccentNormalisingItor p = it;
do {
- t += *p++;
- } while (p != end && *p == '.' && ++p != end && C_isupper(*p));
+ t += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
// One letter does not make an acronym! If we handled a single
// uppercase letter here, we wouldn't catch M&S below.
if (t.length() > 1) {
// Check there's not a (lower case) letter or digit
// immediately after it.
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
it = p;
swap(term, t);
}
@@ -312,17 +349,17 @@
if (term.empty()) {
while (it != end) {
- if (!C_isalnum(*it)) {
+ if (!g_unichar_isalnum(*it)) {
// Treat a single embedded '&' as a word character
// (e.g. AT&T).
if (*it != '&') break;
AccentNormalisingItor p = it;
++p;
- if (p == end || !C_isalnum(*p)) break;
+ if (p == end || !g_unichar_isalnum(*p)) break;
}
- term += *it++;
+ term.append(ubuf, g_unichar_to_utf8(*it++, ubuf));
}
- if (it != end && (*it == '#' || C_issign(*it))) {
+ if (it != end && (*it == '#' || U_issign(*it))) {
string suff_term = term;
AccentNormalisingItor p = it;
if (*p == '#') {
@@ -331,17 +368,17 @@
while (++p != end && *p == '#') { }
} else {
// Keep trailing +, and - (e.g. C++, Na+, Cl-).
- // FIXME: keeping trailing "-" is of dubious utilpy and
+ // FIXME: keeping trailing "-" is of dubious utility and
// there's the risk of hyphens getting stuck onto the end of
// terms...
// FIXME: generating a term like foo+---+++ doesn't make
// much sense - we should probably be more conservative as
// to what combinations are allowed.
do {
- suff_term += *p++;
- } while (p != end && C_issign(*p));
+ suff_term += string(ubuf, g_unichar_to_utf8(*p++, ubuf));
+ } while (p != end && U_issign(*p));
}
- if (p == end || !C_isalnum(*p)) {
+ if (p == end || !g_unichar_isalnum(*p)) {
// If the suffixed term doesn't exist, check that the
// non-suffixed term does. This also takes care of
// the case when QueryParser::set_database() hasn't
@@ -358,7 +395,7 @@
if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
// Don't want to interpret A.N.D. or ?ND as an AND operator.
if (!was_acronym && transliterations == it.transliterations()) {
- if (prefix.empty() && !term.empty() && C_isupper(term[0])) {
+ if (prefix.empty() && !term.empty() && U_isupper(term[0])) {
if (term == "AND") {
Parse(pParser, AND, NULL, &state);
continue;
@@ -388,13 +425,13 @@
// stem terms in a phrased term with '.' phrase generators -
// e.g. "example.com" should give a phrase search for "exampl"
// and "com", not "example" and "com".
- if (p == end || C_isspace(*p)) {
+ if (p == end || U_isspace(*p)) {
it = p;
// If topterms added a term with a trailing '.', it will be
// lower case. So if it has an initial capital it must be an
// initial in someone's name, a full stop in pasted text or
// something like that.
- if (!C_isupper(term[0])) {
+ if (!U_isupper(term[0])) {
unstemmed_term = term + '.';
need_to_stem = false;
}
@@ -404,7 +441,7 @@
if (unstemmed_term.empty()) unstemmed_term = term;
term = downcase_term(term);
if (need_to_stem) {
- if (stem_action == STEM_SOME && C_isupper(unstemmed_term[0]))
+ if (stem_action == STEM_SOME && U_isupper(unstemmed_term[0]))
term = 'R' + term;
else
term = stemmer.stem_word(term);
@@ -436,7 +473,7 @@
} while (it != end && is_phrase_generator(*it));
// Don't generate a phrase unless the phrase generators are
// immediately followed by another term.
- if (it != end && C_isalnum(*it)) {
+ if (it != end && g_unichar_isalnum(*it)) {
mode = IN_PHRASED_TERM;
goto phrased_term;
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: symboltab.h
Type: text/x-chdr
Size: 1063 bytes
Desc: not available
Url : http://lists.tartarus.org/pipermail/xapian-discuss/attachments/20050820/5a08d009/symboltab.bin
More information about the Xapian-discuss
mailing list