ensuring all docs have a certain value before (collapsed) match

Eric Wong e at 80x24.org
Sat Feb 22 00:40:46 GMT 2025


Not a very important or urgent feature to me, but it'd be nice
to have :>

Similar to `thread:' in notmuch, I'm trying to add a
`wholethread:' field processor for searching mail.  That is, I'm
trying to search for mail threads where a subquery matches for
every single message in that thread, not just one message in
a thread as `thread:' does.

I currently have (what seems to be) working code below, but it's
extremely slow (>= 10min/query) since it has to run two full sub
queries for every threadid matched.  I'm hoping there's a more
efficient way to do this against existing (giant) DBs w/o
indexing changes.

I sprinkled some fprintf() calls in there to track progress and
it just seems to just crawl along doing each query, but there's
a lot...

I store THREADID as a column value (not as a term like notmuch does).

// cur_srch is a global where cur_srch->db is Xapian::Database
// wholethread field processor, ensures every single message in a
// thread matches a given subquery.
// derived somewhat from thread-fp.{h,cc} in notmuch
// Disclaimer: I'm an old C hacker but very inexperienced at C++
// most of this code is written with C (not C++) hackers in mind
// since I expect more people interested in my projects know C.

class WholeThreadFieldProcessor : public Xapian::FieldProcessor {
protected:
	Xapian::QueryParser &qp;
public:
	WholeThreadFieldProcessor(Xapian::QueryParser &qp_) : qp(qp_) {};
	Xapian::Query operator()(const std::string &str);
};

enum exc_iter {
	ITER_OK = 0,
	ITER_RETRY,
	ITER_ABORT
};

// ORs a value requirement to xqry if ALL documents with a given value
// matches orig_qry
static enum exc_iter collapse_col_iter(Xapian::Query *xqry,
					Xapian::MSetIterator *i,
					const Xapian::Query orig_qry,
					unsigned column)
{
	try {
		Xapian::Document doc = i->get_document();
		std::string val = doc.get_value(column);
		Xapian::Query val_qry = Xapian::Query(
						Xapian::Query::OP_VALUE_RANGE,
						column, val, val);

		Xapian::Enquire enq(*cur_srch->db);
		enq.set_weighting_scheme(Xapian::BoolWeight());

		// maybe there is a faster way to only get mset.size()?

		// first we count every message with a given value in column
		enq.set_query(val_qry);
		Xapian::doccount total = cur_srch->db->get_doccount();
		Xapian::doccount need = enq.get_mset(0, total).size();
fprintf(stderr, "val_qry<%s> mset.size:%llu\n",
	val_qry.get_description().c_str(), (unsigned long long)need);

		// we use the value only if every message with that value
		// matches orig_qry
		Xapian::Query tmp_qry = Xapian::Query(
				Xapian::Query::OP_FILTER, orig_qry, val_qry);
		Xapian::doccount has = enq.get_mset(0, total).size();
fprintf(stderr, "tmp_qry<%s> mset.size:%llu\n",
	tmp_qry.get_description().c_str(), (unsigned long long)has);
		if (has == need)
			*xqry = Xapian::Query(Xapian::Query::OP_OR, *xqry,
					Xapian::Query(
						Xapian::Query::OP_VALUE_RANGE,
						column, val, val));
	} catch (const Xapian::DatabaseModifiedError &e) {
		cur_srch->db->reopen();
		return ITER_RETRY;
	} catch (const Xapian::DocNotFoundError &e) { // oh well...
		warnx("doc not found: %s", e.get_description().c_str());
	}
	return ITER_OK;
}

static Xapian::Query qry_collapse_col(Xapian::Query qry, unsigned column)
{
	Xapian::Query xqry = Xapian::Query::MatchNothing;
	Xapian::Enquire enq(*cur_srch->db);

	// grab a list of values in column matching qry:
	enq.set_weighting_scheme(Xapian::BoolWeight());
	enq.set_query(qry);
	enq.set_collapse_key(column);

	Xapian::MSet mset = enq.get_mset(0, cur_srch->db->get_doccount());

fprintf(stderr, "qry<%s> mset.size:%llu\n",
	qry.get_description().c_str(),
	(unsigned long long)mset.size());

	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++)  {
		for (int t = 10; t > 0; --t) {
			switch (collapse_col_iter(&xqry, &i, qry, column)) {
			case ITER_OK: t = 0; break; // leave inner loop
			case ITER_RETRY: break; // continue for-loop
			case ITER_ABORT: return xqry; // impossible
			}
		}
	}
	return xqry;
}

Xapian::Query WholeThreadFieldProcessor::operator()(const std::string &str)
{
	Xapian::Query qry;

	if (str.at(0) != '{') { // wholethread:"SUBQUERY"
		qry = cur_srch->qp->parse_query(str, cur_srch->qp_flags);
	} else if (str.size() <= 1 || str.at(str.size() - 1) != '}') {
		throw Xapian::QueryParserError("missing } in '" + str + "'");
	} else { // wholethread:"{SUBQUERY}" (familiar to thread:{} users)
		std::string qstr = str.substr(1, str.size() - 2);
		qry = cur_srch->qp->parse_query(qstr, cur_srch->qp_flags);
	}
	return qry_collapse_col(qry, THREADID);
}

// TIA for any help you can provide, but again, not a high priority



More information about the Xapian-discuss mailing list