Eric Wong
2025-Feb-22 00:40 UTC
ensuring all docs have a certain value before (collapsed) match
Not a very important or urgent feature to me, but it'd be nice
to have :>
Similar to `thread:' in notmuch, I'm trying to add a
`wholethread:' field processor for searching mail. That is, I'm
trying to search for mail threads where a subquery matches for
every single message in that thread, not just one message in
a thread as `thread:' does.
I currently have (what seems to be) working code below, but it's
extremely slow (>= 10min/query) since it has to run two full sub
queries for every threadid matched. I'm hoping there's a more
efficient way to do this against existing (giant) DBs w/o
indexing changes.
I sprinkled some fprintf() calls in there to track progress and
it just seems to just crawl along doing each query, but there's
a lot...
I store THREADID as a column value (not as a term like notmuch does).
// cur_srch is a global where cur_srch->db is Xapian::Database
// wholethread field processor, ensures every single message in a
// thread matches a given subquery.
// derived somewhat from thread-fp.{h,cc} in notmuch
// Disclaimer: I'm an old C hacker but very inexperienced at C++
// most of this code is written with C (not C++) hackers in mind
// since I expect more people interested in my projects know C.
class WholeThreadFieldProcessor : public Xapian::FieldProcessor {
protected:
Xapian::QueryParser &qp;
public:
WholeThreadFieldProcessor(Xapian::QueryParser &qp_) : qp(qp_) {};
Xapian::Query operator()(const std::string &str);
};
enum exc_iter {
ITER_OK = 0,
ITER_RETRY,
ITER_ABORT
};
// ORs a value requirement to xqry if ALL documents with a given value
// matches orig_qry
static enum exc_iter collapse_col_iter(Xapian::Query *xqry,
Xapian::MSetIterator *i,
const Xapian::Query orig_qry,
unsigned column)
{
try {
Xapian::Document doc = i->get_document();
std::string val = doc.get_value(column);
Xapian::Query val_qry = Xapian::Query(
Xapian::Query::OP_VALUE_RANGE,
column, val, val);
Xapian::Enquire enq(*cur_srch->db);
enq.set_weighting_scheme(Xapian::BoolWeight());
// maybe there is a faster way to only get mset.size()?
// first we count every message with a given value in column
enq.set_query(val_qry);
Xapian::doccount total = cur_srch->db->get_doccount();
Xapian::doccount need = enq.get_mset(0, total).size();
fprintf(stderr, "val_qry<%s> mset.size:%llu\n",
val_qry.get_description().c_str(), (unsigned long long)need);
// we use the value only if every message with that value
// matches orig_qry
Xapian::Query tmp_qry = Xapian::Query(
Xapian::Query::OP_FILTER, orig_qry, val_qry);
Xapian::doccount has = enq.get_mset(0, total).size();
fprintf(stderr, "tmp_qry<%s> mset.size:%llu\n",
tmp_qry.get_description().c_str(), (unsigned long long)has);
if (has == need)
*xqry = Xapian::Query(Xapian::Query::OP_OR, *xqry,
Xapian::Query(
Xapian::Query::OP_VALUE_RANGE,
column, val, val));
} catch (const Xapian::DatabaseModifiedError &e) {
cur_srch->db->reopen();
return ITER_RETRY;
} catch (const Xapian::DocNotFoundError &e) { // oh well...
warnx("doc not found: %s", e.get_description().c_str());
}
return ITER_OK;
}
static Xapian::Query qry_collapse_col(Xapian::Query qry, unsigned column)
{
Xapian::Query xqry = Xapian::Query::MatchNothing;
Xapian::Enquire enq(*cur_srch->db);
// grab a list of values in column matching qry:
enq.set_weighting_scheme(Xapian::BoolWeight());
enq.set_query(qry);
enq.set_collapse_key(column);
Xapian::MSet mset = enq.get_mset(0, cur_srch->db->get_doccount());
fprintf(stderr, "qry<%s> mset.size:%llu\n",
qry.get_description().c_str(),
(unsigned long long)mset.size());
for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
for (int t = 10; t > 0; --t) {
switch (collapse_col_iter(&xqry, &i, qry, column)) {
case ITER_OK: t = 0; break; // leave inner loop
case ITER_RETRY: break; // continue for-loop
case ITER_ABORT: return xqry; // impossible
}
}
}
return xqry;
}
Xapian::Query WholeThreadFieldProcessor::operator()(const std::string &str)
{
Xapian::Query qry;
if (str.at(0) != '{') { // wholethread:"SUBQUERY"
qry = cur_srch->qp->parse_query(str, cur_srch->qp_flags);
} else if (str.size() <= 1 || str.at(str.size() - 1) != '}') {
throw Xapian::QueryParserError("missing } in '" + str +
"'");
} else { // wholethread:"{SUBQUERY}" (familiar to thread:{} users)
std::string qstr = str.substr(1, str.size() - 2);
qry = cur_srch->qp->parse_query(qstr, cur_srch->qp_flags);
}
return qry_collapse_col(qry, THREADID);
}
// TIA for any help you can provide, but again, not a high priority