Eric Wong
2025-Feb-22 00:40 UTC
ensuring all docs have a certain value before (collapsed) match
Not a very important or urgent feature to me, but it'd be nice to have :> Similar to `thread:' in notmuch, I'm trying to add a `wholethread:' field processor for searching mail. That is, I'm trying to search for mail threads where a subquery matches for every single message in that thread, not just one message in a thread as `thread:' does. I currently have (what seems to be) working code below, but it's extremely slow (>= 10min/query) since it has to run two full sub queries for every threadid matched. I'm hoping there's a more efficient way to do this against existing (giant) DBs w/o indexing changes. I sprinkled some fprintf() calls in there to track progress and it just seems to just crawl along doing each query, but there's a lot... I store THREADID as a column value (not as a term like notmuch does). // cur_srch is a global where cur_srch->db is Xapian::Database // wholethread field processor, ensures every single message in a // thread matches a given subquery. // derived somewhat from thread-fp.{h,cc} in notmuch // Disclaimer: I'm an old C hacker but very inexperienced at C++ // most of this code is written with C (not C++) hackers in mind // since I expect more people interested in my projects know C. class WholeThreadFieldProcessor : public Xapian::FieldProcessor { protected: Xapian::QueryParser &qp; public: WholeThreadFieldProcessor(Xapian::QueryParser &qp_) : qp(qp_) {}; Xapian::Query operator()(const std::string &str); }; enum exc_iter { ITER_OK = 0, ITER_RETRY, ITER_ABORT }; // ORs a value requirement to xqry if ALL documents with a given value // matches orig_qry static enum exc_iter collapse_col_iter(Xapian::Query *xqry, Xapian::MSetIterator *i, const Xapian::Query orig_qry, unsigned column) { try { Xapian::Document doc = i->get_document(); std::string val = doc.get_value(column); Xapian::Query val_qry = Xapian::Query( Xapian::Query::OP_VALUE_RANGE, column, val, val); Xapian::Enquire enq(*cur_srch->db); enq.set_weighting_scheme(Xapian::BoolWeight()); // maybe there is a faster way to only get mset.size()? // first we count every message with a given value in column enq.set_query(val_qry); Xapian::doccount total = cur_srch->db->get_doccount(); Xapian::doccount need = enq.get_mset(0, total).size(); fprintf(stderr, "val_qry<%s> mset.size:%llu\n", val_qry.get_description().c_str(), (unsigned long long)need); // we use the value only if every message with that value // matches orig_qry Xapian::Query tmp_qry = Xapian::Query( Xapian::Query::OP_FILTER, orig_qry, val_qry); Xapian::doccount has = enq.get_mset(0, total).size(); fprintf(stderr, "tmp_qry<%s> mset.size:%llu\n", tmp_qry.get_description().c_str(), (unsigned long long)has); if (has == need) *xqry = Xapian::Query(Xapian::Query::OP_OR, *xqry, Xapian::Query( Xapian::Query::OP_VALUE_RANGE, column, val, val)); } catch (const Xapian::DatabaseModifiedError &e) { cur_srch->db->reopen(); return ITER_RETRY; } catch (const Xapian::DocNotFoundError &e) { // oh well... warnx("doc not found: %s", e.get_description().c_str()); } return ITER_OK; } static Xapian::Query qry_collapse_col(Xapian::Query qry, unsigned column) { Xapian::Query xqry = Xapian::Query::MatchNothing; Xapian::Enquire enq(*cur_srch->db); // grab a list of values in column matching qry: enq.set_weighting_scheme(Xapian::BoolWeight()); enq.set_query(qry); enq.set_collapse_key(column); Xapian::MSet mset = enq.get_mset(0, cur_srch->db->get_doccount()); fprintf(stderr, "qry<%s> mset.size:%llu\n", qry.get_description().c_str(), (unsigned long long)mset.size()); for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { for (int t = 10; t > 0; --t) { switch (collapse_col_iter(&xqry, &i, qry, column)) { case ITER_OK: t = 0; break; // leave inner loop case ITER_RETRY: break; // continue for-loop case ITER_ABORT: return xqry; // impossible } } } return xqry; } Xapian::Query WholeThreadFieldProcessor::operator()(const std::string &str) { Xapian::Query qry; if (str.at(0) != '{') { // wholethread:"SUBQUERY" qry = cur_srch->qp->parse_query(str, cur_srch->qp_flags); } else if (str.size() <= 1 || str.at(str.size() - 1) != '}') { throw Xapian::QueryParserError("missing } in '" + str + "'"); } else { // wholethread:"{SUBQUERY}" (familiar to thread:{} users) std::string qstr = str.substr(1, str.size() - 2); qry = cur_srch->qp->parse_query(qstr, cur_srch->qp_flags); } return qry_collapse_col(qry, THREADID); } // TIA for any help you can provide, but again, not a high priority