Rewrite of keyword matching code. This fixes #20

Uses new iterator based tagged string functions.
2026-06-10 04:57:00 -04:00 · 2020-05-17 20:37:53 +02:00
parent 0b653938cc
commit b3ddb295fc
8 changed files with 528 additions and 267 deletions
@@ -56,7 +56,7 @@ public:
  IndexMap<FieldP, ValueP>& extraDataFor(const StyleSheet& stylesheet);
  
  /// Keyword usage statistics
-  vector<pair<Value*,const Keyword*>> keyword_usage;
+  vector<pair<const Value*,const Keyword*>> keyword_usage;
  
  /// Get the identification of this card, an identification is something like a name, title, etc.
  /** May return "" */
@@ -10,6 +10,7 @@
 #include <data/keyword.hpp>
 #include <util/tagged_string.hpp>
 #include <unordered_map>
+#include <unordered_set>

 class KeywordTrie;
 DECLARE_POINTER_TYPE(KeywordParamValue);
@@ -253,6 +254,8 @@ namespace std {
 }

 /// A node in a trie to match keywords
+/* The trie is used to speed up matching, by quickly finding candidate keywords.
+*/
 class KeywordTrie {
 public:
  KeywordTrie();
@@ -376,6 +379,19 @@ void KeywordDatabase::prepare_parameters(const vector<KeywordParamP>& ps, const
  }
 }

+#ifdef _DEBUG
+void dump(int i, const KeywordTrie* t) {
+  FOR_EACH(c, t->children) {
+    wxLogDebug(String(i, _(' ')) + c.first + _("     ") + String::Format(_("%p"), c.second.get()));
+    dump(i + 2, c.second.get());
+  }
+  if (t->on_any_star) {
+    wxLogDebug(String(i, _(' ')) + _(".*") + _("     ") + String::Format(_("%p"), t->on_any_star));
+    if (t->on_any_star != t) dump(i + 2, t->on_any_star);
+  }
+}
+#endif
+
 // ----------------------------------------------------------------------------- : KeywordDatabase : matching

 // transitive closure of a state, follow all on_any_star links
@@ -387,209 +403,222 @@ void closure(vector<const KeywordTrie*>& state) {
  }
 }

-#ifdef _DEBUG
-void dump(int i, const KeywordTrie* t) {
-  FOR_EACH(c, t->children) {
-    wxLogDebug(String(i,_(' ')) + c.first + _("     ") + String::Format(_("%p"),c.second.get()));
-    dump(i+2, c.second.get());
-  }
-  if (t->on_any_star) {
-    wxLogDebug(String(i,_(' ')) + _(".*") + _("     ") + String::Format(_("%p"),t->on_any_star));
-    if (t->on_any_star != t) dump(i+2, t->on_any_star);
-  }
-}
-#endif
-
-String KeywordDatabase::expand(const String& text,
-                               const ScriptValueP& match_condition,
-                               const ScriptValueP& expand_default,
-                               const ScriptValueP& combine_script,
-                               Context& ctx,
-                               KeywordUsageStatistics* stat) const {
-  assert(combine_script);
-  assert_tagged(text);
-  
-  // Clean up usage statistics
-  Value* stat_key = value_being_updated();
-  if (stat && stat_key) {
-    for (size_t i = stat->size() - 1 ; i + 1 > 0 ; --i) { // loop backwards
-      if ((*stat)[i].first == stat_key) {
-        stat->erase(stat->begin() + i);
-      }
+void step_state(vector<const KeywordTrie*>& state, wxUniChar c) {
+  vector<const KeywordTrie*> next;
+  for(auto kt : state) {
+    auto it = kt->children.find(c);
+    if (it != kt->children.end()) {
+      next.push_back(it->second.get());
+    }
+    // TODO: on any star first or last?
+    if (kt->on_any_star) {
+      next.push_back(kt->on_any_star);
    }
  }
-  
-  // Remove all old reminder texts
-  String tagged = remove_tag_contents(text, _("<atom-reminder"));
-  tagged = remove_tag_contents(tagged, _("<atom-keyword")); // OLD, TODO: REMOVEME
-  tagged = remove_tag_contents(tagged, _("<atom-kwpph>"));
-  tagged = remove_tag(tagged, _("<keyword-param"));
-  tagged = remove_tag(tagged, _("<param-"));
-  String untagged = untag_no_escape(tagged);
-  
-  if (!root) return tagged;
-  
-  String result;
-  
-  // Find keywords
-  while (!tagged.empty()) {
-    vector<const KeywordTrie*> current; // current location(s) in the trie
-    vector<const KeywordTrie*> next;    // location(s) after this step
-    set<const Keyword*> used; // keywords already investigated
-    current.push_back(root.get());
-    closure(current);
-    // is the keyword expanded? From <kw-?> tag
-    // Possible values are:
-    //  - '0' = reminder text explicitly hidden
-    //  - '1' = reminder text explicitly shown
-    //  - 'a' = reminder text in default state, hidden
-    //  - 'A' = reminder text in default state, shown
-    const char default_expand_type = 'a';
-    char expand_type = default_expand_type;
-    
-    for (size_t i = 0 ; i < tagged.size() ;) {
-      wxUniChar c = tagged.GetChar(i);
-      // tag?
-      if (c == _('<')) {
-        if (is_substr(tagged, i, _("<kw-")) && i + 4 < tagged.size()) {
-          expand_type = tagged.GetChar(i + 4); // <kw-?>
-          tagged = tagged.erase(i, skip_tag(tagged,i)-i); // remove the tag from the string
-        } else if (is_substr(tagged, i, _("</kw-"))) {
-          expand_type = default_expand_type;
-          tagged = tagged.erase(i, skip_tag(tagged,i)-i); // remove the tag from the string
-        } else if (is_substr(tagged, i, _("<atom"))) {
-          i = match_close_tag_end(tagged, i); // skip <atom>s
-        } else {
-          i = skip_tag(tagged, i);
-        }
-        continue;
-      } else {
-        #if USE_CASE_INSENSITIVE_KEYWORDS
-          c = toLower(c); // case insensitive matching
-        #endif
-        ++i;
-      }
-      // find 'next' trie node set matching c
-      FOR_EACH(kt, current) {
-        auto it = kt->children.find(c);
-        if (it != kt->children.end()) {
-          next.push_back(it->second.get());
-        }
-        // TODO: on any star first or last?
-        if (kt->on_any_star) {
-          next.push_back(kt->on_any_star);
-        }
-      }
-      // next becomes current
-      swap(current, next);
-      // in the MSVC stl clear frees memory, that is a waste, because we need it again in the next iteration
-      //next.clear();
-      next.resize(0);
-      closure(current);
-      // are we done?
-      for (int set_or_game = 0 ; set_or_game <= 1 ; ++set_or_game) {
-        FOR_EACH(n, current) {
-          FOR_EACH(kw, n->finished) {
-            if (kw->fixed != (bool)set_or_game) {
-              continue; // first try set keywords, try game keywords in the second round
-            }
-            if (!used.insert(kw).second) {
-              continue; // already seen this keyword
-            }
-            // we have found a possible match, for a keyword which we have not seen before
-            if (tryExpand(*kw, i, tagged, untagged, result, expand_type,
-                          match_condition, expand_default, combine_script, ctx,
-                          stat, stat_key))
-            {
-              // it matches
-              goto matched_keyword;
-            }
-          }
-        }
-      }
-    }
-    // Remainder of the string
-    result += tagged;
-    tagged.clear();
-    
-    matched_keyword:;
-  }
-  
-  assert_tagged(result);
-  return result;
+  swap(state,next);
 }

-bool KeywordDatabase::tryExpand(const Keyword& kw,
-                                size_t expand_type_known_upto,
-                                String& tagged,
-                                String& untagged,
-                                String& result,
-                                char expand_type,
-                                const ScriptValueP& match_condition,
-                                const ScriptValueP& expand_default,
-                                const ScriptValueP& combine_script,
-                                Context& ctx,
-                                KeywordUsageStatistics* stat,
-                                Value* stat_key) const
-{
-  // try to match regex against the *untagged* string
-  assert(!kw.match_re.empty());
-  Regex::Results match;
-  if (!kw.match_re.matches(match, untagged)) return false;
-  
-  // Find match position
-  size_t start_u = match.position();
-  size_t len_u   = match.length();
-  size_t start = untagged_to_index(tagged, start_u, true),
-         end   = untagged_to_index(tagged, start_u + len_u, false);
-  if (start == end) return false; // don't match empty keywords
-  
-  // a part of tagged has not been searched for <kw- tags
-  // this can happen when the trie incorrectly matches too early
-  for (size_t j = expand_type_known_upto+1 ; j < start ;) {
-    Char c = tagged.GetChar(j);
-    if (c == _('<')) {
-      if (is_substr(tagged, j, _("<kw-")) && j + 4 < tagged.size()) {
-        expand_type = tagged.GetChar(j + 4); // <kw-?>
-      } else if (is_substr(tagged, j, _("</kw-"))) {
-        expand_type = 'a';
-      }
-      j = skip_tag(tagged, j);
+// Collect possible matching keywords
+/* First step in matching is to run over the string, and use the trie to find keywords that *potentially* appear in it.
+ */
+unordered_set<Keyword const*> possible_matches(String const& tagged_str, KeywordTrie const* trie_root) {
+  unordered_set<const Keyword*> possible_matches;
+  if (!trie_root) return possible_matches;
+
+  vector<const KeywordTrie*> state;
+  state.push_back(trie_root);
+
+  for (String::const_iterator it = tagged_str.begin(); it != tagged_str.end();) {
+    wxUniChar c = *it;
+    // tag?
+    if (c == '<') {
+      it = skip_tag(it, tagged_str.end());
    } else {
-      ++j;
+      ++it;
+      c = toLower(c); // case insensitive matching
+      // find 'next' trie node set matching c
+      step_state(state, c);
+      closure(state);
+      // matches
+      for (auto kt : state) {
+        for (auto kw : kt->finished) {
+          possible_matches.insert(kw);
+        }
+      }
    }
  }
-  
-  // To determine if the case matches exactly we compare plain text parts with the original match string
-  size_t pos_in_match_string = 0;
-  bool correct_case = true;
-  // also check if there are missing parameters
+  return possible_matches;
+}
+
+struct KeywordMatch {
+  Keyword const* keyword;
+  // match in the untagged string
+  Regex::Results match;
+  KeywordMatch(Keyword const& keyword, Regex::Results match) : keyword(&keyword), match(match) {}
+};
+
+// Collect exact matching keywords
+/* Second step in matching is to match regexes
+ */
+void keyword_matches(const String& untagged_str, const Keyword& keyword, vector<KeywordMatch>& out) {
+  Regex::Results match;
+  size_t i = 0;
+  String::const_iterator it = untagged_str.begin();
+  while (keyword.match_re.matches(match, it, untagged_str.end())) {
+    out.emplace_back(keyword, match);
+    it = max(it+1, match[0].end());
+  }
+}
+void keyword_matches(const String& untagged_str, unordered_set<Keyword const*> keywords, vector<KeywordMatch>& out) {
+  for (auto keyword : keywords) {
+    keyword_matches(untagged_str, *keyword, out);
+  }
+}
+void sort_keyword_matches(vector<KeywordMatch>& matches) {
+  // sort matches by their start position
+  sort(matches.begin(), matches.end(), [](KeywordMatch const& a, KeywordMatch const& b) {
+    if (a.match[0].begin() < b.match[0].begin()) return true;
+    if (a.match[0].begin() > b.match[0].begin()) return false;
+    // otherwise sort by matching set keywords (non-fixed) first
+    if (a.keyword->fixed < b.keyword->fixed) return true;
+    if (a.keyword->fixed > b.keyword->fixed) return false;
+    // otherwise sort by name
+    return a.keyword->keyword < b.keyword->keyword;
+  });
+}
+vector<KeywordMatch> keyword_matches(const String& untagged_str, unordered_set<Keyword const*> keywords) {
+  vector<KeywordMatch> out;
+  keyword_matches(untagged_str, keywords, out);
+  sort_keyword_matches(out);
+  return out;
+}
+
+
+
+tuple<bool,String::const_iterator> expand_keyword(String::const_iterator it, String::const_iterator end, KeywordMatch const& match, char expand_type, String& out, KeywordExpandOptions const& options);
+
+/* Last step in matching is to go over the string, and expand each of the matches, as long as they don't overlap
+ * Note that matches are already sorted, so we can try them in order.
+ * But as a complication, positions and lengths in matches refer to the untagged string.
+ */
+String expand_keywords(const String& tagged_str, vector<KeywordMatch> const& matches, KeywordExpandOptions const& options) {
+  vector<KeywordMatch>::const_iterator match_it = matches.begin();
+  size_t untagged_pos = 0;
+
+  // tags to skip
+  int atom = 0;
+  // Possible values are:
+  //  - '0' = reminder text explicitly hidden
+  //  - '1' = reminder text explicitly shown
+  //  - 'a' = reminder text in default state, hidden
+  //  - 'A' = reminder text in default state, shown
+  const char default_expand_type = 'a';
+  char expand_type = default_expand_type;
+
+  String out;
+  String::const_iterator it = tagged_str.begin();
+  const String::const_iterator end = tagged_str.end();
+
+  // in the loop below, skip past tags
+  auto skip_tags_for_keyword = [&](bool open, bool close) {
+    while (it != end && *it == '<') {
+      if (is_substr(it, end, "<kw-")) {
+        if (it + 4 != end) expand_type = *(it + 4); // <kw-?>
+        it = skip_tag(it, end);
+      } else if (is_substr(it, end, "</kw-")) {
+        expand_type = default_expand_type;
+        it = skip_tag(it, end);
+      } else {
+        bool is_close = (it+1) != end && *(it+1) == '/';
+        if (is_close && !close || !is_close && !open) return;
+        if (is_substr(it, end, "<atom")) {
+          atom++;
+        } else if (is_substr(it, end, "</atom")) {
+          atom++;
+        }
+        // keep tag in output
+        auto after = skip_tag(it, end);
+        out.append(it, after);
+        it = after;
+      }
+    }
+  };
+
+  while (true) {
+    // prefer to match 'outside' tags, so before open tags and after close tags
+    // that way we avoid breaking up atoms
+    // so here match only close tags
+    skip_tags_for_keyword(false, true);
+    if (it == end) break;
+    // is there a match here?
+    while (match_it != matches.end() && (size_t)match_it->match.position() <= untagged_pos) {
+      if ((size_t)match_it->match.position() > untagged_pos) {
+        ++match_it;
+        continue;
+      }
+      // try to expand
+      auto [match,new_it] = expand_keyword(it, end, *match_it, expand_type, out, options);
+      if (match) {
+        untagged_pos += untagged_length(it,new_it);
+        it = new_it;
+        ++match_it;
+        goto after_match;
+      } else {
+        ++match_it;
+      }
+    }
+    // No match, so there is at least one character not part of a keyword
+    // and possibly some tags before it that we missed
+    skip_tags_for_keyword(true, true);
+    out += *it;
+    ++it;
+    ++untagged_pos;
+    // after matching or skipping, go past close tags, to remain as much oustide tags as possible
+    after_match:
+    skip_tags_for_keyword(true, false);
+  }
+  return out;
+}
+
+// Get detailed information on a keyword match:
+//  * The value of each of the parameters
+//  * Whether the case matches
+// Add these things to the context
+// Return iterator after the whole match
+String::const_iterator keyword_match_detail(String::const_iterator it, String::const_iterator end, KeywordMatch const& kw_match, Context& ctx) {
+  Keyword const& keyword = *kw_match.keyword;
+  Regex::Results const& match = kw_match.match;
+
+  // used placeholders?
  bool used_placeholders = false;
-  
-  
+  // case errors? For finding these we will loop over the keyword.match string
+  bool correct_case = true;
+  String::const_iterator match_str_it = keyword.match.begin();
+
+  // Combined tagged match string
+  String total;
+
  // Split the keyword, set parameters in context
  // The even captures are parameter values, the odd ones are the plain text in between
-  String total; // the total keyword
-  assert(match.size() - 1 == 1 + 2 * kw.parameters.size());
-  size_t part_start = start;
-  for (size_t submatch = 1 ; submatch < match.size() ; ++submatch) {
-    // the matched part
-    size_t part_start_u = match.position(submatch);
-    size_t part_len_u   = match.length((int)submatch);
-    size_t part_end_u   = part_start_u + part_len_u;
-    // note: start_u can be (uint)-1 when part_len_u == 0
-    size_t part_end = part_len_u > 0 ? untagged_to_index(tagged, part_end_u, false) : part_start;
-    String part(tagged, part_start, part_end - part_start);
+  // submatch 0 is the whole match
+  assert(match.size() - 1 == 1 + 2 * keyword.parameters.size());
+  for (int sub = 1; sub < match.size(); ++sub) {
+    // The matched part, indices in untagged string. We only need the length
+    size_t part_len_untagged = match.length(sub);
+    // Translate back to tagged position
+    // Note: when part_len_untagged==0, the positions are invalid
+    String::const_iterator part_end = advance_untagged(it, end, part_len_untagged, false,true);
+    String part(it,part_end);
    // strip left over </kw tags
-    part = remove_tag(part,_("</kw-"));
-    
+    part = remove_tag(part, _("</kw-"));
+
    // we start counting at 1, so
-    // submatch = 1 mod 2 -> text
-    // submatch = 0 mod 2 -> parameter
-    if ((submatch % 2) == 0) {
+    // sub = 1 mod 2 -> text
+    // sub = 0 mod 2 -> parameter
+    if ((sub % 2) == 0) {
      // parameter
-      KeywordParam& kwp = *kw.parameters[(submatch - 2) / 2];
-      String param = match.str((int)submatch); // untagged version
+      KeywordParam& kwp = *keyword.parameters[(sub - 2) / 2];
+      String param = match.str(sub); // untagged version
      // strip separator_before
      String separator_before, separator_after;
      Regex::Results sep_match;
@@ -623,96 +652,157 @@ bool KeywordDatabase::tryExpand(const Keyword& kw,
        }
      }
      // to script
-      KeywordParamValueP script_param(new KeywordParamValue(kwp.name, separator_before, separator_after, param));
-      KeywordParamValueP script_part (new KeywordParamValue(kwp.name, separator_before, separator_after, part));
+      KeywordParamValueP script_param = make_intrusive<KeywordParamValue>(kwp.name, separator_before, separator_after, param);
+      KeywordParamValueP script_part  = make_intrusive<KeywordParamValue>(kwp.name, separator_before, separator_after, part);
      // process param
      if (param.empty()) {
        // placeholder
        used_placeholders = true;
        script_param->value = _("<atom-kwpph>") + (kwp.placeholder.empty() ? kwp.name : kwp.placeholder) + _("</atom-kwpph>");
-        script_part->value  = part + script_param->value; // keep tags
+        script_part->value = part + script_param->value; // keep tags
      } else {
        // apply parameter script
        if (kwp.script) {
          ctx.setVariable(_("input"), script_part);
-          script_part->value  = kwp.script.invoke(ctx)->toString();
+          script_part->value = kwp.script.invoke(ctx)->toString();
        }
        if (kwp.reminder_script) {
          ctx.setVariable(_("input"), script_param);
          script_param->value = kwp.reminder_script.invoke(ctx)->toString();
        }
      }
-      part  = separator_before + script_part->toString() + separator_after;
-      ctx.setVariable(String(_("param")) << (int)(submatch/2), script_param);
-      
-    } else if (correct_case) {
+      part = separator_before + script_part->toString() + separator_after;
+      ctx.setVariable(String(_("param")) << (int)(sub / 2), script_param);
+
+    } else {
      // Plain text, check if the case matches
-      for (size_t i = part_start_u ; i < part_start_u + part_len_u ; ++i, ++pos_in_match_string) {
-        if (pos_in_match_string > kw.match.size()) {
-          // outside match string, shouldn't happen, strings should be the same length
-          correct_case = false;
-          break;
+      if (correct_case) {
+        while (it != part_end) {
+          it = skip_all_tags(it, part_end);
+          if (it == part_end) break;
+          while (match_str_it != keyword.match.end() && is_substr(match_str_it, keyword.match.end(), "<param")) {
+            match_str_it = skip_tag(match_str_it, keyword.match.end());
+            while (match_str_it != keyword.match.end() && !is_substr(match_str_it, keyword.match.end(), "</param")) ++match_str_it;
+            match_str_it = skip_tag(match_str_it, keyword.match.end());
+          }
+          if (match_str_it == keyword.match.end()) break;
+          // does the text match the keyword match string exactly?
+          if (*it != *match_str_it) {
+            correct_case = false;
+            break;
+          }
+          ++it;
+          ++match_str_it;
        }
-        Char actual_char = untagged.GetChar(i);
-        Char match_char  = kw.match.GetChar(pos_in_match_string);
-        if (actual_char != match_char) {
-          correct_case = false;
-          break;
-        }
-      }
-      // we should have arrived at a param tag, skip it
-      if (pos_in_match_string < kw.match.size() && is_substr(kw.match, pos_in_match_string, _("<atom-param"))) {
-        pos_in_match_string = match_close_tag_end(kw.match, pos_in_match_string);
      }
    }
-    
+    // build total match
    total += part;
-    part_start = part_end;
+    // next part starts after this
+    it = part_end;
  }
-  ctx.setVariable(_("mode"), to_script(kw.mode));
+  assert_tagged(total);
+  ctx.setVariable(_("keyword"), to_script(total));
+  ctx.setVariable(_("mode"), to_script(keyword.mode));
  ctx.setVariable(_("correct_case"), to_script(correct_case));
  ctx.setVariable(_("used_placeholders"), to_script(used_placeholders));
-  
+  return it;
+};
+
+// expand a keyword that matches at it
+tuple<bool, String::const_iterator> expand_keyword(String::const_iterator it, String::const_iterator end, KeywordMatch const& kw_match, char expand_type, String& out, KeywordExpandOptions const& options) {
+  Keyword const& keyword = *kw_match.keyword;
+
+  // Perform script stuff in a local scope to not leave a mess
+  Context& ctx = options.ctx;
+  LocalScope scope(ctx);
+
+  // Get details of the match
+  String::const_iterator after = keyword_match_detail(it, end, kw_match, ctx);
+
  // Final check whether the keyword matches
-  if (match_condition && match_condition->eval(ctx)->toBool() == false) {
-    return false;
+  if (options.match_condition && options.match_condition->eval(ctx)->toBool() == false) {
+    return {false,it};
  }
-  
+
  // Show reminder text?
  bool expand = expand_type == _('1');
  if (!expand && expand_type != _('0')) {
    // default expand, determined by script
-    expand = expand_default ? expand_default->eval(ctx)->toBool() : true;
+    expand = options.expand_default ? options.expand_default->eval(ctx)->toBool() : true;
    expand_type = expand ? _('A') : _('a');
  }
-  
-  // Copy text before keyword
-  result += remove_tag(tagged.substr(0, start), _("<kw-"));
-  
-  // Combine keyword & reminder with result
+  ctx.setVariable(_("expand"), to_script(expand));
+
+  // Reminder text
  String reminder;
  try {
-    reminder = kw.reminder.invoke(ctx)->toString();
+    reminder = keyword.reminder.invoke(ctx)->toString();
  } catch (const Error& e) {
-    handle_error(_ERROR_2_("in keyword reminder", e.what(), kw.keyword));
+    handle_error(_ERROR_2_("in keyword reminder", e.what(), keyword.keyword));
  }
-  ctx.setVariable(_("keyword"),  to_script(total));
  ctx.setVariable(_("reminder"), to_script(reminder));
-  ctx.setVariable(_("expand"),   to_script(expand));
-  result +=  _("<kw-"); result += expand_type; result += _(">");
-  result += combine_script->eval(ctx)->toString();
-  result += _("</kw-"); result += expand_type; result += _(">");
-  
+
+  // Combine, add to output
+  out += _("<kw-");
+  out += expand_type;
+  out += _(">");
+  out += options.combine_script->eval(ctx)->toString();
+  out += _("</kw-");
+  out += expand_type;
+  out += _(">");
+
  // Add to usage statistics
-  if (stat && stat_key) {
-    stat->push_back(make_pair(stat_key, &kw));
+  if (options.stat && options.stat_key) {
+    options.stat->emplace_back(options.stat_key, &keyword);
  }
+
+  return {true,after};
+}
+
+String remove_keyword_tags(String const& tagged_str) {
+  // Remove all old reminder texts
+  String s = remove_tag_contents(tagged_str, _("<atom-reminder"));
+  s = remove_tag_contents(s, _("<atom-keyword")); // OLD, TODO: REMOVEME
+  s = remove_tag_contents(s, _("<atom-kwpph>"));
+  s = remove_tag(s, _("<keyword-param"));
+  s = remove_tag(s, _("<param-"));
+  return s;
+}
+
+void remove_from_stats(KeywordUsageStatistics* stat, const Value* stat_key) {
+  if (stat && stat_key) {
+    auto condition = [stat_key](KeywordUsageStatistics::value_type const& it) {
+      return it.first == stat_key;
+    };
+    stat->erase(std::remove_if(stat->begin(), stat->end(), condition), stat->end());
+  }
+}
+
+String KeywordDatabase::expand(const String& text, KeywordExpandOptions const& options) const {
+  assert(options.combine_script);
+  assert_tagged(text);
+
+  // Clean up usage statistics
+  remove_from_stats(options.stat, options.stat_key);
  
-  // After keyword
-  tagged   = tagged.substr(end);
-  untagged = untagged.substr(start_u + len_u);
+  // Remove all old reminder texts
+  String tagged = remove_keyword_tags(text);
+
+  // any keywords in database?
+  if (!root) return tagged;
+
+  // Find potential matches
+  auto possible_matches = ::possible_matches(tagged, root.get());
+
+  // Refine
+  String untagged = untag_no_escape(tagged);
+  auto matches = keyword_matches(untagged, possible_matches);
  
-  return true;
+  // Expand
+  String result = expand_keywords(tagged, matches, options);
+  assert_tagged(result);
+  return result;
 }

 // ----------------------------------------------------------------------------- : KeywordParamValue
@@ -132,7 +132,16 @@ inline String type_name(const vector<KeywordP>&) {
 // ----------------------------------------------------------------------------- : Using keywords

 /// Store keyword usage statistics here, using value_being_updated as the key
-typedef vector<pair<Value*, const Keyword*>> KeywordUsageStatistics;
+typedef vector<pair<const Value*, const Keyword*>> KeywordUsageStatistics;
+
+struct KeywordExpandOptions {
+  ScriptValueP match_condition;
+  ScriptValueP expand_default;
+  ScriptValueP combine_script;
+  Context& ctx;
+  KeywordUsageStatistics* stat;
+  const Value* stat_key;
+};

 /// A database of keywords to allow for fast matching
 /** NOTE: keywords may not be altered after they are added to the database,
@@ -157,13 +166,13 @@ public:
  inline bool empty() const { return !root; }
  
  /// Expand/update all keywords in the given string.
-  /** @param expand_default script function indicating whether reminder text should be shown by default
-   *  @param combine_script script function to combine keyword and reminder text in some way
-   *  @param case_sensitive case sensitive matching of keywords?
-   *  @param ctx            context for evaluation of scripts
-   *  @param stats          where to put keyword statistics
+  /** @param options.expand_default script function indicating whether reminder text should be shown by default
+   *  @param options.combine_script script function to combine keyword and reminder text in some way
+   *  @param options.case_sensitive case sensitive matching of keywords?
+   *  @param options.ctx            context for evaluation of scripts
+   *  @param options.stats          where to put keyword statistics
   */
-  String expand(const String& text, const ScriptValueP& match_condition, const ScriptValueP& expand_default, const ScriptValueP& combine_script, Context& ctx, KeywordUsageStatistics* stats = nullptr) const;
+  String expand(const String& text, const KeywordExpandOptions&) const;
  
 private:
  unique_ptr<KeywordTrie> root; ///< Data structure for finding keywords
@@ -403,7 +403,7 @@ SCRIPT_FUNCTION(sort_text) {

 /// Replace the contents of a specific tag with the value of a script function
 String replace_tag_contents(String input, const String& tag, const ScriptValueP& contents, Context& ctx) {
-  assert_tagged(input, false);
+  assert_tagged(input);
  String ret;
  size_t start = 0, pos = input.find(tag);
  while (pos != String::npos) {
@@ -674,7 +674,8 @@ SCRIPT_FUNCTION_WITH_DEP(expand_keywords) {
  SCRIPT_OPTIONAL_PARAM_C_(CardP, card);
  try {
    KeywordUsageStatistics* stat = card ? &card->keyword_usage : nullptr;
-    SCRIPT_RETURN(db.expand(input, match_condition, default_expand, combine, ctx, stat));
+    Value* stat_key = value_being_updated();
+    SCRIPT_RETURN(db.expand(input, KeywordExpandOptions{match_condition, default_expand, combine, ctx, stat, stat_key}));
  } catch (const Error& e) {
    throw ScriptError(_ERROR_2_("in function", e.what(), _("expand_keywords")));
  }
@@ -220,3 +220,39 @@ String regex_escape(const String& s);
 /** Basicly replaces "(" with "(?:" */
 String make_non_capturing(const String& re);

+// ----------------------------------------------------------------------------- : Iterator utilities
+
+struct end_sentinel_t {} end_sentinel;
+
+// Iterate over a string, removing all matching substrings.
+// match.operator(it,end) should return false or return true and advance it past the substring
+template <typename It, typename End, typename Match>
+struct SkipSubstringIterator {
+public:
+  SkipSubstringIterator(It it, End end, Match const& match) : it(it), end(end), match(match) {
+    while (match(it, end));
+  }
+  bool operator == (end_sentinel_t) const {
+    return it == end;
+  }
+  bool operator != (end_sentinel_t) const {
+    return it != end;
+  }
+  auto operator * () const {
+    return *it;
+  }
+  auto& operator ++ () {
+    ++it;
+    while (match(it, end));
+    return *this;
+  }
+private:
+  It it;
+  End end;
+  Match match;
+};
+
+template <typename It, typename End, typename Match>
+inline SkipSubstringIterator<It,End,Match> skip_substring_iterator(It it, End end, Match const& match) {
+  return SkipSubstringIterator<It,End,Match>(it, end, match);
+}
@@ -106,9 +106,105 @@ String fix_old_tags(const String& str) {
  return ret;
 }

+// ----------------------------------------------------------------------------- : Iterator algorithms
+
+[[nodiscard]] String::const_iterator skip_tag(String::const_iterator it, String::const_iterator end) {
+  assert(it != end && *it == '<');
+  ++it;
+  while (it != end && *it != '>') ++it;
+  if (it != end) ++it;
+  return it;
+}
+
+[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end) {
+  while (it != end && *it == '<') {
+    it = skip_tag(it, end);
+  }
+  return it;
+}
+
+[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end, bool skip_open, bool skip_close) {
+  // move after first possible position corresponding
+  while (it != end && *it == '<') {
+    if (it + 1 != end && *(it + 1) == '/') {
+      if (skip_close) {
+        it = skip_tag(it, end);
+      } else {
+        return it;
+      }
+    } else {
+      if (skip_open) {
+        it = skip_tag(it, end);
+      } else {
+        return it;
+      }
+    }
+  }
+  return it;
+}
+
+[[nodiscard]] String::const_iterator advance_untagged(String::const_iterator it, String::const_iterator end, size_t n, bool after_open, bool after_close) {
+  while (n > 0) {
+    it = skip_all_tags(it, end);
+    if (it != end) {
+      ++it;
+      --n;
+    } else {
+      return it;
+    }
+  }
+  return skip_all_tags(it, end, after_open, after_close);
+}
+
+/*
+// Does the string [it..end) contain the matching close tag for [tag..tag_end)?
+bool is_close_tag(String::const_iterator it, String::const_iterator end, String::const_iterator tag, String::const_iterator tag_end) {
+  if (it == end) return false;
+  if (*it != '<') return false;
+  ++it;
+  if (it == end) return false;
+  if (*it != '/') return false;
+  assert(tag != tag_end && *tag == '<');
+  ++tag;
+  return is_substr(it,end, tag,end);
+}
+
+String::const_iterator find_close_tag(String::const_iterator tag, String::const_iterator end) {
+  assert(tag != end && *tag == '<');
+  auto tag_end = skip_tag(tag,end);
+  int nesting = 1;
+  String::const_iterator it = tag_end;
+  while (it != end) {
+    if (*it == '<') {
+      if (is_substr(it,end, tag,tag_end)) {
+        ++nesting;
+      } else if (is_close_tag(it,end, tag,tag_end)) {
+        --nesting;
+        if (nesting == 0) return it;
+      }
+      it = skip_tag(it,end);
+    } else {
+      ++it;
+    }
+  }
+  return end;
+}*/
+
+[[nodiscard]] size_t untagged_length(String::const_iterator it, String::const_iterator end) {
+  size_t n = 0;
+  while (it != end) {
+    it = skip_all_tags(it, end);
+    if (it != end) {
+      ++n;
+      ++it;
+    }
+  }
+  return n;
+}
+
 // ----------------------------------------------------------------------------- : Finding tags

-size_t tag_start(const String& str, size_t pos) {
+[[nodiscard]] size_t tag_start(const String& str, size_t pos) {
  size_t start = str.find_last_of(_('<'), pos);
  if (start == String::npos) return String::npos;
  size_t end   = skip_tag(str, start);
@@ -116,13 +212,13 @@ size_t tag_start(const String& str, size_t pos) {
  return start;
 }

-size_t skip_tag(const String& str, size_t start) {
+[[nodiscard]] size_t skip_tag(const String& str, size_t start) {
  if (start >= str.size()) return String::npos;
  size_t end = str.find_first_of(_('>'), start);
  return end == String::npos ? String::npos : end + 1;
 }

-size_t match_close_tag(const String& str, size_t start) {
+[[nodiscard]] size_t match_close_tag(const String& str, size_t start) {
  String tag  = tag_type_at(str, start);
  String ctag = _("/") + tag;
  size_t size = str.size();
@@ -143,11 +239,11 @@ size_t match_close_tag(const String& str, size_t start) {
  return String::npos;
 }

-size_t match_close_tag_end(const String& str, size_t start) {
+[[nodiscard]] size_t match_close_tag_end(const String& str, size_t start) {
  return skip_tag(str, match_close_tag(str, start));
 }

-size_t last_start_tag_before(const String& str, const String& tag, size_t start) {
+[[nodiscard]] size_t last_start_tag_before(const String& str, const String& tag, size_t start) {
  start = min(str.size(), start);
  for (size_t pos = start ; pos > 0 ; --pos) {
    if (is_substr(str, pos - 1, tag)) {
@@ -157,7 +253,7 @@ size_t last_start_tag_before(const String& str, const String& tag, size_t start)
  return String::npos;
 }

-size_t in_tag(const String& str, const String& tag, size_t start, size_t end) {
+[[nodiscard]] size_t in_tag(const String& str, const String& tag, size_t start, size_t end) {
  size_t last_start = String::npos;
  size_t size = str.size();
  int taglevel = 0;
@@ -604,17 +700,19 @@ String simplify_tagged_overlap(const String& str) {

 // ----------------------------------------------------------------------------- : Verification

-void check_tagged(const String& str, bool check_balance) {
+bool check_tagged(const String& str, bool check_balance) {
  for (size_t i = 0 ; i < str.size() ; ) {
    if (str.GetChar(i) == _('<')) {
      size_t end = skip_tag(str,i);
      if (end == String::npos) {
        queue_message(MESSAGE_WARNING, _("Invalid tagged string: missing '>'"));
+        return false;
      }
      for (size_t j = i + 1 ; j + 1 < end ; ++j) {
        Char c = str.GetChar(j);
        if (c == ESCAPED_LANGLE || c == _('<')) {
          queue_message(MESSAGE_WARNING, _("Invalid character in tag"));
+          return false;
        }
      }
      if (check_balance) {
@@ -626,6 +724,7 @@ void check_tagged(const String& str, bool check_balance) {
          size_t close = match_close_tag(str,i);
          if (close == String::npos) {
            queue_message(MESSAGE_WARNING, _("Invalid tagged string: missing close tag for <") + tag_at(str,i) + _(">"));
+            return false;
          }
        }
      }
@@ -634,6 +733,7 @@ void check_tagged(const String& str, bool check_balance) {
      ++i;
    }
  }
+  return true;
 }

 // ----------------------------------------------------------------------------- : Other utilities
@@ -53,24 +53,24 @@ String fix_old_tags(const String&);
 *   < t a g >
 *  n y y y y n
 */
-size_t tag_start(const String& str, size_t pos);
+[[nodiscard]] size_t tag_start(const String& str, size_t pos);

 /// Returns the position just beyond the tag starting at start
-size_t skip_tag(const String& str, size_t start);
+[[nodiscard]] size_t skip_tag(const String& str, size_t start);

 /// Find the position of the closing tag matching the tag at start
 /** If not found returns String::npos */
-size_t match_close_tag(const String& str, size_t start);
+[[nodiscard]] size_t match_close_tag(const String& str, size_t start);

 /// Find the position of the closing tag matching the tag at start
 /** Returns the position just after that tag.
 *    match_close_tag_end(s,i) == skip_tag(s, match_close_tag(s,i) )
 *  If not found returns String::npos */
-size_t match_close_tag_end(const String& str, size_t start);
+[[nodiscard]] size_t match_close_tag_end(const String& str, size_t start);

 /// Find the last start tag before position start
 /** If not found returns String::npos */
-size_t last_start_tag_before(const String& str, const String& tag, size_t start);
+[[nodiscard]] size_t last_start_tag_before(const String& str, const String& tag, size_t start);

 /// Is the given range entirely contained in a given tagged block?
 /** If so: return the start position of that tag, otherwise returns String::npos
@@ -79,7 +79,7 @@ size_t last_start_tag_before(const String& str, const String& tag, size_t start)
 *          <tag><tag></tag>x</tag>
 *        the x is in_tag
 */
-size_t in_tag(const String& str, const String& tag, size_t start, size_t end);
+[[nodiscard]] size_t in_tag(const String& str, const String& tag, size_t start, size_t end);
 /// Boolean returning version of the above
 bool is_in_tag(const String& str, const String& tag, size_t start, size_t end);

@@ -96,6 +96,29 @@ String close_tag(const String& tag);
 /// The matching close tag for an open tag and vice versa
 String anti_tag(const String& tag);

+// ----------------------------------------------------------------------------- : Iterators in tagged strings
+
+// Skip to the end of a tag, it must point to the start of a tag
+[[nodiscard]] String::const_iterator skip_tag(String::const_iterator it, String::const_iterator end);
+
+// Skip past all tags
+[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end);
+
+// Skip past all open/close tags
+[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end, bool skip_open, bool skip_close);
+
+// Advance an iterator by n positions, not counting tags
+// For example: advance_untagged("<b>abc</b>",_,2) = "c</b>"
+[[nodiscard]] String::const_iterator advance_untagged(String::const_iterator it, String::const_iterator end, size_t n, bool after_open=false, bool after_close=false);
+
+// Find the position of the closing tag matching the tag at it
+// If not found, returns end
+[[nodiscard]] String::const_iterator find_close_tag(String::const_iterator it, String::const_iterator end);
+
+// Length of a string when not counting tags
+// For example: untagged_length("<b>abc</b>",_) = 3
+[[nodiscard]] size_t untagged_length(String::const_iterator it, String::const_iterator end);
+
 // ----------------------------------------------------------------------------- : Cursor position

 /// Directions of cursor movement
@@ -188,13 +211,14 @@ String tagged_substr_replace(const String& input, size_t start, size_t end, cons
 *   - There are no tags containing '<' or whitespace
 *   - For each open tag there is a matching close tag.
 *
- *  In case of an error, throws an exception.
+ *  In case of an error, shows a warning
+ *  Return true if the string is a valid tagged string
 */
-void check_tagged(const String& str, bool check_balance = true);
+bool check_tagged(const String& str, bool check_balance = true);
 #ifdef _DEBUG
-  #define assert_tagged check_tagged
+  #define assert_tagged(x) assert(check_tagged(x))
 #else
-  inline void assert_tagged(const String& str, bool check_balance = true){}
+  #define assert_tagged(x) do{}while(0)
 #endif

 /// Simplify a tagged string