From b3ddb295fc1c0ecb26eabfbb160ae1198e09c32b Mon Sep 17 00:00:00 2001 From: Twan van Laarhoven Date: Sun, 17 May 2020 20:37:53 +0200 Subject: [PATCH] Rewrite of keyword matching code. This fixes #20 Uses new iterator based tagged string functions. --- CHANGES.txt | 1 + src/data/card.hpp | 2 +- src/data/keyword.cpp | 570 +++++++++++++++++++-------------- src/data/keyword.hpp | 23 +- src/script/functions/basic.cpp | 5 +- src/util/string.hpp | 36 +++ src/util/tagged_string.cpp | 114 ++++++- src/util/tagged_string.hpp | 44 ++- 8 files changed, 528 insertions(+), 267 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 8bc5ad49..e0f522fd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,6 +14,7 @@ Features: * Added a keyboard shortcut for the search box (Ctrl+K) Bug fixes: + * Keywords that appear multiple times don't mess up reminder text (#20) * card variable in console panel now refers to the selected card * length function now gives correct results for maps * substr("foo",begin:3) now returns "" instead of true diff --git a/src/data/card.hpp b/src/data/card.hpp index 37471725..7d6f0947 100644 --- a/src/data/card.hpp +++ b/src/data/card.hpp @@ -56,7 +56,7 @@ public: IndexMap& extraDataFor(const StyleSheet& stylesheet); /// Keyword usage statistics - vector> keyword_usage; + vector> keyword_usage; /// Get the identification of this card, an identification is something like a name, title, etc. /** May return "" */ diff --git a/src/data/keyword.cpp b/src/data/keyword.cpp index adcb7ebe..d44b4a30 100644 --- a/src/data/keyword.cpp +++ b/src/data/keyword.cpp @@ -10,6 +10,7 @@ #include #include #include +#include class KeywordTrie; DECLARE_POINTER_TYPE(KeywordParamValue); @@ -253,6 +254,8 @@ namespace std { } /// A node in a trie to match keywords +/* The trie is used to speed up matching, by quickly finding candidate keywords. +*/ class KeywordTrie { public: KeywordTrie(); @@ -376,6 +379,19 @@ void KeywordDatabase::prepare_parameters(const vector& ps, const } } +#ifdef _DEBUG +void dump(int i, const KeywordTrie* t) { + FOR_EACH(c, t->children) { + wxLogDebug(String(i, _(' ')) + c.first + _(" ") + String::Format(_("%p"), c.second.get())); + dump(i + 2, c.second.get()); + } + if (t->on_any_star) { + wxLogDebug(String(i, _(' ')) + _(".*") + _(" ") + String::Format(_("%p"), t->on_any_star)); + if (t->on_any_star != t) dump(i + 2, t->on_any_star); + } +} +#endif + // ----------------------------------------------------------------------------- : KeywordDatabase : matching // transitive closure of a state, follow all on_any_star links @@ -387,209 +403,222 @@ void closure(vector& state) { } } -#ifdef _DEBUG -void dump(int i, const KeywordTrie* t) { - FOR_EACH(c, t->children) { - wxLogDebug(String(i,_(' ')) + c.first + _(" ") + String::Format(_("%p"),c.second.get())); - dump(i+2, c.second.get()); - } - if (t->on_any_star) { - wxLogDebug(String(i,_(' ')) + _(".*") + _(" ") + String::Format(_("%p"),t->on_any_star)); - if (t->on_any_star != t) dump(i+2, t->on_any_star); - } -} -#endif - -String KeywordDatabase::expand(const String& text, - const ScriptValueP& match_condition, - const ScriptValueP& expand_default, - const ScriptValueP& combine_script, - Context& ctx, - KeywordUsageStatistics* stat) const { - assert(combine_script); - assert_tagged(text); - - // Clean up usage statistics - Value* stat_key = value_being_updated(); - if (stat && stat_key) { - for (size_t i = stat->size() - 1 ; i + 1 > 0 ; --i) { // loop backwards - if ((*stat)[i].first == stat_key) { - stat->erase(stat->begin() + i); - } +void step_state(vector& state, wxUniChar c) { + vector next; + for(auto kt : state) { + auto it = kt->children.find(c); + if (it != kt->children.end()) { + next.push_back(it->second.get()); + } + // TODO: on any star first or last? + if (kt->on_any_star) { + next.push_back(kt->on_any_star); } } - - // Remove all old reminder texts - String tagged = remove_tag_contents(text, _("")); - tagged = remove_tag(tagged, _(" current; // current location(s) in the trie - vector next; // location(s) after this step - set used; // keywords already investigated - current.push_back(root.get()); - closure(current); - // is the keyword expanded? From tag - // Possible values are: - // - '0' = reminder text explicitly hidden - // - '1' = reminder text explicitly shown - // - 'a' = reminder text in default state, hidden - // - 'A' = reminder text in default state, shown - const char default_expand_type = 'a'; - char expand_type = default_expand_type; - - for (size_t i = 0 ; i < tagged.size() ;) { - wxUniChar c = tagged.GetChar(i); - // tag? - if (c == _('<')) { - if (is_substr(tagged, i, _(" - tagged = tagged.erase(i, skip_tag(tagged,i)-i); // remove the tag from the string - } else if (is_substr(tagged, i, _("s - } else { - i = skip_tag(tagged, i); - } - continue; - } else { - #if USE_CASE_INSENSITIVE_KEYWORDS - c = toLower(c); // case insensitive matching - #endif - ++i; - } - // find 'next' trie node set matching c - FOR_EACH(kt, current) { - auto it = kt->children.find(c); - if (it != kt->children.end()) { - next.push_back(it->second.get()); - } - // TODO: on any star first or last? - if (kt->on_any_star) { - next.push_back(kt->on_any_star); - } - } - // next becomes current - swap(current, next); - // in the MSVC stl clear frees memory, that is a waste, because we need it again in the next iteration - //next.clear(); - next.resize(0); - closure(current); - // are we done? - for (int set_or_game = 0 ; set_or_game <= 1 ; ++set_or_game) { - FOR_EACH(n, current) { - FOR_EACH(kw, n->finished) { - if (kw->fixed != (bool)set_or_game) { - continue; // first try set keywords, try game keywords in the second round - } - if (!used.insert(kw).second) { - continue; // already seen this keyword - } - // we have found a possible match, for a keyword which we have not seen before - if (tryExpand(*kw, i, tagged, untagged, result, expand_type, - match_condition, expand_default, combine_script, ctx, - stat, stat_key)) - { - // it matches - goto matched_keyword; - } - } - } - } - } - // Remainder of the string - result += tagged; - tagged.clear(); - - matched_keyword:; - } - - assert_tagged(result); - return result; + swap(state,next); } -bool KeywordDatabase::tryExpand(const Keyword& kw, - size_t expand_type_known_upto, - String& tagged, - String& untagged, - String& result, - char expand_type, - const ScriptValueP& match_condition, - const ScriptValueP& expand_default, - const ScriptValueP& combine_script, - Context& ctx, - KeywordUsageStatistics* stat, - Value* stat_key) const -{ - // try to match regex against the *untagged* string - assert(!kw.match_re.empty()); - Regex::Results match; - if (!kw.match_re.matches(match, untagged)) return false; - - // Find match position - size_t start_u = match.position(); - size_t len_u = match.length(); - size_t start = untagged_to_index(tagged, start_u, true), - end = untagged_to_index(tagged, start_u + len_u, false); - if (start == end) return false; // don't match empty keywords - - // a part of tagged has not been searched for - } else if (is_substr(tagged, j, _(" possible_matches(String const& tagged_str, KeywordTrie const* trie_root) { + unordered_set possible_matches; + if (!trie_root) return possible_matches; + + vector state; + state.push_back(trie_root); + + for (String::const_iterator it = tagged_str.begin(); it != tagged_str.end();) { + wxUniChar c = *it; + // tag? + if (c == '<') { + it = skip_tag(it, tagged_str.end()); } else { - ++j; + ++it; + c = toLower(c); // case insensitive matching + // find 'next' trie node set matching c + step_state(state, c); + closure(state); + // matches + for (auto kt : state) { + for (auto kw : kt->finished) { + possible_matches.insert(kw); + } + } } } - - // To determine if the case matches exactly we compare plain text parts with the original match string - size_t pos_in_match_string = 0; - bool correct_case = true; - // also check if there are missing parameters + return possible_matches; +} + +struct KeywordMatch { + Keyword const* keyword; + // match in the untagged string + Regex::Results match; + KeywordMatch(Keyword const& keyword, Regex::Results match) : keyword(&keyword), match(match) {} +}; + +// Collect exact matching keywords +/* Second step in matching is to match regexes + */ +void keyword_matches(const String& untagged_str, const Keyword& keyword, vector& out) { + Regex::Results match; + size_t i = 0; + String::const_iterator it = untagged_str.begin(); + while (keyword.match_re.matches(match, it, untagged_str.end())) { + out.emplace_back(keyword, match); + it = max(it+1, match[0].end()); + } +} +void keyword_matches(const String& untagged_str, unordered_set keywords, vector& out) { + for (auto keyword : keywords) { + keyword_matches(untagged_str, *keyword, out); + } +} +void sort_keyword_matches(vector& matches) { + // sort matches by their start position + sort(matches.begin(), matches.end(), [](KeywordMatch const& a, KeywordMatch const& b) { + if (a.match[0].begin() < b.match[0].begin()) return true; + if (a.match[0].begin() > b.match[0].begin()) return false; + // otherwise sort by matching set keywords (non-fixed) first + if (a.keyword->fixed < b.keyword->fixed) return true; + if (a.keyword->fixed > b.keyword->fixed) return false; + // otherwise sort by name + return a.keyword->keyword < b.keyword->keyword; + }); +} +vector keyword_matches(const String& untagged_str, unordered_set keywords) { + vector out; + keyword_matches(untagged_str, keywords, out); + sort_keyword_matches(out); + return out; +} + + + +tuple expand_keyword(String::const_iterator it, String::const_iterator end, KeywordMatch const& match, char expand_type, String& out, KeywordExpandOptions const& options); + +/* Last step in matching is to go over the string, and expand each of the matches, as long as they don't overlap + * Note that matches are already sorted, so we can try them in order. + * But as a complication, positions and lengths in matches refer to the untagged string. + */ +String expand_keywords(const String& tagged_str, vector const& matches, KeywordExpandOptions const& options) { + vector::const_iterator match_it = matches.begin(); + size_t untagged_pos = 0; + + // tags to skip + int atom = 0; + // Possible values are: + // - '0' = reminder text explicitly hidden + // - '1' = reminder text explicitly shown + // - 'a' = reminder text in default state, hidden + // - 'A' = reminder text in default state, shown + const char default_expand_type = 'a'; + char expand_type = default_expand_type; + + String out; + String::const_iterator it = tagged_str.begin(); + const String::const_iterator end = tagged_str.end(); + + // in the loop below, skip past tags + auto skip_tags_for_keyword = [&](bool open, bool close) { + while (it != end && *it == '<') { + if (is_substr(it, end, " + it = skip_tag(it, end); + } else if (is_substr(it, end, "match.position() <= untagged_pos) { + if ((size_t)match_it->match.position() > untagged_pos) { + ++match_it; + continue; + } + // try to expand + auto [match,new_it] = expand_keyword(it, end, *match_it, expand_type, out, options); + if (match) { + untagged_pos += untagged_length(it,new_it); + it = new_it; + ++match_it; + goto after_match; + } else { + ++match_it; + } + } + // No match, so there is at least one character not part of a keyword + // and possibly some tags before it that we missed + skip_tags_for_keyword(true, true); + out += *it; + ++it; + ++untagged_pos; + // after matching or skipping, go past close tags, to remain as much oustide tags as possible + after_match: + skip_tags_for_keyword(true, false); + } + return out; +} + +// Get detailed information on a keyword match: +// * The value of each of the parameters +// * Whether the case matches +// Add these things to the context +// Return iterator after the whole match +String::const_iterator keyword_match_detail(String::const_iterator it, String::const_iterator end, KeywordMatch const& kw_match, Context& ctx) { + Keyword const& keyword = *kw_match.keyword; + Regex::Results const& match = kw_match.match; + + // used placeholders? bool used_placeholders = false; - - + // case errors? For finding these we will loop over the keyword.match string + bool correct_case = true; + String::const_iterator match_str_it = keyword.match.begin(); + + // Combined tagged match string + String total; + // Split the keyword, set parameters in context // The even captures are parameter values, the odd ones are the plain text in between - String total; // the total keyword - assert(match.size() - 1 == 1 + 2 * kw.parameters.size()); - size_t part_start = start; - for (size_t submatch = 1 ; submatch < match.size() ; ++submatch) { - // the matched part - size_t part_start_u = match.position(submatch); - size_t part_len_u = match.length((int)submatch); - size_t part_end_u = part_start_u + part_len_u; - // note: start_u can be (uint)-1 when part_len_u == 0 - size_t part_end = part_len_u > 0 ? untagged_to_index(tagged, part_end_u, false) : part_start; - String part(tagged, part_start, part_end - part_start); + // submatch 0 is the whole match + assert(match.size() - 1 == 1 + 2 * keyword.parameters.size()); + for (int sub = 1; sub < match.size(); ++sub) { + // The matched part, indices in untagged string. We only need the length + size_t part_len_untagged = match.length(sub); + // Translate back to tagged position + // Note: when part_len_untagged==0, the positions are invalid + String::const_iterator part_end = advance_untagged(it, end, part_len_untagged, false,true); + String part(it,part_end); // strip left over text - // submatch = 0 mod 2 -> parameter - if ((submatch % 2) == 0) { + // sub = 1 mod 2 -> text + // sub = 0 mod 2 -> parameter + if ((sub % 2) == 0) { // parameter - KeywordParam& kwp = *kw.parameters[(submatch - 2) / 2]; - String param = match.str((int)submatch); // untagged version + KeywordParam& kwp = *keyword.parameters[(sub - 2) / 2]; + String param = match.str(sub); // untagged version // strip separator_before String separator_before, separator_after; Regex::Results sep_match; @@ -623,96 +652,157 @@ bool KeywordDatabase::tryExpand(const Keyword& kw, } } // to script - KeywordParamValueP script_param(new KeywordParamValue(kwp.name, separator_before, separator_after, param)); - KeywordParamValueP script_part (new KeywordParamValue(kwp.name, separator_before, separator_after, part)); + KeywordParamValueP script_param = make_intrusive(kwp.name, separator_before, separator_after, param); + KeywordParamValueP script_part = make_intrusive(kwp.name, separator_before, separator_after, part); // process param if (param.empty()) { // placeholder used_placeholders = true; script_param->value = _("") + (kwp.placeholder.empty() ? kwp.name : kwp.placeholder) + _(""); - script_part->value = part + script_param->value; // keep tags + script_part->value = part + script_param->value; // keep tags } else { // apply parameter script if (kwp.script) { ctx.setVariable(_("input"), script_part); - script_part->value = kwp.script.invoke(ctx)->toString(); + script_part->value = kwp.script.invoke(ctx)->toString(); } if (kwp.reminder_script) { ctx.setVariable(_("input"), script_param); script_param->value = kwp.reminder_script.invoke(ctx)->toString(); } } - part = separator_before + script_part->toString() + separator_after; - ctx.setVariable(String(_("param")) << (int)(submatch/2), script_param); - - } else if (correct_case) { + part = separator_before + script_part->toString() + separator_after; + ctx.setVariable(String(_("param")) << (int)(sub / 2), script_param); + + } else { // Plain text, check if the case matches - for (size_t i = part_start_u ; i < part_start_u + part_len_u ; ++i, ++pos_in_match_string) { - if (pos_in_match_string > kw.match.size()) { - // outside match string, shouldn't happen, strings should be the same length - correct_case = false; - break; + if (correct_case) { + while (it != part_end) { + it = skip_all_tags(it, part_end); + if (it == part_end) break; + while (match_str_it != keyword.match.end() && is_substr(match_str_it, keyword.match.end(), " expand_keyword(String::const_iterator it, String::const_iterator end, KeywordMatch const& kw_match, char expand_type, String& out, KeywordExpandOptions const& options) { + Keyword const& keyword = *kw_match.keyword; + + // Perform script stuff in a local scope to not leave a mess + Context& ctx = options.ctx; + LocalScope scope(ctx); + + // Get details of the match + String::const_iterator after = keyword_match_detail(it, end, kw_match, ctx); + // Final check whether the keyword matches - if (match_condition && match_condition->eval(ctx)->toBool() == false) { - return false; + if (options.match_condition && options.match_condition->eval(ctx)->toBool() == false) { + return {false,it}; } - + // Show reminder text? bool expand = expand_type == _('1'); if (!expand && expand_type != _('0')) { // default expand, determined by script - expand = expand_default ? expand_default->eval(ctx)->toBool() : true; + expand = options.expand_default ? options.expand_default->eval(ctx)->toBool() : true; expand_type = expand ? _('A') : _('a'); } - - // Copy text before keyword - result += remove_tag(tagged.substr(0, start), _("toString(); + reminder = keyword.reminder.invoke(ctx)->toString(); } catch (const Error& e) { - handle_error(_ERROR_2_("in keyword reminder", e.what(), kw.keyword)); + handle_error(_ERROR_2_("in keyword reminder", e.what(), keyword.keyword)); } - ctx.setVariable(_("keyword"), to_script(total)); ctx.setVariable(_("reminder"), to_script(reminder)); - ctx.setVariable(_("expand"), to_script(expand)); - result += _(""); - result += combine_script->eval(ctx)->toString(); - result += _(""); - + + // Combine, add to output + out += _(""); + out += options.combine_script->eval(ctx)->toString(); + out += _(""); + // Add to usage statistics - if (stat && stat_key) { - stat->push_back(make_pair(stat_key, &kw)); + if (options.stat && options.stat_key) { + options.stat->emplace_back(options.stat_key, &keyword); } + + return {true,after}; +} + +String remove_keyword_tags(String const& tagged_str) { + // Remove all old reminder texts + String s = remove_tag_contents(tagged_str, _("")); + s = remove_tag(s, _("erase(std::remove_if(stat->begin(), stat->end(), condition), stat->end()); + } +} + +String KeywordDatabase::expand(const String& text, KeywordExpandOptions const& options) const { + assert(options.combine_script); + assert_tagged(text); + + // Clean up usage statistics + remove_from_stats(options.stat, options.stat_key); - // After keyword - tagged = tagged.substr(end); - untagged = untagged.substr(start_u + len_u); + // Remove all old reminder texts + String tagged = remove_keyword_tags(text); + + // any keywords in database? + if (!root) return tagged; + + // Find potential matches + auto possible_matches = ::possible_matches(tagged, root.get()); + + // Refine + String untagged = untag_no_escape(tagged); + auto matches = keyword_matches(untagged, possible_matches); - return true; + // Expand + String result = expand_keywords(tagged, matches, options); + assert_tagged(result); + return result; } // ----------------------------------------------------------------------------- : KeywordParamValue diff --git a/src/data/keyword.hpp b/src/data/keyword.hpp index 6467de45..5888a853 100644 --- a/src/data/keyword.hpp +++ b/src/data/keyword.hpp @@ -132,7 +132,16 @@ inline String type_name(const vector&) { // ----------------------------------------------------------------------------- : Using keywords /// Store keyword usage statistics here, using value_being_updated as the key -typedef vector> KeywordUsageStatistics; +typedef vector> KeywordUsageStatistics; + +struct KeywordExpandOptions { + ScriptValueP match_condition; + ScriptValueP expand_default; + ScriptValueP combine_script; + Context& ctx; + KeywordUsageStatistics* stat; + const Value* stat_key; +}; /// A database of keywords to allow for fast matching /** NOTE: keywords may not be altered after they are added to the database, @@ -157,13 +166,13 @@ public: inline bool empty() const { return !root; } /// Expand/update all keywords in the given string. - /** @param expand_default script function indicating whether reminder text should be shown by default - * @param combine_script script function to combine keyword and reminder text in some way - * @param case_sensitive case sensitive matching of keywords? - * @param ctx context for evaluation of scripts - * @param stats where to put keyword statistics + /** @param options.expand_default script function indicating whether reminder text should be shown by default + * @param options.combine_script script function to combine keyword and reminder text in some way + * @param options.case_sensitive case sensitive matching of keywords? + * @param options.ctx context for evaluation of scripts + * @param options.stats where to put keyword statistics */ - String expand(const String& text, const ScriptValueP& match_condition, const ScriptValueP& expand_default, const ScriptValueP& combine_script, Context& ctx, KeywordUsageStatistics* stats = nullptr) const; + String expand(const String& text, const KeywordExpandOptions&) const; private: unique_ptr root; ///< Data structure for finding keywords diff --git a/src/script/functions/basic.cpp b/src/script/functions/basic.cpp index 5f01bd0f..c898f592 100644 --- a/src/script/functions/basic.cpp +++ b/src/script/functions/basic.cpp @@ -403,7 +403,7 @@ SCRIPT_FUNCTION(sort_text) { /// Replace the contents of a specific tag with the value of a script function String replace_tag_contents(String input, const String& tag, const ScriptValueP& contents, Context& ctx) { - assert_tagged(input, false); + assert_tagged(input); String ret; size_t start = 0, pos = input.find(tag); while (pos != String::npos) { @@ -674,7 +674,8 @@ SCRIPT_FUNCTION_WITH_DEP(expand_keywords) { SCRIPT_OPTIONAL_PARAM_C_(CardP, card); try { KeywordUsageStatistics* stat = card ? &card->keyword_usage : nullptr; - SCRIPT_RETURN(db.expand(input, match_condition, default_expand, combine, ctx, stat)); + Value* stat_key = value_being_updated(); + SCRIPT_RETURN(db.expand(input, KeywordExpandOptions{match_condition, default_expand, combine, ctx, stat, stat_key})); } catch (const Error& e) { throw ScriptError(_ERROR_2_("in function", e.what(), _("expand_keywords"))); } diff --git a/src/util/string.hpp b/src/util/string.hpp index 48c1c3cc..c6149391 100644 --- a/src/util/string.hpp +++ b/src/util/string.hpp @@ -220,3 +220,39 @@ String regex_escape(const String& s); /** Basicly replaces "(" with "(?:" */ String make_non_capturing(const String& re); +// ----------------------------------------------------------------------------- : Iterator utilities + +struct end_sentinel_t {} end_sentinel; + +// Iterate over a string, removing all matching substrings. +// match.operator(it,end) should return false or return true and advance it past the substring +template +struct SkipSubstringIterator { +public: + SkipSubstringIterator(It it, End end, Match const& match) : it(it), end(end), match(match) { + while (match(it, end)); + } + bool operator == (end_sentinel_t) const { + return it == end; + } + bool operator != (end_sentinel_t) const { + return it != end; + } + auto operator * () const { + return *it; + } + auto& operator ++ () { + ++it; + while (match(it, end)); + return *this; + } +private: + It it; + End end; + Match match; +}; + +template +inline SkipSubstringIterator skip_substring_iterator(It it, End end, Match const& match) { + return SkipSubstringIterator(it, end, match); +} diff --git a/src/util/tagged_string.cpp b/src/util/tagged_string.cpp index 29589949..81676a8d 100644 --- a/src/util/tagged_string.cpp +++ b/src/util/tagged_string.cpp @@ -106,9 +106,105 @@ String fix_old_tags(const String& str) { return ret; } +// ----------------------------------------------------------------------------- : Iterator algorithms + +[[nodiscard]] String::const_iterator skip_tag(String::const_iterator it, String::const_iterator end) { + assert(it != end && *it == '<'); + ++it; + while (it != end && *it != '>') ++it; + if (it != end) ++it; + return it; +} + +[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end) { + while (it != end && *it == '<') { + it = skip_tag(it, end); + } + return it; +} + +[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end, bool skip_open, bool skip_close) { + // move after first possible position corresponding + while (it != end && *it == '<') { + if (it + 1 != end && *(it + 1) == '/') { + if (skip_close) { + it = skip_tag(it, end); + } else { + return it; + } + } else { + if (skip_open) { + it = skip_tag(it, end); + } else { + return it; + } + } + } + return it; +} + +[[nodiscard]] String::const_iterator advance_untagged(String::const_iterator it, String::const_iterator end, size_t n, bool after_open, bool after_close) { + while (n > 0) { + it = skip_all_tags(it, end); + if (it != end) { + ++it; + --n; + } else { + return it; + } + } + return skip_all_tags(it, end, after_open, after_close); +} + +/* +// Does the string [it..end) contain the matching close tag for [tag..tag_end)? +bool is_close_tag(String::const_iterator it, String::const_iterator end, String::const_iterator tag, String::const_iterator tag_end) { + if (it == end) return false; + if (*it != '<') return false; + ++it; + if (it == end) return false; + if (*it != '/') return false; + assert(tag != tag_end && *tag == '<'); + ++tag; + return is_substr(it,end, tag,end); +} + +String::const_iterator find_close_tag(String::const_iterator tag, String::const_iterator end) { + assert(tag != end && *tag == '<'); + auto tag_end = skip_tag(tag,end); + int nesting = 1; + String::const_iterator it = tag_end; + while (it != end) { + if (*it == '<') { + if (is_substr(it,end, tag,tag_end)) { + ++nesting; + } else if (is_close_tag(it,end, tag,tag_end)) { + --nesting; + if (nesting == 0) return it; + } + it = skip_tag(it,end); + } else { + ++it; + } + } + return end; +}*/ + +[[nodiscard]] size_t untagged_length(String::const_iterator it, String::const_iterator end) { + size_t n = 0; + while (it != end) { + it = skip_all_tags(it, end); + if (it != end) { + ++n; + ++it; + } + } + return n; +} + // ----------------------------------------------------------------------------- : Finding tags -size_t tag_start(const String& str, size_t pos) { +[[nodiscard]] size_t tag_start(const String& str, size_t pos) { size_t start = str.find_last_of(_('<'), pos); if (start == String::npos) return String::npos; size_t end = skip_tag(str, start); @@ -116,13 +212,13 @@ size_t tag_start(const String& str, size_t pos) { return start; } -size_t skip_tag(const String& str, size_t start) { +[[nodiscard]] size_t skip_tag(const String& str, size_t start) { if (start >= str.size()) return String::npos; size_t end = str.find_first_of(_('>'), start); return end == String::npos ? String::npos : end + 1; } -size_t match_close_tag(const String& str, size_t start) { +[[nodiscard]] size_t match_close_tag(const String& str, size_t start) { String tag = tag_type_at(str, start); String ctag = _("/") + tag; size_t size = str.size(); @@ -143,11 +239,11 @@ size_t match_close_tag(const String& str, size_t start) { return String::npos; } -size_t match_close_tag_end(const String& str, size_t start) { +[[nodiscard]] size_t match_close_tag_end(const String& str, size_t start) { return skip_tag(str, match_close_tag(str, start)); } -size_t last_start_tag_before(const String& str, const String& tag, size_t start) { +[[nodiscard]] size_t last_start_tag_before(const String& str, const String& tag, size_t start) { start = min(str.size(), start); for (size_t pos = start ; pos > 0 ; --pos) { if (is_substr(str, pos - 1, tag)) { @@ -157,7 +253,7 @@ size_t last_start_tag_before(const String& str, const String& tag, size_t start) return String::npos; } -size_t in_tag(const String& str, const String& tag, size_t start, size_t end) { +[[nodiscard]] size_t in_tag(const String& str, const String& tag, size_t start, size_t end) { size_t last_start = String::npos; size_t size = str.size(); int taglevel = 0; @@ -604,17 +700,19 @@ String simplify_tagged_overlap(const String& str) { // ----------------------------------------------------------------------------- : Verification -void check_tagged(const String& str, bool check_balance) { +bool check_tagged(const String& str, bool check_balance) { for (size_t i = 0 ; i < str.size() ; ) { if (str.GetChar(i) == _('<')) { size_t end = skip_tag(str,i); if (end == String::npos) { queue_message(MESSAGE_WARNING, _("Invalid tagged string: missing '>'")); + return false; } for (size_t j = i + 1 ; j + 1 < end ; ++j) { Char c = str.GetChar(j); if (c == ESCAPED_LANGLE || c == _('<')) { queue_message(MESSAGE_WARNING, _("Invalid character in tag")); + return false; } } if (check_balance) { @@ -626,6 +724,7 @@ void check_tagged(const String& str, bool check_balance) { size_t close = match_close_tag(str,i); if (close == String::npos) { queue_message(MESSAGE_WARNING, _("Invalid tagged string: missing close tag for <") + tag_at(str,i) + _(">")); + return false; } } } @@ -634,6 +733,7 @@ void check_tagged(const String& str, bool check_balance) { ++i; } } + return true; } // ----------------------------------------------------------------------------- : Other utilities diff --git a/src/util/tagged_string.hpp b/src/util/tagged_string.hpp index 66833ab9..bf52e7e3 100644 --- a/src/util/tagged_string.hpp +++ b/src/util/tagged_string.hpp @@ -53,24 +53,24 @@ String fix_old_tags(const String&); * < t a g > * n y y y y n */ -size_t tag_start(const String& str, size_t pos); +[[nodiscard]] size_t tag_start(const String& str, size_t pos); /// Returns the position just beyond the tag starting at start -size_t skip_tag(const String& str, size_t start); +[[nodiscard]] size_t skip_tag(const String& str, size_t start); /// Find the position of the closing tag matching the tag at start /** If not found returns String::npos */ -size_t match_close_tag(const String& str, size_t start); +[[nodiscard]] size_t match_close_tag(const String& str, size_t start); /// Find the position of the closing tag matching the tag at start /** Returns the position just after that tag. * match_close_tag_end(s,i) == skip_tag(s, match_close_tag(s,i) ) * If not found returns String::npos */ -size_t match_close_tag_end(const String& str, size_t start); +[[nodiscard]] size_t match_close_tag_end(const String& str, size_t start); /// Find the last start tag before position start /** If not found returns String::npos */ -size_t last_start_tag_before(const String& str, const String& tag, size_t start); +[[nodiscard]] size_t last_start_tag_before(const String& str, const String& tag, size_t start); /// Is the given range entirely contained in a given tagged block? /** If so: return the start position of that tag, otherwise returns String::npos @@ -79,7 +79,7 @@ size_t last_start_tag_before(const String& str, const String& tag, size_t start) * x * the x is in_tag */ -size_t in_tag(const String& str, const String& tag, size_t start, size_t end); +[[nodiscard]] size_t in_tag(const String& str, const String& tag, size_t start, size_t end); /// Boolean returning version of the above bool is_in_tag(const String& str, const String& tag, size_t start, size_t end); @@ -96,6 +96,29 @@ String close_tag(const String& tag); /// The matching close tag for an open tag and vice versa String anti_tag(const String& tag); +// ----------------------------------------------------------------------------- : Iterators in tagged strings + +// Skip to the end of a tag, it must point to the start of a tag +[[nodiscard]] String::const_iterator skip_tag(String::const_iterator it, String::const_iterator end); + +// Skip past all tags +[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end); + +// Skip past all open/close tags +[[nodiscard]] String::const_iterator skip_all_tags(String::const_iterator it, String::const_iterator end, bool skip_open, bool skip_close); + +// Advance an iterator by n positions, not counting tags +// For example: advance_untagged("abc",_,2) = "c" +[[nodiscard]] String::const_iterator advance_untagged(String::const_iterator it, String::const_iterator end, size_t n, bool after_open=false, bool after_close=false); + +// Find the position of the closing tag matching the tag at it +// If not found, returns end +[[nodiscard]] String::const_iterator find_close_tag(String::const_iterator it, String::const_iterator end); + +// Length of a string when not counting tags +// For example: untagged_length("abc",_) = 3 +[[nodiscard]] size_t untagged_length(String::const_iterator it, String::const_iterator end); + // ----------------------------------------------------------------------------- : Cursor position /// Directions of cursor movement @@ -188,13 +211,14 @@ String tagged_substr_replace(const String& input, size_t start, size_t end, cons * - There are no tags containing '<' or whitespace * - For each open tag there is a matching close tag. * - * In case of an error, throws an exception. + * In case of an error, shows a warning + * Return true if the string is a valid tagged string */ -void check_tagged(const String& str, bool check_balance = true); +bool check_tagged(const String& str, bool check_balance = true); #ifdef _DEBUG - #define assert_tagged check_tagged + #define assert_tagged(x) assert(check_tagged(x)) #else - inline void assert_tagged(const String& str, bool check_balance = true){} + #define assert_tagged(x) do{}while(0) #endif /// Simplify a tagged string