diff --git a/src/ccutil/unicharset.cpp b/src/ccutil/unicharset.cpp index b29ec3b7fe..622056f23e 100644 --- a/src/ccutil/unicharset.cpp +++ b/src/ccutil/unicharset.cpp @@ -167,8 +167,7 @@ void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) { fragment = saved_fragment; } -UNICHARSET::UNICHARSET() - : ids(), script_table(nullptr), script_table_size_used(0) { +UNICHARSET::UNICHARSET() : ids() { clear(); for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) { unichar_insert(kSpecialUnicharCodes[i]); @@ -960,17 +959,18 @@ void UNICHARSET::post_load_setup() { // Compute default script. Use the highest-counting alpha script, that is // not the common script, as that still contains some "alphas". - int *script_counts = new int[script_table_size_used]; - memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); + auto script_table_size = script_names_.size(); + int *script_counts = new int[script_table_size]; + memset(script_counts, 0, sizeof(*script_counts) * script_table_size); for (unsigned id = 0; id < unichars.size(); ++id) { if (get_isalpha(id)) { ++script_counts[get_script(id)]; } } default_sid_ = 0; - for (int s = 1; s < script_table_size_used; ++s) { - if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) { - default_sid_ = s; + for (size_t s = 1; s < script_table_size; ++s) { + if (script_counts[s] > script_counts[default_sid_] && static_cast(s) != common_sid_) { + default_sid_ = static_cast(s); } } delete[] script_counts; @@ -1061,26 +1061,20 @@ bool UNICHARSET::AnyRepeatedUnicodes() const { } int UNICHARSET::add_script(const char *script) { - for (int i = 0; i < script_table_size_used; ++i) { - if (strcmp(script, script_table[i]) == 0) { - return i; - } - } - if (script_table_size_reserved == 0) { - script_table_size_reserved = 8; - script_table = new char *[script_table_size_reserved]; - } else if (script_table_size_used >= script_table_size_reserved) { - assert(script_table_size_used == script_table_size_reserved); - script_table_size_reserved += script_table_size_reserved; - char **new_script_table = new char *[script_table_size_reserved]; - memcpy(new_script_table, script_table, - script_table_size_used * sizeof(char *)); - delete[] script_table; - script_table = new_script_table; - } - script_table[script_table_size_used] = new char[strlen(script) + 1]; - strcpy(script_table[script_table_size_used], script); - return script_table_size_used++; + std::string script_str(script); + + // Check if script already exists using hash map lookup + auto it = script_name_to_id_.find(script_str); + if (it != script_name_to_id_.end()) { + return it->second; + } + + // Add new script + int script_id = static_cast(script_names_.size()); + script_names_.push_back(script_str); + script_name_to_id_[script_str] = script_id; + + return script_id; } // Returns the string that represents a fragment @@ -1144,10 +1138,9 @@ CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { } int UNICHARSET::get_script_id_from_name(const char *script_name) const { - for (int i = 0; i < script_table_size_used; ++i) { - if (strcmp(script_name, script_table[i]) == 0) { - return i; - } + auto it = script_name_to_id_.find(std::string(script_name)); + if (it != script_name_to_id_.end()) { + return it->second; } return 0; // 0 is always the null_script } diff --git a/src/ccutil/unicharset.h b/src/ccutil/unicharset.h index dd0ff8f3dd..067ba06002 100644 --- a/src/ccutil/unicharset.h +++ b/src/ccutil/unicharset.h @@ -27,6 +27,8 @@ #include "serialis.h" #include // for std::function +#include // for script name -> id mapping +#include // for id -> script name mapping namespace tesseract { @@ -322,15 +324,9 @@ class TESS_API UNICHARSET { // Clear the UNICHARSET (all the previous data is lost). void clear() { - if (script_table != nullptr) { - for (int i = 0; i < script_table_size_used; ++i) { - delete[] script_table[i]; - } - delete[] script_table; - script_table = nullptr; - script_table_size_used = 0; - } - script_table_size_reserved = 0; + // Clear script storage - no manual memory management needed with STL containers + script_name_to_id_.clear(); + script_names_.clear(); delete_pointers_in_unichars(); unichars.clear(); ids.clear(); @@ -879,22 +875,21 @@ class TESS_API UNICHARSET { // Return the (current) number of scripts in the script table int get_script_table_size() const { - return script_table_size_used; + return static_cast(script_names_.size()); } // Return the script string from its id const char *get_script_from_script_id(int id) const { - if (id >= script_table_size_used || id < 0) { + if (id >= static_cast(script_names_.size()) || id < 0) { return null_script; } - return script_table[id]; + return script_names_[id].c_str(); } // Returns the id from the name of the script, or 0 if script is not found. - // Note that this is an expensive operation since it involves iteratively - // comparing strings in the script table. To avoid dependency on STL, we - // won't use a hash. Instead, the calling function can use this to lookup - // and save the ID for relevant scripts for fast comparisons later. + // Note that this is now an efficient O(1) hash map lookup operation. + // The calling function can use this to lookup and save the ID for relevant + // scripts for fast comparisons later. int get_script_id_from_name(const char *script_name) const; // Return true if the given script is the null script @@ -903,7 +898,7 @@ class TESS_API UNICHARSET { } // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, - // then the returned pointer will be the same. + // then the returned id will be the same. // The script parameter is copied and thus can be a temporary. int add_script(const char *script); @@ -1065,9 +1060,9 @@ class TESS_API UNICHARSET { std::vector unichars; UNICHARMAP ids; - char **script_table; - int script_table_size_used; - int script_table_size_reserved; + // Hash map for efficient script name to id lookup and vector for id to name lookup + std::unordered_map script_name_to_id_; + std::vector script_names_; // True if the unichars have their tops/bottoms set. bool top_bottom_set_; // True if the unicharset has significant upper/lower case chars. @@ -1078,7 +1073,7 @@ class TESS_API UNICHARSET { // True if the set contains chars that would be changed by the cleanup. bool old_style_included_; - // A few convenient script name-to-id mapping without using hash. + // A few convenient script name-to-id mapping for common scripts. // These are initialized when unicharset file is loaded. Anything // missing from this list can be looked up using get_script_id_from_name. int null_sid_;