tesseract-ocr · Copilot · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/src/ccutil/unicharset.cpp b/src/ccutil/unicharset.cpp
@@ -167,8 +167,7 @@ void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
   fragment = saved_fragment;
 }
 
-UNICHARSET::UNICHARSET()
-    : ids(), script_table(nullptr), script_table_size_used(0) {
+UNICHARSET::UNICHARSET() : ids() {
   clear();
   for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
     unichar_insert(kSpecialUnicharCodes[i]);
@@ -960,17 +959,18 @@ void UNICHARSET::post_load_setup() {
 
   // Compute default script. Use the highest-counting alpha script, that is
   // not the common script, as that still contains some "alphas".
-  int *script_counts = new int[script_table_size_used];
-  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
+  auto script_table_size = script_names_.size();
+  int *script_counts = new int[script_table_size];
+  memset(script_counts, 0, sizeof(*script_counts) * script_table_size);
   for (unsigned id = 0; id < unichars.size(); ++id) {
     if (get_isalpha(id)) {
       ++script_counts[get_script(id)];
     }
   }
   default_sid_ = 0;
-  for (int s = 1; s < script_table_size_used; ++s) {
-    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
-      default_sid_ = s;
+  for (size_t s = 1; s < script_table_size; ++s) {
+    if (script_counts[s] > script_counts[default_sid_] && static_cast<int>(s) != common_sid_) {
+      default_sid_ = static_cast<int>(s);
     }
   }
   delete[] script_counts;
@@ -1061,26 +1061,20 @@ bool UNICHARSET::AnyRepeatedUnicodes() const {
 }
 
 int UNICHARSET::add_script(const char *script) {
-  for (int i = 0; i < script_table_size_used; ++i) {
-    if (strcmp(script, script_table[i]) == 0) {
-      return i;
-    }
-  }
-  if (script_table_size_reserved == 0) {
-    script_table_size_reserved = 8;
-    script_table = new char *[script_table_size_reserved];
-  } else if (script_table_size_used >= script_table_size_reserved) {
-    assert(script_table_size_used == script_table_size_reserved);
-    script_table_size_reserved += script_table_size_reserved;
-    char **new_script_table = new char *[script_table_size_reserved];
-    memcpy(new_script_table, script_table,
-           script_table_size_used * sizeof(char *));
-    delete[] script_table;
-    script_table = new_script_table;
-  }
-  script_table[script_table_size_used] = new char[strlen(script) + 1];
-  strcpy(script_table[script_table_size_used], script);
-  return script_table_size_used++;
+  std::string script_str(script);
+
+  // Check if script already exists using hash map lookup
+  auto it = script_name_to_id_.find(script_str);
+  if (it != script_name_to_id_.end()) {
+    return it->second;
+  }
+
+  // Add new script
+  int script_id = static_cast<int>(script_names_.size());
+  script_names_.push_back(script_str);
+  script_name_to_id_[script_str] = script_id;
+
+  return script_id;
 }
 
 // Returns the string that represents a fragment
@@ -1144,10 +1138,9 @@ CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
 }
 
 int UNICHARSET::get_script_id_from_name(const char *script_name) const {
-  for (int i = 0; i < script_table_size_used; ++i) {
-    if (strcmp(script_name, script_table[i]) == 0) {
-      return i;
-    }
+  auto it = script_name_to_id_.find(std::string(script_name));
+  if (it != script_name_to_id_.end()) {
+    return it->second;
   }
   return 0; // 0 is always the null_script
 }

diff --git a/src/ccutil/unicharset.h b/src/ccutil/unicharset.h
@@ -27,6 +27,8 @@
 #include "serialis.h"
 
 #include <functional> // for std::function
+#include <unordered_map> // for script name -> id mapping
+#include <vector> // for id -> script name mapping
 
 namespace tesseract {
 
@@ -322,15 +324,9 @@ class TESS_API UNICHARSET {
 
   // Clear the UNICHARSET (all the previous data is lost).
   void clear() {
-    if (script_table != nullptr) {
-      for (int i = 0; i < script_table_size_used; ++i) {
-        delete[] script_table[i];
-      }
-      delete[] script_table;
-      script_table = nullptr;
-      script_table_size_used = 0;
-    }
-    script_table_size_reserved = 0;
+    // Clear script storage - no manual memory management needed with STL containers
+    script_name_to_id_.clear();
+    script_names_.clear();
     delete_pointers_in_unichars();
     unichars.clear();
     ids.clear();
@@ -879,22 +875,21 @@ class TESS_API UNICHARSET {
 
   // Return the (current) number of scripts in the script table
   int get_script_table_size() const {
-    return script_table_size_used;
+    return static_cast<int>(script_names_.size());
   }
 
   // Return the script string from its id
   const char *get_script_from_script_id(int id) const {
-    if (id >= script_table_size_used || id < 0) {
+    if (id >= static_cast<int>(script_names_.size()) || id < 0) {
       return null_script;
     }
-    return script_table[id];
+    return script_names_[id].c_str();
   }
 
   // Returns the id from the name of the script, or 0 if script is not found.
-  // Note that this is an expensive operation since it involves iteratively
-  // comparing strings in the script table.  To avoid dependency on STL, we
-  // won't use a hash.  Instead, the calling function can use this to lookup
-  // and save the ID for relevant scripts for fast comparisons later.
+  // Note that this is now an efficient O(1) hash map lookup operation.
+  // The calling function can use this to lookup and save the ID for relevant
+  // scripts for fast comparisons later.
   int get_script_id_from_name(const char *script_name) const;
 
   // Return true if the given script is the null script
@@ -903,7 +898,7 @@ class TESS_API UNICHARSET {
   }
 
   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
-  // then the returned pointer will be the same.
+  // then the returned id will be the same.
   // The script parameter is copied and thus can be a temporary.
   int add_script(const char *script);
 
@@ -1065,9 +1060,9 @@ class TESS_API UNICHARSET {
 
   std::vector<UNICHAR_SLOT> unichars;
   UNICHARMAP ids;
-  char **script_table;
-  int script_table_size_used;
-  int script_table_size_reserved;
+  // Hash map for efficient script name to id lookup and vector for id to name lookup
+  std::unordered_map<std::string, int> script_name_to_id_;
+  std::vector<std::string> script_names_;
   // True if the unichars have their tops/bottoms set.
   bool top_bottom_set_;
   // True if the unicharset has significant upper/lower case chars.
@@ -1078,7 +1073,7 @@ class TESS_API UNICHARSET {
   // True if the set contains chars that would be changed by the cleanup.
   bool old_style_included_;
 
-  // A few convenient script name-to-id mapping without using hash.
+  // A few convenient script name-to-id mapping for common scripts.
   // These are initialized when unicharset file is loaded.  Anything
   // missing from this list can be looked up using get_script_id_from_name.
   int null_sid_;