Skip to content

Refactor UNICHARSET script storage to use hash map instead of raw array #4435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 24 additions & 31 deletions src/ccutil/unicharset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,7 @@ void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
fragment = saved_fragment;
}

UNICHARSET::UNICHARSET()
: ids(), script_table(nullptr), script_table_size_used(0) {
UNICHARSET::UNICHARSET() : ids() {
clear();
for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
unichar_insert(kSpecialUnicharCodes[i]);
Expand Down Expand Up @@ -960,17 +959,18 @@ void UNICHARSET::post_load_setup() {

// Compute default script. Use the highest-counting alpha script, that is
// not the common script, as that still contains some "alphas".
int *script_counts = new int[script_table_size_used];
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
auto script_table_size = script_names_.size();
int *script_counts = new int[script_table_size];
memset(script_counts, 0, sizeof(*script_counts) * script_table_size);
for (unsigned id = 0; id < unichars.size(); ++id) {
if (get_isalpha(id)) {
++script_counts[get_script(id)];
}
}
default_sid_ = 0;
for (int s = 1; s < script_table_size_used; ++s) {
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
default_sid_ = s;
for (size_t s = 1; s < script_table_size; ++s) {
if (script_counts[s] > script_counts[default_sid_] && static_cast<int>(s) != common_sid_) {
default_sid_ = static_cast<int>(s);
}
}
delete[] script_counts;
Expand Down Expand Up @@ -1061,26 +1061,20 @@ bool UNICHARSET::AnyRepeatedUnicodes() const {
}

int UNICHARSET::add_script(const char *script) {
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0) {
return i;
}
}
if (script_table_size_reserved == 0) {
script_table_size_reserved = 8;
script_table = new char *[script_table_size_reserved];
} else if (script_table_size_used >= script_table_size_reserved) {
assert(script_table_size_used == script_table_size_reserved);
script_table_size_reserved += script_table_size_reserved;
char **new_script_table = new char *[script_table_size_reserved];
memcpy(new_script_table, script_table,
script_table_size_used * sizeof(char *));
delete[] script_table;
script_table = new_script_table;
}
script_table[script_table_size_used] = new char[strlen(script) + 1];
strcpy(script_table[script_table_size_used], script);
return script_table_size_used++;
std::string script_str(script);

// Check if script already exists using hash map lookup
auto it = script_name_to_id_.find(script_str);
if (it != script_name_to_id_.end()) {
return it->second;
}

// Add new script
int script_id = static_cast<int>(script_names_.size());
script_names_.push_back(script_str);
script_name_to_id_[script_str] = script_id;

return script_id;
}

// Returns the string that represents a fragment
Expand Down Expand Up @@ -1144,10 +1138,9 @@ CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
}

int UNICHARSET::get_script_id_from_name(const char *script_name) const {
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script_name, script_table[i]) == 0) {
return i;
}
auto it = script_name_to_id_.find(std::string(script_name));
if (it != script_name_to_id_.end()) {
return it->second;
}
return 0; // 0 is always the null_script
}
Expand Down
37 changes: 16 additions & 21 deletions src/ccutil/unicharset.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include "serialis.h"

#include <functional> // for std::function
#include <unordered_map> // for script name -> id mapping
#include <vector> // for id -> script name mapping

namespace tesseract {

Expand Down Expand Up @@ -322,15 +324,9 @@ class TESS_API UNICHARSET {

// Clear the UNICHARSET (all the previous data is lost).
void clear() {
if (script_table != nullptr) {
for (int i = 0; i < script_table_size_used; ++i) {
delete[] script_table[i];
}
delete[] script_table;
script_table = nullptr;
script_table_size_used = 0;
}
script_table_size_reserved = 0;
// Clear script storage - no manual memory management needed with STL containers
script_name_to_id_.clear();
script_names_.clear();
delete_pointers_in_unichars();
unichars.clear();
ids.clear();
Expand Down Expand Up @@ -879,22 +875,21 @@ class TESS_API UNICHARSET {

// Return the (current) number of scripts in the script table
int get_script_table_size() const {
return script_table_size_used;
return static_cast<int>(script_names_.size());
}

// Return the script string from its id
const char *get_script_from_script_id(int id) const {
if (id >= script_table_size_used || id < 0) {
if (id >= static_cast<int>(script_names_.size()) || id < 0) {
return null_script;
}
return script_table[id];
return script_names_[id].c_str();
}

// Returns the id from the name of the script, or 0 if script is not found.
// Note that this is an expensive operation since it involves iteratively
// comparing strings in the script table. To avoid dependency on STL, we
// won't use a hash. Instead, the calling function can use this to lookup
// and save the ID for relevant scripts for fast comparisons later.
// Note that this is now an efficient O(1) hash map lookup operation.
// The calling function can use this to lookup and save the ID for relevant
// scripts for fast comparisons later.
int get_script_id_from_name(const char *script_name) const;

// Return true if the given script is the null script
Expand All @@ -903,7 +898,7 @@ class TESS_API UNICHARSET {
}

// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
// then the returned pointer will be the same.
// then the returned id will be the same.
// The script parameter is copied and thus can be a temporary.
int add_script(const char *script);

Expand Down Expand Up @@ -1065,9 +1060,9 @@ class TESS_API UNICHARSET {

std::vector<UNICHAR_SLOT> unichars;
UNICHARMAP ids;
char **script_table;
int script_table_size_used;
int script_table_size_reserved;
// Hash map for efficient script name to id lookup and vector for id to name lookup
std::unordered_map<std::string, int> script_name_to_id_;
std::vector<std::string> script_names_;
// True if the unichars have their tops/bottoms set.
bool top_bottom_set_;
// True if the unicharset has significant upper/lower case chars.
Expand All @@ -1078,7 +1073,7 @@ class TESS_API UNICHARSET {
// True if the set contains chars that would be changed by the cleanup.
bool old_style_included_;

// A few convenient script name-to-id mapping without using hash.
// A few convenient script name-to-id mapping for common scripts.
// These are initialized when unicharset file is loaded. Anything
// missing from this list can be looked up using get_script_id_from_name.
int null_sid_;
Expand Down
Loading