From: Robin Watts Date: Wed, 3 May 2023 14:40:16 +0000 (+0100) Subject: Attempt to speeedup "copy: chinese-example.pdf" PyMuPDF test. X-Git-Tag: 1.22.2~9 X-Git-Url: https://git.ghostscript.com/?a=commitdiff_plain;h=76713d6f75a8d1fac1afadcebe9471fc1fb8d93f;p=mupdf.git Attempt to speeedup "copy: chinese-example.pdf" PyMuPDF test. In order to copy the file, we first open it, then save it. As part of opening it, we perform tests on each of the objects in the file. This takes a long time. This file is strangely constructed in that of the 4 xrefs in the file, the most basic one is extremely fragmented, making looking up an object in the xref be almost an O(n) process. We lookup every object as part of the tests, making this an O(n^2) process. Here we move to a process whereby we 'map' the checks across the objects in the file, moving it back to O(n). --- diff --git a/include/mupdf/pdf/xref.h b/include/mupdf/pdf/xref.h index b05a0528b..931376556 100644 --- a/include/mupdf/pdf/xref.h +++ b/include/mupdf/pdf/xref.h @@ -160,6 +160,12 @@ pdf_xref_entry *pdf_get_populating_xref_entry(fz_context *ctx, pdf_document *doc */ pdf_xref_entry *pdf_get_xref_entry(fz_context *ctx, pdf_document *doc, int i); +/* + Map a function across all xref entries in a document. +*/ +void pdf_xref_entry_map(fz_context *ctx, pdf_document *doc, void (*fn)(fz_context *, pdf_xref_entry *, int i, pdf_document *doc, void *), void *arg); + + /* Used after loading a document to access entries. diff --git a/source/pdf/pdf-xref.c b/source/pdf/pdf-xref.c index 427040420..30c884a0c 100644 --- a/source/pdf/pdf-xref.c +++ b/source/pdf/pdf-xref.c @@ -462,6 +462,55 @@ pdf_xref_entry *pdf_get_xref_entry_no_null(fz_context *ctx, pdf_document *doc, i fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find object in xref (%d 0 R), but not allowed to return NULL", i); } +void pdf_xref_entry_map(fz_context *ctx, pdf_document *doc, void (*fn)(fz_context *, pdf_xref_entry *, int, pdf_document *, void *), void *arg) +{ + int xref_len = pdf_xref_len(ctx, doc); + int i, j; + pdf_xref_subsec *sub; + int xref_base = doc->xref_base; + + fz_try(ctx) + { + /* Map over any active local xref first. */ + if (doc->local_xref && doc->local_xref_nesting > 0) + { + pdf_xref *xref = doc->local_xref; + + for (sub = xref->subsec; sub != NULL; sub = sub->next) + { + for (i = sub->start; i < sub->start + sub->len; i++) + { + pdf_xref_entry *entry = &sub->table[i - sub->start]; + if (entry->type) + fn(ctx, entry, i, doc, arg); + } + } + } + + for (j = 0; j < doc->num_xref_sections; j++) + { + pdf_xref *xref = &doc->xref_sections[j]; + doc->xref_base = j; + + for (sub = xref->subsec; sub != NULL; sub = sub->next) + { + for (i = sub->start; i < sub->start + sub->len; i++) + { + pdf_xref_entry *entry = &sub->table[i - sub->start]; + if (entry->type) + fn(ctx, entry, i, doc, arg); + } + } + } + } + fz_always(ctx) + { + doc->xref_base = xref_base; + } + fz_catch(ctx) + fz_rethrow(ctx); +} + /* Ensure we have an incremental xref section where we can store updated versions of indirect objects. This is a new xref section @@ -1547,6 +1596,31 @@ pdf_prime_xref_index(fz_context *ctx, pdf_document *doc) } } +static void +check_xref_entry_offsets(fz_context *ctx, pdf_xref_entry *entry, int i, pdf_document *doc, void *arg) +{ + int xref_len = (int)(intptr_t)arg; + + if (entry->type == 'n') + { + /* Special case code: "0000000000 * n" means free, + * according to some producers (inc Quartz) */ + if (entry->ofs == 0) + entry->type = 'f'; + else if (entry->ofs <= 0 || entry->ofs >= doc->file_size) + fz_throw(ctx, FZ_ERROR_GENERIC, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i); + } + else if (entry->type == 'o') + { + /* Read this into a local variable here, because pdf_get_xref_entry + * may solidify the xref, hence invalidating "entry", meaning we + * need a stashed value for the throw. */ + int64_t ofs = entry->ofs; + if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry_no_null(ctx, doc, ofs)->type != 'n') + fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i); + } +} + /* * load xref tables from pdf * @@ -1556,7 +1630,6 @@ pdf_prime_xref_index(fz_context *ctx, pdf_document *doc) static void pdf_load_xref(fz_context *ctx, pdf_document *doc) { - int i; int xref_len; pdf_xref_entry *entry; @@ -1583,28 +1656,7 @@ pdf_load_xref(fz_context *ctx, pdf_document *doc) /* broken pdfs where object offsets are out of range */ xref_len = pdf_xref_len(ctx, doc); - for (i = 0; i < xref_len; i++) - { - entry = pdf_get_xref_entry(ctx, doc, i); - if (entry && entry->type == 'n') - { - /* Special case code: "0000000000 * n" means free, - * according to some producers (inc Quartz) */ - if (entry->ofs == 0) - entry->type = 'f'; - else if (entry->ofs <= 0 || entry->ofs >= doc->file_size) - fz_throw(ctx, FZ_ERROR_GENERIC, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i); - } - if (entry && entry->type == 'o') - { - /* Read this into a local variable here, because pdf_get_xref_entry - * may solidify the xref, hence invalidating "entry", meaning we - * need a stashed value for the throw. */ - int64_t ofs = entry->ofs; - if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry_no_null(ctx, doc, ofs)->type != 'n') - fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i); - } - } + pdf_xref_entry_map(ctx, doc, check_xref_entry_offsets, (void *)(intptr_t)xref_len); } static void