Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c326f14

Browse files
authoredFeb 20, 2025
Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly (infiniflow#5182)
### What problem does this PR solve? Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly ### Type of change - [x] Performance Improvement
1 parent 07ddb8f commit c326f14

File tree

3 files changed

+16
-26
lines changed

3 files changed

+16
-26
lines changed
 

‎deepdoc/parser/pdf_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ def __images__(self, fnm, zoomin=3, page_from=0,
957957
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
958958
enumerate(self.pdf.pages[page_from:page_to])]
959959
try:
960-
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
960+
self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
961961
except Exception as e:
962962
logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
963963
self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.

‎deepdoc/vision/recognizer.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import math
2020
import numpy as np
2121
import cv2
22-
from copy import deepcopy
22+
from functools import cmp_to_key
2323

2424
import onnxruntime as ort
2525
from huggingface_hub import snapshot_download
@@ -99,30 +99,22 @@ def cuda_is_available():
9999

100100
@staticmethod
101101
def sort_Y_firstly(arr, threashold):
102-
# sort using y1 first and then x1
103-
arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
104-
for i in range(len(arr) - 1):
105-
for j in range(i, -1, -1):
106-
# restore the order using th
107-
if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
108-
and arr[j + 1]["x0"] < arr[j]["x0"]:
109-
tmp = deepcopy(arr[j])
110-
arr[j] = deepcopy(arr[j + 1])
111-
arr[j + 1] = deepcopy(tmp)
102+
def cmp(c1, c2):
103+
diff = c1["top"] - c2["top"]
104+
if abs(diff) < threashold:
105+
diff = c1["x0"] - c2["x0"]
106+
return diff
107+
arr = sorted(arr, key=cmp_to_key(cmp))
112108
return arr
113109

114110
@staticmethod
115-
def sort_X_firstly(arr, threashold, copy=True):
116-
# sort using y1 first and then x1
117-
arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
118-
for i in range(len(arr) - 1):
119-
for j in range(i, -1, -1):
120-
# restore the order using th
121-
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
122-
and arr[j + 1]["top"] < arr[j]["top"]:
123-
tmp = deepcopy(arr[j]) if copy else arr[j]
124-
arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
125-
arr[j + 1] = deepcopy(tmp) if copy else tmp
111+
def sort_X_firstly(arr, threashold):
112+
def cmp(c1, c2):
113+
diff = c1["x0"] - c2["x0"]
114+
if abs(diff) < threashold:
115+
diff = c1["top"] - c2["top"]
116+
return diff
117+
arr = sorted(arr, key=cmp_to_key(cmp))
126118
return arr
127119

128120
@staticmethod
@@ -145,8 +137,6 @@ def sort_C_firstly(arr, thr=0):
145137
arr[j + 1] = tmp
146138
return arr
147139

148-
return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
149-
150140
@staticmethod
151141
def sort_R_firstly(arr, thr=0):
152142
# sort using y1 first and then x1

‎deepdoc/vision/table_structure_recognizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def construct_table(boxes, is_english=False, html=False):
177177
colwm = np.min(colwm) if colwm else 0
178178
crosspage = len(set([b["page_number"] for b in boxes])) > 1
179179
if crosspage:
180-
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False)
180+
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
181181
else:
182182
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
183183
boxes[0]["cn"] = 0

0 commit comments

Comments
 (0)
Failed to load comments.