Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly (infiniflow#5182)

yuzhichang · web-flow · commit c326f14fedf3 · 2025-02-20T15:41:12.000+08:00
### What problem does this PR solve?

Optimized Recognizer.sort_X_firstly and Recognizer.sort_Y_firstly

### Type of change

- [x] Performance Improvement
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
@@ -957,7 +957,7 @@ def __images__(self, fnm, zoomin=3, page_from=0,
             self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                 enumerate(self.pdf.pages[page_from:page_to])]
             try:
-                self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
+                self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
             except Exception as e:
                 logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
                 self.page_chars = [[] for _ in range(page_to - page_from)]  # If failed to extract, using empty list instead.
diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py
@@ -19,7 +19,7 @@
 import math
 import numpy as np
 import cv2
-from copy import deepcopy
+from functools import cmp_to_key
 
 import onnxruntime as ort
 from huggingface_hub import snapshot_download
@@ -99,30 +99,22 @@ def cuda_is_available():
 
     @staticmethod
     def sort_Y_firstly(arr, threashold):
-        # sort using y1 first and then x1
-        arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
-        for i in range(len(arr) - 1):
-            for j in range(i, -1, -1):
-                # restore the order using th
-                if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
-                        and arr[j + 1]["x0"] < arr[j]["x0"]:
-                    tmp = deepcopy(arr[j])
-                    arr[j] = deepcopy(arr[j + 1])
-                    arr[j + 1] = deepcopy(tmp)
+        def cmp(c1, c2):
+            diff = c1["top"] - c2["top"]
+            if abs(diff) < threashold:
+                diff = c1["x0"] - c2["x0"]
+            return diff
+        arr = sorted(arr, key=cmp_to_key(cmp))
         return arr
 
     @staticmethod
-    def sort_X_firstly(arr, threashold, copy=True):
-        # sort using y1 first and then x1
-        arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
-        for i in range(len(arr) - 1):
-            for j in range(i, -1, -1):
-                # restore the order using th
-                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
-                        and arr[j + 1]["top"] < arr[j]["top"]:
-                    tmp = deepcopy(arr[j]) if copy else arr[j]
-                    arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
-                    arr[j + 1] = deepcopy(tmp) if copy else tmp
+    def sort_X_firstly(arr, threashold):
+        def cmp(c1, c2):
+            diff = c1["x0"] - c2["x0"]
+            if abs(diff) < threashold:
+                diff = c1["top"] - c2["top"]
+            return diff
+        arr = sorted(arr, key=cmp_to_key(cmp))
         return arr
 
     @staticmethod
@@ -145,8 +137,6 @@ def sort_C_firstly(arr, thr=0):
                     arr[j + 1] = tmp
         return arr
 
-        return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
-
     @staticmethod
     def sort_R_firstly(arr, thr=0):
         # sort using y1 first and then x1
diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py
@@ -177,7 +177,7 @@ def construct_table(boxes, is_english=False, html=False):
         colwm = np.min(colwm) if colwm else 0
         crosspage = len(set([b["page_number"] for b in boxes])) > 1
         if crosspage:
-            boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False)
+            boxes = Recognizer.sort_X_firstly(boxes, colwm / 2)
         else:
             boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
         boxes[0]["cn"] = 0