working on crop

This commit is contained in:
Boki 2026-02-12 17:48:16 -05:00
parent 93e2234c4e
commit f74e3e1c85
12 changed files with 1135 additions and 220 deletions

View file

@ -71,6 +71,51 @@ def split_into_words(text, x, y, width, height):
return words
def merge_nearby_detections(items, merge_gap):
"""Merge adjacent detections on the same Y baseline when X gap < merge_gap.
items: list of {"text", "x", "y", "w", "h"}
Merge when: Y overlap > 50% of min height AND 0 <= X gap <= merge_gap.
"""
if not items or merge_gap <= 0:
return items
sorted_items = sorted(items, key=lambda d: (d["y"] + d["h"] / 2, d["x"]))
merged = [dict(sorted_items[0])]
for item in sorted_items[1:]:
last = merged[-1]
overlap = min(last["y"] + last["h"], item["y"] + item["h"]) - max(last["y"], item["y"])
min_h = min(last["h"], item["h"])
x_gap = item["x"] - (last["x"] + last["w"])
if min_h > 0 and overlap / min_h > 0.5 and 0 <= x_gap <= merge_gap:
new_x = min(last["x"], item["x"])
new_y = min(last["y"], item["y"])
new_x2 = max(last["x"] + last["w"], item["x"] + item["w"])
new_y2 = max(last["y"] + last["h"], item["y"] + item["h"])
last["x"] = new_x
last["y"] = new_y
last["w"] = new_x2 - new_x
last["h"] = new_y2 - new_y
last["text"] = last["text"] + " " + item["text"]
else:
merged.append(dict(item))
return merged
def items_to_response(items):
"""Convert list of {"text", "x", "y", "w", "h"} to OcrResponse format."""
lines = []
all_text_parts = []
for item in items:
words = split_into_words(item["text"], item["x"], item["y"], item["w"], item["h"])
lines.append({"text": item["text"], "words": words})
all_text_parts.append(item["text"])
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
def run_easyocr(image_path):
from PIL import Image
import numpy as np
@ -78,27 +123,28 @@ def run_easyocr(image_path):
return run_easyocr_array(img)
def run_easyocr_array(img):
def run_easyocr_array(img, merge_gap=0, **easyocr_kwargs):
reader = get_easyocr()
# Redirect stdout during inference — easyocr can print warnings
real_stdout = _redirect_stdout_to_stderr()
try:
# batch_size=32: batch GPU recognition of detected text regions
results = reader.readtext(img, batch_size=32)
results = reader.readtext(img, batch_size=32, **easyocr_kwargs)
finally:
_restore_stdout(real_stdout)
# results: [(bbox_4corners, text, conf), ...]
lines = []
all_text_parts = []
items = []
for bbox, text, conf in results:
if not text.strip():
continue
x, y, w, h = bbox_to_rect(bbox)
words = split_into_words(text, x, y, w, h)
lines.append({"text": text.strip(), "words": words})
all_text_parts.append(text.strip())
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
items.append({"text": text.strip(), "x": x, "y": y, "w": w, "h": h})
if merge_gap > 0:
items = merge_nearby_detections(items, merge_gap)
return items_to_response(items)
def get_paddleocr():
@ -106,10 +152,18 @@ def get_paddleocr():
if _paddle_ocr is None:
sys.stderr.write("Loading PaddleOCR model...\n")
sys.stderr.flush()
import os
os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
real_stdout = _redirect_stdout_to_stderr()
try:
from paddleocr import PaddleOCR
_paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=True, show_log=False)
_paddle_ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang="en",
ocr_version="PP-OCRv4",
)
finally:
_restore_stdout(real_stdout)
sys.stderr.write("PaddleOCR model loaded.\n")
@ -117,28 +171,41 @@ def get_paddleocr():
return _paddle_ocr
def run_paddleocr_array(img):
def run_paddleocr_array(img, merge_gap=0):
ocr = get_paddleocr()
# Ensure RGB 3-channel
if len(img.shape) == 2:
import numpy as np
img = np.stack([img, img, img], axis=-1)
elif img.shape[2] == 4:
img = img[:, :, :3]
real_stdout = _redirect_stdout_to_stderr()
try:
results = ocr.ocr(img, cls=True)
results = ocr.predict(img)
finally:
_restore_stdout(real_stdout)
lines = []
all_text_parts = []
# PaddleOCR returns [page_results], each item is [bbox_4corners, (text, conf)]
if results and results[0]:
for item in results[0]:
bbox, (text, conf) = item
items = []
# PaddleOCR 3.4: results is list of OCRResult objects
for res in results:
texts = res.get("rec_texts", []) if hasattr(res, "get") else getattr(res, "rec_texts", [])
polys = res.get("dt_polys", []) if hasattr(res, "get") else getattr(res, "dt_polys", [])
for i, text in enumerate(texts):
if not text.strip():
continue
x, y, w, h = bbox_to_rect(bbox)
words = split_into_words(text, x, y, w, h)
lines.append({"text": text.strip(), "words": words})
all_text_parts.append(text.strip())
return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}
if i < len(polys):
bbox = polys[i]
x, y, w, h = bbox_to_rect(bbox)
else:
x, y, w, h = 0, 0, 0, 0
items.append({"text": text.strip(), "x": x, "y": y, "w": w, "h": h})
if merge_gap > 0:
items = merge_nearby_detections(items, merge_gap)
return items_to_response(items)
def load_image(req):
@ -170,10 +237,22 @@ def handle_request(req):
if img is None:
return {"ok": False, "error": "Missing imagePath or imageBase64"}
merge_gap = req.get("mergeGap", 0)
if engine == "easyocr":
return run_easyocr_array(img)
easyocr_kwargs = {}
for json_key, py_param in [
("linkThreshold", "link_threshold"),
("textThreshold", "text_threshold"),
("lowText", "low_text"),
("widthThs", "width_ths"),
("paragraph", "paragraph"),
]:
if json_key in req:
easyocr_kwargs[py_param] = req[json_key]
return run_easyocr_array(img, merge_gap=merge_gap, **easyocr_kwargs)
elif engine == "paddleocr":
return run_paddleocr_array(img)
return run_paddleocr_array(img, merge_gap=merge_gap)
else:
return {"ok": False, "error": f"Unknown engine: {engine}"}