poe2-bot/tools/python-ocr/daemon.py

"""
Persistent Python OCR daemon (stdin/stdout JSON-per-line protocol).

Supports EasyOCR engine, lazy-loaded on first use.
Managed as a subprocess by the C# OcrDaemon.

Request:  {"cmd": "ocr", "engine": "easyocr", "imagePath": "C:\\temp\\screenshot.png"}
Response: {"ok": true, "text": "...", "lines": [{"text": "...", "words": [...]}]}
"""

import sys
import json

_easyocr_reader = None
_paddle_ocr = None


def _redirect_stdout_to_stderr():
    """Redirect stdout to stderr so library print() calls don't corrupt the JSON protocol."""
    real_stdout = sys.stdout
    sys.stdout = sys.stderr
    return real_stdout


def _restore_stdout(real_stdout):
    sys.stdout = real_stdout


def get_easyocr():
    global _easyocr_reader
    if _easyocr_reader is None:
        sys.stderr.write("Loading EasyOCR model...\n")
        sys.stderr.flush()
        # EasyOCR prints download progress to stdout — redirect during load
        real_stdout = _redirect_stdout_to_stderr()
        try:
            import easyocr
            _easyocr_reader = easyocr.Reader(["en"], gpu=True)
        finally:
            _restore_stdout(real_stdout)
        sys.stderr.write("EasyOCR model loaded.\n")
        sys.stderr.flush()
    return _easyocr_reader


def bbox_to_rect(corners):
    """Convert 4-corner bbox [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to axis-aligned {x, y, width, height}."""
    xs = [c[0] for c in corners]
    ys = [c[1] for c in corners]
    x = int(min(xs))
    y = int(min(ys))
    return x, y, int(max(xs)) - x, int(max(ys)) - y


def split_into_words(text, x, y, width, height):
    """Split a detection's text into individual words with proportional bounding boxes."""
    parts = text.split()
    if len(parts) <= 1:
        return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]

    total_chars = sum(len(p) for p in parts)
    if total_chars == 0:
        return [{"text": text.strip(), "x": x, "y": y, "width": width, "height": height}]

    words = []
    cx = x
    for part in parts:
        w = max(1, int(width * len(part) / total_chars))
        words.append({"text": part, "x": cx, "y": y, "width": w, "height": height})
        cx += w
    return words


def run_easyocr(image_path):
    from PIL import Image
    import numpy as np
    img = np.array(Image.open(image_path))
    return run_easyocr_array(img)


def run_easyocr_array(img):
    reader = get_easyocr()

    # Redirect stdout during inference — easyocr can print warnings
    real_stdout = _redirect_stdout_to_stderr()
    try:
        # batch_size=32: batch GPU recognition of detected text regions
        results = reader.readtext(img, batch_size=32)
    finally:
        _restore_stdout(real_stdout)
    # results: [(bbox_4corners, text, conf), ...]
    lines = []
    all_text_parts = []
    for bbox, text, conf in results:
        if not text.strip():
            continue
        x, y, w, h = bbox_to_rect(bbox)
        words = split_into_words(text, x, y, w, h)
        lines.append({"text": text.strip(), "words": words})
        all_text_parts.append(text.strip())
    return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}


def get_paddleocr():
    global _paddle_ocr
    if _paddle_ocr is None:
        sys.stderr.write("Loading PaddleOCR model...\n")
        sys.stderr.flush()
        real_stdout = _redirect_stdout_to_stderr()
        try:
            from paddleocr import PaddleOCR
            _paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=True, show_log=False)
        finally:
            _restore_stdout(real_stdout)
        sys.stderr.write("PaddleOCR model loaded.\n")
        sys.stderr.flush()
    return _paddle_ocr


def run_paddleocr_array(img):
    ocr = get_paddleocr()

    real_stdout = _redirect_stdout_to_stderr()
    try:
        results = ocr.ocr(img, cls=True)
    finally:
        _restore_stdout(real_stdout)

    lines = []
    all_text_parts = []
    # PaddleOCR returns [page_results], each item is [bbox_4corners, (text, conf)]
    if results and results[0]:
        for item in results[0]:
            bbox, (text, conf) = item
            if not text.strip():
                continue
            x, y, w, h = bbox_to_rect(bbox)
            words = split_into_words(text, x, y, w, h)
            lines.append({"text": text.strip(), "words": words})
            all_text_parts.append(text.strip())
    return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines}


def load_image(req):
    """Load image from either imagePath (file) or imageBase64 (base64-encoded PNG)."""
    from PIL import Image
    import numpy as np

    image_base64 = req.get("imageBase64")
    if image_base64:
        import base64
        import io
        img_bytes = base64.b64decode(image_base64)
        return np.array(Image.open(io.BytesIO(img_bytes)))

    image_path = req.get("imagePath")
    if image_path:
        return np.array(Image.open(image_path))

    return None


def handle_request(req):
    cmd = req.get("cmd")
    if cmd != "ocr":
        return {"ok": False, "error": f"Unknown command: {cmd}"}

    engine = req.get("engine", "")
    img = load_image(req)
    if img is None:
        return {"ok": False, "error": "Missing imagePath or imageBase64"}

    if engine == "easyocr":
        return run_easyocr_array(img)
    elif engine == "paddleocr":
        return run_paddleocr_array(img)
    else:
        return {"ok": False, "error": f"Unknown engine: {engine}"}


def main():
    # Signal ready
    sys.stdout.write(json.dumps({"ok": True, "ready": True}) + "\n")
    sys.stdout.flush()

    for line in sys.stdin:
        line = line.strip()
        if not line:
            continue
        try:
            req = json.loads(line)
            resp = handle_request(req)
        except Exception as e:
            resp = {"ok": False, "error": str(e)}
        sys.stdout.write(json.dumps(resp) + "\n")
        sys.stdout.flush()


if __name__ == "__main__":
    main()