From cf5d944fd1c7279b4a2c0f3b4f76de0c96e556f2 Mon Sep 17 00:00:00 2001 From: Boki Date: Thu, 12 Feb 2026 11:24:31 -0500 Subject: [PATCH] finished easyocr and pipeline --- src/dashboard/DashboardServer.ts | 24 ++++- src/dashboard/index.html | 25 +++++ src/game/OcrDaemon.ts | 11 ++- src/game/ScreenReader.ts | 19 ++-- tools/OcrDaemon/Daemon.cs | 151 +++++++++++++++++++++++-------- tools/OcrDaemon/Models.cs | 3 + tools/OcrDaemon/OcrHandler.cs | 27 ++++++ tools/python-ocr/daemon.py | 43 +++++++++ 8 files changed, 252 insertions(+), 51 deletions(-) diff --git a/src/dashboard/DashboardServer.ts b/src/dashboard/DashboardServer.ts index 39f2c27..2bf573c 100644 --- a/src/dashboard/DashboardServer.ts +++ b/src/dashboard/DashboardServer.ts @@ -8,7 +8,7 @@ import { logger } from '../util/logger.js'; import { sleep } from '../util/sleep.js'; import type { BotController } from './BotController.js'; import type { ScreenReader } from '../game/ScreenReader.js'; -import type { OcrEngine } from '../game/OcrDaemon.js'; +import type { OcrEngine, OcrPreprocess } from '../game/OcrDaemon.js'; import { GRID_LAYOUTS } from '../game/GridReader.js'; import type { GameController } from '../game/GameController.js'; @@ -131,8 +131,8 @@ export class DashboardServer { this.app.post('/api/debug/ocr-engine', (req, res) => { if (!this.debug) { res.status(503).json({ error: 'Debug not available' }); return; } const { engine } = req.body as { engine: string }; - if (!['tesseract', 'easyocr'].includes(engine)) { - res.status(400).json({ error: 'Invalid engine. Must be tesseract or easyocr.' }); + if (!['tesseract', 'easyocr', 'paddleocr'].includes(engine)) { + res.status(400).json({ error: 'Invalid engine. Must be tesseract, easyocr, or paddleocr.' }); return; } this.debug.screenReader.debugOcrEngine = engine as OcrEngine; @@ -140,6 +140,24 @@ export class DashboardServer { res.json({ ok: true }); }); + // OCR preprocess selection + this.app.get('/api/debug/ocr-preprocess', (_req, res) => { + if (!this.debug) { res.status(503).json({ error: 'Debug not available' }); return; } + res.json({ ok: true, preprocess: this.debug.screenReader.debugPreprocess }); + }); + + this.app.post('/api/debug/ocr-preprocess', (req, res) => { + if (!this.debug) { res.status(503).json({ error: 'Debug not available' }); return; } + const { preprocess } = req.body as { preprocess: string }; + if (!['none', 'bgsub', 'tophat'].includes(preprocess)) { + res.status(400).json({ error: 'Invalid preprocess. Must be none, bgsub, or tophat.' }); + return; + } + this.debug.screenReader.debugPreprocess = preprocess as OcrPreprocess; + this.broadcastLog('info', `OCR preprocess set to: ${preprocess}`); + res.json({ ok: true }); + }); + this.app.post('/api/debug/ocr', async (_req, res) => { if (!this.debug) { res.status(503).json({ error: 'Debug not available' }); return; } try { diff --git a/src/dashboard/index.html b/src/dashboard/index.html index 986992f..5babde6 100644 --- a/src/dashboard/index.html +++ b/src/dashboard/index.html @@ -455,6 +455,12 @@ + @@ -1004,8 +1010,27 @@ } catch {} } + async function setOcrPreprocess(preprocess) { + await fetch('/api/debug/ocr-preprocess', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ preprocess }), + }); + } + + async function loadOcrPreprocess() { + try { + const res = await fetch('/api/debug/ocr-preprocess'); + const data = await res.json(); + if (data.ok && data.preprocess) { + document.getElementById('ocrPreprocessSelect').value = data.preprocess; + } + } catch {} + } + connect(); loadOcrEngine(); + loadOcrPreprocess(); diff --git a/src/game/OcrDaemon.ts b/src/game/OcrDaemon.ts index 73b4fa4..21f9a2a 100644 --- a/src/game/OcrDaemon.ts +++ b/src/game/OcrDaemon.ts @@ -67,7 +67,9 @@ export interface TemplateMatchResult { confidence: number; } -export type OcrEngine = 'tesseract' | 'easyocr'; +export type OcrEngine = 'tesseract' | 'easyocr' | 'paddleocr'; + +export type OcrPreprocess = 'none' | 'bgsub' | 'tophat'; interface DaemonRequest { cmd: string; @@ -79,6 +81,7 @@ interface DaemonRequest { minCellSize?: number; maxCellSize?: number; engine?: string; + preprocess?: string; } interface DaemonResponse { @@ -133,10 +136,11 @@ export class OcrDaemon { // ── Public API ────────────────────────────────────────────────────────── - async ocr(region?: Region, engine?: OcrEngine): Promise { + async ocr(region?: Region, engine?: OcrEngine, preprocess?: OcrPreprocess): Promise { const req: DaemonRequest = { cmd: 'ocr' }; if (region) req.region = region; if (engine && engine !== 'tesseract') req.engine = engine; + if (preprocess && preprocess !== 'none') req.preprocess = preprocess; // Python engines need longer timeout for first model load + download const timeout = (engine && engine !== 'tesseract') ? 120_000 : CAPTURE_TIMEOUT; const resp = await this.sendWithRetry(req, timeout); @@ -182,11 +186,12 @@ export class OcrDaemon { await this.sendWithRetry({ cmd: 'snapshot' }, REQUEST_TIMEOUT); } - async diffOcr(savePath?: string, region?: Region, engine?: OcrEngine): Promise { + async diffOcr(savePath?: string, region?: Region, engine?: OcrEngine, preprocess?: OcrPreprocess): Promise { const req: DaemonRequest = { cmd: 'diff-ocr' }; if (savePath) req.path = savePath; if (region) req.region = region; if (engine && engine !== 'tesseract') req.engine = engine; + if (preprocess) req.preprocess = preprocess; const timeout = (engine && engine !== 'tesseract') ? 120_000 : CAPTURE_TIMEOUT; const resp = await this.sendWithRetry(req, timeout); return { diff --git a/src/game/ScreenReader.ts b/src/game/ScreenReader.ts index 7b513f3..94e1afb 100644 --- a/src/game/ScreenReader.ts +++ b/src/game/ScreenReader.ts @@ -1,7 +1,7 @@ import { mkdir } from 'fs/promises'; import { join } from 'path'; import { logger } from '../util/logger.js'; -import { OcrDaemon, type OcrResponse, type OcrEngine, type DiffOcrResponse, type TemplateMatchResult } from './OcrDaemon.js'; +import { OcrDaemon, type OcrResponse, type OcrEngine, type OcrPreprocess, type DiffOcrResponse, type TemplateMatchResult } from './OcrDaemon.js'; import { GridReader, type GridLayout, type CellCoord } from './GridReader.js'; import type { Region } from '../types.js'; @@ -13,6 +13,7 @@ export class ScreenReader { private daemon = new OcrDaemon(); readonly grid = new GridReader(this.daemon); debugOcrEngine: OcrEngine = 'tesseract'; + debugPreprocess: OcrPreprocess = 'bgsub'; // ── Screenshot capture ────────────────────────────────────────────── @@ -241,20 +242,20 @@ export class ScreenReader { async debugDiffOcr(savePath?: string, region?: Region): Promise { const t = performance.now(); - const result = await this.daemon.diffOcr(savePath, region, this.debugOcrEngine); - logger.info({ engine: this.debugOcrEngine, ms: elapsed(t) }, 'debugDiffOcr'); + const result = await this.daemon.diffOcr(savePath, region, this.debugOcrEngine, this.debugPreprocess); + logger.info({ engine: this.debugOcrEngine, preprocess: this.debugPreprocess, ms: elapsed(t) }, 'debugDiffOcr'); return result; } async debugOcr(region?: Region): Promise { const t = performance.now(); - const result = await this.daemon.ocr(region, this.debugOcrEngine); - logger.info({ engine: this.debugOcrEngine, ms: elapsed(t) }, 'debugOcr'); + const result = await this.daemon.ocr(region, this.debugOcrEngine, this.debugPreprocess); + logger.info({ engine: this.debugOcrEngine, preprocess: this.debugPreprocess, ms: elapsed(t) }, 'debugOcr'); return result; } async debugReadFullScreen(): Promise { - const result = await this.daemon.ocr(undefined, this.debugOcrEngine); + const result = await this.daemon.ocr(undefined, this.debugOcrEngine, this.debugPreprocess); return result.text; } @@ -263,13 +264,13 @@ export class ScreenReader { fuzzy: boolean = false, ): Promise<{ x: number; y: number } | null> { const t = performance.now(); - const result = await this.daemon.ocr(undefined, this.debugOcrEngine); + const result = await this.daemon.ocr(undefined, this.debugOcrEngine, this.debugPreprocess); const pos = this.findWordInOcrResult(result, searchText, fuzzy); if (pos) { - logger.info({ searchText, engine: this.debugOcrEngine, x: pos.x, y: pos.y, totalMs: elapsed(t) }, 'debugFindText found'); + logger.info({ searchText, engine: this.debugOcrEngine, preprocess: this.debugPreprocess, x: pos.x, y: pos.y, totalMs: elapsed(t) }, 'debugFindText found'); } else { - logger.info({ searchText, engine: this.debugOcrEngine, totalMs: elapsed(t) }, 'debugFindText not found'); + logger.info({ searchText, engine: this.debugOcrEngine, preprocess: this.debugPreprocess, totalMs: elapsed(t) }, 'debugFindText not found'); } return pos; } diff --git a/tools/OcrDaemon/Daemon.cs b/tools/OcrDaemon/Daemon.cs index 2fc36d5..6cfb898 100644 --- a/tools/OcrDaemon/Daemon.cs +++ b/tools/OcrDaemon/Daemon.cs @@ -1,5 +1,6 @@ namespace OcrDaemon; +using System.Drawing; using System.Text.Json; using System.Text.Json.Serialization; using Tesseract; @@ -74,15 +75,11 @@ static class Daemon object response = request.Cmd?.ToLowerInvariant() switch { - "ocr" when request.Engine is "easyocr" - => pythonBridge.HandleOcr(request, request.Engine), - "ocr" => ocrHandler.HandleOcr(request), + "ocr" => HandleOcrPipeline(ocrHandler, pythonBridge, request), "screenshot" => ocrHandler.HandleScreenshot(request), "capture" => ocrHandler.HandleCapture(request), "snapshot" => ocrHandler.HandleSnapshot(request), - "diff-ocr" when request.Engine is "easyocr" - => HandleDiffOcrPython(ocrHandler, pythonBridge, request), - "diff-ocr" => ocrHandler.HandleDiffOcr(request), + "diff-ocr" => HandleDiffOcrPipeline(ocrHandler, pythonBridge, request), "test" => ocrHandler.HandleTest(request), "tune" => ocrHandler.HandleTune(request), "grid" => gridHandler.HandleGrid(request), @@ -102,11 +99,67 @@ static class Daemon return 0; } - private static object HandleDiffOcrPython(OcrHandler ocrHandler, PythonOcrBridge pythonBridge, Request request) + /// + /// Unified OCR pipeline for full/region captures. + /// Capture → optional preprocess → route to engine (tesseract / easyocr / paddleocr). + /// + private static object HandleOcrPipeline(OcrHandler ocrHandler, PythonOcrBridge pythonBridge, Request request) { + var engine = request.Engine ?? "tesseract"; + var preprocess = request.Preprocess ?? "none"; + + // No preprocess + tesseract = original fast path + if (engine == "tesseract" && preprocess == "none") + return ocrHandler.HandleOcr(request); + + // Capture + using var bitmap = ScreenCapture.CaptureOrLoad(request.File, request.Region); + + // Preprocess + Bitmap processed; + if (preprocess == "tophat") + { + processed = ImagePreprocessor.PreprocessForOcr(bitmap); + } + else if (preprocess == "bgsub") + { + return new ErrorResponse("bgsub preprocess requires a reference frame; use diff-ocr instead."); + } + else // "none" + { + processed = (Bitmap)bitmap.Clone(); + } + using var _processed = processed; + + // Route to engine + if (engine == "tesseract") + { + var region = request.Region != null + ? new RegionRect { X = request.Region.X, Y = request.Region.Y, Width = request.Region.Width, Height = request.Region.Height } + : new RegionRect { X = 0, Y = 0, Width = processed.Width, Height = processed.Height }; + return ocrHandler.RunTesseractOnBitmap(processed, region); + } + else // easyocr, paddleocr + { + return pythonBridge.OcrFromBitmap(processed, engine); + } + } + + /// + /// Unified diff-OCR pipeline for tooltip detection. + /// DiffCrop → preprocess (default=bgsub) → route to engine. + /// + private static object HandleDiffOcrPipeline(OcrHandler ocrHandler, PythonOcrBridge pythonBridge, Request request) + { + var engine = request.Engine ?? "tesseract"; + var preprocess = request.Preprocess ?? "bgsub"; + var isPythonEngine = engine is "easyocr" or "paddleocr"; + + // No engine override + no preprocess override = original Tesseract path (supports test/tune params) + if (engine == "tesseract" && request.Preprocess == null) + return ocrHandler.HandleDiffOcr(request); + var sw = System.Diagnostics.Stopwatch.StartNew(); - // Use default params (same wide crop as Tesseract path). - // Background subtraction below eliminates stash items from the image. var p = new DiffOcrParams(); if (request.Threshold > 0) p.DiffThresh = request.Threshold; @@ -117,46 +170,72 @@ static class Daemon var (cropped, refCropped, current, region) = cropResult.Value; using var _current = current; - // Apply background subtraction to isolate tooltip text. - // This removes stash items and game world — only tooltip text remains. - // No upscale (upscale=1) to keep the image small for EasyOCR speed. - // Hard threshold (softThreshold=false) produces clean binary for OCR. - using var processed = ImagePreprocessor.PreprocessWithBackgroundSub( - cropped, refCropped, dimPercentile: 40, textThresh: 60, upscale: 1, softThreshold: false); + // Preprocess + Bitmap processed; + if (preprocess == "bgsub") + { + int upscale = isPythonEngine ? 1 : 2; + processed = ImagePreprocessor.PreprocessWithBackgroundSub( + cropped, refCropped, dimPercentile: 40, textThresh: 60, upscale: upscale, softThreshold: false); + } + else if (preprocess == "tophat") + { + processed = ImagePreprocessor.PreprocessForOcr(cropped); + } + else // "none" + { + processed = (Bitmap)cropped.Clone(); + } cropped.Dispose(); refCropped.Dispose(); - var diffMs = sw.ElapsedMilliseconds; - // Save processed crop if path provided + var diffMs = sw.ElapsedMilliseconds; + using var _processed = processed; + + // Save debug images if path provided if (!string.IsNullOrEmpty(request.Path)) { var dir = Path.GetDirectoryName(request.Path); if (!string.IsNullOrEmpty(dir) && !Directory.Exists(dir)) Directory.CreateDirectory(dir); + // Save preprocessed crop processed.Save(request.Path, ImageUtils.GetImageFormat(request.Path)); + + var ext = Path.GetExtension(request.Path); + var fullPath = Path.ChangeExtension(request.Path, ".full" + ext); + current.Save(fullPath, ImageUtils.GetImageFormat(fullPath)); } - // Send processed image to Python OCR via base64 + // Route to engine sw.Restart(); - var ocrResult = pythonBridge.OcrFromBitmap(processed, request.Engine!); - var ocrMs = sw.ElapsedMilliseconds; - - Console.Error.WriteLine($" diff-ocr-python: diff={diffMs}ms ocr={ocrMs}ms total={diffMs + ocrMs}ms crop={region.Width}x{region.Height}"); - - // Offset word coordinates to screen space - foreach (var line in ocrResult.Lines) - foreach (var word in line.Words) - { - word.X += region.X; - word.Y += region.Y; - } - - return new DiffOcrResponse + if (engine == "tesseract") { - Text = ocrResult.Text, - Lines = ocrResult.Lines, - Region = region, - }; + var result = ocrHandler.RunTesseractOnBitmap(processed, region); + var ocrMs = sw.ElapsedMilliseconds; + Console.Error.WriteLine($" diff-ocr-pipeline: engine={engine} preprocess={preprocess} diff={diffMs}ms ocr={ocrMs}ms crop={region.Width}x{region.Height}"); + return result; + } + else // easyocr, paddleocr + { + var ocrResult = pythonBridge.OcrFromBitmap(processed, engine); + var ocrMs = sw.ElapsedMilliseconds; + Console.Error.WriteLine($" diff-ocr-pipeline: engine={engine} preprocess={preprocess} diff={diffMs}ms ocr={ocrMs}ms crop={region.Width}x{region.Height}"); + + // Offset word coordinates to screen space + foreach (var line in ocrResult.Lines) + foreach (var word in line.Words) + { + word.X += region.X; + word.Y += region.Y; + } + + return new DiffOcrResponse + { + Text = ocrResult.Text, + Lines = ocrResult.Lines, + Region = region, + }; + } } private static void WriteResponse(object response) diff --git a/tools/OcrDaemon/Models.cs b/tools/OcrDaemon/Models.cs index 55427b0..03ef8c4 100644 --- a/tools/OcrDaemon/Models.cs +++ b/tools/OcrDaemon/Models.cs @@ -42,6 +42,9 @@ class Request [JsonPropertyName("engine")] public string? Engine { get; set; } + + [JsonPropertyName("preprocess")] + public string? Preprocess { get; set; } } class RegionRect diff --git a/tools/OcrDaemon/OcrHandler.cs b/tools/OcrDaemon/OcrHandler.cs index 04dbe07..26cfe46 100644 --- a/tools/OcrDaemon/OcrHandler.cs +++ b/tools/OcrDaemon/OcrHandler.cs @@ -401,6 +401,33 @@ class OcrHandler(TesseractEngine engine) } } + /// + /// Run Tesseract OCR on an already-preprocessed bitmap. Converts to Mat, pads, + /// runs PSM-6, and adjusts word coordinates to screen space using the supplied region. + /// + public DiffOcrResponse RunTesseractOnBitmap(Bitmap processedBmp, RegionRect region, int pad = 10, int upscale = 2, int psm = 6) + { + using var processedMat = BitmapConverter.ToMat(processedBmp); + using var padded = new Mat(); + Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White); + using var bmp = BitmapConverter.ToBitmap(padded); + using var pix = ImageUtils.BitmapToPix(bmp); + using var page = engine.Process(pix, (PageSegMode)psm); + + var text = page.GetText(); + int effUpscale = upscale > 0 ? upscale : 1; + var lines = ImageUtils.ExtractLinesFromPage(page, + offsetX: region.X - pad / effUpscale, + offsetY: region.Y - pad / effUpscale); + + return new DiffOcrResponse + { + Text = text, + Lines = lines, + Region = region, + }; + } + public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true); public object HandleTune(Request req) diff --git a/tools/python-ocr/daemon.py b/tools/python-ocr/daemon.py index 1d38b60..116a1ac 100644 --- a/tools/python-ocr/daemon.py +++ b/tools/python-ocr/daemon.py @@ -12,6 +12,7 @@ import sys import json _easyocr_reader = None +_paddle_ocr = None def _redirect_stdout_to_stderr(): @@ -100,6 +101,46 @@ def run_easyocr_array(img): return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines} +def get_paddleocr(): + global _paddle_ocr + if _paddle_ocr is None: + sys.stderr.write("Loading PaddleOCR model...\n") + sys.stderr.flush() + real_stdout = _redirect_stdout_to_stderr() + try: + from paddleocr import PaddleOCR + _paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", use_gpu=True, show_log=False) + finally: + _restore_stdout(real_stdout) + sys.stderr.write("PaddleOCR model loaded.\n") + sys.stderr.flush() + return _paddle_ocr + + +def run_paddleocr_array(img): + ocr = get_paddleocr() + + real_stdout = _redirect_stdout_to_stderr() + try: + results = ocr.ocr(img, cls=True) + finally: + _restore_stdout(real_stdout) + + lines = [] + all_text_parts = [] + # PaddleOCR returns [page_results], each item is [bbox_4corners, (text, conf)] + if results and results[0]: + for item in results[0]: + bbox, (text, conf) = item + if not text.strip(): + continue + x, y, w, h = bbox_to_rect(bbox) + words = split_into_words(text, x, y, w, h) + lines.append({"text": text.strip(), "words": words}) + all_text_parts.append(text.strip()) + return {"ok": True, "text": "\n".join(all_text_parts), "lines": lines} + + def load_image(req): """Load image from either imagePath (file) or imageBase64 (base64-encoded PNG).""" from PIL import Image @@ -131,6 +172,8 @@ def handle_request(req): if engine == "easyocr": return run_easyocr_array(img) + elif engine == "paddleocr": + return run_paddleocr_array(img) else: return {"ok": False, "error": f"Unknown engine: {engine}"}