work on OCR

This commit is contained in:
Boki 2026-02-11 17:42:28 -05:00
parent 6600969947
commit 854a474435
13 changed files with 4374 additions and 38 deletions

View file

@ -4,6 +4,8 @@ using System.Drawing;
using System.Drawing.Imaging;
using System.Runtime.InteropServices;
using System.Text.Json;
using OpenCvSharp;
using OpenCvSharp.Extensions;
using Tesseract;
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
@ -188,11 +190,10 @@ class OcrHandler(TesseractEngine engine)
return new OcrResponse { Text = "", Lines = [] };
}
int pad = 0;
int minX = Math.Max(bestColStart - pad, 0);
int minY = Math.Max(bestRowStart - pad, 0);
int maxX = Math.Min(bestColEnd + pad, w - 1);
int maxY = Math.Min(bestRowEnd + pad, h - 1);
int minX = bestColStart;
int minY = bestRowStart;
int maxX = Math.Min(bestColEnd, w - 1);
int maxY = Math.Min(bestRowEnd, h - 1);
// Dynamic right-edge trim: if the rightmost columns are much sparser than
// the tooltip body, trim them. This handles the ~5% of cases where ambient
@ -232,10 +233,18 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
}
// Pre-process for OCR
using var processed = p.UseBackgroundSub
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
: ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
// Pre-process for OCR — get Mat for per-line detection and padding
Mat processedMat;
if (p.UseBackgroundSub)
{
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale, p.SoftThreshold);
}
else
{
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
processedMat = BitmapConverter.ToMat(topHatBmp);
}
using var _processedMat = processedMat; // ensure disposal
// Save fullscreen and preprocessed versions alongside raw
if (!string.IsNullOrEmpty(req.Path))
@ -246,21 +255,114 @@ class OcrHandler(TesseractEngine engine)
if (debug) Console.Error.WriteLine($" diff-ocr: saved fullscreen to {fullPath}");
var prePath = Path.ChangeExtension(req.Path, ".pre" + ext);
processed.Save(prePath, ImageUtils.GetImageFormat(prePath));
using var preBmp = BitmapConverter.ToBitmap(processedMat);
preBmp.Save(prePath, ImageUtils.GetImageFormat(prePath));
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
}
using var pix = ImageUtils.BitmapToPix(processed);
using var page = engine.Process(pix);
var text = page.GetText();
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY);
int pad = p.OcrPad;
int upscale = p.Upscale > 0 ? p.Upscale : 1;
var lines = new List<OcrLineResult>();
return new DiffOcrResponse
// Per-line OCR: detect text lines via horizontal projection, OCR each individually
if (p.UsePerLineOcr)
{
Text = text,
Lines = lines,
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
// DetectTextLines needs binary input; if soft threshold produced grayscale, binarize a copy
int minRowPx = Math.Max(processedMat.Cols / 200, 3);
using var detectionMat = p.SoftThreshold ? new Mat() : null;
if (p.SoftThreshold)
Cv2.Threshold(processedMat, detectionMat!, 128, 255, ThresholdTypes.Binary);
var lineDetectInput = p.SoftThreshold ? detectionMat! : processedMat;
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: p.LineGapTolerance * upscale);
if (debug) Console.Error.WriteLine($" diff-ocr: detected {textLines.Count} text lines");
if (textLines.Count > 0)
{
int linePadY = p.LinePadY;
foreach (var (yStart, yEnd) in textLines)
{
int y0 = Math.Max(yStart - linePadY, 0);
int y1 = Math.Min(yEnd + linePadY, processedMat.Rows - 1);
int lineH = y1 - y0 + 1;
// Crop line strip (full width)
using var lineStrip = new Mat(processedMat, new OpenCvSharp.Rect(0, y0, processedMat.Cols, lineH));
// Add whitespace padding around the line
using var padded = new Mat();
Cv2.CopyMakeBorder(lineStrip, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
using var lineBmp = BitmapConverter.ToBitmap(padded);
using var linePix = ImageUtils.BitmapToPix(lineBmp);
using var linePage = engine.Process(linePix, (PageSegMode)p.Psm);
// Extract words, adjusting coordinates back to screen space
// Word coords are in padded image space → subtract pad, add line offset, scale to original, add region offset
var lineWords = new List<OcrWordResult>();
using var iter = linePage.GetIterator();
if (iter != null)
{
iter.Begin();
do
{
var wordText = iter.GetText(PageIteratorLevel.Word);
if (string.IsNullOrWhiteSpace(wordText)) continue;
float conf = iter.GetConfidence(PageIteratorLevel.Word);
if (conf < 50) continue;
if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds))
{
lineWords.Add(new OcrWordResult
{
Text = wordText.Trim(),
X = (bounds.X1 - pad + 0) / upscale + minX,
Y = (bounds.Y1 - pad + y0) / upscale + minY,
Width = bounds.Width / upscale,
Height = bounds.Height / upscale,
});
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
}
if (lineWords.Count > 0)
{
var lineText = string.Join(" ", lineWords.Select(w => w.Text));
lines.Add(new OcrLineResult { Text = lineText, Words = lineWords });
}
}
var text = string.Join("\n", lines.Select(l => l.Text)) + "\n";
return new DiffOcrResponse
{
Text = text,
Lines = lines,
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
}
if (debug) Console.Error.WriteLine(" diff-ocr: no text lines detected, falling back to whole-block OCR");
}
// Whole-block fallback: add padding and use configurable PSM
{
using var padded = new Mat();
Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
using var bmp = BitmapConverter.ToBitmap(padded);
using var pix = ImageUtils.BitmapToPix(bmp);
using var page = engine.Process(pix, (PageSegMode)p.Psm);
var text = page.GetText();
// Adjust word coordinates: subtract padding offset
lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX - pad / upscale, offsetY: minY - pad / upscale);
return new DiffOcrResponse
{
Text = text,
Lines = lines,
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
};
}
}
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
@ -314,6 +416,8 @@ class OcrHandler(TesseractEngine engine)
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
("ocrPad", [0, 5, 10, 15, 20, 30], (p, v) => p.OcrPad = v),
("psm", [4, 6, 11, 13], (p, v) => p.Psm = v),
};
// Top-hat specific
@ -325,8 +429,10 @@ class OcrHandler(TesseractEngine engine)
// Background-subtraction specific
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
{
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
("lineGapTolerance", [3, 5, 8, 10, 15], (p, v) => p.LineGapTolerance = v),
("linePadY", [5, 10, 15, 20], (p, v) => p.LinePadY = v),
};
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];