work on OCR
This commit is contained in:
parent
6600969947
commit
854a474435
13 changed files with 4374 additions and 38 deletions
|
|
@ -4,6 +4,8 @@ using System.Drawing;
|
|||
using System.Drawing.Imaging;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text.Json;
|
||||
using OpenCvSharp;
|
||||
using OpenCvSharp.Extensions;
|
||||
using Tesseract;
|
||||
using SdImageFormat = System.Drawing.Imaging.ImageFormat;
|
||||
|
||||
|
|
@ -188,11 +190,10 @@ class OcrHandler(TesseractEngine engine)
|
|||
return new OcrResponse { Text = "", Lines = [] };
|
||||
}
|
||||
|
||||
int pad = 0;
|
||||
int minX = Math.Max(bestColStart - pad, 0);
|
||||
int minY = Math.Max(bestRowStart - pad, 0);
|
||||
int maxX = Math.Min(bestColEnd + pad, w - 1);
|
||||
int maxY = Math.Min(bestRowEnd + pad, h - 1);
|
||||
int minX = bestColStart;
|
||||
int minY = bestRowStart;
|
||||
int maxX = Math.Min(bestColEnd, w - 1);
|
||||
int maxY = Math.Min(bestRowEnd, h - 1);
|
||||
|
||||
// Dynamic right-edge trim: if the rightmost columns are much sparser than
|
||||
// the tooltip body, trim them. This handles the ~5% of cases where ambient
|
||||
|
|
@ -232,10 +233,18 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (debug) Console.Error.WriteLine($" diff-ocr: saved raw to {req.Path}");
|
||||
}
|
||||
|
||||
// Pre-process for OCR
|
||||
using var processed = p.UseBackgroundSub
|
||||
? ImagePreprocessor.PreprocessWithBackgroundSub(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale)
|
||||
: ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
||||
// Pre-process for OCR — get Mat for per-line detection and padding
|
||||
Mat processedMat;
|
||||
if (p.UseBackgroundSub)
|
||||
{
|
||||
processedMat = ImagePreprocessor.PreprocessWithBackgroundSubMat(cropped, refCropped, p.DimPercentile, p.TextThresh, p.Upscale, p.SoftThreshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
using var topHatBmp = ImagePreprocessor.PreprocessForOcr(cropped, p.KernelSize, p.Upscale);
|
||||
processedMat = BitmapConverter.ToMat(topHatBmp);
|
||||
}
|
||||
using var _processedMat = processedMat; // ensure disposal
|
||||
|
||||
// Save fullscreen and preprocessed versions alongside raw
|
||||
if (!string.IsNullOrEmpty(req.Path))
|
||||
|
|
@ -246,21 +255,114 @@ class OcrHandler(TesseractEngine engine)
|
|||
if (debug) Console.Error.WriteLine($" diff-ocr: saved fullscreen to {fullPath}");
|
||||
|
||||
var prePath = Path.ChangeExtension(req.Path, ".pre" + ext);
|
||||
processed.Save(prePath, ImageUtils.GetImageFormat(prePath));
|
||||
using var preBmp = BitmapConverter.ToBitmap(processedMat);
|
||||
preBmp.Save(prePath, ImageUtils.GetImageFormat(prePath));
|
||||
if (debug) Console.Error.WriteLine($" diff-ocr: saved preprocessed to {prePath}");
|
||||
}
|
||||
using var pix = ImageUtils.BitmapToPix(processed);
|
||||
using var page = engine.Process(pix);
|
||||
|
||||
var text = page.GetText();
|
||||
var lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX, offsetY: minY);
|
||||
int pad = p.OcrPad;
|
||||
int upscale = p.Upscale > 0 ? p.Upscale : 1;
|
||||
var lines = new List<OcrLineResult>();
|
||||
|
||||
return new DiffOcrResponse
|
||||
// Per-line OCR: detect text lines via horizontal projection, OCR each individually
|
||||
if (p.UsePerLineOcr)
|
||||
{
|
||||
Text = text,
|
||||
Lines = lines,
|
||||
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
|
||||
};
|
||||
// DetectTextLines needs binary input; if soft threshold produced grayscale, binarize a copy
|
||||
int minRowPx = Math.Max(processedMat.Cols / 200, 3);
|
||||
using var detectionMat = p.SoftThreshold ? new Mat() : null;
|
||||
if (p.SoftThreshold)
|
||||
Cv2.Threshold(processedMat, detectionMat!, 128, 255, ThresholdTypes.Binary);
|
||||
var lineDetectInput = p.SoftThreshold ? detectionMat! : processedMat;
|
||||
var textLines = ImagePreprocessor.DetectTextLines(lineDetectInput, minRowPixels: minRowPx, gapTolerance: p.LineGapTolerance * upscale);
|
||||
if (debug) Console.Error.WriteLine($" diff-ocr: detected {textLines.Count} text lines");
|
||||
|
||||
if (textLines.Count > 0)
|
||||
{
|
||||
int linePadY = p.LinePadY;
|
||||
foreach (var (yStart, yEnd) in textLines)
|
||||
{
|
||||
int y0 = Math.Max(yStart - linePadY, 0);
|
||||
int y1 = Math.Min(yEnd + linePadY, processedMat.Rows - 1);
|
||||
int lineH = y1 - y0 + 1;
|
||||
|
||||
// Crop line strip (full width)
|
||||
using var lineStrip = new Mat(processedMat, new OpenCvSharp.Rect(0, y0, processedMat.Cols, lineH));
|
||||
|
||||
// Add whitespace padding around the line
|
||||
using var padded = new Mat();
|
||||
Cv2.CopyMakeBorder(lineStrip, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
|
||||
|
||||
using var lineBmp = BitmapConverter.ToBitmap(padded);
|
||||
using var linePix = ImageUtils.BitmapToPix(lineBmp);
|
||||
using var linePage = engine.Process(linePix, (PageSegMode)p.Psm);
|
||||
|
||||
// Extract words, adjusting coordinates back to screen space
|
||||
// Word coords are in padded image space → subtract pad, add line offset, scale to original, add region offset
|
||||
var lineWords = new List<OcrWordResult>();
|
||||
using var iter = linePage.GetIterator();
|
||||
if (iter != null)
|
||||
{
|
||||
iter.Begin();
|
||||
do
|
||||
{
|
||||
var wordText = iter.GetText(PageIteratorLevel.Word);
|
||||
if (string.IsNullOrWhiteSpace(wordText)) continue;
|
||||
|
||||
float conf = iter.GetConfidence(PageIteratorLevel.Word);
|
||||
if (conf < 50) continue;
|
||||
|
||||
if (iter.TryGetBoundingBox(PageIteratorLevel.Word, out var bounds))
|
||||
{
|
||||
lineWords.Add(new OcrWordResult
|
||||
{
|
||||
Text = wordText.Trim(),
|
||||
X = (bounds.X1 - pad + 0) / upscale + minX,
|
||||
Y = (bounds.Y1 - pad + y0) / upscale + minY,
|
||||
Width = bounds.Width / upscale,
|
||||
Height = bounds.Height / upscale,
|
||||
});
|
||||
}
|
||||
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
|
||||
}
|
||||
|
||||
if (lineWords.Count > 0)
|
||||
{
|
||||
var lineText = string.Join(" ", lineWords.Select(w => w.Text));
|
||||
lines.Add(new OcrLineResult { Text = lineText, Words = lineWords });
|
||||
}
|
||||
}
|
||||
|
||||
var text = string.Join("\n", lines.Select(l => l.Text)) + "\n";
|
||||
return new DiffOcrResponse
|
||||
{
|
||||
Text = text,
|
||||
Lines = lines,
|
||||
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
|
||||
};
|
||||
}
|
||||
|
||||
if (debug) Console.Error.WriteLine(" diff-ocr: no text lines detected, falling back to whole-block OCR");
|
||||
}
|
||||
|
||||
// Whole-block fallback: add padding and use configurable PSM
|
||||
{
|
||||
using var padded = new Mat();
|
||||
Cv2.CopyMakeBorder(processedMat, padded, pad, pad, pad, pad, BorderTypes.Constant, Scalar.White);
|
||||
using var bmp = BitmapConverter.ToBitmap(padded);
|
||||
using var pix = ImageUtils.BitmapToPix(bmp);
|
||||
using var page = engine.Process(pix, (PageSegMode)p.Psm);
|
||||
|
||||
var text = page.GetText();
|
||||
// Adjust word coordinates: subtract padding offset
|
||||
lines = ImageUtils.ExtractLinesFromPage(page, offsetX: minX - pad / upscale, offsetY: minY - pad / upscale);
|
||||
|
||||
return new DiffOcrResponse
|
||||
{
|
||||
Text = text,
|
||||
Lines = lines,
|
||||
Region = new RegionRect { X = minX, Y = minY, Width = rw, Height = rh },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public object HandleTest(Request req) => RunTestCases(new DiffOcrParams(), verbose: true);
|
||||
|
|
@ -314,6 +416,8 @@ class OcrHandler(TesseractEngine engine)
|
|||
("colThreshDiv", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.ColThreshDiv = v),
|
||||
("maxGap", [5, 8, 10, 12, 15, 20, 25, 30], (p, v) => p.MaxGap = v),
|
||||
("upscale", [1, 2, 3], (p, v) => p.Upscale = v),
|
||||
("ocrPad", [0, 5, 10, 15, 20, 30], (p, v) => p.OcrPad = v),
|
||||
("psm", [4, 6, 11, 13], (p, v) => p.Psm = v),
|
||||
};
|
||||
|
||||
// Top-hat specific
|
||||
|
|
@ -325,8 +429,10 @@ class OcrHandler(TesseractEngine engine)
|
|||
// Background-subtraction specific
|
||||
var bgSubSweeps = new (string Name, int[] Values, Action<DiffOcrParams, int> Set)[]
|
||||
{
|
||||
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
|
||||
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
|
||||
("dimPercentile", [5, 10, 15, 20, 25, 30, 40, 50], (p, v) => p.DimPercentile = v),
|
||||
("textThresh", [10, 15, 20, 25, 30, 40, 50, 60, 80], (p, v) => p.TextThresh = v),
|
||||
("lineGapTolerance", [3, 5, 8, 10, 15], (p, v) => p.LineGapTolerance = v),
|
||||
("linePadY", [5, 10, 15, 20], (p, v) => p.LinePadY = v),
|
||||
};
|
||||
|
||||
double[] trimValues = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5];
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue