RED-9746: improve invisible element removal

This commit is contained in:
Kilian Schüttler 2024-08-19 14:47:48 +02:00
parent b019ca8e2d
commit 2cadfdb8bf
6 changed files with 12 additions and 10 deletions

View File

@ -70,7 +70,7 @@ int concurrency = 8;
int batchSize = 128;
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
boolean tableDetection; // writes the tables to the PDF as invisible lines.
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....

View File

@ -21,7 +21,7 @@ dependencies {
api("org.apache.commons:commons-math3:3.6.1")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
api("com.iqser.red.commons:pdftron-logic-commons:2.30.0")
api("com.knecon.fforesight:viewer-doc-processor:0.148.0")
api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")

View File

@ -18,7 +18,7 @@ public class OcrServiceSettings {
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
boolean tableDetection; // writes the tables to the PDF as invisible lines.
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....

View File

@ -20,10 +20,12 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
ConcurrentHashMap<Integer, PageInformation> pageInformationMap = new ConcurrentHashMap<>();
int pageNumber = 1;
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
Page page = iterator.next();
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
try (PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next();
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
pageNumber++;
}
}
return pageInformationMap;
}

View File

@ -53,7 +53,7 @@ public class ImageProcessingSupervisor {
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
if (hasErros()) {
if (hasErrors()) {
return null;
}
getPageLatch(pageNumber).await();
@ -61,7 +61,7 @@ public class ImageProcessingSupervisor {
}
private boolean hasErros() {
private boolean hasErrors() {
return errors.isEmpty();
}

View File

@ -88,7 +88,7 @@ public class WritableOcrResultFactory {
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words);
if (settings.isTableDetection()) {
if (settings.isDrawTablesAsLines()) {
builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm));
}