RED-9746: improve invisible element removal
This commit is contained in:
parent
b019ca8e2d
commit
2cadfdb8bf
@ -70,7 +70,7 @@ int concurrency = 8;
|
||||
int batchSize = 128;
|
||||
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
|
||||
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
|
||||
boolean tableDetection; // writes the tables to the PDF as invisible lines.
|
||||
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
|
||||
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
|
||||
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
|
||||
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
|
||||
|
||||
@ -21,7 +21,7 @@ dependencies {
|
||||
api("org.apache.commons:commons-math3:3.6.1")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.30.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.148.0")
|
||||
api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
|
||||
@ -18,7 +18,7 @@ public class OcrServiceSettings {
|
||||
|
||||
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
|
||||
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
|
||||
boolean tableDetection; // writes the tables to the PDF as invisible lines.
|
||||
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
|
||||
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
|
||||
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
|
||||
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
|
||||
|
||||
@ -20,10 +20,12 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
|
||||
|
||||
ConcurrentHashMap<Integer, PageInformation> pageInformationMap = new ConcurrentHashMap<>();
|
||||
int pageNumber = 1;
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
|
||||
|
||||
Page page = iterator.next();
|
||||
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator()) {
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
|
||||
pageNumber++;
|
||||
}
|
||||
}
|
||||
return pageInformationMap;
|
||||
}
|
||||
|
||||
@ -53,7 +53,7 @@ public class ImageProcessingSupervisor {
|
||||
|
||||
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
|
||||
|
||||
if (hasErros()) {
|
||||
if (hasErrors()) {
|
||||
return null;
|
||||
}
|
||||
getPageLatch(pageNumber).await();
|
||||
@ -61,7 +61,7 @@ public class ImageProcessingSupervisor {
|
||||
}
|
||||
|
||||
|
||||
private boolean hasErros() {
|
||||
private boolean hasErrors() {
|
||||
|
||||
return errors.isEmpty();
|
||||
}
|
||||
|
||||
@ -88,7 +88,7 @@ public class WritableOcrResultFactory {
|
||||
|
||||
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words);
|
||||
|
||||
if (settings.isTableDetection()) {
|
||||
if (settings.isDrawTablesAsLines()) {
|
||||
builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm));
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user