From eb2ea755a5e42c031ce5ab2ad520575ca9e53c3b Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 3 May 2024 14:06:58 +0200 Subject: [PATCH] akra-certificates: finetuninng for certificates --- .gitattributes | 1 - .../docstrum/DocstrumSegmentationService.java | 5 +- .../processor/docstrum/model/BoundingBox.java | 70 +++++++++++++++++- .../processor/docstrum/model/Zone.java | 2 + .../docstrum/service/ZoneBuilderService.java | 73 +++++++++++++++++++ .../model/text/StringFrequencyCounter.java | 12 +++ .../DocstrumBlockificationService.java | 8 +- .../DocuMineClassificationService.java | 4 +- .../services/factory/SectionNodeFactory.java | 2 +- .../visualization/LayoutGridService.java | 2 +- .../LayoutparsingVisualizations.java | 4 +- .../server/LayoutparserEnd2EndTest.java | 4 +- 12 files changed, 174 insertions(+), 13 deletions(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index b634d85..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 2b095a4..6f361ff 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -63,7 +63,10 @@ public class DocstrumSegmentationService { double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); List lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings); - return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); + List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); +// return zones; + return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings); } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index 9f79eed..ae2fd62 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -135,6 +135,12 @@ public abstract class BoundingBox { } + public boolean intersectsYJava(BoundingBox other) { + + return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY(); + } + + public boolean intersectsY(BoundingBox other, float threshold) { return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); @@ -143,7 +149,13 @@ public abstract class BoundingBox { public boolean intersectsX(BoundingBox other) { - return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX(); + return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX(); + } + + + public boolean intersectsXJava(BoundingBox other) { + + return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX(); } @@ -182,4 +194,60 @@ public abstract class BoundingBox { } }; + + public double horizontalDistance(BoundingBox other) { + + Rectangle2D left; + Rectangle2D right; + if (this.leftOf(other)) { + left = this.getBBox(); + right = other.getBBox(); + } else { + left = other.getBBox(); + right = this.getBBox(); + } + + return Math.max(0, right.getMinX() - left.getMaxX()); + } + + + public double verticalDistance(BoundingBox other) { + + Rectangle2D bottom; + Rectangle2D top; + if (this.isAbove(other)) { + top = this.getBBox(); + bottom = other.getBBox(); + } else { + bottom = this.getBBox(); + top = other.getBBox(); + } + + return Math.max(0, bottom.getMinY() - top.getMaxY()); + } + + + public boolean rightOf(BoundingBox other) { + + return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX(); + } + + + public boolean leftOf(BoundingBox other) { + + return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX(); + } + + + public boolean isAbove(BoundingBox other) { + + return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY(); + } + + + public boolean isBelow(BoundingBox other) { + + return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java index f1c61c5..aa23493 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/Zone.java @@ -29,4 +29,6 @@ public class Zone extends BoundingBox { return sb.toString().trim(); } + + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index ec1871c..3061bfc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; import java.util.ArrayList; +import java.util.Collection; import java.util.Comparator; import java.util.HashSet; import java.util.List; @@ -81,6 +82,78 @@ public class ZoneBuilderService { } + public List mergeZonesUntilConvergence(List zones, double characterSpacing, double lineSpacing, CleanRulings rulings) { + + double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; + double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; + double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; + + UnionFind unionFind = new UnionFind<>(new HashSet<>(zones)); + + double meanHeight = calculateMeanHeight(zones.stream() + .map(Zone::getLines) + .flatMap(Collection::stream) + .toList()); + + zones.forEach(outerZone -> { + zones.forEach(innerZone -> { + + if (innerZone == outerZone // + || unionFind.inSameSet(outerZone, innerZone)// + || !outerZone.intersectsYJava(innerZone) && !innerZone.intersectsXJava(innerZone)) { + return; + } + + double scale = Math.min(outerZone.getHeight(), innerZone.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + + double horizontalDistance = outerZone.horizontalDistance(innerZone); + double verticalDistance = outerZone.verticalDistance(innerZone); + + if (rulings.lineBetween(outerZone, innerZone)) { + return; + } + + if (outerZone.intersectsYJava(innerZone) && horizontalDistance < 10) { + unionFind.union(outerZone, innerZone); + return; + } + + if (outerZone.intersectsXJava(innerZone) && verticalDistance < 6) { + unionFind.union(outerZone, innerZone); + return; + } + + boolean outerZoneEndsWithColon = outerZone.getLines() + .stream() + .allMatch(line -> line.toString().trim().endsWith(":")); + boolean innerZoneDoesNotEndWithColon = innerZone.getLines() + .stream() + .noneMatch(line -> line.toString().trim().endsWith(":")); + if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && outerZone.leftOf(innerZone) && innerZone.getMinX() - outerZone.getMinX() < 250) { + unionFind.union(outerZone, innerZone); + return; + } + if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && (outerZone.isAbove(innerZone) && verticalDistance < lineSpacing * 2)) { + unionFind.union(outerZone, innerZone); + return; + } + + }); + }); + + return unionFind.getGroups() + .stream() + .map(groupOfZones -> groupOfZones.stream() + .map(Zone::getLines) + .flatMap(Collection::stream) + .collect(Collectors.toList())) + .map(linesInZoneToMerge -> mergeLinesInZone(linesInZoneToMerge, characterSpacing, lineSpacing)) + .toList(); + } + + private double calculateMeanHeight(List lines) { double meanHeight = 0.0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java index 934b1b3..0fd707d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java @@ -41,6 +41,18 @@ public class StringFrequencyCounter { mostPopular = entry; } } + if (mostPopular != null && mostPopular.getKey().equals("standard")) { + int standard = countPerValue.get(mostPopular.getKey()); + double total = countPerValue.values() + .stream() + .mapToDouble(v -> v).sum(); + if ((double) standard / total > 0.85) { + return mostPopular.getKey(); + } + countPerValue.remove(mostPopular.getKey()); + return getMostPopular(); + + } return mostPopular != null ? mostPopular.getKey() : null; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 4290da9..a5c8959 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -56,9 +56,9 @@ public class DocstrumBlockificationService { var classificationPage = new ClassificationPage(pageBlocks); classificationPage.setCleanRulings(rulings); - mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); + mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f); - if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { + if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { combineBlocks(classificationPage); } @@ -264,7 +264,9 @@ public class DocstrumBlockificationService { continue; } - if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { + if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 // + && current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) // + && current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); current.getSequences().addAll(inner.getSequences()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d0ee204..8a3ff80 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -62,6 +62,7 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } + /* if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) @@ -73,7 +74,8 @@ public class DocuMineClassificationService { .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) ) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + } else */ + if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() .size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index cca8558..b6e09f5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -117,7 +117,7 @@ public class SectionNodeFactory { if (abstractPageBlock instanceof TextPageBlock) { switch (layoutParsingType) { - case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { + case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { alreadyMerged.add(abstractPageBlock); remainingBlocks.remove(abstractPageBlock); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 4cdc5bc..e0fe7cd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -68,7 +68,7 @@ public class LayoutGridService { public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) { List allVisualizations; - Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false); + Visualizations layoutGrid = this.addLayoutGrid(document, true, false); if (writeVisualLayoutParsingGrid) { Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java index e89ef31..1ca34b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -74,7 +74,7 @@ public class LayoutparsingVisualizations { final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); - final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build(); + final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build(); final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build(); final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); @@ -180,7 +180,7 @@ public class LayoutparsingVisualizations { visualizationsOnPage.getColoredRectangles() .addAll(zones.stream() .map(BoundingBox::getBBoxInitialUserSpace) - .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) + .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 0.5f)) .toList()); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 8763e37..ed044d6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; + String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { file = new File(filePath); } - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.CLARIFYND_PARAGRAPH_DEBUG, true); prepareStorage(layoutParsingRequest, file); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);