akra-certificates: finetuninng for certificates

This commit is contained in:
Kilian Schuettler 2024-05-03 14:06:58 +02:00
parent 07733d0855
commit eb2ea755a5
12 changed files with 174 additions and 13 deletions

1
.gitattributes vendored
View File

@ -1 +0,0 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

View File

@ -63,7 +63,10 @@ public class DocstrumSegmentationService {
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
// return zones;
return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings);
}
}

View File

@ -135,6 +135,12 @@ public abstract class BoundingBox {
}
public boolean intersectsYJava(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
}
public boolean intersectsY(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
@ -143,7 +149,13 @@ public abstract class BoundingBox {
public boolean intersectsX(BoundingBox other) {
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsXJava(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
}
@ -182,4 +194,60 @@ public abstract class BoundingBox {
}
};
public double horizontalDistance(BoundingBox other) {
Rectangle2D left;
Rectangle2D right;
if (this.leftOf(other)) {
left = this.getBBox();
right = other.getBBox();
} else {
left = other.getBBox();
right = this.getBBox();
}
return Math.max(0, right.getMinX() - left.getMaxX());
}
public double verticalDistance(BoundingBox other) {
Rectangle2D bottom;
Rectangle2D top;
if (this.isAbove(other)) {
top = this.getBBox();
bottom = other.getBBox();
} else {
bottom = this.getBBox();
top = other.getBBox();
}
return Math.max(0, bottom.getMinY() - top.getMaxY());
}
public boolean rightOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
}
public boolean leftOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
}
public boolean isAbove(BoundingBox other) {
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
}
public boolean isBelow(BoundingBox other) {
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
}
}

View File

@ -29,4 +29,6 @@ public class Zone extends BoundingBox {
return sb.toString().trim();
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
@ -81,6 +82,78 @@ public class ZoneBuilderService {
}
public List<Zone> mergeZonesUntilConvergence(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Zone> unionFind = new UnionFind<>(new HashSet<>(zones));
double meanHeight = calculateMeanHeight(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.toList());
zones.forEach(outerZone -> {
zones.forEach(innerZone -> {
if (innerZone == outerZone //
|| unionFind.inSameSet(outerZone, innerZone)//
|| !outerZone.intersectsYJava(innerZone) && !innerZone.intersectsXJava(innerZone)) {
return;
}
double scale = Math.min(outerZone.getHeight(), innerZone.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
double horizontalDistance = outerZone.horizontalDistance(innerZone);
double verticalDistance = outerZone.verticalDistance(innerZone);
if (rulings.lineBetween(outerZone, innerZone)) {
return;
}
if (outerZone.intersectsYJava(innerZone) && horizontalDistance < 10) {
unionFind.union(outerZone, innerZone);
return;
}
if (outerZone.intersectsXJava(innerZone) && verticalDistance < 6) {
unionFind.union(outerZone, innerZone);
return;
}
boolean outerZoneEndsWithColon = outerZone.getLines()
.stream()
.allMatch(line -> line.toString().trim().endsWith(":"));
boolean innerZoneDoesNotEndWithColon = innerZone.getLines()
.stream()
.noneMatch(line -> line.toString().trim().endsWith(":"));
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && outerZone.leftOf(innerZone) && innerZone.getMinX() - outerZone.getMinX() < 250) {
unionFind.union(outerZone, innerZone);
return;
}
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && (outerZone.isAbove(innerZone) && verticalDistance < lineSpacing * 2)) {
unionFind.union(outerZone, innerZone);
return;
}
});
});
return unionFind.getGroups()
.stream()
.map(groupOfZones -> groupOfZones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.collect(Collectors.toList()))
.map(linesInZoneToMerge -> mergeLinesInZone(linesInZoneToMerge, characterSpacing, lineSpacing))
.toList();
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;

View File

@ -41,6 +41,18 @@ public class StringFrequencyCounter {
mostPopular = entry;
}
}
if (mostPopular != null && mostPopular.getKey().equals("standard")) {
int standard = countPerValue.get(mostPopular.getKey());
double total = countPerValue.values()
.stream()
.mapToDouble(v -> v).sum();
if ((double) standard / total > 0.85) {
return mostPopular.getKey();
}
countPerValue.remove(mostPopular.getKey());
return getMostPopular();
}
return mostPopular != null ? mostPopular.getKey() : null;
}

View File

@ -56,9 +56,9 @@ public class DocstrumBlockificationService {
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
combineBlocks(classificationPage);
}
@ -264,7 +264,9 @@ public class DocstrumBlockificationService {
continue;
}
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 //
&& current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) //
&& current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences());

View File

@ -62,6 +62,7 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
/*
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
@ -73,7 +74,8 @@ public class DocuMineClassificationService {
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
} else */
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {

View File

@ -117,7 +117,7 @@ public class SectionNodeFactory {
if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) {
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);

View File

@ -68,7 +68,7 @@ public class LayoutGridService {
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
List<Visualizations> allVisualizations;
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
Visualizations layoutGrid = this.addLayoutGrid(document, true, false);
if (writeVisualLayoutParsingGrid) {
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())

View File

@ -74,7 +74,7 @@ public class LayoutparsingVisualizations {
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
@ -180,7 +180,7 @@ public class LayoutparsingVisualizations {
visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 0.5f))
.toList());
}

View File

@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.CLARIFYND_PARAGRAPH_DEBUG, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);