akra-certificates: finetuninng for certificates
This commit is contained in:
parent
07733d0855
commit
eb2ea755a5
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1 +0,0 @@
|
||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
@ -63,7 +63,10 @@ public class DocstrumSegmentationService {
|
||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||
List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||
// return zones;
|
||||
return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -135,6 +135,12 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYJava(BoundingBox other) {
|
||||
|
||||
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
@ -143,7 +149,13 @@ public abstract class BoundingBox {
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
|
||||
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXJava(BoundingBox other) {
|
||||
|
||||
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||
}
|
||||
|
||||
|
||||
@ -182,4 +194,60 @@ public abstract class BoundingBox {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D left;
|
||||
Rectangle2D right;
|
||||
if (this.leftOf(other)) {
|
||||
left = this.getBBox();
|
||||
right = other.getBBox();
|
||||
} else {
|
||||
left = other.getBBox();
|
||||
right = this.getBBox();
|
||||
}
|
||||
|
||||
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D bottom;
|
||||
Rectangle2D top;
|
||||
if (this.isAbove(other)) {
|
||||
top = this.getBBox();
|
||||
bottom = other.getBBox();
|
||||
} else {
|
||||
bottom = this.getBBox();
|
||||
top = other.getBBox();
|
||||
}
|
||||
|
||||
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||
}
|
||||
|
||||
|
||||
public boolean rightOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean leftOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAbove(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelow(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -29,4 +29,6 @@ public class Zone extends BoundingBox {
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -81,6 +82,78 @@ public class ZoneBuilderService {
|
||||
}
|
||||
|
||||
|
||||
public List<Zone> mergeZonesUntilConvergence(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
UnionFind<Zone> unionFind = new UnionFind<>(new HashSet<>(zones));
|
||||
|
||||
double meanHeight = calculateMeanHeight(zones.stream()
|
||||
.map(Zone::getLines)
|
||||
.flatMap(Collection::stream)
|
||||
.toList());
|
||||
|
||||
zones.forEach(outerZone -> {
|
||||
zones.forEach(innerZone -> {
|
||||
|
||||
if (innerZone == outerZone //
|
||||
|| unionFind.inSameSet(outerZone, innerZone)//
|
||||
|| !outerZone.intersectsYJava(innerZone) && !innerZone.intersectsXJava(innerZone)) {
|
||||
return;
|
||||
}
|
||||
|
||||
double scale = Math.min(outerZone.getHeight(), innerZone.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
double horizontalDistance = outerZone.horizontalDistance(innerZone);
|
||||
double verticalDistance = outerZone.verticalDistance(innerZone);
|
||||
|
||||
if (rulings.lineBetween(outerZone, innerZone)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (outerZone.intersectsYJava(innerZone) && horizontalDistance < 10) {
|
||||
unionFind.union(outerZone, innerZone);
|
||||
return;
|
||||
}
|
||||
|
||||
if (outerZone.intersectsXJava(innerZone) && verticalDistance < 6) {
|
||||
unionFind.union(outerZone, innerZone);
|
||||
return;
|
||||
}
|
||||
|
||||
boolean outerZoneEndsWithColon = outerZone.getLines()
|
||||
.stream()
|
||||
.allMatch(line -> line.toString().trim().endsWith(":"));
|
||||
boolean innerZoneDoesNotEndWithColon = innerZone.getLines()
|
||||
.stream()
|
||||
.noneMatch(line -> line.toString().trim().endsWith(":"));
|
||||
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && outerZone.leftOf(innerZone) && innerZone.getMinX() - outerZone.getMinX() < 250) {
|
||||
unionFind.union(outerZone, innerZone);
|
||||
return;
|
||||
}
|
||||
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && (outerZone.isAbove(innerZone) && verticalDistance < lineSpacing * 2)) {
|
||||
unionFind.union(outerZone, innerZone);
|
||||
return;
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(groupOfZones -> groupOfZones.stream()
|
||||
.map(Zone::getLines)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.map(linesInZoneToMerge -> mergeLinesInZone(linesInZoneToMerge, characterSpacing, lineSpacing))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<Line> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
|
||||
@ -41,6 +41,18 @@ public class StringFrequencyCounter {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
if (mostPopular != null && mostPopular.getKey().equals("standard")) {
|
||||
int standard = countPerValue.get(mostPopular.getKey());
|
||||
double total = countPerValue.values()
|
||||
.stream()
|
||||
.mapToDouble(v -> v).sum();
|
||||
if ((double) standard / total > 0.85) {
|
||||
return mostPopular.getKey();
|
||||
}
|
||||
countPerValue.remove(mostPopular.getKey());
|
||||
return getMostPopular();
|
||||
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
|
||||
@ -56,9 +56,9 @@ public class DocstrumBlockificationService {
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
combineBlocks(classificationPage);
|
||||
}
|
||||
|
||||
@ -264,7 +264,9 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
||||
if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 //
|
||||
&& current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) //
|
||||
&& current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
|
||||
@ -62,6 +62,7 @@ public class DocuMineClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
@ -73,7 +74,8 @@ public class DocuMineClassificationService {
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
} else */
|
||||
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
|
||||
@ -117,7 +117,7 @@ public class SectionNodeFactory {
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
|
||||
@ -68,7 +68,7 @@ public class LayoutGridService {
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||
|
||||
List<Visualizations> allVisualizations;
|
||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
||||
Visualizations layoutGrid = this.addLayoutGrid(document, true, false);
|
||||
if (writeVisualLayoutParsingGrid) {
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||
|
||||
@ -74,7 +74,7 @@ public class LayoutparsingVisualizations {
|
||||
|
||||
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build();
|
||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||
@ -180,7 +180,7 @@ public class LayoutparsingVisualizations {
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(zones.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 0.5f))
|
||||
.toList());
|
||||
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.CLARIFYND_PARAGRAPH_DEBUG, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user