akra-certificates: finetuninng for certificates
This commit is contained in:
parent
07733d0855
commit
eb2ea755a5
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1 +0,0 @@
|
|||||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
||||||
@ -63,7 +63,10 @@ public class DocstrumSegmentationService {
|
|||||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||||
|
|
||||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||||
|
// return zones;
|
||||||
|
return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -135,6 +135,12 @@ public abstract class BoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYJava(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||||
@ -143,7 +149,13 @@ public abstract class BoundingBox {
|
|||||||
|
|
||||||
public boolean intersectsX(BoundingBox other) {
|
public boolean intersectsX(BoundingBox other) {
|
||||||
|
|
||||||
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
|
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXJava(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -182,4 +194,60 @@ public abstract class BoundingBox {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
Rectangle2D left;
|
||||||
|
Rectangle2D right;
|
||||||
|
if (this.leftOf(other)) {
|
||||||
|
left = this.getBBox();
|
||||||
|
right = other.getBBox();
|
||||||
|
} else {
|
||||||
|
left = other.getBBox();
|
||||||
|
right = this.getBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
Rectangle2D bottom;
|
||||||
|
Rectangle2D top;
|
||||||
|
if (this.isAbove(other)) {
|
||||||
|
top = this.getBBox();
|
||||||
|
bottom = other.getBBox();
|
||||||
|
} else {
|
||||||
|
bottom = this.getBBox();
|
||||||
|
top = other.getBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean rightOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean leftOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAbove(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBelow(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,4 +29,6 @@ public class Zone extends BoundingBox {
|
|||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -81,6 +82,78 @@ public class ZoneBuilderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> mergeZonesUntilConvergence(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
|
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||||
|
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
|
||||||
|
UnionFind<Zone> unionFind = new UnionFind<>(new HashSet<>(zones));
|
||||||
|
|
||||||
|
double meanHeight = calculateMeanHeight(zones.stream()
|
||||||
|
.map(Zone::getLines)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
zones.forEach(outerZone -> {
|
||||||
|
zones.forEach(innerZone -> {
|
||||||
|
|
||||||
|
if (innerZone == outerZone //
|
||||||
|
|| unionFind.inSameSet(outerZone, innerZone)//
|
||||||
|
|| !outerZone.intersectsYJava(innerZone) && !innerZone.intersectsXJava(innerZone)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
double scale = Math.min(outerZone.getHeight(), innerZone.getHeight()) / meanHeight;
|
||||||
|
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||||
|
|
||||||
|
double horizontalDistance = outerZone.horizontalDistance(innerZone);
|
||||||
|
double verticalDistance = outerZone.verticalDistance(innerZone);
|
||||||
|
|
||||||
|
if (rulings.lineBetween(outerZone, innerZone)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outerZone.intersectsYJava(innerZone) && horizontalDistance < 10) {
|
||||||
|
unionFind.union(outerZone, innerZone);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outerZone.intersectsXJava(innerZone) && verticalDistance < 6) {
|
||||||
|
unionFind.union(outerZone, innerZone);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean outerZoneEndsWithColon = outerZone.getLines()
|
||||||
|
.stream()
|
||||||
|
.allMatch(line -> line.toString().trim().endsWith(":"));
|
||||||
|
boolean innerZoneDoesNotEndWithColon = innerZone.getLines()
|
||||||
|
.stream()
|
||||||
|
.noneMatch(line -> line.toString().trim().endsWith(":"));
|
||||||
|
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && outerZone.leftOf(innerZone) && innerZone.getMinX() - outerZone.getMinX() < 250) {
|
||||||
|
unionFind.union(outerZone, innerZone);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && (outerZone.isAbove(innerZone) && verticalDistance < lineSpacing * 2)) {
|
||||||
|
unionFind.union(outerZone, innerZone);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return unionFind.getGroups()
|
||||||
|
.stream()
|
||||||
|
.map(groupOfZones -> groupOfZones.stream()
|
||||||
|
.map(Zone::getLines)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.map(linesInZoneToMerge -> mergeLinesInZone(linesInZoneToMerge, characterSpacing, lineSpacing))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private double calculateMeanHeight(List<Line> lines) {
|
private double calculateMeanHeight(List<Line> lines) {
|
||||||
|
|
||||||
double meanHeight = 0.0;
|
double meanHeight = 0.0;
|
||||||
|
|||||||
@ -41,6 +41,18 @@ public class StringFrequencyCounter {
|
|||||||
mostPopular = entry;
|
mostPopular = entry;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (mostPopular != null && mostPopular.getKey().equals("standard")) {
|
||||||
|
int standard = countPerValue.get(mostPopular.getKey());
|
||||||
|
double total = countPerValue.values()
|
||||||
|
.stream()
|
||||||
|
.mapToDouble(v -> v).sum();
|
||||||
|
if ((double) standard / total > 0.85) {
|
||||||
|
return mostPopular.getKey();
|
||||||
|
}
|
||||||
|
countPerValue.remove(mostPopular.getKey());
|
||||||
|
return getMostPopular();
|
||||||
|
|
||||||
|
}
|
||||||
return mostPopular != null ? mostPopular.getKey() : null;
|
return mostPopular != null ? mostPopular.getKey() : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -56,9 +56,9 @@ public class DocstrumBlockificationService {
|
|||||||
var classificationPage = new ClassificationPage(pageBlocks);
|
var classificationPage = new ClassificationPage(pageBlocks);
|
||||||
classificationPage.setCleanRulings(rulings);
|
classificationPage.setCleanRulings(rulings);
|
||||||
|
|
||||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||||
combineBlocks(classificationPage);
|
combineBlocks(classificationPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -264,7 +264,9 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 //
|
||||||
|
&& current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) //
|
||||||
|
&& current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
|
||||||
|
|
||||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
current.getSequences().addAll(inner.getSequences());
|
current.getSequences().addAll(inner.getSequences());
|
||||||
|
|||||||
@ -62,6 +62,7 @@ public class DocuMineClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||||
@ -73,7 +74,8 @@ public class DocuMineClassificationService {
|
|||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||||
) {
|
) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else */
|
||||||
|
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
.size() == 1)) {
|
.size() == 1)) {
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
|
|||||||
@ -117,7 +117,7 @@ public class SectionNodeFactory {
|
|||||||
if (abstractPageBlock instanceof TextPageBlock) {
|
if (abstractPageBlock instanceof TextPageBlock) {
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||||
alreadyMerged.add(abstractPageBlock);
|
alreadyMerged.add(abstractPageBlock);
|
||||||
remainingBlocks.remove(abstractPageBlock);
|
remainingBlocks.remove(abstractPageBlock);
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||||
|
|||||||
@ -68,7 +68,7 @@ public class LayoutGridService {
|
|||||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||||
|
|
||||||
List<Visualizations> allVisualizations;
|
List<Visualizations> allVisualizations;
|
||||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
Visualizations layoutGrid = this.addLayoutGrid(document, true, false);
|
||||||
if (writeVisualLayoutParsingGrid) {
|
if (writeVisualLayoutParsingGrid) {
|
||||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||||
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||||
|
|||||||
@ -74,7 +74,7 @@ public class LayoutparsingVisualizations {
|
|||||||
|
|
||||||
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build();
|
||||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||||
@ -180,7 +180,7 @@ public class LayoutparsingVisualizations {
|
|||||||
visualizationsOnPage.getColoredRectangles()
|
visualizationsOnPage.getColoredRectangles()
|
||||||
.addAll(zones.stream()
|
.addAll(zones.stream()
|
||||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 0.5f))
|
||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,7 +44,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testLayoutParserEndToEndWithFolder() {
|
public void testLayoutParserEndToEndWithFolder() {
|
||||||
|
|
||||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred";
|
||||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
.sorted(Comparator.comparing(Path::getFileName))
|
.sorted(Comparator.comparing(Path::getFileName))
|
||||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
file = new File(filePath);
|
file = new File(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.CLARIFYND_PARAGRAPH_DEBUG, true);
|
||||||
prepareStorage(layoutParsingRequest, file);
|
prepareStorage(layoutParsingRequest, file);
|
||||||
|
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user