Merge branch 'RED-8825' into 'main'
RED-8825: improve layoutparsing See merge request fforesight/layout-parser!132
This commit is contained in:
commit
07733d0855
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@ -1,8 +1,8 @@
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
|
||||
update = merge
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
|
||||
@ -5,6 +5,7 @@ public enum LayoutParsingType {
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
}
|
||||
|
||||
@ -45,6 +45,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
@ -52,12 +53,14 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -119,11 +122,11 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -131,7 +134,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -155,25 +158,25 @@ public class LayoutParsingPipeline {
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
|
||||
}
|
||||
@ -194,14 +197,14 @@ public class LayoutParsingPipeline {
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -220,6 +223,9 @@ public class LayoutParsingPipeline {
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
classificationDocument.getVisualizations().setActive(identifier.containsKey("debug"));
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
@ -244,10 +250,12 @@ public class LayoutParsingPipeline {
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
@ -255,28 +263,29 @@ public class LayoutParsingPipeline {
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||
pdPage,
|
||||
pageNumber,
|
||||
cleanRulings,
|
||||
stripper.getTextPositionSequences(),
|
||||
emptyTableCells,
|
||||
false);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -286,17 +295,18 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
if (pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
if (signatures.containsKey(pageNumber)) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||
classificationPage.setImages(signatures.get(pageNumber));
|
||||
} else {
|
||||
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||
@ -305,12 +315,6 @@ public class LayoutParsingPipeline {
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
|
||||
}
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
@ -321,11 +325,14 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
|
||||
@ -7,14 +7,18 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -29,31 +33,37 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
|
||||
|
||||
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
List<RedTextPosition> positions = textPositions.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
List<Character> characters = positions.stream()
|
||||
.map(Character::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public abstract class BoundingBox {
|
||||
|
||||
private Rectangle2D bBox;
|
||||
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
|
||||
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
|
||||
|
||||
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
|
||||
// This rotates completely in 90 degree steps with page rotation.
|
||||
// Needs to be used when writing to a PDF.
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxInitialUserSpace;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
|
||||
public double getX() {
|
||||
@ -22,6 +36,42 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double getMinX() {
|
||||
|
||||
return bBox.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getMinY() {
|
||||
|
||||
return bBox.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
@ -34,21 +84,102 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double getMaxX() {
|
||||
|
||||
return bBox.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxY() {
|
||||
|
||||
return bBox.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
public boolean contains(BoundingBox contained) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
return contains(contained, 0);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(BoundingBox contained, double tolerance) {
|
||||
|
||||
return getPdfMinX() <= contained.getPdfMinX() + tolerance
|
||||
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
|
||||
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
|
||||
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other) {
|
||||
|
||||
return this.intersectsX(other) && this.intersectsY(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
||||
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||
|
||||
this.bBox = components.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
this.bBoxInitialUserSpace = components.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
|
||||
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||
}
|
||||
|
||||
|
||||
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -27,8 +27,8 @@ public class Character {
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
||||
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Data;
|
||||
@ -72,7 +72,7 @@ public class Line extends BoundingBox {
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
@ -84,7 +84,9 @@ public class Line extends BoundingBox {
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
|
||||
return characters.stream()
|
||||
.map(Character::getHeight)
|
||||
.reduce(0d, Double::sum) / characters.size();
|
||||
}
|
||||
|
||||
|
||||
@ -116,7 +118,7 @@ public class Line extends BoundingBox {
|
||||
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||
return Math.abs(ym - yn);
|
||||
}
|
||||
|
||||
|
||||
@ -141,21 +143,9 @@ public class Line extends BoundingBox {
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
|
||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
this.setToBBoxOfComponents(characters.stream()
|
||||
.map(Character::getTextPosition)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ -15,29 +16,9 @@ public class Zone extends BoundingBox {
|
||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
||||
lines.sort(Comparator.comparingDouble(Line::getY0));
|
||||
this.lines = lines;
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBBox() {
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double minY = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
double maxY = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Line line : lines) {
|
||||
|
||||
minX = Math.min(minX, line.getX());
|
||||
minY = Math.min(minY, line.getY());
|
||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
||||
|
||||
}
|
||||
|
||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
||||
setToBBoxOfComponents(lines);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -11,43 +10,49 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
||||
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
|
||||
|
||||
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||
|
||||
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
|
||||
2) <= 1) {
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
|
||||
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||
|| !angleFilter.matches(neighbor) //
|
||||
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
});
|
||||
});
|
||||
|
||||
List<Line> lines = new ArrayList<>();
|
||||
unionFind.getGroups().forEach(group -> {
|
||||
List<Character> lineCharacters = new ArrayList<>(group);
|
||||
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new Line(lineCharacters, characterSpacing));
|
||||
});
|
||||
|
||||
return lines;
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(lineCharacters -> lineCharacters.stream()
|
||||
.sorted(Comparator.comparingDouble(Character::getX))
|
||||
.toList())
|
||||
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -39,7 +39,10 @@ public class ReadingOrderService {
|
||||
}
|
||||
}
|
||||
|
||||
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
} else {
|
||||
|
||||
@ -52,7 +55,7 @@ public class ReadingOrderService {
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
@ -90,14 +93,14 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
/*
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
boolean intersects = false;
|
||||
@ -139,7 +142,7 @@ public class ReadingOrderService {
|
||||
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
|
||||
*/
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
@ -29,12 +31,10 @@ public class ZoneBuilderService {
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
private static final int MAX_ZONES = 300;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
@ -45,38 +45,39 @@ public class ZoneBuilderService {
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> //
|
||||
lines.forEach(innerLine -> {
|
||||
lines.forEach(outerLine -> {
|
||||
lines.forEach(innerLine -> {
|
||||
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
if (innerLine == outerLine //
|
||||
|| unionFind.inSameSet(outerLine, innerLine)//
|
||||
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||
|
||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
||||
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
||||
unionFind.union(outerLine, innerLine);
|
||||
}
|
||||
}
|
||||
}));
|
||||
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
unionFind.getGroups().forEach(group -> {
|
||||
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
|
||||
if (rulings.lineBetween(outerLine, innerLine)) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(outerLine, innerLine);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
||||
}
|
||||
|
||||
return zones;
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -103,35 +104,40 @@ public class ZoneBuilderService {
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
|
||||
lines.forEach(outer -> {
|
||||
|
||||
lines.forEach(inner -> {
|
||||
if (inner != outer) {
|
||||
if (inner == outer) {
|
||||
return;
|
||||
}
|
||||
|
||||
double horizontalDistance = outer.horizontalDistance(inner);
|
||||
double verticalDistance = outer.verticalDistance(inner);
|
||||
double horizontalDistance = outer.horizontalDistance(inner);
|
||||
double verticalDistance = outer.verticalDistance(inner);
|
||||
|
||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
unionFind.union(outer, inner);
|
||||
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
|
||||
inner.getLength())) < 0.1) {
|
||||
boolean characterOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character outerCharacter : outer.getCharacters()) {
|
||||
for (Character innerCharacter : inner.getCharacters()) {
|
||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||
if (characterOverlapDistance > 2) {
|
||||
characterOverlap = true;
|
||||
}
|
||||
if (characterOverlapDistance > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||
|
||||
unionFind.union(outer, inner);
|
||||
|
||||
} else if (minVerticalDistance <= verticalDistance
|
||||
&& verticalDistance <= maxVerticalDistance
|
||||
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
|
||||
|
||||
boolean characterOverlap = false;
|
||||
int overlappingCount = 0;
|
||||
for (Character outerCharacter : outer.getCharacters()) {
|
||||
for (Character innerCharacter : inner.getCharacters()) {
|
||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||
if (characterOverlapDistance > 2) {
|
||||
characterOverlap = true;
|
||||
}
|
||||
if (characterOverlapDistance > 0) {
|
||||
overlappingCount++;
|
||||
}
|
||||
}
|
||||
if (!characterOverlap && overlappingCount <= 2) {
|
||||
unionFind.union(outer, inner);
|
||||
}
|
||||
}
|
||||
if (!characterOverlap && overlappingCount <= 2) {
|
||||
unionFind.union(outer, inner);
|
||||
}
|
||||
}
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
@ -146,7 +152,9 @@ public class ZoneBuilderService {
|
||||
outputZone.add(new Line(characters, characterSpacing));
|
||||
}
|
||||
|
||||
return new Zone(outputZone);
|
||||
return new Zone(outputZone.stream()
|
||||
.sorted(Comparator.comparing(Line::getY0))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -13,16 +16,8 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends Rectangle {
|
||||
public abstract class AbstractPageBlock extends BoundingBox {
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@JsonIgnore
|
||||
protected float maxX;
|
||||
@JsonIgnore
|
||||
protected float minY;
|
||||
@JsonIgnore
|
||||
protected float maxY;
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
@JsonIgnore
|
||||
@ -41,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
public boolean containsBlock(TextPageBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractPageBlock other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock apb) {
|
||||
|
||||
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -22,6 +23,7 @@ public class ClassificationDocument {
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
|
||||
private boolean headlines;
|
||||
|
||||
private long rulesVersion;
|
||||
|
||||
@ -12,6 +12,7 @@ import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode {
|
||||
@Builder.Default
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
LayoutparsingVisualizations visualizations;
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class Cell extends Rectangle {
|
||||
public class Cell extends BoundingBox {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxInitialUserSpace;
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D r) {
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
|
||||
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
|
||||
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
public static Cell copy(Cell cell) {
|
||||
|
||||
Cell copy = new Cell();
|
||||
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
|
||||
copy.bBox = cell.bBox;
|
||||
return copy;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,15 +1,206 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontal;
|
||||
List<Ruling> vertical;
|
||||
List<Ruling> horizontals; // unmodifiable sorted by Y list
|
||||
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||
|
||||
|
||||
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
this.horizontals = horizontals.stream()
|
||||
.peek(Ruling::assertHorizontal)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getY1))
|
||||
.toList();
|
||||
this.verticals = verticals.stream()
|
||||
.peek(Ruling::assertVertical)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getX1))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings getTableLines() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings withoutTextRulings() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> buildAll() {
|
||||
|
||||
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||
rulings.addAll(horizontals);
|
||||
rulings.addAll(verticals);
|
||||
return rulings;
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(BoundingBox a, BoundingBox b) {
|
||||
|
||||
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
|
||||
|
||||
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Point2D p1, Point2D p2) {
|
||||
|
||||
Ruling ruling = new Ruling(p1, p2);
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
|
||||
.anyMatch(vertical -> vertical.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
if (ruling.isVertical()) {
|
||||
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
|
||||
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
|
||||
.flatMap(Collection::stream)
|
||||
.anyMatch(other -> other.intersectsLine(ruling));
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
|
||||
|
||||
float startY = Math.min(y1, y2);
|
||||
float endY = Math.max(y1, y2);
|
||||
|
||||
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
|
||||
Ruling horizontal = horizontals.get(i);
|
||||
if (horizontal.y1 > endY) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstHorizontalRulingIdxAbove(float y) {
|
||||
|
||||
int low = 0;
|
||||
int high = horizontals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = horizontals.get(mid);
|
||||
float midY = midLine.y1;
|
||||
|
||||
if (midY == y) {
|
||||
return mid;
|
||||
} else if (midY > y) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
|
||||
|
||||
float startX = Math.min(x1, x2);
|
||||
float endX = Math.max(x1, x2);
|
||||
|
||||
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
|
||||
Ruling horizontal = verticals.get(i);
|
||||
if (horizontal.x1 > endX) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstVerticalRulingIdxRightOf(float x) {
|
||||
|
||||
int low = 0;
|
||||
int high = verticals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = verticals.get(mid);
|
||||
float midX = midLine.x1;
|
||||
|
||||
if (midX == x) {
|
||||
return mid;
|
||||
} else if (midX > x) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,218 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public Rectangle() {
|
||||
|
||||
super();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public float getArea() {
|
||||
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
|
||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float top) {
|
||||
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float right) {
|
||||
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float left) {
|
||||
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,16 +4,14 @@ import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -23,10 +21,24 @@ public class Ruling extends Line2D.Float {
|
||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||
|
||||
public enum Classification {
|
||||
TABLE_LINE,
|
||||
UNDERLINE,
|
||||
STRIKETROUGH,
|
||||
HEADER_SEPARATOR,
|
||||
FOOTER_SEPARATOR,
|
||||
OTHER
|
||||
}
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private Classification classification;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
this.classification = Classification.OTHER;
|
||||
}
|
||||
|
||||
|
||||
@ -60,126 +72,32 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
public void assertHorizontal() {
|
||||
|
||||
if (isHorizontal()) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
|
||||
int rv;
|
||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean vertical() {
|
||||
public void assertVertical() {
|
||||
|
||||
if (isVertical()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
|
||||
}
|
||||
|
||||
|
||||
public boolean isVertical() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontal() {
|
||||
public boolean isHorizontal() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
@ -188,36 +106,36 @@ public class Ruling extends Line2D.Float {
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
|
||||
public boolean oblique() {
|
||||
public boolean isOblique() {
|
||||
|
||||
return !(this.vertical() || this.horizontal());
|
||||
return !(this.isVertical() || this.isHorizontal());
|
||||
}
|
||||
|
||||
|
||||
public float getPosition() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
return this.isVertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getTop() : this.getLeft();
|
||||
return this.isVertical() ? this.getTop() : this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public void setStart(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setTop(v);
|
||||
} else {
|
||||
this.setLeft(v);
|
||||
@ -227,19 +145,19 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public float getEnd() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getBottom() : this.getRight();
|
||||
return this.isVertical() ? this.getBottom() : this.getRight();
|
||||
}
|
||||
|
||||
|
||||
public void setEnd(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setBottom(v);
|
||||
} else {
|
||||
this.setRight(v);
|
||||
@ -249,10 +167,10 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public void setStartEnd(float start, float end) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setTop(start);
|
||||
this.setBottom(end);
|
||||
} else {
|
||||
@ -264,7 +182,7 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
|
||||
return this.vertical() == other.horizontal();
|
||||
return this.isVertical() == other.isHorizontal();
|
||||
}
|
||||
|
||||
|
||||
@ -318,30 +236,6 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
public Point2D intersectionPoint(Ruling other) {
|
||||
|
||||
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
Ruling horizontal, vertical;
|
||||
|
||||
if (!this_l.intersectsLine(other_l)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this_l.horizontal() && other_l.vertical()) {
|
||||
horizontal = this_l;
|
||||
vertical = other_l;
|
||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
||||
vertical = this_l;
|
||||
horizontal = other_l;
|
||||
} else {
|
||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||
return null;
|
||||
}
|
||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
@ -451,16 +345,9 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
final float TOLERANCE = 1;
|
||||
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||
}
|
||||
|
||||
|
||||
private enum SOType {
|
||||
VERTICAL,
|
||||
HRIGHT,
|
||||
HLEFT
|
||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -36,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
private List<Cell> cells;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||
|
||||
setToBBoxOfComponents(cells);
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
@ -230,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
uniqueX.add(c.getPdfMinX());
|
||||
uniqueX.add(c.getPdfMaxX());
|
||||
uniqueY.add(c.getPdfMinY());
|
||||
uniqueY.add(c.getPdfMaxY());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
@ -250,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Float prevY = null;
|
||||
Double prevY = null;
|
||||
|
||||
for (Float y : sortedUniqueY) {
|
||||
for (Double y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
Double prevX = null;
|
||||
for (Double x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
|
||||
originalCell.getBBoxInitialUserSpace())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
|
||||
@ -1,8 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -14,9 +18,11 @@ import lombok.SneakyThrows;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedTextPosition {
|
||||
public class RedTextPosition extends BoundingBox {
|
||||
|
||||
private float[] position;
|
||||
public final static int HEIGHT_PADDING = 2;
|
||||
|
||||
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
@ -58,43 +64,71 @@ public class RedTextPosition {
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
var position = new float[4];
|
||||
//TODO: There is a mismatch in the java coords of the text and the rulings,
|
||||
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
|
||||
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
|
||||
|
||||
position[0] = textPosition.getXDirAdj();
|
||||
position[1] = textPosition.getYDirAdj();
|
||||
position[2] = textPosition.getWidthDirAdj();
|
||||
position[3] = textPosition.getHeightDir();
|
||||
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
pos.setBBoxDirAdj(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
|
||||
pos.setBBoxInitialUserSpace(initialUserSpacePositionRect); // These are definitely correct
|
||||
|
||||
pos.setPosition(position);
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
|
||||
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageHeight);
|
||||
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||
transform.translate(0f, pageWidth);
|
||||
} else {
|
||||
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageWidth);
|
||||
}
|
||||
transform.scale(1., -1.);
|
||||
return transform;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
return position[0];
|
||||
return this.bBoxDirAdj.x;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return position[1];
|
||||
return this.bBoxDirAdj.y;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidthDirAdj() {
|
||||
|
||||
return position[2];
|
||||
return this.bBoxDirAdj.width;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeightDir() {
|
||||
|
||||
return position[3];
|
||||
return this.bBoxDirAdj.height;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,16 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -29,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordFont;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordHeight;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordSpaceWidth;
|
||||
|
||||
@JsonIgnore
|
||||
private float highestFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private PageBlockType classification;
|
||||
|
||||
@JsonIgnore
|
||||
private boolean toDuplicate;
|
||||
|
||||
|
||||
public TextPageBlock(List<TextPositionSequence> sequences) {
|
||||
|
||||
this.sequences = sequences;
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public TextDirection getDir() {
|
||||
|
||||
@ -64,31 +58,40 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageHeight() {
|
||||
private void calculateBBox() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
if (sequences == null) {
|
||||
this.bBox = new Rectangle2D.Double();
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
||||
return;
|
||||
}
|
||||
setToBBoxOfComponents(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
||||
if (textBlocksToMerge.isEmpty()) {
|
||||
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getPage)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
|
||||
}
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getSequences)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
return fromTextPositionSequences(sequences);
|
||||
|
||||
return new TextPageBlock(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
private void calculateFrequencyCounters() {
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
@ -96,7 +99,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
for (TextPositionSequence wordBlock : sequences) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
@ -104,160 +107,23 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet())
|
||||
.size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - maxX;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
|
||||
return getPageWidth() - maxY;
|
||||
} else {
|
||||
return minX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - minX;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageWidth() - minY;
|
||||
|
||||
} else {
|
||||
return maxX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return maxY;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - maxX;
|
||||
|
||||
} else {
|
||||
return getPageHeight() - maxY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - minX;
|
||||
} else {
|
||||
return getPageHeight() - minY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
|
||||
this.minX = minX;
|
||||
this.maxX = maxX;
|
||||
this.minY = minY;
|
||||
this.maxY = maxY;
|
||||
this.sequences = sequences;
|
||||
this.rotation = rotation;
|
||||
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock union(TextPositionSequence r) {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
union.getSequences().add(r);
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
|
||||
@ -265,64 +131,32 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
public TextPageBlock union(TextPageBlock r) {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
union.getSequences().addAll(r.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPageBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
}
|
||||
if (r.getMaxX() > maxX) {
|
||||
maxX = r.getMaxX();
|
||||
}
|
||||
if (r.getMinY() < minY) {
|
||||
minY = r.getMinY();
|
||||
}
|
||||
if (r.getMaxY() > maxY) {
|
||||
maxY = r.getMaxY();
|
||||
}
|
||||
sequences.addAll(r.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence r) {
|
||||
|
||||
if (r.getMinXDirAdj() < minX) {
|
||||
minX = r.getMinXDirAdj();
|
||||
}
|
||||
if (r.getMaxXDirAdj() > maxX) {
|
||||
maxX = r.getMaxXDirAdj();
|
||||
}
|
||||
if (r.getMinYDirAdj() < minY) {
|
||||
minY = r.getMinYDirAdj();
|
||||
}
|
||||
if (r.getMaxYDirAdj() > maxY) {
|
||||
maxY = r.getMaxYDirAdj();
|
||||
}
|
||||
sequences.add(r);
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||
}
|
||||
|
||||
|
||||
public void resize(float x1, float y1, float width, float height) {
|
||||
|
||||
set(x1, y1, x1 + width, y1 + height);
|
||||
}
|
||||
|
||||
|
||||
public void set(float x1, float y1, float x2, float y2) {
|
||||
|
||||
this.minX = Math.min(x1, x2);
|
||||
this.maxX = Math.max(x1, x2);
|
||||
this.minY = Math.min(y1, y2);
|
||||
this.maxY = Math.max(y1, y2);
|
||||
return new TextPageBlock(new ArrayList<>(sequences));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
@ -9,15 +8,14 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -25,8 +23,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
|
||||
@ -36,29 +34,38 @@ public class TextPositionSequence implements CharSequence {
|
||||
@EqualsAndHashCode.Include
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
@EqualsAndHashCode.Include
|
||||
private TextDirection dir;
|
||||
private int rotation;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private boolean isParagraphStart;
|
||||
private boolean strikethrough;
|
||||
private boolean underline;
|
||||
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
|
||||
|
||||
this.page = page;
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
||||
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
this.textPositions = textPositions.stream()
|
||||
.map(RedTextPosition::fromTextPosition)
|
||||
.collect(Collectors.toList());
|
||||
this.page = pageNumber;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = isParagraphStart;
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
private void calculateBBox() {
|
||||
|
||||
this.bBoxDirAdj = textPositions.stream()
|
||||
.map(RedTextPosition::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
}
|
||||
|
||||
|
||||
@ -70,6 +77,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
@ -107,7 +115,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
textPositionSequence.rotation = rotation;
|
||||
textPositionSequence.pageHeight = pageHeight;
|
||||
textPositionSequence.pageWidth = pageWidth;
|
||||
|
||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
@ -137,18 +145,18 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
@ -220,18 +228,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public String getFont() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
@ -271,54 +267,5 @@ public class TextPositionSequence implements CharSequence {
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
|
||||
float textHeight = getTextHeight();
|
||||
|
||||
RedTextPosition firstTextPos = textPositions.get(0);
|
||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
||||
|
||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageHeight + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else {
|
||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
}
|
||||
|
||||
bottomLeft = transform.transform(bottomLeft, null);
|
||||
topRight = transform.transform(topRight, null);
|
||||
|
||||
return new Rectangle( //
|
||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
||||
(float) (topRight.getX() - bottomLeft.getX()),
|
||||
(float) (topRight.getY() - bottomLeft.getY()),
|
||||
page);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter {
|
||||
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
classificationPage.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@ -31,8 +31,9 @@ public class BodyTextFrameService {
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,24 +59,26 @@ public class BodyTextFrameService {
|
||||
|
||||
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
return page.getCleanRulings().getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getTop))
|
||||
.peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
|
||||
|
||||
return page.getCleanRulings()
|
||||
.getHorizontal()
|
||||
return page.getCleanRulings().getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
|
||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
|
||||
.peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
@ -99,16 +102,16 @@ public class BodyTextFrameService {
|
||||
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
textFrame.getHeight(),
|
||||
textFrame.getWidth(),
|
||||
0);
|
||||
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
||||
} else if (page.getRotation() == 180) {
|
||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
textFrame.getWidth(),
|
||||
textFrame.getHeight(),
|
||||
0);
|
||||
}
|
||||
page.setBodyTextFrame(textFrame);
|
||||
}
|
||||
@ -152,14 +155,17 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
|
||||
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
|
||||
double approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||
&& approxLineCount < approximateHeaderLineCount //
|
||||
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)//
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||
&& approxLineCount < approximateHeaderLineCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -185,10 +191,10 @@ public class BodyTextFrameService {
|
||||
}
|
||||
}
|
||||
}
|
||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
||||
0);
|
||||
return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY),
|
||||
(float) (expansionsRectangle.maxX - expansionsRectangle.minX),
|
||||
(float) (expansionsRectangle.maxY - expansionsRectangle.minY),
|
||||
0);
|
||||
}
|
||||
|
||||
|
||||
@ -226,10 +232,10 @@ public class BodyTextFrameService {
|
||||
|
||||
private class BodyTextFrameExpansionsRectangle {
|
||||
|
||||
float minX = 10000;
|
||||
float maxX = -100;
|
||||
float minY = 10000;
|
||||
float maxY = -100;
|
||||
double minX = 10000;
|
||||
double maxX = -100;
|
||||
double minY = 10000;
|
||||
double maxY = -100;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -44,9 +44,9 @@ public class GapDetectionService {
|
||||
|
||||
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
}
|
||||
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
|
||||
@ -69,32 +69,37 @@ public class GapDetectionService {
|
||||
|
||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
return mirrorY(textPosition.getBBox());
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
}
|
||||
|
||||
|
||||
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
assert textPositionSequences.stream()
|
||||
.map(TextPositionSequence::getDir)
|
||||
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
return textPositionSequences.stream()
|
||||
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
@ -142,9 +147,9 @@ public class GapDetectionService {
|
||||
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
@ -152,9 +157,9 @@ public class GapDetectionService {
|
||||
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
|
||||
@ -180,7 +180,7 @@ public class LineDetectionService {
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -12,9 +13,9 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -31,7 +32,7 @@ public class RulingCleaningService {
|
||||
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
||||
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
public CleanRulings deduplicateAndStraightenRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
|
||||
Rulings verticalAndHorizontalRulingLines;
|
||||
|
||||
@ -45,43 +46,43 @@ public class RulingCleaningService {
|
||||
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||
|
||||
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||
return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines());
|
||||
}
|
||||
|
||||
|
||||
private Rulings cleanRulings(Rulings rulings) {
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
|
||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
|
||||
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
|
||||
|
||||
UnionFind<Rectangle> unionFind = new UnionFind<>();
|
||||
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
|
||||
for (int i = 0; i < rectangles.size(); i++) {
|
||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||
Rectangle rectangle1 = rectangles.get(i);
|
||||
Rectangle rectangle2 = rectangles.get(j);
|
||||
Rectangle2D rectangle1 = rectangles.get(i);
|
||||
Rectangle2D rectangle2 = rectangles.get(j);
|
||||
|
||||
// we can stop early when we are too far off because of x-y-sorting
|
||||
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
|
||||
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -91,66 +92,66 @@ public class RulingCleaningService {
|
||||
}
|
||||
}
|
||||
|
||||
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
|
||||
for (Rectangle rectangle : rectangles) {
|
||||
Rectangle root = unionFind.find(rectangle);
|
||||
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
Rectangle2D root = unionFind.find(rectangle);
|
||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||
}
|
||||
return new ArrayList<>(groups.values());
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle getOverlapRectangle(Ruling ruling) {
|
||||
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
|
||||
|
||||
float top;
|
||||
float left;
|
||||
float y;
|
||||
float x;
|
||||
float w;
|
||||
float h;
|
||||
|
||||
if (ruling.x1 < ruling.x2) {
|
||||
left = ruling.x1;
|
||||
x = ruling.x1;
|
||||
w = ruling.x2 - ruling.x1;
|
||||
} else {
|
||||
left = ruling.x2;
|
||||
x = ruling.x2;
|
||||
w = ruling.x1 - ruling.x2;
|
||||
}
|
||||
if (ruling.y1 < ruling.y2) {
|
||||
top = ruling.y1;
|
||||
y = ruling.y1;
|
||||
h = ruling.y2 - ruling.y1;
|
||||
} else {
|
||||
top = ruling.y2;
|
||||
y = ruling.y2;
|
||||
h = ruling.y1 - ruling.y2;
|
||||
}
|
||||
|
||||
if (ruling.horizontal()) {
|
||||
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
if (ruling.isHorizontal()) {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getXCenteredRuling(Rectangle rectangle) {
|
||||
public static Ruling getXCenteredRuling(Rectangle2D rectangle) {
|
||||
|
||||
float x = (float) rectangle.getCenterX();
|
||||
float y1 = rectangle.getTop();
|
||||
float y2 = rectangle.getBottom();
|
||||
double x = rectangle.getCenterX();
|
||||
double y1 = rectangle.getMinY();
|
||||
double y2 = rectangle.getMaxY();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||
Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||
Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getYCenteredRuling(Rectangle rectangle) {
|
||||
public static Ruling getYCenteredRuling(Rectangle2D rectangle) {
|
||||
|
||||
float x1 = rectangle.getLeft();
|
||||
float x2 = rectangle.getRight();
|
||||
float y = (float) rectangle.getCenterY();
|
||||
double x1 = rectangle.getX();
|
||||
double x2 = rectangle.getMaxX();
|
||||
double y = rectangle.getCenterY();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
@ -160,14 +161,14 @@ public class RulingCleaningService {
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
for (Ruling vr : rulings) {
|
||||
if (vr.vertical()) {
|
||||
if (vr.isVertical()) {
|
||||
vrs.add(vr);
|
||||
}
|
||||
}
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
for (Ruling hr : rulings) {
|
||||
if (hr.horizontal()) {
|
||||
if (hr.isHorizontal()) {
|
||||
hrs.add(hr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -71,7 +71,8 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
previousTable = chunkBlock.getTables()
|
||||
.get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof TablePageBlock table) {
|
||||
@ -106,11 +107,12 @@ public class SectionsBuilderService {
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : document.getPages()) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
page.getTextBlocks()
|
||||
.forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
document.setSections(sections);
|
||||
}
|
||||
@ -155,10 +157,10 @@ public class SectionsBuilderService {
|
||||
}
|
||||
}
|
||||
for (ClassificationSection section : sectionsOnPage) {
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
@ -202,8 +204,14 @@ public class SectionsBuilderService {
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
if (xMin != null
|
||||
&& xMax != null
|
||||
&& yMin != null
|
||||
&& yMax != null
|
||||
&& image.getPosition().getX() >= xMin
|
||||
&& image.getPosition().getX() <= xMax
|
||||
&& image.getPosition().getY() >= yMin
|
||||
&& image.getPosition().getY() <= yMax) {
|
||||
section.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
@ -226,17 +234,26 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
}).collect(Collectors.toList());
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
@ -279,7 +296,11 @@ public class SectionsBuilderService {
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||
.findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -287,7 +308,8 @@ public class SectionsBuilderService {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -11,22 +13,26 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
@ -59,29 +65,31 @@ public class TableExtractionService {
|
||||
}
|
||||
}
|
||||
|
||||
var cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && area.contains(c)) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
||||
tables.add(new TablePageBlock(containedCells, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
@ -90,14 +98,18 @@ public class TableExtractionService {
|
||||
int position = -1;
|
||||
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
@ -112,7 +124,7 @@ public class TableExtractionService {
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(Rectangle::getWidth)
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
@ -122,22 +134,26 @@ public class TableExtractionService {
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
double x = textBlock.getPdfMinX();
|
||||
double y = textBlock.getPdfMinY();
|
||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||
/*
|
||||
switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
*/
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,99 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextRulingsClassifier {
|
||||
|
||||
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines.
|
||||
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines.
|
||||
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
|
||||
|
||||
|
||||
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
|
||||
|
||||
for (TextPositionSequence word : words) {
|
||||
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
|
||||
handleHorizontalText(cleanRulings, word);
|
||||
} else {
|
||||
handleVerticalText(cleanRulings, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
||||
|
||||
List<Ruling> rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX)
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2)
|
||||
.toList();
|
||||
|
||||
for (Ruling ruling : rulingsIntersectingWord) {
|
||||
if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||
word.setStrikethrough(true);
|
||||
}
|
||||
|
||||
if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||
word.setUnderline(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
||||
|
||||
List<Ruling> rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY)
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2)
|
||||
.toList();
|
||||
|
||||
for (Ruling ruling : rulingsIntersectingWord) {
|
||||
if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||
word.setStrikethrough(true);
|
||||
}
|
||||
|
||||
if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) {
|
||||
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||
word.setUnderline(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
@ -9,21 +7,17 @@ import java.util.ListIterator;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -37,48 +31,76 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
|
||||
CleanRulings rulings,
|
||||
boolean xyOrder,
|
||||
LayoutparsingVisualizations visualizations,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
|
||||
|
||||
if (xyOrder) {
|
||||
sortPageBlocksXThenY(pageBlocks);
|
||||
}
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
combineBlocks(classificationPage);
|
||||
}
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f);
|
||||
}
|
||||
|
||||
return classificationPage;
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings, boolean xyOrder) {
|
||||
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
zone.getLines().forEach(line -> {
|
||||
line.getWords().forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||
});
|
||||
});
|
||||
zone.getLines()
|
||||
.forEach(line -> {
|
||||
line.getWords()
|
||||
.forEach(word -> {
|
||||
textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage()));
|
||||
});
|
||||
});
|
||||
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||
});
|
||||
|
||||
if (xyOrder) {
|
||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return abstractPageBlocks;
|
||||
}
|
||||
|
||||
@ -87,6 +109,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
@ -98,7 +121,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir()) {
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
@ -108,7 +131,7 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (previous.almostIntersects(current, 0, 0)) {
|
||||
if (previous.intersects(current)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
@ -127,15 +150,15 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
|
||||
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
|
||||
}
|
||||
|
||||
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return current.intersectsY(previous) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
}
|
||||
|
||||
|
||||
@ -144,16 +167,16 @@ public class DocstrumBlockificationService {
|
||||
ClassificationPage page) {
|
||||
|
||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
}
|
||||
|
||||
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
|
||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
}
|
||||
|
||||
|
||||
@ -208,12 +231,13 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if(block == null){
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
@ -224,7 +248,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
if(blocks.get(i) == null){
|
||||
if (blocks.get(i) == null) {
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) == current) {
|
||||
@ -236,7 +260,11 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
@ -249,181 +277,17 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
}
|
||||
var blocksIterator = blocks.iterator();
|
||||
while(blocksIterator.hasNext()){
|
||||
if(blocksIterator.next() == null){
|
||||
while (blocksIterator.hasNext()) {
|
||||
if (blocksIterator.next() == null) {
|
||||
blocksIterator.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
return chunkBlockList;
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
return new TextPageBlock(wordBlockList);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
public class DocuMineBlockificationService {
|
||||
@ -34,15 +33,16 @@ public class DocuMineBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @param textPositions The textPositions of a page.
|
||||
* @param cleanRulings All rulings on a page
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
float minX = 1000;
|
||||
float maxX = 0;
|
||||
@ -59,23 +59,26 @@ public class DocuMineBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
.contains("bold")
|
||||
&& !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||
.collect(Collectors.joining(" ")).toString());
|
||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
if (!textPageBlocks.isEmpty()) {
|
||||
prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
textPageBlocks.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
@ -86,7 +89,11 @@ public class DocuMineBlockificationService {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||
|| !startFromTop
|
||||
|| !splitByX
|
||||
|| !newLineAfterSplit
|
||||
|| !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
@ -114,128 +121,12 @@ public class DocuMineBlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
textPageBlocks.add(new TextPageBlock(chunkWords));
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -13,14 +13,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
@ -34,12 +31,13 @@ public class RedactManagerBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param textPositions The words of a page.
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
@ -57,7 +55,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
@ -67,7 +65,7 @@ public class RedactManagerBlockificationService {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
@ -81,7 +79,11 @@ public class RedactManagerBlockificationService {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||
|| !startFromTop
|
||||
|| !splitByX
|
||||
|| !newLineAfterSplit
|
||||
|| !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
@ -109,8 +111,8 @@ public class RedactManagerBlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
if (!chunkWords.isEmpty()) {
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
@ -150,8 +152,11 @@ public class RedactManagerBlockificationService {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.getMaxY())
|
||||
|| previous != null
|
||||
&& previous.getOrientation().equals(Orientation.LEFT)
|
||||
&& block.getOrientation().equals(Orientation.RIGHT)
|
||||
&& equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
@ -159,123 +164,19 @@ public class RedactManagerBlockificationService {
|
||||
|
||||
previous = block;
|
||||
}
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
|
||||
.map(tb -> (TextPageBlock) tb)
|
||||
.toList(), textPositions.get(0).getPage());
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
private boolean equalsWithThreshold(double f1, double f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -49,7 +49,6 @@ public class DocuMineClassificationService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
|
||||
@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
@ -52,6 +50,9 @@ public class DocumentGraphFactory {
|
||||
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
|
||||
|
||||
Document documentGraph = new Document();
|
||||
|
||||
documentGraph.setVisualizations(document.getVisualizations());
|
||||
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
document.getPages()
|
||||
@ -79,20 +80,21 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
|
||||
TextPageBlock originalTextBlock,
|
||||
Context context,
|
||||
List<TextPageBlock> textBlocksToMerge,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
} else if (originalTextBlock.isToDuplicate()) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
@ -178,8 +180,7 @@ public class DocumentGraphFactory {
|
||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
@ -194,8 +195,7 @@ public class DocumentGraphFactory {
|
||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
@ -207,8 +207,7 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
@ -220,8 +219,7 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
@ -275,8 +273,7 @@ public class DocumentGraphFactory {
|
||||
return pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -29,19 +30,22 @@ public class SearchTextWithTextPositionFactory {
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
if (sequences.isEmpty() || sequences.stream()
|
||||
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
}
|
||||
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
||||
.get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||
|
||||
for (TextPositionSequence word : sequences) {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getTextPositions().get(i);
|
||||
currentTextPosition = word.getTextPositions()
|
||||
.get(i);
|
||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||
removeHyphenLinebreaks(context);
|
||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||
@ -57,18 +61,21 @@ public class SearchTextWithTextPositionFactory {
|
||||
++context.positionIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
|
||||
context.stringBuilder.append(" ");
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(Collection::stream)
|
||||
.map(RedTextPosition::getBBoxInitialUserSpace)
|
||||
.toList();
|
||||
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText(context.stringBuilder.toString())
|
||||
.lineBreaks(context.lineBreaksStringIdx)
|
||||
@ -153,7 +160,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
return false;
|
||||
}
|
||||
|
||||
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||
return deltaY >= currentPosition.getHeightDir();
|
||||
}
|
||||
|
||||
@ -167,16 +174,16 @@ public class SearchTextWithTextPositionFactory {
|
||||
private boolean isHyphen(String unicodeCharacter) {
|
||||
|
||||
return Objects.equals(unicodeCharacter, "-") || //
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
Objects.equals(unicodeCharacter, "‐") || //
|
||||
Objects.equals(unicodeCharacter, "‒") || //
|
||||
Objects.equals(unicodeCharacter, "⁻") || //
|
||||
Objects.equals(unicodeCharacter, "−") || //
|
||||
Objects.equals(unicodeCharacter, "﹣") || //
|
||||
Objects.equals(unicodeCharacter, "゠") || //
|
||||
Objects.equals(unicodeCharacter, "⁓") || //
|
||||
Objects.equals(unicodeCharacter, "‑") || //
|
||||
Objects.equals(unicodeCharacter, "\u00AD");
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
Objects.equals(unicodeCharacter, "‐") || //
|
||||
Objects.equals(unicodeCharacter, "‒") || //
|
||||
Objects.equals(unicodeCharacter, "⁻") || //
|
||||
Objects.equals(unicodeCharacter, "−") || //
|
||||
Objects.equals(unicodeCharacter, "﹣") || //
|
||||
Objects.equals(unicodeCharacter, "゠") || //
|
||||
Objects.equals(unicodeCharacter, "⁓") || //
|
||||
Objects.equals(unicodeCharacter, "‑") || //
|
||||
Objects.equals(unicodeCharacter, "\u00AD");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -49,8 +49,7 @@ public class SectionNodeFactory {
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet()
|
||||
@ -121,12 +120,12 @@ public class SectionNodeFactory {
|
||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
}
|
||||
default -> {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
|
||||
}
|
||||
}
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
|
||||
@ -45,7 +45,10 @@ public class TableNodeFactory {
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||
Table table = Table.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
||||
.numberOfRows(mergedRows.size())
|
||||
.build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
@ -128,7 +131,12 @@ public class TableNodeFactory {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||
TableCell tableCell = TableCell.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBBoxInitialUserSpace())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
@ -159,7 +167,7 @@ public class TableNodeFactory {
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
@ -30,7 +33,7 @@ public class FindGraphicsRaster {
|
||||
|
||||
var renderer = new PDFRenderer(doc);
|
||||
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
||||
var imageCtm = getImageCTM(pageInformation, img.getWidth());
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
||||
}
|
||||
|
||||
@ -131,42 +134,4 @@ public class FindGraphicsRaster {
|
||||
}
|
||||
|
||||
|
||||
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
|
||||
|
||||
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / imageWidth;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ -9,10 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -22,6 +22,9 @@ import lombok.SneakyThrows;
|
||||
@RequiredArgsConstructor
|
||||
public class GraphicExtractorService {
|
||||
|
||||
private static final int MIN_GRAPHICS_SIDE_LENGTH = 30;
|
||||
private static final int MIN_GRAPHICS_AREA = 500;
|
||||
|
||||
private final GraphicsClusteringService graphicsClusteringService;
|
||||
private final FindGraphicsRaster findGraphicsRaster;
|
||||
|
||||
@ -32,33 +35,32 @@ public class GraphicExtractorService {
|
||||
int pageNumber,
|
||||
CleanRulings cleanRulings,
|
||||
List<TextPositionSequence> textPositionSequences,
|
||||
List<Cell> emptyTableCells,
|
||||
boolean graphicsRaster) {
|
||||
|
||||
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
||||
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
|
||||
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
|
||||
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
|
||||
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
||||
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (graphicsRaster) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
characterBBoxes.stream()
|
||||
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
|
||||
.collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
}
|
||||
|
||||
var filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
|
||||
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
|
||||
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
|
||||
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||
|
||||
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
|
||||
return clusters.stream()
|
||||
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -74,34 +76,13 @@ public class GraphicExtractorService {
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
|
||||
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
|
||||
|
||||
List<Box> expandedTableLines = new ArrayList<>();
|
||||
|
||||
emptyTableCells.forEach(cell -> {
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
|
||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
|
||||
});
|
||||
|
||||
return expandedTableLines;
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
||||
|
||||
return cleanRulings.getHorizontal()
|
||||
return cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
|
||||
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
||||
|
||||
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private int pageRotation;
|
||||
private PDRectangle pageSize;
|
||||
private Matrix translateMatrix;
|
||||
private final GlyphList glyphList;
|
||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||
|
||||
@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
this.pageRotation = page.getRotation();
|
||||
this.pageSize = page.getCropBox();
|
||||
|
||||
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
|
||||
translateMatrix = null;
|
||||
} else {
|
||||
// translation matrix for cropbox
|
||||
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
|
||||
}
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
}
|
||||
}
|
||||
|
||||
// adjust for cropbox if needed
|
||||
Matrix translatedTextRenderingMatrix;
|
||||
if (translateMatrix == null) {
|
||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
||||
} else {
|
||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
||||
nextX -= pageSize.getLowerLeftX();
|
||||
nextY -= pageSize.getLowerLeftY();
|
||||
}
|
||||
|
||||
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
||||
if (unicodeMapping.length() == 2) {
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(0)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
textRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(0)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(1)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
textRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
Character.toString(unicodeMapping.charAt(1)),
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
} else {
|
||||
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
translatedTextRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
unicodeMapping,
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
textRenderingMatrix,
|
||||
nextX,
|
||||
nextY,
|
||||
Math.abs(dyDisplay),
|
||||
dxDisplay,
|
||||
Math.abs(spaceWidthDisplay),
|
||||
unicodeMapping,
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
|
||||
/**
|
||||
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
|
||||
* character if there is enough space between two words. By default a space character is used. If you need and
|
||||
* character if there is enough space between two textPositions. By default a space character is used. If you need and
|
||||
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
|
||||
* the empty string.
|
||||
*
|
||||
@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
/**
|
||||
* Write a list of string containing a whole line of a document.
|
||||
*
|
||||
* @param line a list with the words of the given line
|
||||
* @param line a list with the textPositions of the given line
|
||||
* @throws IOException if something went wrong
|
||||
*/
|
||||
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
|
||||
@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
|
||||
|
||||
/**
|
||||
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
|
||||
* word. If the word is a full line, the results will be the best. If the word contains of single words or
|
||||
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
|
||||
* Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
|
||||
* word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
|
||||
* characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
|
||||
* characters!
|
||||
* <p>
|
||||
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx
|
||||
|
||||
@ -65,12 +65,20 @@ public class LayoutGridService {
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||
|
||||
List<Visualizations> allVisualizations;
|
||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
if (writeVisualLayoutParsingGrid) {
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
} else {
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
}
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid));
|
||||
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
|
||||
}
|
||||
|
||||
|
||||
@ -130,7 +138,10 @@ public class LayoutGridService {
|
||||
}
|
||||
for (Page page : table.getPages()) {
|
||||
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
|
||||
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getRow)
|
||||
.findFirst();
|
||||
if (optionalFirstRowOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
@ -170,14 +181,17 @@ public class LayoutGridService {
|
||||
|
||||
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
||||
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page));
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getBBox)
|
||||
.map(bBoxMap -> bBoxMap.get(page));
|
||||
}
|
||||
|
||||
|
||||
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
.toList();
|
||||
Page firstPage = semanticNode.getFirstPage();
|
||||
if (!subSections.isEmpty()) {
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
|
||||
@ -196,7 +210,10 @@ public class LayoutGridService {
|
||||
}
|
||||
return;
|
||||
}
|
||||
List<Page> pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList());
|
||||
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||
.stream()
|
||||
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||
.collect(Collectors.toList());
|
||||
pagesInOrder.remove(0);
|
||||
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
|
||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||
@ -293,7 +310,10 @@ public class LayoutGridService {
|
||||
|
||||
private String buildTreeIdString(SemanticNode semanticNode) {
|
||||
|
||||
return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining("."));
|
||||
return semanticNode.getTreeId()
|
||||
.stream()
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining("."));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class CoordinateTransforms {
|
||||
|
||||
public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) {
|
||||
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
|
||||
|
||||
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse();
|
||||
}
|
||||
|
||||
|
||||
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / imageWidth;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class GeometricComparators {
|
||||
@ -58,7 +58,7 @@ public class GeometricComparators {
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||
|
||||
@ -1,12 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -14,12 +7,23 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MarkedContentUtils {
|
||||
|
||||
public static final String HEADER = "Header";
|
||||
public static final String FOOTER = "Footer";
|
||||
|
||||
|
||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
|
||||
if (markedContents == null) {
|
||||
@ -31,7 +35,8 @@ public class MarkedContentUtils {
|
||||
.filter(m -> m.getProperties() != null)
|
||||
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
||||
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
||||
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
|
||||
.map(PDMarkedContent::getContents)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(t -> t instanceof TextPosition)
|
||||
.map(t -> (TextPosition) t)
|
||||
.filter(t -> !t.getUnicode().equals(" "))
|
||||
@ -41,16 +46,77 @@ public class MarkedContentUtils {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return markedContentByYPosition.values().stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
||||
.toList(), 0, true)
|
||||
.getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
||||
return markedContentByYPosition.values()
|
||||
.stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
|
||||
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents) {
|
||||
|
||||
if (markedContents == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return markedContents.stream()
|
||||
.filter(m -> !m.getContents().isEmpty())
|
||||
.map(MarkedContentPosition::fromPDMarkedContent)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
|
||||
.stream()
|
||||
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
|
||||
|
||||
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) {
|
||||
|
||||
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents()));
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> parseTextPositions(List<Object> contents) {
|
||||
|
||||
return contents.stream()
|
||||
.filter(content -> content instanceof TextPosition)
|
||||
.map(content -> (TextPosition) content)
|
||||
.filter(content -> !content.getUnicode().equals(" "))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private static String parseSubType(PDMarkedContent markedContent) {
|
||||
|
||||
if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return ((COSName) markedContent.getProperties().getItem("Subtype")).getName();
|
||||
}
|
||||
|
||||
|
||||
public String formattedType() {
|
||||
|
||||
if (subType == null || subType.isEmpty()) {
|
||||
return type;
|
||||
}
|
||||
if (type.equals("Artifact")) {
|
||||
return subType;
|
||||
}
|
||||
return String.format("%s-%s", type, subType);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@ -114,7 +114,7 @@ public final class PositionUtils {
|
||||
}
|
||||
|
||||
|
||||
public Float getApproxLineCount(TextPageBlock textBlock) {
|
||||
public double getApproxLineCount(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
|
||||
@ -52,7 +52,10 @@ public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||
return atomicTextBlocks.stream()
|
||||
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
|
||||
.stream())
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +80,10 @@ public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||
return atomicTextBlocks.stream()
|
||||
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
|
||||
.stream())
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -89,16 +95,18 @@ public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||
return rectangles.stream()
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
|
||||
|
||||
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
|
||||
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
|
||||
redactionLogRectangle.getWidth(),
|
||||
-redactionLogRectangle.getHeight());
|
||||
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
|
||||
redactionLogRectangle.getWidth(),
|
||||
-redactionLogRectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@ -111,15 +119,16 @@ public class RectangleTransformations {
|
||||
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
|
||||
|
||||
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
|
||||
(float) rectangle2D.getWidth(),
|
||||
-(float) rectangle2D.getHeight(),
|
||||
pageNumber);
|
||||
(float) rectangle2D.getWidth(),
|
||||
-(float) rectangle2D.getHeight(),
|
||||
pageNumber);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector());
|
||||
return rectangle2DList.stream()
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -134,7 +143,8 @@ public class RectangleTransformations {
|
||||
if (rectangle2DList.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
|
||||
double splitThreshold = rectangle2DList.stream()
|
||||
.mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
|
||||
|
||||
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
|
||||
@ -171,7 +181,7 @@ public class RectangleTransformations {
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
});
|
||||
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||
return new CleanRulings(verticalRulings, horizontalRulings);
|
||||
}
|
||||
|
||||
|
||||
@ -195,9 +205,9 @@ public class RectangleTransformations {
|
||||
public BinaryOperator<BBox> combiner() {
|
||||
|
||||
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
|
||||
Math.min(b1.lowerLeftY, b2.lowerLeftY),
|
||||
Math.max(b1.upperRightX, b2.upperRightX),
|
||||
Math.max(b1.upperRightY, b2.upperRightY));
|
||||
Math.min(b1.lowerLeftY, b2.lowerLeftY),
|
||||
Math.max(b1.upperRightX, b2.upperRightX),
|
||||
Math.max(b1.upperRightY, b2.upperRightY));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -14,23 +14,24 @@ public class RectangularIntersectionFinder {
|
||||
|
||||
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
// Fix for 211.pdf
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
}
|
||||
// // Fix for 211.pdf
|
||||
// for (Ruling r : horizontalRulingLines) {
|
||||
// if (r.getX2() < r.getX1()) {
|
||||
// double a = r.getX2();
|
||||
// r.x2 = (float) r.getX1();
|
||||
// r.x1 = (float) a;
|
||||
// }
|
||||
// }
|
||||
|
||||
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
|
||||
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
||||
|
||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||
Point2D topLeft = intersectionPointsList.get(i);
|
||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
||||
RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft);
|
||||
|
||||
// CrossingPointsDirectlyBelow( topLeft );
|
||||
List<Point2D> xPoints = new ArrayList<>();
|
||||
@ -48,19 +49,24 @@ public class RectangularIntersectionFinder {
|
||||
outer:
|
||||
for (Point2D xPoint : xPoints) {
|
||||
// is there a vertical edge b/w topLeft and xPoint?
|
||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
||||
if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) {
|
||||
continue;
|
||||
}
|
||||
for (Point2D yPoint : yPoints) {
|
||||
// is there a horizontal edge b/w topLeft and yPoint ?
|
||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||
if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) {
|
||||
continue;
|
||||
}
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight)
|
||||
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
|
||||
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
|
||||
&& intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal())
|
||||
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
|
||||
|
||||
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
||||
intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,200 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class RulingIntersectionFinder {
|
||||
|
||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||
|
||||
public static final Comparator<Point2D> Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX);
|
||||
|
||||
|
||||
/**
|
||||
* Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections.
|
||||
* based on <a href="http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf">Segment Intersection by Piotr Indyk</a>
|
||||
*
|
||||
* @param horizontals a list of non-overlapping horizontal rulings
|
||||
* @param verticals a list of non-overlapping vertical rulings
|
||||
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
|
||||
*/
|
||||
/*
|
||||
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
|
||||
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
|
||||
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
|
||||
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
|
||||
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
|
||||
* This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm
|
||||
* Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over.
|
||||
* Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead.
|
||||
* Since we are using this implementation to find table cells, one can expect this worst case to always be the case.
|
||||
* A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast.
|
||||
* If we would like to make this faster, we would need a better data structure for 'TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n).
|
||||
*/
|
||||
public Map<Point2D, IntersectingRulings> find(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
List<SweepStep> sweepTrajectory = buildSweepTrajectory(horizontals, verticals);
|
||||
|
||||
TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop));
|
||||
|
||||
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
|
||||
|
||||
for (SweepStep step : sweepTrajectory) {
|
||||
switch (step.type) {
|
||||
case VERTICAL: // check for intersections with currently intersected horizontal lines
|
||||
for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) {
|
||||
|
||||
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling);
|
||||
|
||||
if (intersectionPoint.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling));
|
||||
}
|
||||
break;
|
||||
case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling
|
||||
horizontalRulingsInCurrentSweep.put(step.ruling, null);
|
||||
break;
|
||||
case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling
|
||||
horizontalRulingsInCurrentSweep.remove(step.ruling);
|
||||
break;
|
||||
}
|
||||
}
|
||||
log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start);
|
||||
|
||||
return intersections;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines.
|
||||
*
|
||||
* @param horizontals a list of non-overlapping horizontal rulings
|
||||
* @param verticals a list of non-overlapping vertical rulings
|
||||
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
|
||||
*/
|
||||
public Map<Point2D, IntersectingRulings> findNaive(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
|
||||
|
||||
for (Ruling horizontal : horizontals) {
|
||||
for (Ruling vertical : verticals) {
|
||||
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontal, vertical);
|
||||
|
||||
if (intersectionPoint.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical));
|
||||
}
|
||||
}
|
||||
log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start);
|
||||
|
||||
return intersections;
|
||||
}
|
||||
|
||||
|
||||
private static List<SweepStep> buildSweepTrajectory(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
List<SweepStep> sweepTrajectory = new LinkedList<>();
|
||||
|
||||
for (Ruling horizontalRuling : horizontals) {
|
||||
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
|
||||
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
|
||||
}
|
||||
|
||||
for (Ruling verticalRuling : verticals) {
|
||||
sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling));
|
||||
}
|
||||
|
||||
Collections.sort(sweepTrajectory);
|
||||
|
||||
return sweepTrajectory;
|
||||
}
|
||||
|
||||
|
||||
public Optional<Point2D> findIntersectionPoint(Ruling horizontal, Ruling vertical) {
|
||||
|
||||
if (!horizontal.isHorizontal() || !vertical.isVertical()) {
|
||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
|
||||
if (!expanded_horizontal.intersectsLine(expanded_vertical)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop()));
|
||||
}
|
||||
|
||||
|
||||
private class SweepStep implements Comparable<SweepStep> {
|
||||
|
||||
protected Type type;
|
||||
protected float y_position;
|
||||
protected Ruling ruling;
|
||||
|
||||
private enum Type {
|
||||
VERTICAL,
|
||||
HORIZONTAL_EXIT,
|
||||
HORIZONTAL_ENTRY
|
||||
}
|
||||
|
||||
|
||||
SweepStep(Type type, float y_position, Ruling ruling) {
|
||||
|
||||
this.type = type;
|
||||
this.y_position = y_position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(SweepStep other) {
|
||||
|
||||
int rv;
|
||||
if (DoubleComparisons.feq(y_position, other.y_position)) {
|
||||
if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) {
|
||||
rv = 1;
|
||||
} else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) {
|
||||
rv = -1;
|
||||
} else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = Double.compare(y_position, other.y_position);
|
||||
}
|
||||
} else {
|
||||
return Double.compare(y_position, other.y_position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public record IntersectingRulings(Ruling horizontal, Ruling vertical) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -11,7 +12,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
public class SpreadsheetFinder {
|
||||
|
||||
@ -19,15 +20,15 @@ public class SpreadsheetFinder {
|
||||
private static final float AREA_TOLERANCE = 0.001f;
|
||||
|
||||
|
||||
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
public static List<Rectangle2D> findSpreadsheetsFromCells(List<Cell> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
List<Rectangle2D> rectangles = new ArrayList<>();
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
for (Cell cell : cells) {
|
||||
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
@ -116,13 +117,22 @@ public class SpreadsheetFinder {
|
||||
|
||||
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
||||
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
|
||||
rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE)));
|
||||
}
|
||||
}
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
public static List<Point2D> getPoints(Rectangle2D rectangle2D) {
|
||||
|
||||
return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()));
|
||||
}
|
||||
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL,
|
||||
VERTICAL
|
||||
|
||||
@ -0,0 +1,310 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutparsingVisualizations {
|
||||
|
||||
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
|
||||
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
|
||||
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
|
||||
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||
|
||||
static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||
|
||||
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||
|
||||
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
|
||||
new Color(255, 195, 0),
|
||||
new Color(76, 175, 80),
|
||||
new Color(33, 150, 243),
|
||||
new Color(155, 89, 182),
|
||||
new Color(233, 30, 99),
|
||||
new Color(0, 188, 212),
|
||||
new Color(121, 85, 72));
|
||||
|
||||
@Setter
|
||||
boolean active;
|
||||
|
||||
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
|
||||
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
|
||||
|
||||
|
||||
public Stream<Visualizations> streamAll() {
|
||||
|
||||
if (!active) {
|
||||
return Stream.empty();
|
||||
}
|
||||
return Stream.of(characters, //
|
||||
neighbours,//
|
||||
words, //
|
||||
lines, //
|
||||
zones, //
|
||||
rulings, //
|
||||
clean_rulings, //
|
||||
cells, //
|
||||
mainBody, //
|
||||
markedContent //
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPositionSequences.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(cleanRulings.buildAll()
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(rulings
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
private Color decideOnRulingColor(Ruling ruling) {
|
||||
|
||||
return switch (ruling.getClassification()) {
|
||||
case TABLE_LINE -> TABLE_RULINGS_COLOR;
|
||||
case HEADER_SEPARATOR -> HEADER_RULING_COLOR;
|
||||
case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR;
|
||||
case UNDERLINE -> UNDERLINE_RULING_COLOR;
|
||||
case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR;
|
||||
default -> RULINGS_COLOR;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addZoneVisualizations(List<Zone> zones, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(zones.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.toList());
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void addLineVisualizationsFromZones(List<Zone> zones, int page) {
|
||||
|
||||
addLineVisualizations(zones.stream()
|
||||
.map(Zone::getLines)
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), page);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void addLineVisualizations(List<Line> lines, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(lines.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPageBlocks.stream()
|
||||
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()),
|
||||
MAIN_BODY_COLOR,
|
||||
1));
|
||||
}
|
||||
|
||||
|
||||
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
|
||||
|
||||
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
|
||||
AtomicInteger count = new AtomicInteger();
|
||||
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
|
||||
var bbox = markedContentPosition.textPositions()
|
||||
.stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
String type = markedContentPosition.formattedType() + " " + count.getAndIncrement();
|
||||
|
||||
float translationAmount = ((FONT.getStringWidth(type) / 100) + 6);
|
||||
// Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6).
|
||||
|
||||
visualizationsOnPage.getPlacedTexts()
|
||||
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
|
||||
|
||||
visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1));
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void addCharactersWithNeighbours(List<Zone> zones, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
|
||||
VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters);
|
||||
VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours);
|
||||
|
||||
AtomicInteger index = new AtomicInteger(0);
|
||||
zones.forEach(zone -> zone.getLines()
|
||||
.stream()
|
||||
.map(Line::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(character -> {
|
||||
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
|
||||
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
|
||||
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
|
||||
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
|
||||
});
|
||||
}));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||
|
||||
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
|
||||
return visualizations.getVisualizationsOnPages()
|
||||
.get(page - 1);
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
|
||||
return visualizationsOnPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -1,10 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -20,28 +30,65 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
.peek(System.out::println)
|
||||
.toList();
|
||||
|
||||
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
|
||||
AtomicInteger count = new AtomicInteger(0);
|
||||
pdfFiles.stream()
|
||||
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
|
||||
.forEach(path -> runForFile(path.toFile().toString()));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd_RED_8747() {
|
||||
private void runForFile(String filePath) {
|
||||
|
||||
String fileName = Path.of(filePath).getFileName().toString();
|
||||
File file;
|
||||
if (filePath.startsWith("files")) { // from resources
|
||||
file = new ClassPathResource(filePath).getFile();
|
||||
} else { // absolute path
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
|
||||
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||
}
|
||||
|
||||
|
||||
@AfterEach
|
||||
public void cleanUpTmp() {
|
||||
|
||||
((FileSystemBackedStorageService) storageService).clearStorage();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -23,6 +23,10 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
@ -31,12 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
@ -54,17 +56,17 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,118 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
class CleanRulingsTest {
|
||||
|
||||
@Test
|
||||
public void testLineBetween() {
|
||||
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)));
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(a, a));
|
||||
assertFalse(cleanRulings.lineBetween(a, b));
|
||||
assertTrue(cleanRulings.lineBetween(a, c));
|
||||
assertTrue(cleanRulings.lineBetween(a, d));
|
||||
assertTrue(cleanRulings.lineBetween(a, e));
|
||||
assertTrue(cleanRulings.lineBetween(a, f));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSingleLineInRange() {
|
||||
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1)));
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100)));
|
||||
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||
assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLinesInRange() {
|
||||
|
||||
List<Ruling> horizontals = IntStream.range(0, 101).boxed()
|
||||
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||
.toList();
|
||||
List<Ruling> verticals = IntStream.range(0, 101).boxed()
|
||||
.map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100)))
|
||||
.toList();
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||
assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size());
|
||||
assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size());
|
||||
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||
|
||||
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||
assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||
assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLinesInRangePerformance() {
|
||||
|
||||
List<Ruling> horizontals = IntStream.range(0, (int) 1e6).boxed()
|
||||
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||
.toList();
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList());
|
||||
|
||||
float startY = 29;
|
||||
float endY = 3000;
|
||||
long start = System.currentTimeMillis();
|
||||
var result = cleanRulings.getHorizontalsInYInterval(startY, endY);
|
||||
long time = System.currentTimeMillis() - start;
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
var result2 = cleanRulings.getHorizontals()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY)
|
||||
.toList();
|
||||
long time2 = System.currentTimeMillis() - start;
|
||||
|
||||
assertEquals(result, result2);
|
||||
assertTrue(time < time2);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class RulingTest {
|
||||
|
||||
@Test
|
||||
public void testLineBetween() {
|
||||
|
||||
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5)));
|
||||
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||
|
||||
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||
|
||||
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(a, a));
|
||||
assertFalse(cleanRulings.lineBetween(a, b));
|
||||
assertTrue(cleanRulings.lineBetween(a, c));
|
||||
assertTrue(cleanRulings.lineBetween(a, d));
|
||||
assertTrue(cleanRulings.lineBetween(a, e));
|
||||
assertTrue(cleanRulings.lineBetween(a, f));
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(d, d));
|
||||
assertTrue(cleanRulings.lineBetween(d, b));
|
||||
assertTrue(cleanRulings.lineBetween(d, c));
|
||||
assertTrue(cleanRulings.lineBetween(d, a));
|
||||
assertTrue(cleanRulings.lineBetween(d, e));
|
||||
assertTrue(cleanRulings.lineBetween(d, f));
|
||||
|
||||
assertFalse(cleanRulings.lineBetween(c, c));
|
||||
assertTrue(cleanRulings.lineBetween(c, b));
|
||||
assertTrue(cleanRulings.lineBetween(c, d));
|
||||
assertTrue(cleanRulings.lineBetween(c, a));
|
||||
assertTrue(cleanRulings.lineBetween(c, e));
|
||||
assertFalse(cleanRulings.lineBetween(c, f));
|
||||
|
||||
var all = List.of(a, b, c, d, e, f);
|
||||
for (Rectangle2D r1 : all) {
|
||||
for (Rectangle2D r2 : all) {
|
||||
assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -52,28 +52,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Autowired
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file","document"));
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", "document"));
|
||||
}
|
||||
|
||||
|
||||
@ -133,7 +121,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.get(0).getSequences().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).isEqualTo(textToSearch);
|
||||
.get(0).toString()).contains(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
@ -143,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableAndCellRotations() {
|
||||
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
@ -157,7 +156,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
// We only asset that the table border is not the page border.
|
||||
@ -179,12 +182,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
|
||||
System.out.println("object");
|
||||
}
|
||||
@ -196,11 +199,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
assertThat(table.getRows()
|
||||
.stream()
|
||||
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@ -373,29 +387,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList(
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
|
||||
validateTable(document, 0, values);
|
||||
|
||||
@ -785,6 +800,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMergedEntities_Page26() throws IOException {
|
||||
|
||||
@ -802,7 +818,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int currentPage = 1;
|
||||
@ -823,9 +843,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList()
|
||||
.stream()
|
||||
.filter(f -> f.toString().isEmpty())
|
||||
.toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
@ -840,11 +870,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
List<String> valuesFlattened = values.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||
Cell cell = rowsFlattened.get(i);
|
||||
@ -857,7 +896,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
|
||||
.stream()
|
||||
.map(PageInformationService::build)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
.subList(45, 152)
|
||||
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152)
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.map(this::mirrorY)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
|
||||
.toList();
|
||||
|
||||
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
|
||||
PdfDraw.drawRectanglesPerPage(fileName,
|
||||
List.of(table.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), Collections.emptyList()),
|
||||
tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -29,9 +29,7 @@ class PageContentExtractorTest {
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||
.map(TextPositionSequence::getBBoxInitialUserSpace)
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
|
||||
@ -52,8 +52,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
rectanglesPerPage.add(rects);
|
||||
}
|
||||
|
||||
@ -72,15 +72,16 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testTableExtraction() {
|
||||
|
||||
@ -97,6 +98,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
|
||||
@ -0,0 +1,84 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingsClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void textRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Underlined"))
|
||||
.allMatch(TextPositionSequence::isUnderline));
|
||||
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Striketrough"))
|
||||
.allMatch(TextPositionSequence::isStrikethrough));
|
||||
|
||||
assertEquals(4,
|
||||
cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
|
||||
.count());
|
||||
assertEquals(4,
|
||||
cleanRulings.buildAll()
|
||||
.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
|
||||
.count());
|
||||
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void tableRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertEquals(30, cleanRulings.getHorizontals().size());
|
||||
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
|
||||
|
||||
assertEquals(144, cleanRulings.getVerticals().size());
|
||||
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
@ -102,29 +105,22 @@ public abstract class AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
}
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
|
||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||
return LayoutParsingRequest.builder()
|
||||
.identifier(Map.of("fileId", "1337"))
|
||||
.identifier(identifier)
|
||||
.layoutParsingType(layoutParsingType)
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
.visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE))
|
||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||
.pageFileStorageId(PAGES_FILE_ID)
|
||||
.simplifiedTextStorageId(SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
|
||||
.originFileStorageId(fileName + ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
|
||||
.visualLayoutParsingFileId(Optional.empty())
|
||||
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
|
||||
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -148,10 +144,28 @@ public abstract class AbstractTest {
|
||||
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
||||
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
|
||||
|
||||
return prepareStorage(pdfFileResource.getInputStream(),
|
||||
cvServiceResponseFileResource.getInputStream(),
|
||||
imageInfoFileResource.getInputStream(),
|
||||
visualLayoutParsingResponseResource.getInputStream());
|
||||
return prepareStorage(Path.of(file).getFileName().toString(),
|
||||
pdfFileResource.getInputStream(),
|
||||
cvServiceResponseFileResource.getInputStream(),
|
||||
imageInfoFileResource.getInputStream(),
|
||||
visualLayoutParsingResponseResource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) {
|
||||
|
||||
ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json");
|
||||
ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json");
|
||||
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json");
|
||||
|
||||
try (var in = new FileInputStream(file)) {
|
||||
prepareStorage(layoutParsingRequest,
|
||||
in,
|
||||
cvServiceResponseFileResource.getInputStream(),
|
||||
imageInfoFileResource.getInputStream(),
|
||||
visualLayoutParsingResponseResource.getInputStream());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -162,12 +176,29 @@ public abstract class AbstractTest {
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
|
||||
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest,
|
||||
InputStream fileStream,
|
||||
InputStream cvServiceResponseFileStream,
|
||||
InputStream imageInfoStream,
|
||||
InputStream visualLayoutParsingResponseFileStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String fileName,
|
||||
InputStream fileStream,
|
||||
InputStream cvServiceResponseFileStream,
|
||||
InputStream imageInfoStream,
|
||||
InputStream visualLayoutParsingResponseFileStream) {
|
||||
@ -177,7 +208,7 @@ public abstract class AbstractTest {
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
|
||||
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||
return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
File fileResource = new ClassPathResource(filename).getFile();
|
||||
prepareStorage(filename);
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename));
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
|
||||
|
||||
@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
||||
prepareStorage(layoutParsingRequest, new File(filename));
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
||||
layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
new File(filename),
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
}
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
||||
}
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -26,6 +26,26 @@ public class ContentStreams {
|
||||
|
||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||
|
||||
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
|
||||
|
||||
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
||||
|
||||
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
||||
|
||||
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
|
||||
|
||||
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
|
||||
|
||||
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
|
||||
|
||||
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
|
||||
|
||||
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
|
||||
|
||||
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
|
||||
|
||||
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
|
||||
|
||||
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
|
||||
KNECON_VISUAL_PARSING,
|
||||
KNECON_OCR,
|
||||
@ -33,7 +53,17 @@ public class ContentStreams {
|
||||
KNECON_OCR_TEXT_DEBUG,
|
||||
OTHER,
|
||||
ESCAPE_START,
|
||||
ESCAPE_END);
|
||||
ESCAPE_END,
|
||||
RULINGS,
|
||||
CLEAN_RULINGS,
|
||||
WORDS,
|
||||
ZONES,
|
||||
LINES,
|
||||
MAIN_BODY,
|
||||
MARKED_CONTENT,
|
||||
NEIGHBOURS,
|
||||
CHARACTERS,
|
||||
CELLS);
|
||||
|
||||
public record Identifier(String name, COSName cosName, boolean optionalContent) {
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults;
|
||||
public class Visualizations {
|
||||
|
||||
ContentStreams.Identifier layer;
|
||||
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
|
||||
@Builder.Default
|
||||
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
|
||||
boolean layerVisibilityDefaultValue;
|
||||
|
||||
}
|
||||
|
||||
@ -53,12 +53,6 @@ public class ViewerDocumentService {
|
||||
private final ObservationRegistry registry;
|
||||
|
||||
|
||||
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
|
||||
|
||||
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
|
||||
@SneakyThrows
|
||||
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
||||
@ -70,9 +64,14 @@ public class ViewerDocumentService {
|
||||
|
||||
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
|
||||
|
||||
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
|
||||
enrichObservation(pdDocument,
|
||||
visualizations.stream()
|
||||
.map(Visualizations::getLayer)
|
||||
.toList());
|
||||
|
||||
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
|
||||
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
|
||||
.map(Visualizations::getLayer)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
|
||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
|
||||
|
||||
@ -229,11 +228,11 @@ public class ViewerDocumentService {
|
||||
Matrix textMatrix;
|
||||
if (placedText.textMatrix().isEmpty()) {
|
||||
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
} else {
|
||||
textMatrix = placedText.textMatrix().get();
|
||||
}
|
||||
|
||||
@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
|
||||
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user