diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 5f713ab..c04ccf7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -261,7 +261,7 @@ public class LayoutParsingPipeline { boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); PDRectangle cropbox = pdPage.getCropBox(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); @@ -293,10 +293,10 @@ public class LayoutParsingPipeline { classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. - classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage)); + classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. - if (pdfImages != null && pdfImages.containsKey(pageNumber)) { + if (pdfImages.containsKey(pageNumber)) { classificationPage.setImages(pdfImages.get(pageNumber)); imageServiceResponseAdapter.findOcr(classificationPage); } @@ -370,11 +370,11 @@ public class LayoutParsingPipeline { } - private Map> convertMarkedContents(List pdMarkedContents, PDPage pdPage) { + private Map> convertMarkedContents(List pdMarkedContents) { Map> markedContentBboxes = new HashMap<>(); - markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage)); - markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage)); + markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); + markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); return markedContentBboxes; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 5b1a61d..4517029 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -9,6 +9,7 @@ import java.util.Map; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; @@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter { classificationPage.getImages().forEach(image -> { if (image.getImageType().equals(ImageType.OTHER)) { - classificationPage.getTextBlocks().forEach(textblock -> { - if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) { + for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { + if (image.getPosition().contains(textblock.getBBox())) { image.setImageType(ImageType.OCR); + return; } - }); + } } }); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index 675098b..353abad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -31,7 +31,7 @@ public class RulingCleaningService { private static final float THRESHOLD_Y_HORIZONTAL = 3; - public CleanRulings getCleanRulings(List tableCells, List rulings) { + public CleanRulings deduplicateAndStraightenRulings(List tableCells, List rulings) { Rulings verticalAndHorizontalRulingLines; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java index 72579b3..26987c7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -12,9 +12,9 @@ import lombok.experimental.UtilityClass; @UtilityClass public class TextRulingsClassifier { - private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines - private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines - private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width + private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines. + private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines. + private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline. public static void classifyUnderlinedAndStrikethroughText(List words, CleanRulings cleanRulings) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 3e67cfb..618e20e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -48,7 +48,7 @@ public class DocstrumBlockificationService { visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); } - var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontals(), usedRulings.getVerticals(), xyOrder, usedRulings); + var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings); if (xyOrder) { sortPageBlocksXThenY(pageBlocks); @@ -77,10 +77,7 @@ public class DocstrumBlockificationService { } - private List toAbstractPageBlocks(List zones, - List horizontalRulings, - List verticalRulings, - boolean xyOrder, + private List toAbstractPageBlocks(List zones, boolean xyOrder, CleanRulings usedRulings) { List abstractPageBlocks = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d622fc8..d0ee204 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,7 +5,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -49,7 +49,6 @@ public class DocuMineClassificationService { } } - private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 86d79d7..b1bd1ef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -8,7 +8,6 @@ import java.util.Map; import java.util.stream.Collectors; import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.apache.pdfbox.text.TextPosition; @@ -24,7 +23,7 @@ public class MarkedContentUtils { public static final String FOOTER = "Footer"; - public List getMarkedContentBboxPerLine(List markedContents, String subtype, PDPage pdPage) { + public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { return Collections.emptyList(); @@ -54,7 +53,7 @@ public class MarkedContentUtils { } - public List getMarkedContentPositions(List markedContents, PDPage pdPage) { + public List getMarkedContentPositions(List markedContents) { if (markedContents == null) { return Collections.emptyList(); @@ -62,7 +61,7 @@ public class MarkedContentUtils { return markedContents.stream() .filter(m -> !m.getContents().isEmpty()) - .map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage)) + .map(MarkedContentPosition::fromPDMarkedContent) .toList(); } @@ -77,20 +76,20 @@ public class MarkedContentUtils { public record MarkedContentPosition(String type, String subType, List textPositions) { - public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) { + public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) { - return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage)); + return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents())); } - private static List parseTextPositions(List contents, PDPage pdPage) { + private static List parseTextPositions(List contents) { return contents.stream() .filter(content -> content instanceof TextPosition) .map(content -> (TextPosition) content) .filter(content -> !content.getUnicode().equals(" ")) - .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))) + .map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true)) + .map(TextPositionSequence::getBoundingBox) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java index 98fe586..f50f753 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -234,7 +234,7 @@ public class LayoutparsingVisualizations { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent); - List markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage); + List markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents); markedContentBBoxMapBySubType.forEach(markedContentPosition -> { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 53bb180..f69a399 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; runForFile(filePath); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 93da7e9..467b6c0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -52,7 +52,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { RulingCleaningService rulingCleaningService = new RulingCleaningService(); List> rectanglesPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()); + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); List rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); rectanglesPerPage.add(rects); } @@ -72,7 +72,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { RulingCleaningService rulingCleaningService = new RulingCleaningService(); List cleanRulingsPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { - cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); + cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings())); } var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList()); PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);