diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 86e3741..98cde15 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -3,32 +3,50 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; -import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; +import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -39,14 +57,18 @@ public class LayoutParsingPipeline { private final ImageServiceResponseAdapter imageServiceResponseAdapter; private final CvTableParsingAdapter cvTableParsingAdapter; private final LayoutParsingStorageService layoutParsingStorageService; - private final PdfParsingService pdfParsingService; private final SectionsBuilderService sectionsBuilderService; private final SectionGridCreatorService sectionGridCreatorService; private final TaasClassificationService taasClassificationService; private final RedactManagerClassificationService redactManagerClassificationService; private final DocuMineClassificationService docuMineClassificationService; private final SimplifiedSectionTextService simplifiedSectionTextService; - + private final BodyTextFrameService bodyTextFrameService; + private final RulingCleaningService rulingCleaningService; + private final TableExtractionService tableExtractionService; + private final TaasBlockificationService taasBlockificationService; + private final DocuMineBlockificationService docuMineBlockificationService; + private final RedactManagerBlockificationService redactManagerBlockificationService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -63,7 +85,9 @@ public class LayoutParsingPipeline { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } - Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); + ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + int numberOfPages = originDocument.getNumberOfPages(); layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph)); @@ -88,15 +112,72 @@ public class LayoutParsingPipeline { } - public Document parseLayout(LayoutParsingType layoutParsingType, + @SneakyThrows + public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, - originDocument, - cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), - imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); + Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); + + ClassificationDocument classificationDocument = new ClassificationDocument(); + List classificationPages = new ArrayList<>(); + + originDocument.setAllSecurityToBeRemoved(true); + long pageCount = originDocument.getNumberOfPages(); + + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + classificationDocument.setPages(classificationPages); + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = originDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) { + stripper.setSortByPosition(true); + } + stripper.getText(originDocument); + + PDRectangle pdr = pdPage.getMediaBox(); + + int rotation = pdPage.getRotation(); + boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); + + PDRectangle cropbox = pdPage.getCropBox(); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), + stripper.getRulings(), + stripper.getMinCharWidth(), + stripper.getMaxCharHeight()); + + ClassificationPage classificationPage = switch (layoutParsingType) { + case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + }; + classificationPage.setCleanRulings(cleanRulings); + classificationPage.setRotation(rotation); + classificationPage.setLandscape(isLandscape); + classificationPage.setPageNumber(pageNumber); + classificationPage.setPageWidth(cropbox.getWidth()); + classificationPage.setPageHeight(cropbox.getHeight()); + + // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. + if (pdfImages != null && pdfImages.containsKey(pageNumber)) { + classificationPage.setImages(pdfImages.get(pageNumber)); + imageServiceResponseAdapter.findOcr(classificationPage); + } + + tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType); + buildPageStatistics(classificationPage); + increaseDocumentStatistics(classificationPage, classificationDocument); + + classificationPages.add(classificationPage); + } + + + bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); switch (layoutParsingType) { case TAAS -> taasClassificationService.classifyDocument(classificationDocument); @@ -107,40 +188,39 @@ public class LayoutParsingPipeline { sectionsBuilderService.buildSections(classificationDocument); sectionsBuilderService.addImagesToSections(classificationDocument); - return DocumentGraphFactory.buildDocumentGraph(classificationDocument); + return classificationDocument; } + private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType, - PDDocument originDocument, - ImageServiceResponse imageServiceResponse, - TableServiceResponse tableServiceResponse) { - - long start = System.currentTimeMillis(); - - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, - originDocument, - cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), - imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); - - System.out.printf("parsed %d ms", System.currentTimeMillis() - start); - - start = System.currentTimeMillis(); - switch (layoutParsingType) { - case TAAS -> taasClassificationService.classifyDocument(classificationDocument); - case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); - case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + if (!classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); } - System.out.printf(", classified %d ms", System.currentTimeMillis() - start); - - start = System.currentTimeMillis(); - sectionsBuilderService.buildSections(classificationDocument); - System.out.printf(", sections built %d ms", System.currentTimeMillis() - start); - - start = System.currentTimeMillis(); - Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); - System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start); - return document; + document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); + document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); + document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); } + + private void buildPageStatistics(ClassificationPage classificationPage) { + + // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. + for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + if (((TextPageBlock) textBlock).getSequences() == null) { + continue; + } + for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { + classificationPage.getTextHeightCounter().add(word.getTextHeight()); + classificationPage.getFontCounter().add(word.getFont()); + classificationPage.getFontSizeCounter().add(word.getFontSize()); + classificationPage.getFontStyleCounter().add(word.getFontStyle()); + } + } + } + + } + + + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index 21796c8..cc0a420 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -5,6 +5,7 @@ import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import lombok.Data; @@ -35,4 +36,6 @@ public class ClassificationPage { private float pageWidth; private float pageHeight; + CleanRulings cleanRulings; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java index 21d1e67..f9d00ef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.awt.geom.Rectangle2D; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.AllArgsConstructor; @@ -17,5 +18,5 @@ public class PageContents { List sortedTextPositionSequences; Rectangle2D cropBox; Rectangle2D mediaBox; - + List rulings; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 53f6bde..6acf5e1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -28,6 +28,7 @@ public class Ruling extends Line2D.Float { super(p1, p2); } + public Ruling straightenVertical() { double y1 = Math.min(getY1(), getY2()); @@ -36,6 +37,7 @@ public class Ruling extends Line2D.Float { return new Ruling(new Point2D.Double(x, y1), new Point2D.Double(x, y2)); } + public Ruling straightenHorizontal() { double x1 = Math.min(getX1(), getX2()); @@ -444,6 +446,16 @@ public class Ruling extends Line2D.Float { } + public boolean almostMatches(Ruling ruling) { + + final float TOLERANCE = 1; + return Math.abs(ruling.getX1() - x1) < TOLERANCE &&// + Math.abs(ruling.getY1() - y1) < TOLERANCE &&// + Math.abs(ruling.getX2() - x2) < TOLERANCE &&// + Math.abs(ruling.getY2() - y2) < TOLERANCE; + } + + private enum SOType { VERTICAL, HRIGHT, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 49c801c..436df0c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -8,6 +8,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; @@ -20,6 +21,65 @@ public class BodyTextFrameService { + public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { + + Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); + Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType); + for (ClassificationPage page : classificationDocument.getPages()) { + setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + } + } + + +/* + private Rectangle calculateBodyTextFrameByRulings(List pages) { + + Map> potentialHeaderRulingsPerPage = new HashMap<>(); + Map> potentialFooterRulingsPerPage = new HashMap<>(); + + for (var page : pages) { + potentialHeaderRulingsPerPage.put(page, + page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() > page.getPageHeight() * 0.8) + .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) + .toList()); + potentialFooterRulingsPerPage.put(page, + page.getCleanRulings() + .getHorizontal() + .stream() + .filter(ruling -> ruling.getY1() < page.getPageHeight() * 0.2) + .filter(ruling -> ruling.getWidth() > 0.6 * page.getPageWidth()) + .toList()); + } + + Optional headerRuling = potentialHeaderRulingsPerPage.values() + .stream() + .flatMap(Collection::stream) + .filter(ruling -> potentialHeaderRulingsPerPage.values() + .stream() + .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) + .count() > pages.size() * RULING_THRESHOLD_FACTOR) + .min(Comparator.comparingDouble(Ruling::getY1)); + + Optional footerRuling = potentialFooterRulingsPerPage.values() + .stream() + .flatMap(Collection::stream) + .filter(ruling -> potentialHeaderRulingsPerPage.values() + .stream() + .filter(rulingsPerPage -> rulingsPerPage.stream().anyMatch(ruling::almostMatches)) + .count() > pages.size() * RULING_THRESHOLD_FACTOR) + .max(Comparator.comparingDouble(Ruling::getY1)); + + double maxY = headerRuling.isPresent() ? headerRuling.get().y1 : pages.stream().mapToDouble(ClassificationPage::getPageHeight).max().orElse(Double.MAX_VALUE); + double minY = footerRuling.map(ruling -> ruling.y1).orElse(0F); + double maxX = pages.stream().mapToDouble(ClassificationPage::getPageWidth).max().orElse(Double.MAX_VALUE); + + return new Rectangle(new Point((float) maxX, (float) maxY), (float) 0, (float) minY, -1); + } +*/ + /** * Adjusts and sets the body text frame to a page. @@ -34,7 +94,7 @@ public class BodyTextFrameService { * @param bodyTextFrame frame that contains the main text on portrait pages * @param landscapeBodyTextFrame frame that contains the main text on landscape pages */ - public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { + private void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; @@ -69,7 +129,10 @@ public class BodyTextFrameService { * @param landscape Calculate for landscape or portrait * @return Rectangle of the text frame */ - public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape, LayoutParsingType layoutParsingType) { + private Rectangle calculateBodyTextFrame(List pages, + FloatFrequencyCounter documentFontSizeCounter, + boolean landscape, + LayoutParsingType layoutParsingType) { float approximateHeaderLineCount; if (layoutParsingType.equals(LayoutParsingType.TAAS)) { @@ -95,8 +158,8 @@ public class BodyTextFrameService { } float approxLineCount = PositionUtils.getApproxLineCount(textBlock); - if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) - || !layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount){ + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals( + LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java similarity index 93% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index 5b78fd9..82a1ad3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -21,9 +21,9 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTrans import lombok.experimental.UtilityClass; @UtilityClass -public class TextPositionSequenceSorter { +public class PageContentExtractor { - public List getSortedTextPositionsWithPages(String filename) throws IOException { + public List getSortedPageContents(String filename) throws IOException { List textPositionSequencesPerPage = new LinkedList<>(); try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { @@ -49,7 +49,8 @@ public class TextPositionSequenceSorter { textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox()), - RectangleTransformations.toRectangle2D(pdPage.getMediaBox()))); + RectangleTransformations.toRectangle2D(pdPage.getMediaBox()), + stripper.getRulings())); } } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java deleted file mode 100644 index d2fd738..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java +++ /dev/null @@ -1,154 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; -import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; - -import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -@RequiredArgsConstructor -public class PdfParsingService { - - private final RulingCleaningService rulingCleaningService; - private final TableExtractionService tableExtractionService; - private final ImageServiceResponseAdapter imageServiceResponseAdapter; - private final TaasBlockificationService taasBlockificationService; - private final DocuMineBlockificationService docuMineBlockificationService; - private final RedactManagerBlockificationService redactManagerBlockificationService; - - - public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType, - PDDocument originDocument, - Map> pdfTableCells, - Map> pdfImages) { - - ClassificationDocument document = new ClassificationDocument(); - List classificationPages = new ArrayList<>(); - - originDocument.setAllSecurityToBeRemoved(true); - long pageCount = originDocument.getNumberOfPages(); - - for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); - } - - document.setPages(classificationPages); - - return document; - } - - - @SneakyThrows - private void parsePage(LayoutParsingType layoutParsingType, - Map> pdfImages, - PDDocument pdDocument, - Map> pdfTableCells, - ClassificationDocument document, - List classificationPages, - int pageNumber) { - - PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - PDPage pdPage = pdDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){ - stripper.setSortByPosition(true); - } - stripper.getText(pdDocument); - - PDRectangle pdr = pdPage.getMediaBox(); - - int rotation = pdPage.getRotation(); - boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); - - PDRectangle cropbox = pdPage.getCropBox(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), - stripper.getRulings(), - stripper.getMinCharWidth(), - stripper.getMaxCharHeight()); - - ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - }; - - classificationPage.setRotation(rotation); - classificationPage.setLandscape(isLandscape); - classificationPage.setPageNumber(pageNumber); - classificationPage.setPageWidth(cropbox.getWidth()); - classificationPage.setPageHeight(cropbox.getHeight()); - - // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. - if (pdfImages != null && pdfImages.containsKey(pageNumber)) { - classificationPage.setImages(pdfImages.get(pageNumber)); - imageServiceResponseAdapter.findOcr(classificationPage); - } - - tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType); - buildPageStatistics(classificationPage); - increaseDocumentStatistics(classificationPage, document); - - classificationPages.add(classificationPage); - } - - - private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - - if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } - document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); - document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); - document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); - } - - - private void buildPageStatistics(ClassificationPage classificationPage) { - - // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. - for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { - if (textBlock instanceof TextPageBlock) { - if (((TextPageBlock) textBlock).getSequences() == null) { - continue; - } - for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { - classificationPage.getTextHeightCounter().add(word.getTextHeight()); - classificationPage.getFontCounter().add(word.getFont()); - classificationPage.getFontSizeCounter().add(word.getFontSize()); - classificationPage.getFontStyleCounter().add(word.getFontStyle()); - } - } - } - - } - -} - - diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 691c60c..f26f2d2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -7,14 +7,11 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -25,7 +22,6 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private final BodyTextFrameService bodyTextFrameService; private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); @@ -33,14 +29,11 @@ public class DocuMineClassificationService { public void classifyDocument(ClassificationDocument document) { - Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.DOCUMINE); - Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.DOCUMINE); List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); for (ClassificationPage page : document.getPages()) { - bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); classifyPage(page, document, headlineFontSizes); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 1ffa2d1..2be8f03 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -5,14 +5,11 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -23,19 +20,14 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { - private final BodyTextFrameService bodyTextFrameService; - public void classifyDocument(ClassificationDocument document) { - Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.REDACT_MANAGER); - Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.REDACT_MANAGER); List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); for (ClassificationPage page : document.getPages()) { - bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); classifyPage(page, document, headlineFontSizes); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index 5c3c725..c177efd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -5,8 +5,6 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; @@ -28,14 +26,13 @@ public class TaasClassificationService { public void classifyDocument(ClassificationDocument document) { - Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.TAAS); - Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.TAAS); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); for (ClassificationPage page : document.getPages()) { - bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index ccd76fc..b950340 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -152,7 +152,7 @@ public class PdfVisualisationUtility { @SneakyThrows - public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, Options options) { + public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, Options options) { var pdPage = pdDocument.getPage(pageNumber - 1); var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 2f0b4bf..cfa17e8 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -22,10 +22,11 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -46,7 +47,7 @@ public class BdrJsonBuildTest extends AbstractTest { try (InputStream inputStream = new FileInputStream(filename)) { try (PDDocument pdDocument = Loader.loadPDF(inputStream)) { - return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); + return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse())); } } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index e4ae1f2..868034d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.tenantcommons.TenantsClient; import lombok.AllArgsConstructor; @@ -94,10 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); - Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(pdfFileResource.getInputStream()), new ImageServiceResponse(), - new TableServiceResponse()); + new TableServiceResponse())); var foundHeadlines = documentGraph.streamAllSubNodes() .map(SemanticNode::getHeadline) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index 3844c3d..06d3861 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -15,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import lombok.SneakyThrows; @@ -29,7 +30,7 @@ public class BuildDocumentGraphTest extends AbstractTest { @Disabled public void buildMetolachlor() { - Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); assertEquals(221, documentGraph.getPages().size()); assertEquals(220, documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count()); assertEquals(0, documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count()); @@ -47,10 +48,10 @@ public class BuildDocumentGraphTest extends AbstractTest { ClassPathResource fileResource = new ClassPathResource(filename); try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream)) { - return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), - new TableServiceResponse()); + new TableServiceResponse())); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index c7be92c..96e6402 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import lombok.SneakyThrows; @@ -54,10 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(new FileInputStream(filename.toFile())), new ImageServiceResponse(), - new TableServiceResponse()); + new TableServiceResponse())); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index a02a95b..48e4002 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.awt.Color; import java.io.File; import java.io.IOException; +import java.nio.file.Path; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; @@ -23,7 +24,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { @Disabled public void visualizeMetolachlor() { - String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"; + String filename = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; visualizePdf(filename); } @@ -33,7 +34,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { @Disabled public void visualizeRotatedTestDocument() { - String filename = "files/211"; + String filename = "files/new/RotateTestFile.pdf"; visualizePdf(filename); } @@ -43,7 +44,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { @Disabled public void visualizeCraftedDocument() { - String filename = "files/crafted document"; + String filename = "files/crafted document.pdf"; visualizePdf(filename); } @@ -60,8 +61,8 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException { - File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf"); - ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); + File tmpFile = new File("/tmp/" + Path.of(filename).getFileName().toString() + "_SEMANTIC_NODES_BBOX.pdf"); + ClassPathResource fileResource = new ClassPathResource(filename); try (var fileStream = fileResource.getInputStream();// PDDocument pdDocument = Loader.loadPDF(fileStream)// diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 854f142..93fde51 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -21,16 +21,16 @@ import org.springframework.core.io.ClassPathResource; import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; @@ -40,7 +40,7 @@ import lombok.SneakyThrows; public class PdfSegmentationServiceTest extends AbstractTest { @Autowired - private PdfParsingService pdfParsingService; + private LayoutParsingPipeline layoutParsingPipeline; @Autowired private ObjectMapper objectMapper; @@ -57,12 +57,13 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Autowired private SectionsBuilderService sectionsBuilderService; + public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(LayoutParsingType.REDACT_MANAGER, + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, - cvTableParsingAdapter.buildCvParsedTablesPerPage(new TableServiceResponse()), - imageServiceResponseAdapter.buildClassifiedImagesPerPage(new ImageServiceResponse())); + new ImageServiceResponse(), + new TableServiceResponse()); redactManagerClassificationService.classifyDocument(classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java index 4add2a3..db26f8a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.DividingCol import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; -import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; @@ -30,7 +30,7 @@ class GapAcrossLinesDetectionServiceTest { var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); - List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); List> columnsPerPage = new LinkedList<>(); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.println("start column detection"); @@ -56,7 +56,7 @@ class GapAcrossLinesDetectionServiceTest { var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); - List sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename); + List sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename); List> columnsPerPage = new LinkedList<>(); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.println("start column detection"); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java index 795585c..7eb50e2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.PageInformatio import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; -import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -28,7 +28,7 @@ class InvisibleTableDetectionServiceTest { String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); - List pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); + List pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); int pageNumber = 1; Rectangle2D tableBBox = pageContents.get(0) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java index b9f04bc..84c3ba2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java @@ -7,7 +7,7 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; -import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import lombok.SneakyThrows; @@ -20,7 +20,7 @@ class MainBodyTextFrameExtractionServiceTest { String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString(); - List sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + List sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java similarity index 89% rename from layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java rename to layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index a4b2442..1ea53f3 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -9,12 +9,12 @@ import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; -class TextPositionSequenceSorterTest { +class PageContentExtractorTest { @Test @Disabled @@ -24,7 +24,7 @@ class TextPositionSequenceSorterTest { String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); - List textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, textPositionPerPage.stream() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java index 6a5582f..15d8243 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java @@ -8,7 +8,7 @@ import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; -import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; @@ -24,7 +24,7 @@ class PageInformationServiceTest { var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); - List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.println("start gap detection"); start = System.currentTimeMillis(); @@ -47,7 +47,7 @@ class PageInformationServiceTest { var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); - List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.println("start gap detection"); start = System.currentTimeMillis(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java new file mode 100644 index 0000000..0fdcf29 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; +import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +public class RulingCleaningServiceTest { + + @Test + @Disabled + @SneakyThrows + public void textRulingExtraction() { + + String fileName = "files/BASF/2013-1110704.pdf"; + String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; + List pageContents = PageContentExtractor.getSortedPageContents("files/BASF/2013-1110704.pdf"); + PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); + + RulingCleaningService rulingCleaningService = new RulingCleaningService(); + List cleanRulingsPerPage = new LinkedList<>(); + for (PageContents pageContent : pageContents) { + cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20)); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 6158031..876c98c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -24,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -206,6 +207,21 @@ public class PdfDraw { } + @SneakyThrows + public static void drawLinesPerPage(String filename, List> linesPerPage, String tmpFileName) { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream();// + PDDocument pdDocument = Loader.loadPDF(inputStream);// + var out = new FileOutputStream(tmpFileName)// + ) { + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build()); + } + pdDocument.save(out); + } + } + + @Builder @AllArgsConstructor @Getter diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/BASF/2013-1110704.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/BASF/2013-1110704.pdf new file mode 100644 index 0000000..18dc29f Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/BASF/2013-1110704.pdf differ