diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index dfa0537..9372075 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -260,8 +260,10 @@ public class LayoutParsingPipeline { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, true, cleanRulings); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(layoutParsingType, stripper.getTextPositionSequences(), emptyTableCells, false, cleanRulings); }; classificationPage.setCleanRulings(cleanRulings); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index c3666a6..12b00d3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -5,13 +5,12 @@ import static java.util.stream.Collectors.toSet; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashSet; import java.util.List; import java.util.ListIterator; -import java.util.Set; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; @@ -19,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; @@ -38,7 +38,11 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { + public ClassificationPage blockify(LayoutParsingType layoutParsingType, + List textPositions, + List cells, + boolean xyOrder, + CleanRulings cleanRulings) { // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. List usedHorizonalRulings = new ArrayList<>(); @@ -52,11 +56,18 @@ public class DocstrumBlockificationService { }); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); - var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); + + List pageBlocks; + + if (layoutParsingType == LayoutParsingType.CLARIFYND) { + pageBlocks = toAbstractPageBlocks(zones, cleanRulings.getHorizontal(), cleanRulings.getVertical(), xyOrder); + } else { + pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); + } var classificationPage = new ClassificationPage(pageBlocks); - mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0); + mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, layoutParsingType == LayoutParsingType.CLARIFYND ? 10 : 0); return classificationPage; } @@ -223,7 +234,7 @@ public class DocstrumBlockificationService { ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if(block == null){ + if (block == null) { continue; } if (block instanceof TablePageBlock) { @@ -234,7 +245,7 @@ public class DocstrumBlockificationService { for (int i = 0; i < blocks.size(); i++) { - if(blocks.get(i) == null){ + if (blocks.get(i) == null) { continue; } if (blocks.get(i) == current) { @@ -259,8 +270,8 @@ public class DocstrumBlockificationService { } } var blocksIterator = blocks.iterator(); - while(blocksIterator.hasNext()){ - if(blocksIterator.next() == null){ + while (blocksIterator.hasNext()) { + if (blocksIterator.next() == null) { blocksIterator.remove(); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 580961e..3da1d8a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/ScrambledTextAfterSorting.pdf"; + String fileName = "files/WEF Global Risks Report 2017 - Part 1 (2).pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); + Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/WEF Global Risks Report 2017 - Part 1 (2).pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/WEF Global Risks Report 2017 - Part 1 (2).pdf new file mode 100644 index 0000000..37db481 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/WEF Global Risks Report 2017 - Part 1 (2).pdf differ