diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 3ec8d47..d628681 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -187,10 +187,7 @@ public class LayoutParsingPipeline { boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); PDRectangle cropbox = pdPage.getCropBox(); - CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), - stripper.getRulings(), - stripper.getMinCharWidth(), - stripper.getMaxCharHeight()); + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); @@ -213,7 +210,8 @@ public class LayoutParsingPipeline { imageServiceResponseAdapter.findOcr(classificationPage); } - tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType); + tableExtractionService.extractTables(cleanRulings, classificationPage); + buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); @@ -246,8 +244,8 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { -// if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); + // if (!classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); // } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index bd33f6d..d62b4cf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -34,7 +34,6 @@ public class Table implements SemanticNode { int numberOfRows; int numberOfCols; - TextBlock textBlock; @Builder.Default @@ -208,7 +207,6 @@ public class Table implements SemanticNode { return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); } - /** * Streams all TableCells row-wise and filters them with header == true. * diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 0ecf5d3..1295424 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,12 +1,14 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; +import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -252,7 +254,8 @@ public class TablePageBlock extends AbstractPageBlock { if (prevY != null && prevX != null) { var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); - var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); + var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst(); + intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); if (cell.hasMinimumSize()) { row.add(cell); @@ -273,6 +276,21 @@ public class TablePageBlock extends AbstractPageBlock { } + + public boolean intersects(Cell cell1, Cell cell2) { + if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) { + return false; + } + double x0 = cell1.getX() + 2; + double y0 = cell1.getY() + 2; + return (cell2.x + cell2.width > x0 && + cell2.y + cell2.height > y0 && + cell2.x < x0 + cell1.getWidth() -2 && + cell2.y < y0 + cell1.getHeight() -2); + } + + + @Override public String getText() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 92059ae..ccea113 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -17,7 +17,6 @@ import lombok.SneakyThrows; @AllArgsConstructor public class RedTextPosition { - private String textMatrix; private float[] position; @JsonIgnore @@ -56,8 +55,6 @@ public class RedTextPosition { pos.setFontSizeInPt(textPosition.getFontSizeInPt()); - pos.setTextMatrix(textPosition.getTextMatrix().toString()); - var position = new float[4]; position[0] = textPosition.getXDirAdj(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index f18cfee..b24157c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -12,9 +12,9 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.RequiredArgsConstructor; @@ -25,10 +25,13 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RulingCleaningService { - public CleanRulings getCleanRulings(List tableCells, List rulings, float minCharWidth, float maxCharHeight) { + private static final float THRESHOLD = 6; + + + public CleanRulings getCleanRulings(List tableCells, List rulings) { if (!rulings.isEmpty()) { - snapPoints(rulings, minCharWidth, maxCharHeight); + snapPoints(rulings); } List vrs = new ArrayList<>(); @@ -53,14 +56,11 @@ public class RulingCleaningService { } List horizontalRulingLines = collapseOrientedRulings(hrs); - return CleanRulings.builder() - .vertical(verticalRulingLines) - .horizontal(horizontalRulingLines) - .build(); + return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build(); } - public void snapPoints(List rulings, float xThreshold, float yThreshold) { + public void snapPoints(List rulings) { // collect points and keep a Line -> p1,p2 map Map linesToPoints = new HashMap<>(); @@ -81,7 +81,7 @@ public class RulingCleaningService { for (Point2D p : points.subList(1, points.size() - 1)) { List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) { + if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); @@ -108,7 +108,7 @@ public class RulingCleaningService { for (Point2D p : points.subList(1, points.size() - 1)) { List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) { + if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 4fe5b10..dd6bcc8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -12,7 +12,6 @@ import java.util.Set; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; @@ -66,6 +65,17 @@ public class TableExtractionService { }; + public boolean contains(Cell cell, double x, double y, double w, double h) { + + if (cell.isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = cell.getX(); + double y0 = cell.getY(); + return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2); + } + + /** * Finds tables on a page and moves textblocks into cells of the found tables. * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -79,16 +89,17 @@ public class TableExtractionService { * @param cleanRulings The lines used to build the table. * @param page Page object that contains textblocks and statistics. */ - public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) { + public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { - List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); + List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); List toBeRemoved = new ArrayList<>(); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; for (Cell cell : cells) { - if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(), + if (cell.hasMinimumSize() && contains(cell, + textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getPdfMaxX() - textBlock.getPdfMinX(), textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { @@ -102,7 +113,7 @@ public class TableExtractionService { cells = new ArrayList<>(new HashSet<>(cells)); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); - List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList(); + List spreadsheetAreas = findSpreadsheetsFromCells(cells); List tables = new ArrayList<>(); for (Rectangle area : spreadsheetAreas) { @@ -135,16 +146,14 @@ public class TableExtractionService { } - public List findCells(List horizontalRulingLines, List verticalRulingLines, LayoutParsingType layoutParsingType) { + public List findCells(List horizontalRulingLines, List verticalRulingLines) { - if (layoutParsingType.equals(LayoutParsingType.TAAS)) { - // TODO: breaks some tables, for example "1 Abamectin Prr.pdf" try to fix this upstream in RulingCleaningService - for (Ruling r : horizontalRulingLines) { - if (r.getX2() < r.getX1()) { - double a = r.getX2(); - r.x2 = (float) r.getX1(); - r.x1 = (float) a; - } + // Fix for 211.pdf + for (Ruling r : horizontalRulingLines) { + if (r.getX2() < r.getX1()) { + double a = r.getX2(); + r.x2 = (float) r.getX1(); + r.x1 = (float) a; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java index f6c66cb..329bd40 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -8,6 +9,7 @@ import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index d3309bd..f7b523f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -1,18 +1,34 @@ package com.knecon.fforesight.service.layoutparser.processor.services.parsing; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import lombok.Getter; -import lombok.Setter; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; +import java.awt.color.CMMException; +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.contentstream.operator.color.*; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties; import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence; -import org.apache.pdfbox.contentstream.operator.state.*; +import org.apache.pdfbox.contentstream.operator.state.SetFlatness; +import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; +import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; +import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; +import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; @@ -21,11 +37,14 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; -import java.awt.color.CMMException; -import java.awt.geom.Point2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; @Getter @Slf4j @@ -36,11 +55,6 @@ public class PDFLinesTextStripper extends PDFTextStripper { private final List graphicsPath = new ArrayList<>(); @Setter protected PDPage pdpage; - private int minCharWidth; - private int maxCharWidth; - private int minCharHeight; - private int maxCharHeight; - private float path_x; private float path_y; @@ -73,7 +87,6 @@ public class PDFLinesTextStripper extends PDFTextStripper { this.addOperator(new SetFontAndSize(this)); this.addOperator(new SetLineWidth(this)); - addOperator(new BeginMarkedContentSequenceWithProperties(this)); // addOperator(new BeginMarkedContentSequence(this)); addOperator(new EndMarkedContentSequence(this)); @@ -232,33 +245,15 @@ public class PDFLinesTextStripper extends PDFTextStripper { .get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1); } - int charWidth = (int) textPositions.get(i).getWidthDirAdj(); - if (charWidth < minCharWidth) { - minCharWidth = charWidth; - } - if (charWidth > maxCharWidth) { - maxCharWidth = charWidth; - } - - int charHeight = (int) textPositions.get(i).getHeightDir(); - if (charHeight < minCharHeight) { - minCharHeight = charHeight; - } - if (charWidth > maxCharHeight) { - maxCharHeight = charHeight; - } - if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { startIndex++; continue; } // Strange but sometimes this is happening, for example: Metolachlor2.pdf - if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) { + if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; @@ -266,9 +261,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; @@ -278,13 +271,10 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getUnicode() .equals("\t")) && i <= textPositions.size() - 2) { List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) - .getUnicode() - .equals("\t")))) { + if (checkIfSequenceContainsOnlyWhitespaces(sublist)) { // Remove false sequence ends (whitespaces) - if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) - .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { + if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) { for (TextPosition t : sublist) { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } @@ -319,13 +309,34 @@ public class PDFLinesTextStripper extends PDFTextStripper { } + public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List textPositions) { + + return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj(); + } + + + public boolean checkIfSequenceContainsOnlyWhitespaces(List sublist) { + + return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + .getUnicode() + .equals("\t"))); + } + + + public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List sublist, float maximumGapSize) { + + return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) + .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; + } + + // !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) + // .getUnicode() + // .equals("\t"))) + + @Override public String getText(PDDocument doc) throws IOException { - minCharWidth = Integer.MAX_VALUE; - maxCharWidth = 0; - minCharHeight = Integer.MAX_VALUE; - maxCharHeight = 0; textPositionSequences.clear(); rulings.clear(); graphicsPath.clear(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index 2fd6aae..5f150e2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -47,7 +47,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest { @Disabled public void visualizeCraftedDocument() { - String filename = "files/crafted document.pdf"; + String filename = "files/1 Abamectin_prr.pdf"; visualizePdf(filename); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 8875e01..e8dd8d6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.FileOutputStream; import java.nio.file.Path; +import java.util.List; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; @@ -18,19 +35,38 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { + @Autowired + private SectionsBuilderService sectionsBuilderService; + + @Autowired + private RedactManagerClassificationService redactManagerClassificationService; + @Test - @Disabled @SneakyThrows public void testViewerDocument() { + String fileName = "files/bdr/notMergedParagraphs.pdf"; + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/bdr/notMergedParagraphs.pdf"; Document document = buildGraph(fileName, LayoutParsingType.TAAS); - String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out, true); } } + public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { + + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + originDocument, + new ImageServiceResponse(), + new TableServiceResponse()); + + redactManagerClassificationService.classifyDocument(classificationDocument); + + sectionsBuilderService.buildSections(classificationDocument); + + return classificationDocument; + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index be893a2..a35b401 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -1,5 +1,27 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation; +import static org.assertj.core.api.Assertions.assertThat; + +import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; @@ -15,19 +37,8 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; + import lombok.SneakyThrows; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.core.io.ClassPathResource; - -import java.awt.geom.Rectangle2D; -import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; - -import static org.assertj.core.api.Assertions.assertThat; public class PdfSegmentationServiceTest extends AbstractTest { @@ -52,7 +63,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, new ImageServiceResponse(), new TableServiceResponse()); @@ -65,6 +76,18 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void tablesToHtmlDebugger() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); + + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + + toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html"); + + } + + @Test @SneakyThrows public void testMapping() { @@ -155,7 +178,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } - @Test + @Test // Non-sense test public void testDoc56Page170() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf"); @@ -166,8 +189,25 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 1, 1, 0, 0); validateTable(document, 1, 2, 2, 0, 0); - validateTable(document, 2, 7, 20, 0, 140); - validateTable(document, 3, 8, 31, 0, 170); + validateTable(document, 2, 6, 20, 0, 0); + validateTable(document, 3, 7, 31, 0, 0); + + } + + + @Test + public void testDoc211() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf"); + + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + + validateTableSize(document, 4); + + validateTable(document, 0, 5, 4, 0, 0); + validateTable(document, 1, 5, 15, 14, 0); + validateTable(document, 2, 5, 14, 11, 0); + validateTable(document, 3, 5, 3, 0, 0); } @@ -181,7 +221,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 8, 8, 0, 2); + validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", "Author, date", @@ -191,18 +231,18 @@ public class PdfSegmentationServiceTest extends AbstractTest { "Method meets analytical validation criteria", "Remarks (in case validation criteria are not met)", "Acceptability of the method"), - Arrays.asList("", + Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), Arrays.asList("CA 7.1.2.1.1 DAR (2009)", "Evans P.G. 2001 TMJ4569B, VV-323245", "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", "Y", "N/A", @@ -220,6 +260,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + toHtml(document, "/tmp/html.html"); + validateTableSize(document, 4); validateTable(document, 0, 3, 2, 0, 0); @@ -231,17 +273,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Test + @Disabled // FIXME Fake Redactions leads to more cells, no solution for this currently public void testDocA20622APartB9Page185() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); - validateTableSize(document, 2); + validateTableSize(document, 1); - validateTable(document, 0, 5, 5, 0, 23); - validateTable(document, 1, 11, 9, 0, 36); + validateTable(document, 0, 7, 4, 0, 0); + } + + @Test + public void testDocA20622APartB9Page185FixedDoc() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf"); + + ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + + validateTableSize(document, 1); + + validateTable(document, 0, 7, 4, 0, 0); } @@ -328,7 +382,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); validateTableSize(document, 1); - validateTable(document, 0, 10, 6, 0, 1); + validateTable(document, 0, 10, 6, 0, 0); } @@ -450,8 +504,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 6, 8, 0, 2); - validateTable(document, 1, 6, 8, 0, 1); + validateTable(document, 0, 6, 8, 0, 0); + validateTable(document, 1, 6, 8, 0, 0); } @@ -484,12 +538,37 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @SneakyThrows + private void toHtml(ClassificationDocument document, String filename) { + + var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + StringBuilder sb = new StringBuilder(); + + int currentPage = 1; + for (var table : tables) { + if (currentPage != table.getPage()) { + currentPage = table.getPage(); + sb.append("---------------------- Page ").append(currentPage).append("--------------\n"); + } + sb.append("\n\n"); + sb.append(table.getTextAsHtml()); + } + + try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(filename).toFile())) { + fileOutputStream.write(sb.toString().getBytes()); + } + } + + private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); List> rows = table.getRows(); int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size(); + for (List row : table.getRows()) { + row.forEach(r -> System.out.println(r.toString())); + } assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect); assertThat(table.getColCount()).isEqualTo(colCount); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index cceec48..ae7e418 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -1,21 +1,39 @@ package com.knecon.fforesight.service.layoutparser.server.services; +import java.nio.file.Files; import java.nio.file.Path; import java.util.Collections; import java.util.LinkedList; import java.util.List; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; -public class RulingCleaningServiceTest { +public class RulingCleaningServiceTest extends BuildDocumentTest { @Test // @Disabled @@ -25,13 +43,96 @@ public class RulingCleaningServiceTest { String fileName = "files/211.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; List pageContents = PageContentExtractor.getSortedPageContents(fileName); - PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); - RulingCleaningService rulingCleaningService = new RulingCleaningService(); + PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); List cleanRulingsPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { - cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20)); + cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); + } + + } + + + @Test + @SneakyThrows + public void testTableExtraction() { + + LayoutGridService layoutGridService = new LayoutGridService(); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); + + ClassPathResource resource = new ClassPathResource("files"); + List pdfFileNames = Files.walk(resource.getFile().toPath()) + .filter(path -> path.getFileName().toString().endsWith(".pdf")) + .map(Path::toAbsolutePath) + .map(Path::toString) + .toList(); + + for (int i = 0; i < pdfFileNames.size(); i++) { + writeJsons(Path.of(pdfFileNames.get(i))); } } + + @SneakyThrows + private void writeJsons(Path filename) { + + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(filename.toFile()), + new ImageServiceResponse(), + new TableServiceResponse())); + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(filename.toFile()), + new ImageServiceResponse(), + new TableServiceResponse())); + DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); + DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); + if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { + String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString(); + try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { + PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore); + pdDocument.save(tmpFileNameBefore); + } + String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString(); + try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { + PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter); + pdDocument.save(tmpFileNameAfter); + + } + } + } + + + @SneakyThrows + private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) { + + List listStructure1 = structure1.streamAllEntries() + .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) + .map(DocumentStructure.EntryData::getProperties) + .map(properties -> { + var builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.build(); + }) + .toList(); + + List listStructure2 = structure2.streamAllEntries() + .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) + .map(DocumentStructure.EntryData::getProperties) + .map(properties -> { + var builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.build(); + }) + .toList(); + + for (int i = 0; i < listStructure1.size(); i++) { + Table tableNode1 = (Table) listStructure1.get(i); + Table tableNode2 = (Table) listStructure2.get(i); + if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) { + return false; + } + } + return true; + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf new file mode 100644 index 0000000..9610cb5 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf differ