Merge branch 'RED-8747-fp' into 'main'
RED-8747 - Entities not merged properly - fp See merge request fforesight/layout-parser!131
This commit is contained in:
commit
c4d9c5df02
@ -258,7 +258,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER_OLD ->
|
case REDACT_MANAGER_OLD ->
|
||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||||
|
|||||||
@ -2,13 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
|||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ListIterator;
|
import java.util.ListIterator;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
@ -19,11 +16,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -40,19 +39,10 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||||
|
|
||||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
|
||||||
List<Ruling> usedVerticalRulings = new ArrayList<>();
|
|
||||||
|
|
||||||
cells.forEach(cell -> {
|
|
||||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
|
|
||||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
|
||||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
|
|
||||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
|
||||||
});
|
|
||||||
|
|
||||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
|
||||||
|
|
||||||
var classificationPage = new ClassificationPage(pageBlocks);
|
var classificationPage = new ClassificationPage(pageBlocks);
|
||||||
|
|
||||||
|
|||||||
@ -13,10 +13,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@ -31,12 +34,12 @@ public class RedactManagerBlockificationService {
|
|||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
*
|
*
|
||||||
* @param textPositions The words of a page.
|
* @param textPositions The words of a page.
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
|
||||||
* @param verticalRulingLines Vertical table lines.
|
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||||
|
|
||||||
|
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||||
|
|
||||||
int indexOnPage = 0;
|
int indexOnPage = 0;
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
@ -54,7 +57,7 @@ public class RedactManagerBlockificationService {
|
|||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
|
|||||||
@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.geom.RectangularShape;
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -19,6 +21,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
@ -147,7 +151,27 @@ public class RectangleTransformations {
|
|||||||
previousRectangle = currentRectangle;
|
previousRectangle = currentRectangle;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
|
return rectangleListsWithGaps.stream()
|
||||||
|
.map(RectangleTransformations::rectangle2DBBox)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static CleanRulings extractRulings(List<? extends Rectangle2D.Float> rectangles) {
|
||||||
|
|
||||||
|
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||||
|
List<Ruling> horizontalRulings = new ArrayList<>();
|
||||||
|
List<Ruling> verticalRulings = new ArrayList<>();
|
||||||
|
|
||||||
|
rectangles.forEach(rectangle -> {
|
||||||
|
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y)));
|
||||||
|
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height),
|
||||||
|
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||||
|
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height)));
|
||||||
|
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||||
|
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||||
|
});
|
||||||
|
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -26,9 +26,22 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||||
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
Arrays.stream(finishedEvent.message().split("\n"))
|
||||||
|
.forEach(log::info);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParserEndToEnd_RED_8747() {
|
||||||
|
|
||||||
|
prepareStorage("files/SinglePages/MergedEntities.pdf");
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
Arrays.stream(finishedEvent.message().split("\n"))
|
||||||
|
.forEach(log::info);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -667,6 +667,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMergedEntities_Page26() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
validateTable(document, 0, 6, 6, 5, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void toHtml(ClassificationDocument document, String filename) {
|
private void toHtml(ClassificationDocument document, String filename) {
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user