RED-8747 - Entities not merged properly - fp

- use the rullings from the found tables instead of all rullings as splitting rullings in the blockification service
This commit is contained in:
Corina Olariu 2024-04-08 09:42:32 +03:00
parent 990c376ce6
commit f185b13f2b
6 changed files with 50 additions and 16 deletions

View File

@ -258,7 +258,7 @@ public class LayoutParsingPipeline {
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.awt.geom.Point2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.experimental.UtilityClass;
@UtilityClass
public class BlockificationUtils {
public void extractedRullingsFromCells(List<Cell> cells, List<Ruling> usedHorizonalRulings, List<Ruling> usedVerticalRulings) {
cells.forEach(cell -> {
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
});
}
}

View File

@ -2,13 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
@ -44,12 +41,7 @@ public class DocstrumBlockificationService {
List<Ruling> usedHorizonalRulings = new ArrayList<>();
List<Ruling> usedVerticalRulings = new ArrayList<>();
cells.forEach(cell -> {
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
});
BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);

View File

@ -13,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -31,12 +32,16 @@ public class RedactManagerBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @param textPositions The words of a page.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
List<Ruling> usedVerticalRulings = new ArrayList<>();
BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings);
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
@ -54,7 +59,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedVerticalRulings, usedVerticalRulings);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {

View File

@ -26,9 +26,22 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
}
@Test
@SneakyThrows
public void testLayoutParserEndToEnd_RED_8747() {
prepareStorage("files/localTests/MergedEntities.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
}
}