RED-8747 - Entities not merged properly - fp
- rework the extraction of rulings from the table cells
This commit is contained in:
parent
319268c53d
commit
976f408237
@ -1,24 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class BlockificationUtils {
|
||||
|
||||
public void extractedRullingsFromCells(List<Cell> cells, List<Ruling> usedHorizonalRulings, List<Ruling> usedVerticalRulings) {
|
||||
|
||||
cells.forEach(cell -> {
|
||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y)));
|
||||
usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height)));
|
||||
usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height)));
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@ -16,11 +16,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -37,14 +39,10 @@ public class DocstrumBlockificationService {
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
List<Ruling> usedVerticalRulings = new ArrayList<>();
|
||||
|
||||
BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings);
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
|
||||
|
||||
@ -14,10 +14,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@ -37,11 +39,7 @@ public class RedactManagerBlockificationService {
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
List<Ruling> usedVerticalRulings = new ArrayList<>();
|
||||
|
||||
BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings);
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
@ -59,7 +57,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedHorizonalRulings, usedVerticalRulings);
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -19,6 +21,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -147,7 +151,27 @@ public class RectangleTransformations {
|
||||
previousRectangle = currentRectangle;
|
||||
}
|
||||
}
|
||||
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
|
||||
return rectangleListsWithGaps.stream()
|
||||
.map(RectangleTransformations::rectangle2DBBox)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public static CleanRulings extractRulings(List<? extends Rectangle2D.Float> rectangles) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> horizontalRulings = new ArrayList<>();
|
||||
List<Ruling> verticalRulings = new ArrayList<>();
|
||||
|
||||
rectangles.forEach(rectangle -> {
|
||||
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y)));
|
||||
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height)));
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
});
|
||||
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user