RED-8825: general layoutparsing improvements
* fixing a bunch of coordinates
This commit is contained in:
parent
a3decd292d
commit
60acbac53f
@ -263,6 +263,7 @@ public class LayoutParsingPipeline {
|
|||||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||||
|
|
||||||
PDRectangle cropbox = pdPage.getCropBox();
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
|
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||||
|
|
||||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
||||||
|
|||||||
@ -27,8 +27,8 @@ public class Character {
|
|||||||
|
|
||||||
public Character(RedTextPosition chunk) {
|
public Character(RedTextPosition chunk) {
|
||||||
|
|
||||||
this.x = chunk.getDirectionAdjustedPosition().getCenterX();
|
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||||
this.y = chunk.getDirectionAdjustedPosition().getCenterY();
|
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||||
this.textPosition = chunk;
|
this.textPosition = chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -21,7 +21,8 @@ import lombok.SneakyThrows;
|
|||||||
public class RedTextPosition extends BoundingBox {
|
public class RedTextPosition extends BoundingBox {
|
||||||
|
|
||||||
public final static int HEIGHT_PADDING = 2;
|
public final static int HEIGHT_PADDING = 2;
|
||||||
private Rectangle2D.Float directionAdjustedPosition; // adjusted to text rotation
|
|
||||||
|
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private int rotation;
|
private int rotation;
|
||||||
@ -72,7 +73,7 @@ public class RedTextPosition extends BoundingBox {
|
|||||||
textPosition.getYDirAdj() - textHeight,
|
textPosition.getYDirAdj() - textHeight,
|
||||||
textPosition.getWidthDirAdj(),
|
textPosition.getWidthDirAdj(),
|
||||||
textHeight + HEIGHT_PADDING);
|
textHeight + HEIGHT_PADDING);
|
||||||
pos.setDirectionAdjustedPosition(dirAdjPosition);
|
pos.setBBoxDirAdj(dirAdjPosition);
|
||||||
|
|
||||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||||
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||||
@ -83,6 +84,7 @@ public class RedTextPosition extends BoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
||||||
|
|
||||||
AffineTransform transform = new AffineTransform();
|
AffineTransform transform = new AffineTransform();
|
||||||
@ -105,28 +107,28 @@ public class RedTextPosition extends BoundingBox {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getXDirAdj() {
|
public float getXDirAdj() {
|
||||||
|
|
||||||
return this.directionAdjustedPosition.x;
|
return this.bBoxDirAdj.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getYDirAdj() {
|
public float getYDirAdj() {
|
||||||
|
|
||||||
return this.directionAdjustedPosition.y;
|
return this.bBoxDirAdj.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getWidthDirAdj() {
|
public float getWidthDirAdj() {
|
||||||
|
|
||||||
return this.directionAdjustedPosition.width;
|
return this.bBoxDirAdj.width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getHeightDir() {
|
public float getHeightDir() {
|
||||||
|
|
||||||
return this.directionAdjustedPosition.height;
|
return this.bBoxDirAdj.height;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
@ -8,6 +9,7 @@ import java.util.stream.Collectors;
|
|||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -32,6 +34,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
|||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||||
|
|
||||||
|
private Rectangle2D bBoxDirAdj;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private TextDirection dir;
|
private TextDirection dir;
|
||||||
private int rotation;
|
private int rotation;
|
||||||
@ -53,6 +56,15 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
|||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
this.isParagraphStart = isParagraphStart;
|
this.isParagraphStart = isParagraphStart;
|
||||||
|
calculateBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void calculateBBox() {
|
||||||
|
|
||||||
|
this.bBoxDirAdj = textPositions.stream()
|
||||||
|
.map(RedTextPosition::getBBoxDirAdj)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
setToBBoxOfComponents(getTextPositions());
|
setToBBoxOfComponents(getTextPositions());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,7 +77,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
|||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
setToBBoxOfComponents(getTextPositions());
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -133,7 +145,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
|||||||
this.rotation = textPositionSequence.getRotation();
|
this.rotation = textPositionSequence.getRotation();
|
||||||
this.pageHeight = textPositionSequence.getPageHeight();
|
this.pageHeight = textPositionSequence.getPageHeight();
|
||||||
this.pageWidth = textPositionSequence.getPageWidth();
|
this.pageWidth = textPositionSequence.getPageWidth();
|
||||||
setToBBoxOfComponents(getTextPositions());
|
calculateBBox();
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,7 +157,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
|||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
setToBBoxOfComponents(getTextPositions());
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
|
|||||||
classificationPage.getImages().forEach(image -> {
|
classificationPage.getImages().forEach(image -> {
|
||||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||||
if (image.getPosition().contains(textblock.getBBox())) {
|
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
|
||||||
image.setImageType(ImageType.OCR);
|
image.setImageType(ImageType.OCR);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -124,9 +124,9 @@ public class RulingCleaningService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ruling.isHorizontal()) {
|
if (ruling.isHorizontal()) {
|
||||||
return new Rectangle2D.Double(x - THRESHOLD_Y_HORIZONTAL, y - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||||
} else {
|
} else {
|
||||||
return new Rectangle2D.Double(x - THRESHOLD_Y_VERTICAL, y - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -142,14 +142,14 @@ public class TableExtractionService {
|
|||||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||||
|
|
||||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||||
|
/*
|
||||||
switch (pageInformation.rotationDegrees()) {
|
switch (pageInformation.rotationDegrees()) {
|
||||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||||
.stream()
|
.stream()
|
||||||
.map(rect -> new Cell(rect, affineTransform))
|
.map(rect -> new Cell(rect, affineTransform))
|
||||||
|
|||||||
@ -31,14 +31,14 @@ public class TextRulingsClassifier {
|
|||||||
|
|
||||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||||
|
|
||||||
float lowerY = (float) (word.getBBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
float upperY = (float) (word.getBBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
|
||||||
float strikethroughCenterX = (float) word.getBBox().getCenterX();
|
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
|
||||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||||
|
|
||||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBox().getMaxX() : word.getBBox().getMinX());
|
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
|
||||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||||
|
|
||||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||||
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
||||||
@ -65,14 +65,14 @@ public class TextRulingsClassifier {
|
|||||||
|
|
||||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||||
|
|
||||||
float leftX = (float) (word.getBBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
float rightX = (float) (word.getBBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
|
||||||
float strikethroughCenterY = (float) word.getBBox().getCenterY();
|
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
|
||||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||||
|
|
||||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBox().getMinY() : word.getBBox().getMaxY());
|
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
|
||||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||||
|
|
||||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||||
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
||||||
|
|||||||
@ -39,7 +39,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
|
|
||||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
||||||
.get(0);
|
.get(0);
|
||||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build();
|
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||||
|
|
||||||
for (TextPositionSequence word : sequences) {
|
for (TextPositionSequence word : sequences) {
|
||||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||||
@ -61,7 +61,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
++context.positionIdx;
|
++context.positionIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build();
|
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
|
||||||
context.stringBuilder.append(" ");
|
context.stringBuilder.append(" ");
|
||||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||||
++context.stringIdx;
|
++context.stringIdx;
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import org.apache.pdfbox.cos.COSName;
|
|||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
@ -47,7 +48,7 @@ public class MarkedContentUtils {
|
|||||||
|
|
||||||
return markedContentByYPosition.values()
|
return markedContentByYPosition.values()
|
||||||
.stream()
|
.stream()
|
||||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBox())
|
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
|
||||||
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
@ -89,7 +90,7 @@ public class MarkedContentUtils {
|
|||||||
.map(content -> (TextPosition) content)
|
.map(content -> (TextPosition) content)
|
||||||
.filter(content -> !content.getUnicode().equals(" "))
|
.filter(content -> !content.getUnicode().equals(" "))
|
||||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||||
.map(TextPositionSequence::getBBox)
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -76,6 +76,7 @@ public class LayoutparsingVisualizations {
|
|||||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||||
|
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||||
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||||
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||||
@ -94,6 +95,7 @@ public class LayoutparsingVisualizations {
|
|||||||
lines, //
|
lines, //
|
||||||
zones, //
|
zones, //
|
||||||
rulings, //
|
rulings, //
|
||||||
|
clean_rulings, //
|
||||||
cells, //
|
cells, //
|
||||||
mainBody, //
|
mainBody, //
|
||||||
markedContent //
|
markedContent //
|
||||||
@ -120,11 +122,24 @@ public class LayoutparsingVisualizations {
|
|||||||
if (!active) {
|
if (!active) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
|
||||||
visualizationsOnPage.getColoredLines()
|
visualizationsOnPage.getColoredLines()
|
||||||
.addAll(cleanRulings.buildAll()
|
.addAll(cleanRulings.buildAll()
|
||||||
.stream()
|
.stream()
|
||||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||||
|
visualizationsOnPage.getColoredLines()
|
||||||
|
.addAll(rulings
|
||||||
|
.stream()
|
||||||
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||||
.toList());
|
.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,7 +29,7 @@ class PageContentExtractorTest {
|
|||||||
textPositionPerPage.stream()
|
textPositionPerPage.stream()
|
||||||
.map(t -> t.getSortedTextPositionSequences()
|
.map(t -> t.getSortedTextPositionSequences()
|
||||||
.stream()
|
.stream()
|
||||||
.map(TextPositionSequence::getBBox)
|
.map(TextPositionSequence::getBBoxInitialUserSpace)
|
||||||
.map(List::of)
|
.map(List::of)
|
||||||
.toList())
|
.toList())
|
||||||
.toList(), tmpFileName);
|
.toList(), tmpFileName);
|
||||||
|
|||||||
@ -26,6 +26,8 @@ public class ContentStreams {
|
|||||||
|
|
||||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||||
|
|
||||||
|
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
|
||||||
|
|
||||||
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
||||||
|
|
||||||
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
||||||
@ -53,6 +55,7 @@ public class ContentStreams {
|
|||||||
ESCAPE_START,
|
ESCAPE_START,
|
||||||
ESCAPE_END,
|
ESCAPE_END,
|
||||||
RULINGS,
|
RULINGS,
|
||||||
|
CLEAN_RULINGS,
|
||||||
WORDS,
|
WORDS,
|
||||||
ZONES,
|
ZONES,
|
||||||
LINES,
|
LINES,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user