Merge branch 'TAAS-103' into 'main'
TAAS-103: Table Detection and rotated text See merge request fforesight/layout-parser!81
This commit is contained in:
commit
09ee90222e
@ -187,10 +187,7 @@ public class LayoutParsingPipeline {
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
@ -213,7 +210,8 @@ public class LayoutParsingPipeline {
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
@ -246,8 +244,8 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
// if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
// if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
// }
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
|
||||
@ -34,7 +34,6 @@ public class Table implements SemanticNode {
|
||||
|
||||
int numberOfRows;
|
||||
int numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@ -208,7 +207,6 @@ public class Table implements SemanticNode {
|
||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells row-wise and filters them with header == true.
|
||||
*
|
||||
|
||||
@ -1,12 +1,14 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -252,7 +254,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
|
||||
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
@ -273,6 +276,21 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean intersects(Cell cell1, Cell cell2) {
|
||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell1.getX() + 2;
|
||||
double y0 = cell1.getY() + 2;
|
||||
return (cell2.x + cell2.width > x0 &&
|
||||
cell2.y + cell2.height > y0 &&
|
||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
||||
cell2.y < y0 + cell1.getHeight() -2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
|
||||
@ -17,7 +17,6 @@ import lombok.SneakyThrows;
|
||||
@AllArgsConstructor
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
private float[] position;
|
||||
|
||||
@JsonIgnore
|
||||
@ -56,8 +55,6 @@ public class RedTextPosition {
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
var position = new float[4];
|
||||
|
||||
position[0] = textPosition.getXDirAdj();
|
||||
|
||||
@ -12,9 +12,9 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -25,10 +25,13 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RulingCleaningService {
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
||||
private static final float THRESHOLD = 6;
|
||||
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings, minCharWidth, maxCharHeight);
|
||||
snapPoints(rulings);
|
||||
}
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
@ -53,14 +56,11 @@ public class RulingCleaningService {
|
||||
}
|
||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
||||
|
||||
return CleanRulings.builder()
|
||||
.vertical(verticalRulingLines)
|
||||
.horizontal(horizontalRulingLines)
|
||||
.build();
|
||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
||||
}
|
||||
|
||||
|
||||
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
|
||||
public void snapPoints(List<? extends Line2D.Float> rulings) {
|
||||
|
||||
// collect points and keep a Line -> p1,p2 map
|
||||
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
|
||||
@ -81,7 +81,7 @@ public class RulingCleaningService {
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) {
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
@ -108,7 +108,7 @@ public class RulingCleaningService {
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) {
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
|
||||
@ -12,7 +12,6 @@ import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
@ -66,6 +65,17 @@ public class TableExtractionService {
|
||||
};
|
||||
|
||||
|
||||
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
||||
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -79,16 +89,17 @@ public class TableExtractionService {
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||
if (cell.hasMinimumSize() && contains(cell,
|
||||
textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
@ -102,7 +113,7 @@ public class TableExtractionService {
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
@ -135,16 +146,14 @@ public class TableExtractionService {
|
||||
}
|
||||
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, LayoutParsingType layoutParsingType) {
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||
// TODO: breaks some tables, for example "1 Abamectin Prr.pdf" try to fix this upstream in RulingCleaningService
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
// Fix for 211.pdf
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
@ -8,6 +9,7 @@ import java.util.Map;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
|
||||
@ -1,18 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.parsing;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.awt.color.CMMException;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor;
|
||||
import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor;
|
||||
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
|
||||
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
|
||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetFlatness;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetLineWidth;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent;
|
||||
import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
@ -21,11 +37,14 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.awt.color.CMMException;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Getter
|
||||
@Slf4j
|
||||
@ -36,11 +55,6 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private final List<Ruling> graphicsPath = new ArrayList<>();
|
||||
@Setter
|
||||
protected PDPage pdpage;
|
||||
private int minCharWidth;
|
||||
private int maxCharWidth;
|
||||
private int minCharHeight;
|
||||
private int maxCharHeight;
|
||||
|
||||
|
||||
private float path_x;
|
||||
private float path_y;
|
||||
@ -73,7 +87,6 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
this.addOperator(new SetFontAndSize(this));
|
||||
this.addOperator(new SetLineWidth(this));
|
||||
|
||||
|
||||
addOperator(new BeginMarkedContentSequenceWithProperties(this));
|
||||
// addOperator(new BeginMarkedContentSequence(this));
|
||||
addOperator(new EndMarkedContentSequence(this));
|
||||
@ -232,33 +245,15 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.get(textPositionSequences.get(textPositionSequences.size() - 1).getTextPositions().size() - 1);
|
||||
}
|
||||
|
||||
int charWidth = (int) textPositions.get(i).getWidthDirAdj();
|
||||
if (charWidth < minCharWidth) {
|
||||
minCharWidth = charWidth;
|
||||
}
|
||||
if (charWidth > maxCharWidth) {
|
||||
maxCharWidth = charWidth;
|
||||
}
|
||||
|
||||
int charHeight = (int) textPositions.get(i).getHeightDir();
|
||||
if (charHeight < minCharHeight) {
|
||||
minCharHeight = charHeight;
|
||||
}
|
||||
if (charWidth > maxCharHeight) {
|
||||
maxCharHeight = charHeight;
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
|
||||
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -266,9 +261,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
@ -278,13 +271,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getUnicode()
|
||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous, sublist, 0.01f)) {
|
||||
for (TextPosition t : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
@ -319,13 +309,34 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
|
||||
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
|
||||
|
||||
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean checkIfSequenceContainsOnlyWhitespaces(List<TextPosition> sublist) {
|
||||
|
||||
return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")));
|
||||
}
|
||||
|
||||
|
||||
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
|
||||
|
||||
return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
}
|
||||
|
||||
// !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
// .getUnicode()
|
||||
// .equals("\t")))
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
minCharWidth = Integer.MAX_VALUE;
|
||||
maxCharWidth = 0;
|
||||
minCharHeight = Integer.MAX_VALUE;
|
||||
maxCharHeight = 0;
|
||||
textPositionSequences.clear();
|
||||
rulings.clear();
|
||||
graphicsPath.clear();
|
||||
|
||||
@ -47,7 +47,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
||||
@Disabled
|
||||
public void visualizeCraftedDocument() {
|
||||
|
||||
String filename = "files/crafted document.pdf";
|
||||
String filename = "files/1 Abamectin_prr.pdf";
|
||||
visualizePdf(filename);
|
||||
}
|
||||
|
||||
|
||||
@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
@ -18,19 +35,38 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
}
|
||||
}
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
@ -15,19 +37,8 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
@ -52,7 +63,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
@ -65,6 +76,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void tablesToHtmlDebugger() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMapping() {
|
||||
@ -155,7 +178,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Test // Non-sense test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
@ -166,8 +189,25 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 7, 20, 0, 140);
|
||||
validateTable(document, 3, 8, 31, 0, 170);
|
||||
validateTable(document, 2, 6, 20, 0, 0);
|
||||
validateTable(document, 3, 7, 31, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDoc211() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 5, 4, 0, 0);
|
||||
validateTable(document, 1, 5, 15, 14, 0);
|
||||
validateTable(document, 2, 5, 14, 11, 0);
|
||||
validateTable(document, 3, 5, 3, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -181,7 +221,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 8, 8, 0, 2);
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
@ -191,18 +231,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("",
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
@ -220,6 +260,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
toHtml(document, "/tmp/html.html");
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 2, 0, 0);
|
||||
@ -231,17 +273,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // FIXME Fake Redactions leads to more cells, no solution for this currently
|
||||
public void testDocA20622APartB9Page185() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 23);
|
||||
validateTable(document, 1, 11, 9, 0, 36);
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDocA20622APartB9Page185FixedDoc() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -328,7 +382,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -450,8 +504,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 2);
|
||||
validateTable(document, 1, 6, 8, 0, 1);
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 1, 6, 8, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -484,12 +538,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int currentPage = 1;
|
||||
for (var table : tables) {
|
||||
if (currentPage != table.getPage()) {
|
||||
currentPage = table.getPage();
|
||||
sb.append("---------------------- Page ").append(currentPage).append("--------------\n");
|
||||
}
|
||||
sb.append("\n\n");
|
||||
sb.append(table.getTextAsHtml());
|
||||
}
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(filename).toFile())) {
|
||||
fileOutputStream.write(sb.toString().getBytes());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
}
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
|
||||
@ -1,21 +1,39 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingCleaningServiceTest {
|
||||
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@ -25,13 +43,96 @@ public class RulingCleaningServiceTest {
|
||||
String fileName = "files/211.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableExtraction() {
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
|
||||
ClassPathResource resource = new ClassPathResource("files");
|
||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.map(Path::toAbsolutePath)
|
||||
.map(Path::toString)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < pdfFileNames.size(); i++) {
|
||||
writeJsons(Path.of(pdfFileNames.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||
pdDocument.save(tmpFileNameBefore);
|
||||
}
|
||||
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||
pdDocument.save(tmpFileNameAfter);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
||||
|
||||
List listStructure1 = structure1.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.build();
|
||||
})
|
||||
.toList();
|
||||
|
||||
List listStructure2 = structure2.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.build();
|
||||
})
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < listStructure1.size(); i++) {
|
||||
Table tableNode1 = (Table) listStructure1.get(i);
|
||||
Table tableNode2 = (Table) listStructure2.get(i);
|
||||
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user