TAAS-103: Table Detection and rotated text
* added page property to DocumentStructure to be able to get page of found tables * added a method to TableExtractionService to get the table area * added calculateMinCharWidthAndMaxCharHeightInsideTable to LayoutParsingPipeline to calculate the values based upon table area * refactored PDFLinesTextStripper for better readability *removed textMatrix from RedTextPosition as it is no longer needed
This commit is contained in:
parent
459e0c8be7
commit
01493dc033
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
@ -189,8 +190,15 @@ public class LayoutParsingPipeline {
|
|||||||
PDRectangle cropbox = pdPage.getCropBox();
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||||
stripper.getRulings(),
|
stripper.getRulings(),
|
||||||
stripper.getMinCharWidth(),
|
1,
|
||||||
stripper.getMaxCharHeight());
|
1);
|
||||||
|
|
||||||
|
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
|
||||||
|
|
||||||
|
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
|
||||||
|
|
||||||
|
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
|
||||||
|
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
@ -234,6 +242,43 @@ public class LayoutParsingPipeline {
|
|||||||
return classificationDocument;
|
return classificationDocument;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the smallest character by width
|
||||||
|
* and the largest character by height
|
||||||
|
* inside a table area
|
||||||
|
*
|
||||||
|
* @param stripper the stripper containing the words
|
||||||
|
* @param spreedSheetArea the table area
|
||||||
|
* @param initialMinCharWidth an initial value for a minimum char width
|
||||||
|
* @param initialMaxCharHeight an initial value for a maximum char heigth
|
||||||
|
*
|
||||||
|
* @return Map with both values
|
||||||
|
*/
|
||||||
|
|
||||||
|
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
|
||||||
|
|
||||||
|
float newMinCharWidth = initialMinCharWidth;
|
||||||
|
float newMaxCharHeight = initialMaxCharHeight;
|
||||||
|
Map<String,Float> result = new HashMap<>();
|
||||||
|
for(var textPositionSequence: stripper.getTextPositionSequences() ) {
|
||||||
|
for(var redTextPosition: textPositionSequence.getTextPositions()) {
|
||||||
|
for(var area: spreedSheetArea) {
|
||||||
|
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
|
||||||
|
if(redTextPosition.getHeightDir() > newMaxCharHeight) {
|
||||||
|
newMaxCharHeight = redTextPosition.getHeightDir();
|
||||||
|
}
|
||||||
|
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
|
||||||
|
newMinCharWidth = redTextPosition.getWidthDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.put("minCharWidth",newMinCharWidth);
|
||||||
|
result.put("maxCharHeight",newMaxCharHeight);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||||
|
|
||||||
@ -246,7 +291,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
// if (!classificationPage.isLandscape()) {
|
// if (!classificationPage.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
// }
|
// }
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
|
|||||||
@ -34,7 +34,7 @@ public class Table implements SemanticNode {
|
|||||||
|
|
||||||
int numberOfRows;
|
int numberOfRows;
|
||||||
int numberOfCols;
|
int numberOfCols;
|
||||||
|
int page;
|
||||||
TextBlock textBlock;
|
TextBlock textBlock;
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
|
|||||||
@ -17,7 +17,6 @@ import lombok.SneakyThrows;
|
|||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class RedTextPosition {
|
public class RedTextPosition {
|
||||||
|
|
||||||
private String textMatrix;
|
|
||||||
private float[] position;
|
private float[] position;
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@ -56,8 +55,6 @@ public class RedTextPosition {
|
|||||||
|
|
||||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||||
|
|
||||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
|
||||||
|
|
||||||
var position = new float[4];
|
var position = new float[4];
|
||||||
|
|
||||||
position[0] = textPosition.getXDirAdj();
|
position[0] = textPosition.getXDirAdj();
|
||||||
|
|||||||
@ -79,10 +79,12 @@ public class TableExtractionService {
|
|||||||
* @param cleanRulings The lines used to build the table.
|
* @param cleanRulings The lines used to build the table.
|
||||||
* @param page Page object that contains textblocks and statistics.
|
* @param page Page object that contains textblocks and statistics.
|
||||||
*/
|
*/
|
||||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) {
|
public void extractTables(CleanRulings cleanRulings, ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||||
|
|
||||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
@ -134,6 +136,14 @@ public class TableExtractionService {
|
|||||||
page.getTextBlocks().removeAll(toBeRemoved);
|
page.getTextBlocks().removeAll(toBeRemoved);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
|
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
||||||
|
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
||||||
|
return spreadsheetAreas;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, LayoutParsingType layoutParsingType) {
|
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -8,6 +9,7 @@ import java.util.Map;
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
|
||||||
@ -44,6 +46,8 @@ public class PropertiesMapper {
|
|||||||
public static Map<String, String> buildTableProperties(Table table) {
|
public static Map<String, String> buildTableProperties(Table table) {
|
||||||
|
|
||||||
Map<String, String> properties = new HashMap<>();
|
Map<String, String> properties = new HashMap<>();
|
||||||
|
Page page = table.getFirstPage();
|
||||||
|
properties.put(DocumentStructure.TableProperties.PAGE, String.valueOf(page.getNumber()));
|
||||||
properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
|
properties.put(DocumentStructure.TableProperties.NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows()));
|
||||||
properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
|
properties.put(DocumentStructure.TableProperties.NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols()));
|
||||||
return properties;
|
return properties;
|
||||||
@ -69,6 +73,7 @@ public class PropertiesMapper {
|
|||||||
|
|
||||||
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
public static void parseTableProperties(Map<String, String> properties, Table.TableBuilder builder) {
|
||||||
|
|
||||||
|
builder.page(Integer.parseInt(properties.get(DocumentStructure.TableProperties.PAGE)));
|
||||||
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
|
builder.numberOfRows(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_ROWS)));
|
||||||
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
|
builder.numberOfCols(Integer.parseInt(properties.get(DocumentStructure.TableProperties.NUMBER_OF_COLS)));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -254,11 +254,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||||
if (i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj()) {
|
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i,textPositions)) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
.getUnicode()
|
|
||||||
.equals("\t")))) {
|
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
@ -266,9 +264,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
|
|
||||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
.getUnicode()
|
|
||||||
.equals("\t")))) {
|
|
||||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||||
}
|
}
|
||||||
startIndex = i;
|
startIndex = i;
|
||||||
@ -278,13 +274,10 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
.getUnicode()
|
.getUnicode()
|
||||||
.equals("\t")) && i <= textPositions.size() - 2) {
|
.equals("\t")) && i <= textPositions.size() - 2) {
|
||||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
if (checkIfSequenceContainsOnlyWhitespaces(sublist)) {
|
||||||
.getUnicode()
|
|
||||||
.equals("\t")))) {
|
|
||||||
|
|
||||||
// Remove false sequence ends (whitespaces)
|
// Remove false sequence ends (whitespaces)
|
||||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
if (checkIfGapSizeBetweenCharactersSmallerThanMaximum(previous,sublist,0.01f)) {
|
||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
|
||||||
for (TextPosition t : sublist) {
|
for (TextPosition t : sublist) {
|
||||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||||
}
|
}
|
||||||
@ -318,6 +311,25 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
super.writeString(text);
|
super.writeString(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean checkIfCurrentPositionIsToTheRightOfPreviousPosition(int i, List<TextPosition> textPositions) {
|
||||||
|
return i > 0 && textPositions.get(i).getXDirAdj() < textPositions.get(i - 1).getXDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean checkIfSequenceContainsOnlyWhitespaces(List<TextPosition> sublist) {
|
||||||
|
return !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
|
.getUnicode()
|
||||||
|
.equals("\t")));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean checkIfGapSizeBetweenCharactersSmallerThanMaximum(RedTextPosition previous, List<TextPosition> sublist, float maximumGapSize) {
|
||||||
|
return previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||||
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// !(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||||
|
// .getUnicode()
|
||||||
|
// .equals("\t")))
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText(PDDocument doc) throws IOException {
|
public String getText(PDDocument doc) throws IOException {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user