RED-7375 table extractor prototype

This commit is contained in:
yhampe 2024-02-01 08:23:53 +01:00
parent 0c3b910088
commit aa1e77dda3
2 changed files with 10 additions and 18 deletions

View File

@ -50,7 +50,7 @@ public class TableExtractorResponseAdapter {
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
tableCells.setLabel(t.getTable().getLabel());
log.info("Parsed table cell {}",tableCells);
log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel());
parsedTableCells.add(tableCells);
t.getObjects().forEach(o -> {
TableExtractorCells objectCell = new TableExtractorCells();
@ -61,7 +61,7 @@ public class TableExtractorResponseAdapter {
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
objectCell.setLabel(o.getLabel());
log.info("Parsed object cell {}",objectCell);
log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel());
parsedTableCells.add(objectCell);
});
});

View File

@ -5,7 +5,6 @@ import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@ -28,19 +27,13 @@ import org.apache.pdfbox.util.Matrix;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -58,17 +51,19 @@ public class ViewerDocumentService {
private final LayoutGridService layoutGridService;
@SneakyThrows
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, Map<Integer, List<TableExtractorCells>> extractedTableCells, boolean layerVisibilityDefaultValue) {
public void createViewerDocument(PDDocument pdDocument,
Document document,
OutputStream outputStream,
Map<Integer,List<TableExtractorCells>> extractedTableCells,
boolean layerVisibilityDefaultValue) {
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
PDOptionalContentGroup tableExtractorLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -85,9 +80,7 @@ public class ViewerDocumentService {
assert pageNumber == visualizationsOnPage.getPageNumber();
// We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
contentStream.beginMarkedContent(COSName.OC, tableExtractorLayer);
contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer);
contentStream.saveGraphicsState();
contentStream.setLineWidth(LINE_WIDTH);
@ -110,7 +103,6 @@ public class ViewerDocumentService {
}
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
}
dictionariesToUpdate.add(pdPage.getCOSObject());
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());