RED-7375 table extractor prototype
This commit is contained in:
parent
0c3b910088
commit
aa1e77dda3
@ -50,7 +50,7 @@ public class TableExtractorResponseAdapter {
|
||||
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||
tableCells.setLabel(t.getTable().getLabel());
|
||||
log.info("Parsed table cell {}",tableCells);
|
||||
log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel());
|
||||
parsedTableCells.add(tableCells);
|
||||
t.getObjects().forEach(o -> {
|
||||
TableExtractorCells objectCell = new TableExtractorCells();
|
||||
@ -61,7 +61,7 @@ public class TableExtractorResponseAdapter {
|
||||
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||
objectCell.setLabel(o.getLabel());
|
||||
log.info("Parsed object cell {}",objectCell);
|
||||
log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel());
|
||||
parsedTableCells.add(objectCell);
|
||||
});
|
||||
});
|
||||
|
||||
@ -5,7 +5,6 @@ import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -28,19 +27,13 @@ import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -58,17 +51,19 @@ public class ViewerDocumentService {
|
||||
private final LayoutGridService layoutGridService;
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, Map<Integer, List<TableExtractorCells>> extractedTableCells, boolean layerVisibilityDefaultValue) {
|
||||
public void createViewerDocument(PDDocument pdDocument,
|
||||
Document document,
|
||||
OutputStream outputStream,
|
||||
Map<Integer,List<TableExtractorCells>> extractedTableCells,
|
||||
boolean layerVisibilityDefaultValue) {
|
||||
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
|
||||
PDOptionalContentGroup tableExtractorLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
||||
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||
PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
@ -85,9 +80,7 @@ public class ViewerDocumentService {
|
||||
assert pageNumber == visualizationsOnPage.getPageNumber();
|
||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
|
||||
contentStream.beginMarkedContent(COSName.OC, tableExtractorLayer);
|
||||
contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
|
||||
contentStream.setLineWidth(LINE_WIDTH);
|
||||
@ -110,7 +103,6 @@ public class ViewerDocumentService {
|
||||
}
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
|
||||
}
|
||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user