RED-7375 table extractor prototype
added label changed colour of bounding boxes
This commit is contained in:
parent
b47b187c8a
commit
b4e5f2da2f
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.I
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
@ -92,6 +93,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
|
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
|
||||||
if (layoutParsingRequest.tableExtractorFileId().isPresent()) {
|
if (layoutParsingRequest.tableExtractorFileId().isPresent()) {
|
||||||
|
log.info("TABLEEXTRACTORRESPONSE:"+tableExtractorResponse);
|
||||||
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
|
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,7 +114,7 @@ public class LayoutParsingPipeline {
|
|||||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||||
try (var out = new ByteArrayOutputStream()) {
|
try (var out = new ByteArrayOutputStream()) {
|
||||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false);
|
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false);
|
||||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||||
@ -175,7 +177,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
Map<Integer, List<TableCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -24,8 +25,8 @@ import lombok.RequiredArgsConstructor;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class TableExtractorResponseAdapter {
|
public class TableExtractorResponseAdapter {
|
||||||
|
|
||||||
public Map<Integer, List<TableCells>> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) {
|
public Map<Integer, List<TableExtractorCells>> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) {
|
||||||
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
|
Map<Integer, List<TableExtractorCells>> tableCells = new HashMap<>();
|
||||||
tableExtractorResponse.getData()
|
tableExtractorResponse.getData()
|
||||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>())
|
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>())
|
||||||
.addAll(convertTableCells(tableData.getTables())));
|
.addAll(convertTableCells(tableData.getTables())));
|
||||||
@ -34,27 +35,29 @@ public class TableExtractorResponseAdapter {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<TableCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
public List<TableExtractorCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
||||||
|
|
||||||
List<TableCells> parsedTableCells = new ArrayList<>();
|
List<TableExtractorCells> parsedTableCells = new ArrayList<>();
|
||||||
|
|
||||||
tableObjects.stream().forEach(t -> {
|
tableObjects.stream().forEach(t -> {
|
||||||
TableCells tableCells = new TableCells();
|
TableExtractorCells tableCells = new TableExtractorCells();
|
||||||
tableCells.setX0(t.getTable().getBbox().get(0));
|
tableCells.setX0(t.getTable().getBbox().get(0));
|
||||||
tableCells.setX1(t.getTable().getBbox().get(2));
|
tableCells.setX1(t.getTable().getBbox().get(2));
|
||||||
tableCells.setY0(t.getTable().getBbox().get(1));
|
tableCells.setY0(t.getTable().getBbox().get(1));
|
||||||
tableCells.setY1(t.getTable().getBbox().get(3));
|
tableCells.setY1(t.getTable().getBbox().get(3));
|
||||||
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||||
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||||
|
tableCells.setLabel(t.getTable().getLabel());
|
||||||
parsedTableCells.add(tableCells);
|
parsedTableCells.add(tableCells);
|
||||||
t.getObjects().forEach(o -> {
|
t.getObjects().forEach(o -> {
|
||||||
TableCells objectCell = new TableCells();
|
TableExtractorCells objectCell = new TableExtractorCells();
|
||||||
objectCell.setX0(t.getTable().getBbox().get(0));
|
objectCell.setX0(t.getTable().getBbox().get(0));
|
||||||
objectCell.setX1(t.getTable().getBbox().get(2));
|
objectCell.setX1(t.getTable().getBbox().get(2));
|
||||||
objectCell.setY0(t.getTable().getBbox().get(1));
|
objectCell.setY0(t.getTable().getBbox().get(1));
|
||||||
objectCell.setY1(t.getTable().getBbox().get(3));
|
objectCell.setY1(t.getTable().getBbox().get(3));
|
||||||
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||||
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||||
|
objectCell.setLabel(o.getLabel());
|
||||||
parsedTableCells.add(objectCell);
|
parsedTableCells.add(objectCell);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorCells {
|
||||||
|
private float x0;
|
||||||
|
private float y0;
|
||||||
|
private float x1;
|
||||||
|
private float y1;
|
||||||
|
private float width;
|
||||||
|
private float height;
|
||||||
|
private String label;
|
||||||
|
|
||||||
|
}
|
||||||
@ -38,6 +38,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
|
|
||||||
@ -59,14 +60,16 @@ public class ViewerDocumentService {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, Map<Integer, List<TableCells>> extractedTableCells, boolean layerVisibilityDefaultValue) {
|
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, Map<Integer, List<TableExtractorCells>> extractedTableCells, boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||||
|
|
||||||
PDOptionalContentGroup tableExtractorLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
PDOptionalContentGroup tableExtractorLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
||||||
|
|
||||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||||
|
|
||||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||||
@ -125,12 +128,13 @@ public class ViewerDocumentService {
|
|||||||
}
|
}
|
||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
contentStream.endMarkedContent();
|
contentStream.endMarkedContent();
|
||||||
|
|
||||||
contentStream.beginMarkedContent(COSName.OC, tableExtractorLayer);
|
contentStream.beginMarkedContent(COSName.OC, tableExtractorLayer);
|
||||||
contentStream.saveGraphicsState();
|
contentStream.saveGraphicsState();
|
||||||
|
|
||||||
contentStream.setLineWidth(LINE_WIDTH);
|
contentStream.setLineWidth(LINE_WIDTH);
|
||||||
for (TableCells tableCells : extractedTableCells.get(pageNumber)) {
|
for (TableExtractorCells tableCells : extractedTableCells.get(pageNumber)) {
|
||||||
contentStream.setStrokingColor(new Color(0xFF0000));
|
contentStream.setStrokingColor(new Color(0xB700FF));
|
||||||
contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight());
|
contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight());
|
||||||
contentStream.stroke();
|
contentStream.stroke();
|
||||||
contentStream.setFont(font, FONT_SIZE);
|
contentStream.setFont(font, FONT_SIZE);
|
||||||
@ -141,9 +145,9 @@ public class ViewerDocumentService {
|
|||||||
(float) textDeRotationMatrix.getScaleY(),
|
(float) textDeRotationMatrix.getScaleY(),
|
||||||
tableCells.getX0() ,
|
tableCells.getX0() ,
|
||||||
tableCells.getY0());
|
tableCells.getY0());
|
||||||
// textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||||
// contentStream.setTextMatrix(textMatrix);
|
contentStream.setTextMatrix(textMatrix);
|
||||||
// contentStream.showText(tableCells.getLabel());
|
contentStream.showText(tableCells.getLabel());
|
||||||
contentStream.endText();
|
contentStream.endText();
|
||||||
}
|
}
|
||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user