RED-7375:
* using ViewerDocumentService to draw TableExtractorResponse into documents
This commit is contained in:
parent
207d9dec97
commit
f4e93ef03b
@ -20,6 +20,9 @@ public record LayoutParsingRequest(
|
|||||||
@NonNull String originFileStorageId,//
|
@NonNull String originFileStorageId,//
|
||||||
@Schema(description = "Optional Path to the table extraction file.")//
|
@Schema(description = "Optional Path to the table extraction file.")//
|
||||||
Optional<String> tablesFileStorageId,//
|
Optional<String> tablesFileStorageId,//
|
||||||
|
|
||||||
|
@Schema(description= "Optional Path to the the table parsing service file")
|
||||||
|
Optional<String> tableExtractorFileId,
|
||||||
@Schema(description = "Optional Path to the image classification file.")//
|
@Schema(description = "Optional Path to the image classification file.")//
|
||||||
Optional<String> imagesFileStorageId,//
|
Optional<String> imagesFileStorageId,//
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.C
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
@ -87,12 +88,17 @@ public class LayoutParsingPipeline {
|
|||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
|
||||||
|
if(layoutParsingRequest.tableExtractorFileId().isPresent()) {
|
||||||
|
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
|
||||||
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse, tableExtractorResponse);
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
|
||||||
int numberOfPages = originDocument.getNumberOfPages();
|
int numberOfPages = originDocument.getNumberOfPages();
|
||||||
@ -102,6 +108,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
try (var out = new ByteArrayOutputStream()) {
|
try (var out = new ByteArrayOutputStream()) {
|
||||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
||||||
|
viewerDocumentService.drawExtractedTables(originDocument,documentGraph,out,tableExtractorResponse.getExtractedTableData());
|
||||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,10 +164,11 @@ public class LayoutParsingPipeline {
|
|||||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
PDDocument originDocument,
|
PDDocument originDocument,
|
||||||
ImageServiceResponse imageServiceResponse,
|
ImageServiceResponse imageServiceResponse,
|
||||||
TableServiceResponse tableServiceResponse) {
|
TableServiceResponse tableServiceResponse, TableExtractorResponse tableExtractorResponse) {
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
|
//Hier muss ich die table cells einlesen
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|||||||
@ -23,6 +23,8 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
@ -62,6 +64,14 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException {
|
||||||
|
try (InputStream inputStream = getObject(storageId)) {
|
||||||
|
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream,TableExtractorResponse.class);
|
||||||
|
inputStream.close();
|
||||||
|
return tableExtractorResponse;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,17 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ExtractedTable {
|
||||||
|
private boolean rotated;
|
||||||
|
private ExtractedTableData extractedTableValue;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ExtractedTableData {
|
||||||
|
private String label;
|
||||||
|
private float score;
|
||||||
|
private List<Float> boundingBox;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorData {
|
||||||
|
|
||||||
|
private int pageNumber;
|
||||||
|
private int pageRotation;
|
||||||
|
private int imageHeigth;
|
||||||
|
private int imageWidth;
|
||||||
|
private float pdfHeight;
|
||||||
|
private float pdfWidth;
|
||||||
|
private int dpi;
|
||||||
|
private List<ExtractedTable> tables;
|
||||||
|
private List<ExtractedTableData> objects;
|
||||||
|
}
|
||||||
@ -0,0 +1,22 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorResponse { private String dossierId;
|
||||||
|
private String fileId;
|
||||||
|
private String targetFileExtension;
|
||||||
|
private String responseFileExtension;
|
||||||
|
private String X_TENANT_ID;
|
||||||
|
private List<TableExtractorData> extractedTableData;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,12 +1,22 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import javax.print.Doc;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
@ -23,13 +33,21 @@ import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
|
|||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -47,6 +65,45 @@ public class ViewerDocumentService {
|
|||||||
|
|
||||||
private final LayoutGridService layoutGridService;
|
private final LayoutGridService layoutGridService;
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void drawExtractedTables(PDDocument pdDocument, Document document, OutputStream outputStream, List<TableExtractorData> tableExtractorData) {
|
||||||
|
|
||||||
|
for (TableExtractorData tableExtractorDatum : tableExtractorData) {
|
||||||
|
int pageNumber = tableExtractorDatum.getPageNumber();
|
||||||
|
List<Rectangle2D> tableRectangles = new ArrayList<>();
|
||||||
|
List<Rectangle2D> objectRectangles = new ArrayList<>();
|
||||||
|
for (ExtractedTable table : tableExtractorDatum.getTables()) {
|
||||||
|
List<Float> boundingBox = table.getExtractedTableValue().getBoundingBox();
|
||||||
|
float x0 = boundingBox.get(0);
|
||||||
|
float x1 = boundingBox.get(2);
|
||||||
|
float y0 = boundingBox.get(1);
|
||||||
|
float y1 = boundingBox.get(3);
|
||||||
|
Rectangle2D tableRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0);
|
||||||
|
tableRectangles.add(tableRectangle);
|
||||||
|
}
|
||||||
|
for (ExtractedTableData object : tableExtractorDatum.getObjects()) {
|
||||||
|
List<Float> boundingBox = object.getBoundingBox();
|
||||||
|
float x0 = boundingBox.get(0);
|
||||||
|
float x1 = boundingBox.get(2);
|
||||||
|
float y0 = boundingBox.get(1);
|
||||||
|
float y1 = boundingBox.get(3);
|
||||||
|
Rectangle2D objectRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0);
|
||||||
|
objectRectangles.add(objectRectangle);
|
||||||
|
}
|
||||||
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||||
|
pageNumber,
|
||||||
|
tableRectangles,
|
||||||
|
PdfVisualisationUtility.Options.builder().strokeColor(Color.PINK).strokeWidth(1).stroke(true).build());
|
||||||
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||||
|
pageNumber,
|
||||||
|
objectRectangles,
|
||||||
|
PdfVisualisationUtility.Options.builder().strokeColor(Color.CYAN).strokeWidth(1).stroke(true).build());
|
||||||
|
}
|
||||||
|
pdDocument.save(outputStream);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||||
|
|||||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
@ -50,7 +51,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
|||||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||||
pdDocument,
|
pdDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||||
@ -98,7 +99,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(pdfFileResource.getFile()),
|
Loader.loadPDF(pdfFileResource.getFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
|
|
||||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||||
.map(SemanticNode::getHeadline)
|
.map(SemanticNode::getHeadline)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
@ -58,7 +59,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
|||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
|
|
||||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||||
|
|||||||
@ -1,27 +1,50 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw.drawRectangle2DList;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.LineDetectionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
@ -29,7 +52,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@ -55,12 +80,61 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParsingServiceResults() {
|
||||||
|
String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json";
|
||||||
|
Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf");
|
||||||
|
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName();
|
||||||
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
|
PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile());
|
||||||
|
JsonNode jsonNode = objectMapper.readTree(new String(Files.readAllBytes(new File(tableSourceFileName).toPath())));
|
||||||
|
JsonNode dataNode = jsonNode.get("data");
|
||||||
|
|
||||||
|
dataNode.forEach(node -> {
|
||||||
|
List<Rectangle2D> rectangles = new ArrayList<>();
|
||||||
|
int pageNumber = node.get("page_number").asInt()+1;
|
||||||
|
JsonNode tables = node.get("tables");
|
||||||
|
tables.forEach(entry -> {
|
||||||
|
JsonNode table = entry.get("table");
|
||||||
|
//table bounding box
|
||||||
|
if(Float.valueOf(String.valueOf(table.get("score"))) < 0.99) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
JsonNode tableBox = table.get("bbox");
|
||||||
|
float x0 = Float.valueOf(tableBox.get(0).toString());
|
||||||
|
float x1 = Float.valueOf(tableBox.get(2).toString());
|
||||||
|
float y0 = Float.valueOf(tableBox.get(1).toString());
|
||||||
|
float y1 = Float.valueOf(tableBox.get(3).toString());
|
||||||
|
Rectangle2D rectangle2D = new Rectangle(y0, x0, x1 - x0, y1 - y0);
|
||||||
|
rectangles.add(rectangle2D);
|
||||||
|
//columns and rows
|
||||||
|
JsonNode rowsAndColumns = entry.get("objects");
|
||||||
|
rowsAndColumns.forEach(rowOrColumn -> {
|
||||||
|
JsonNode bbox = rowOrColumn.get("bbox");
|
||||||
|
float rx0 = Float.valueOf(bbox.get(0).toString());
|
||||||
|
float rx1 = Float.valueOf(bbox.get(2).toString());
|
||||||
|
float ry0 = Float.valueOf(bbox.get(1).toString());
|
||||||
|
float ry1 = Float.valueOf(bbox.get(3).toString());
|
||||||
|
Rectangle2D rowOrColumnRectangle = new Rectangle(ry0, rx0, rx1 - rx0, ry1 - ry0);
|
||||||
|
rectangles.add(rowOrColumnRectangle);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectangles, PdfVisualisationUtility.Options.builder().strokeColor(Color.GREEN).strokeWidth(2).stroke(true).build());
|
||||||
|
|
||||||
|
});
|
||||||
|
try (var out = new FileOutputStream(tmpFileName)) {
|
||||||
|
pdDocument.save(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse(), new TableExtractorResponse());
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
|||||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
@ -66,7 +67,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse(), new TableExtractorResponse());
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
|||||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
@ -79,11 +80,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||||
prepareStorage(filename);
|
prepareStorage(filename);
|
||||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
||||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new TableExtractorResponse());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user