RED-8156: refactor ViewerDocumentService as a dependency for ocr-service

* various improvements to experimental parsing steps
* added embed fonts functionality to viewer doc
This commit is contained in:
Kilian Schuettler 2024-02-06 16:59:51 +01:00
parent 1b4aaf4454
commit 23eb0c40a3
32 changed files with 1033 additions and 295 deletions

View File

@ -10,6 +10,7 @@ val pdfBoxVersion = "3.0.0"
dependencies {
implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor"))
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") {
exclude("org.springframework.boot", "spring-boot-starter-security")

View File

@ -51,7 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import io.micrometer.observation.Observation;
@ -84,7 +84,7 @@ public class LayoutParsingPipeline {
TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
ViewerDocumentService viewerDocumentService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
@ -94,7 +94,7 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf");
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
@ -111,25 +111,31 @@ public class LayoutParsingPipeline {
imageServiceResponse,
tableServiceResponse,
layoutParsingRequest.identifier().toString());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false);
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
if (!viewerDocumentFile.equals(originFile)) {
viewerDocumentFile.delete();
}
originFile.delete();
viewerDocumentFile.delete();
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())

View File

@ -1,10 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration
@ComponentScan
public class LayoutParsingServiceProcessorConfiguration {
@Bean
@Autowired
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new ViewerDocumentService(registry);
}
}

View File

@ -8,6 +8,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -36,13 +37,6 @@ public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
public PDDocument getOriginDocument(String storageId) throws IOException {
return Loader.loadPDF(getOriginFile(storageId));
}
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException {
@ -52,6 +46,18 @@ public class LayoutParsingStorageService {
}
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
public Optional<File> getViewerDocFile(String storageId) throws IOException {
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
return Optional.empty();
}
File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
return Optional.of(tempFile);
}
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
try (InputStream inputStream = getObject(storageId)) {
@ -137,7 +143,6 @@ public class LayoutParsingStorageService {
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
try (var in = new FileInputStream(out)) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
}
}

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
public record ColoredLine(Line2D line, Color color) {
}

View File

@ -1,7 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
import java.awt.geom.Point2D;
public record PlacedText(String text, Point2D lineStart) {
}

View File

@ -1,12 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import static com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService.LINE_WIDTH;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@ -24,25 +23,59 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Service
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class LayoutGridService {
private static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
private static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
public static final Color TABLE_COLOR = new Color(102, 205, 170);
public static final Color SECTION_COLOR = new Color(50, 50, 50);
public static final Color HEADLINE_COLOR = new Color(162, 56, 56);
public static final Color HEADER_COLOR = new Color(171, 131, 6);
public static final Color IMAGE_COLOR = new Color(253, 63, 146);
ViewerDocumentService viewerDocumentService;
static float FONT_SIZE = 10f;
static float LINE_WIDTH = 1f;
static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
static Color TABLE_COLOR = new Color(102, 205, 170);
static Color SECTION_COLOR = new Color(50, 50, 50);
static Color HEADLINE_COLOR = new Color(162, 56, 56);
static Color HEADER_COLOR = new Color(171, 131, 6);
static Color IMAGE_COLOR = new Color(253, 63, 146);
public LayoutGrid createLayoutGrid(Document document) {
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
LayoutGrid layoutGrid = createLayoutGrid(document);
viewerDocumentService.addVisualizationsOnPage(originFile,
destinationFile,
Visualizations.builder()
.layer(ContentStreams.KNECON_LAYOUT)
.visualizationsOnPages(layoutGrid.getVisualizationsPerPages())
.layerVisibilityDefaultValue(layerVisibilityDefaultValue)
.build());
}
private LayoutGrid createLayoutGrid(Document document) {
LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages());
document.streamAllSubNodes().forEach(semanticNode -> {
@ -103,11 +136,11 @@ public class LayoutGridService {
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
xs.forEach(x -> {
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
ys.forEach(y -> {
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
}
}
@ -135,9 +168,9 @@ public class LayoutGridService {
List<Line2D> lines = createLinesFromRectangle(r, firstPage.getRotation());
// add string to top line
var firstLine = lines.remove(0);
coloredLines.add(new ColoredLine(firstLine, color));
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : lines) {
coloredLines.add(new ColoredLine(line, color));
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
return;
}
@ -152,6 +185,7 @@ public class LayoutGridService {
}
@SneakyThrows
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
Point2D.Float upperLeftCorner = switch (page.getRotation()) {
@ -161,7 +195,8 @@ public class LayoutGridService {
default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY());
};
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
placedTexts.add(new PlacedText(s, upperLeftCorner));
upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE);
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
}
@ -176,9 +211,9 @@ public class LayoutGridService {
midPageLines.remove(1);
// add string to left line
var leftLine = midPageLines.remove(1);
coloredLines.add(new ColoredLine(leftLine, color));
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : midPageLines) {
coloredLines.add(new ColoredLine(line, color));
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
}
@ -192,9 +227,9 @@ public class LayoutGridService {
lastPageLines.remove(0);
// add string to left line
var leftLine = lastPageLines.remove(2);
coloredLines.add(new ColoredLine(leftLine, color));
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : lastPageLines) {
coloredLines.add(new ColoredLine(line, color));
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
}
@ -208,9 +243,9 @@ public class LayoutGridService {
firstPageLines.remove(2);
// add string to top line
var firstLine = firstPageLines.remove(0);
coloredLines.add(new ColoredLine(firstLine, color));
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : firstPageLines) {
coloredLines.add(new ColoredLine(line, color));
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
}
}
@ -276,7 +311,10 @@ public class LayoutGridService {
private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
semanticNode.getBBox()
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color)));
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1)
.getColoredRectangles()
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
}
}

View File

@ -1,217 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.util.Matrix;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ViewerDocumentService {
private static final String LAYER_NAME = "Layout grid";
private static final int FONT_SIZE = 10;
public static final float LINE_WIDTH = 1f;
private final LayoutGridService layoutGridService;
private final ObservationRegistry observationRegistry;
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
PDDocument pdDocument = openPDDocument(originFile);
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
PDPage pdPage = pdDocument.getPage(pageNumber);
//
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
addLayerToPageResources(pdPage);
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
// e.g. not escaped matrix transformations.
escapePreviousContents(pdDocument, pdPage);
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber);
assert pageNumber == visualizationsOnPage.getPageNumber();
// We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
contentStream.beginMarkedContent(COSName.OC, layer);
contentStream.saveGraphicsState();
contentStream.setLineWidth(LINE_WIDTH);
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
contentStream.setStrokingColor(coloredLine.color());
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
contentStream.stroke();
}
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
contentStream.setStrokingColor(coloredRectangle.color());
Rectangle2D r = coloredRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.stroke();
}
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
contentStream.setNonStrokingColor(filledRectangle.color());
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
contentStream.setGraphicsStateParameters(graphicsState);
Rectangle2D r = filledRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.fill();
}
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
contentStream.setFont(font, FONT_SIZE);
contentStream.beginText();
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text());
contentStream.endText();
}
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
}
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
log.info("Incremental save after {} pages", pageNumber);
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
pdDocument = openPDDocument(tmpFile.toFile());
layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
}
}
observedIncrementalSave(pdDocument, destinationFile);
tmpFile.toFile().delete();
pdDocument.close();
}
private static PDDocument openPDDocument(File tmpFile) throws IOException {
PDDocument pdDocument;
pdDocument = Loader.loadPDF(tmpFile);
pdDocument.setAllSecurityToBeRemoved(true);
return pdDocument;
}
@SneakyThrows
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> {
try {
pdDocument.save(outputFile);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
private static void addLayerToPageResources(PDPage pdPage) {
PDResources resources = pdPage.getResources();
if (resources == null) {
resources = new PDResources();
pdPage.setResources(resources);
}
}
private static void escapePreviousContents(PDDocument pdDocument, PDPage pdPage) throws IOException {
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
contentStream.saveGraphicsState();
}
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
contentStream.restoreGraphicsState();
}
}
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
if (ocprops == null) {
ocprops = new PDOptionalContentProperties();
catalog.setOCProperties(ocprops);
}
PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(LAYER_NAME)) {
layer = ocprops.getGroup(LAYER_NAME);
} else {
layer = new PDOptionalContentGroup(LAYER_NAME);
ocprops.addGroup(layer);
}
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
// dictionariesToUpdate.add(catalog.getCOSObject());
return layer;
}
private static AffineTransform getTextDeRotationTransform(PDPage page) {
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
case 90 -> 3;
case 180 -> 2;
case 270 -> 1;
default -> 0;
});
}
}

View File

@ -39,6 +39,7 @@ dependencies {
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
// for integration testing only
testImplementation(project(":viewer-doc-processor"))
testImplementation(project(":layoutparser-service-internal-api"))
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")

View File

@ -6,11 +6,15 @@ import java.nio.file.Path;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import lombok.SneakyThrows;
@ -22,11 +26,35 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
var documentFile = new ClassPathResource(fileName).getFile();
viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true);
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@SneakyThrows
public void testViewerDocumentWithTables() {
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var mapper = ObjectMapperFactory.create();
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
}
}

View File

@ -0,0 +1,19 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.2.2"
}
description = "Library for adding/removing layers in the viewer document"
var pdfBoxVersion = "3.0.0"
dependencies {
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("org.slf4j:slf4j-api:1.7.25")
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
testImplementation("org.junit.jupiter:junit-jupiter")
testImplementation platform('org.junit:junit-bom:5.10.0')
}

View File

@ -0,0 +1,33 @@
package com.knecon.fforesight.service.viewerdoc;
import java.util.List;
import org.apache.pdfbox.cos.COSName;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC)
public class ContentStreams {
public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true);
public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false);
public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true);
public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true);
public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false);
public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false);
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT, KNECON_OCR, KNECON_OCR_BBOX_DEBUG, KNECON_OCR_TEXT_DEBUG, OTHER, ESCAPE_START, ESCAPE_END);
public record Identifier(String name, COSName cosName, boolean optionalContent) {
}
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.awt.Color;
import java.awt.geom.Line2D;
public record ColoredLine(Line2D line, Color color, float lineWidth) {
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
package com.knecon.fforesight.service.viewerdoc.model;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
public record ColoredRectangle(Rectangle2D rectangle2D, Color color) {
public record ColoredRectangle(Rectangle2D rectangle2D, Color color, float lineWidth) {
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.viewerdoc.model;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
public interface EmbeddableFont {
PDFont embed(PDDocument document);
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
package com.knecon.fforesight.service.viewerdoc.model;
import java.awt.Color;
import java.awt.geom.Rectangle2D;

View File

@ -1,7 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import lombok.AccessLevel;
import lombok.Getter;
@ -12,15 +12,15 @@ import lombok.experimental.FieldDefaults;
public class LayoutGrid {
int numberOfPages;
List<VisualizationsOnPage> visualizationsPerPages;
Map<Integer, VisualizationsOnPage> visualizationsPerPages;
public LayoutGrid(int numberOfPages) {
this.numberOfPages = numberOfPages;
this.visualizationsPerPages = new ArrayList<>(numberOfPages);
this.visualizationsPerPages = new HashMap<>();
for (int i = 0; i < numberOfPages; i++) {
this.visualizationsPerPages.add(VisualizationsOnPage.builder().pageNumber(i).build());
this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build());
}
}

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
public record OperatorWithArguments(Operator operator, List<COSBase> arguments) {
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.util.Optional;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.util.Matrix;
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<Matrix> textMatrix, Optional<RenderingMode> renderingMode) {
public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) {
return new PlacedText(text, lineStart, color, fontSize, font, Optional.empty(), Optional.empty());
}
}

View File

@ -0,0 +1,37 @@
package com.knecon.fforesight.service.viewerdoc.model;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@RequiredArgsConstructor
public class Standard14EmbeddableFont implements EmbeddableFont {
private final PDType1Font font;
public static Standard14EmbeddableFont helvetica() {
return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA));
}
@SneakyThrows
public float getStringWidth(String text) {
return font.getStringWidth(text);
}
@Override
public PDFont embed(PDDocument document) {
// no need to embed anything
return font;
}
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Visualizations {
ContentStreams.Identifier layer;
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
boolean layerVisibilityDefaultValue;
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.LinkedList;
import java.util.List;
@ -13,7 +13,8 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class VisualizationsOnPage {
int pageNumber;
@Builder.Default
boolean makePathsInvisible = false;
@Builder.Default
List<PlacedText> placedTexts = new LinkedList<>();
@Builder.Default

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.viewerdoc.pdf;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) {
}

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.viewerdoc.pdf;
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.io.RandomAccessInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.Matrix;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SinglePDContentStream implements PDContentStream {
PDStream pdStream;
@Override
public InputStream getContents() throws IOException {
return new RandomAccessInputStream(getContentsForRandomAccess());
}
@Override
public RandomAccessRead getContentsForRandomAccess() throws IOException {
return pdStream.getCOSObject().createView();
}
@Override
public PDResources getResources() {
return null;
}
@Override
public PDRectangle getBBox() {
return null;
}
@Override
public Matrix getMatrix() {
return null;
}
}

View File

@ -0,0 +1,121 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ContentStreamClassifier {
public List<ClassifiedContentStream> getClassifiedContentStreams(PDPage page) {
List<SinglePDContentStream> streams = new LinkedList<>();
page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream)));
return ContentStreamClassifier.classifySingleContentStreams(page, streams);
}
public List<ClassifiedContentStream> classifySingleContentStreams(PDPage page, List<SinglePDContentStream> streams) {
return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList();
}
private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) {
ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page);
return new ClassifiedContentStream(singlePDContentStream, classification);
}
/**
* We assume all of our layers are written escaped, so only unknown content streams need to be escaped.
*
* @param classifiers List of all content streams of a page with their classification
* @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END
*/
public boolean areAllContentStreamsEscaped(List<ClassifiedContentStream> classifiers) {
int escapeDepth = 0;
for (ClassifiedContentStream classifier : classifiers) {
if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) {
return false;
}
if (classifier.classification().equals(ContentStreams.ESCAPE_START)) {
escapeDepth++;
}
if (classifier.classification().equals(ContentStreams.ESCAPE_END)) {
escapeDepth--;
}
}
return escapeDepth == 0;
}
@SneakyThrows
public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) {
List<OperatorWithArguments> operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2);
if (operatorsWithArguments.isEmpty()) {
return ContentStreams.OTHER;
}
OperatorWithArguments firstOperator = operatorsWithArguments.get(0);
// If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q".
if (operatorsWithArguments.size() == 1) {
if (firstOperator.operator().getName().equals(OperatorName.SAVE)) {
return ContentStreams.ESCAPE_START;
}
if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) {
return ContentStreams.ESCAPE_END;
}
}
// In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid".
// This OCG is then assigned a COSName by PDFBox. Usually its "oc1".
// Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG.
// This COSName can then be found as an argument for the first operator in the content stream.
if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) {
Optional<COSName> layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page);
if (layoutGridOCGName.isPresent()) {
if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) {
return ContentStreams.KNECON_LAYOUT;
}
}
}
if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) {
return ContentStreams.OTHER;
}
Optional<COSName> firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst();
if (firstCOSNameFromArguments.isEmpty()) {
return ContentStreams.OTHER;
}
var cosName = firstCOSNameFromArguments.get();
return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER);
}
private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) {
return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName));
}
}

View File

@ -0,0 +1,78 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ContentStreamUtility {
public static List<OperatorWithArguments> parseLeadingOperators(PDContentStream contentStream,
int numberOfOperatorsToRead) throws IOException {
List<COSBase> arguments = new ArrayList<>();
PDFStreamParser parser = new PDFStreamParser(contentStream);
List<OperatorWithArguments> operatorsWithArguments = new LinkedList<>();
for (int i = 0; i < numberOfOperatorsToRead; ) {
Object token = parser.parseNextToken();
if (token == null) {
break;
}
if (token instanceof Operator operator) {
operatorsWithArguments.add(new OperatorWithArguments(operator, arguments));
arguments = new ArrayList<>();
i++;
} else {
arguments.add((COSBase) token);
}
}
return operatorsWithArguments;
}
public static Optional<COSName> findLayoutGridOCGName(PDPage page) {
Optional<COSName> layoutGridOCGName = Optional.empty();
var resourceIterator = page.getResources().getPropertiesNames();
for (COSName cosName : resourceIterator) {
COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME);
if (cosBase instanceof COSString string) {
if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) {
layoutGridOCGName = Optional.of(cosName);
}
}
}
return layoutGridOCGName;
}
public static List<PDStream> removeLayerFromContentStreams(Set<ContentStreams.Identifier> layers, List<ClassifiedContentStream> classifiers) {
return classifiers.stream()
.filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification()))
.map(ClassifiedContentStream::contentStream)
.map(SinglePDContentStream::getPdStream)
.toList();
}
}

View File

@ -0,0 +1,316 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
public class ViewerDocumentService {
private final ObservationRegistry registry;
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
}
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
// originFile and destinationFile might be the same, so we use a temp file.
// Otherwise, saving the document might corrupt the file
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
PDPage pdPage = pdDocument.getPage(pageNumber);
createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups
List<ClassifiedContentStream> classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage);
pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers));
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) {
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
// e.g. not escaped matrix transformations.
wrapContentStreams(pdDocument, pdPage);
}
for (Visualizations visualization : visualizations) {
if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) {
continue;
}
// We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
contentStream.beginMarkedContent(visualization.getLayer().cosName());
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer()));
}
contentStream.saveGraphicsState();
drawVisualizationsToContentStream(pdDocument, visualization.getVisualizationsOnPages().get(pageNumber), contentStream, textDeRotationMatrix);
contentStream.restoreGraphicsState();
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
contentStream.endMarkedContent();
}
contentStream.endMarkedContent();
}
}
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages());
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
pdDocument = openPDDocument(tmpFile.toFile());
}
}
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
assert tmpFile.toFile().delete();
}
private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) {
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>();
for (Visualizations visualization : visualizations) {
addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())//
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
}
return optionalContentGroupMap;
}
private static void drawVisualizationsToContentStream(PDDocument pdDocument,
VisualizationsOnPage visualizationsOnPage,
PDPageContentStream contentStream,
AffineTransform textDeRotationMatrix) throws IOException {
if (visualizationsOnPage.isMakePathsInvisible()) {
contentStream.addRect(0, 0, 1, 1);
contentStream.clip();
}
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
contentStream.setLineWidth(coloredLine.lineWidth());
contentStream.setStrokingColor(coloredLine.color());
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
contentStream.stroke();
}
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
contentStream.setLineWidth(coloredRectangle.lineWidth());
contentStream.setStrokingColor(coloredRectangle.color());
Rectangle2D r = coloredRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.stroke();
}
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
contentStream.setNonStrokingColor(filledRectangle.color());
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
contentStream.setGraphicsStateParameters(graphicsState);
Rectangle2D r = filledRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.fill();
}
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
PDFont font = placedText.font().embed(pdDocument);
contentStream.setFont(font, placedText.fontSize());
contentStream.beginText();
contentStream.setNonStrokingColor(placedText.color());
if (placedText.renderingMode().isPresent()) {
contentStream.setRenderingMode(placedText.renderingMode().get());
} else {
contentStream.setRenderingMode(RenderingMode.FILL);
}
Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix);
contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text());
contentStream.endText();
}
}
private void enrichObservation(PDDocument pdDocument, List<ContentStreams.Identifier> layers) {
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
return;
}
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(pdDocument.getNumberOfPages()));
for (int i = 0; i < layers.size(); i++) {
ContentStreams.Identifier layer = layers.get(i);
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
}
}
private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException {
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
contentStream.saveGraphicsState();
}
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
contentStream.restoreGraphicsState();
}
}
private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) {
Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
} else {
textMatrix = placedText.textMatrix().get();
}
return textMatrix;
}
private static Optional<PDOptionalContentGroup> addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
if (layer.optionalContent()) {
return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue));
}
return Optional.empty();
}
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
if (ocprops == null) {
ocprops = new PDOptionalContentProperties();
catalog.setOCProperties(ocprops);
}
PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(layerName)) {
layer = ocprops.getGroup(layerName);
} else {
layer = new PDOptionalContentGroup(layerName);
ocprops.addGroup(layer);
}
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
return layer;
}
private static PDDocument openPDDocument(File tmpFile) throws IOException {
PDDocument pdDocument;
pdDocument = Loader.loadPDF(tmpFile);
pdDocument.setAllSecurityToBeRemoved(true);
return pdDocument;
}
@SneakyThrows
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
try (var out = new FileOutputStream(outputFile)) {
pdDocument.save(out);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
private static void createPageResourcesIfNotPresent(PDPage pdPage) {
PDResources resources = pdPage.getResources();
if (resources == null) {
resources = new PDResources();
pdPage.setResources(resources);
}
}
private static AffineTransform getTextDeRotationTransform(PDPage page) {
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
case 90 -> 3;
case 180 -> 2;
case 270 -> 1;
default -> 0;
});
}
}

View File

@ -0,0 +1,124 @@
package com.knecon.fforesight.service.viewerdoc.service;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.nio.file.Files;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class ContentStreamClassifierTest {
@Test
@SneakyThrows
public void testClassification() {
File pdfFile = new File(this.getClass().getClassLoader().getResource("viewerDocLayers.pdf").getFile());
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
logContentStreamClassifications(classifieds);
assertEquals(11, classifieds.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
}
}
@Test
@SneakyThrows
public void testRemoveLayoutLayer() {
File pdfFile = new File(this.getClass().getClassLoader().getResource("viewerDocLayers.pdf").getFile());
File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile();
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds));
document.save(tmpFile);
}
try (PDDocument document2 = Loader.loadPDF(tmpFile)) {
PDPage page2 = document2.getPage(0);
List<ClassifiedContentStream> classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2);
logContentStreamClassifications(classifieds2);
assertEquals(10, classifieds2.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2));
}
assert tmpFile.delete();
}
@Test
@SneakyThrows
public void testClassificationForOldLayers() {
File pdfFile = new File(this.getClass().getClassLoader().getResource("oldViewerDocLayers.pdf").getFile());
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
logContentStreamClassifications(classifieds);
assertEquals(11, classifieds.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
}
}
private static void logContentStreamClassifications(List<ClassifiedContentStream> classifieds) {
log.info("number of content streams: {}", classifieds.size());
log.info("Classifications: {}", classifieds.stream()//
.map(ClassifiedContentStream::classification)//
.map(ContentStreams.Identifier::cosName)//
.map(COSName::getName)//
.collect(Collectors.joining(", ")));
}
}

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.knecon" level="info"/>
</Loggers>
</Configuration>

View File

@ -1,13 +1,9 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
rootProject.name = "layoutparser"
include(":layoutparser-service-server")
include(":layoutparser-service-processor")
include(":layoutparser-service-internal-api")
include("viewer-doc-processor")
project(":layoutparser-service-server").projectDir = file("layoutparser-service/layoutparser-service-server")
project(":layoutparser-service-processor").projectDir = file("layoutparser-service/layoutparser-service-processor")
project(":layoutparser-service-internal-api").projectDir = file("layoutparser-service/layoutparser-service-internal-api")
project(":viewer-doc-processor").projectDir = file("layoutparser-service/viewer-doc-processor")