RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* various improvements to experimental parsing steps * added embed fonts functionality to viewer doc
This commit is contained in:
parent
1b4aaf4454
commit
23eb0c40a3
@ -10,6 +10,7 @@ val pdfBoxVersion = "3.0.0"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
implementation(project(":viewer-doc-processor"))
|
||||
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
|
||||
@ -51,7 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
@ -84,7 +84,7 @@ public class LayoutParsingPipeline {
|
||||
TaasBlockificationService taasBlockificationService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
LayoutGridService layoutGridService;
|
||||
ObservationRegistry observationRegistry;
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf");
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
@ -111,25 +111,31 @@ public class LayoutParsingPipeline {
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
layoutParsingRequest.identifier().toString());
|
||||
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
if (!viewerDocumentFile.equals(originFile)) {
|
||||
viewerDocumentFile.delete();
|
||||
}
|
||||
originFile.delete();
|
||||
viewerDocumentFile.delete();
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
|
||||
@ -1,10 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
public class LayoutParsingServiceProcessorConfiguration {
|
||||
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new ViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
@ -36,13 +37,6 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
public PDDocument getOriginDocument(String storageId) throws IOException {
|
||||
|
||||
return Loader.loadPDF(getOriginFile(storageId));
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
|
||||
@ -52,6 +46,18 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
|
||||
public Optional<File> getViewerDocFile(String storageId) throws IOException {
|
||||
|
||||
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
File tempFile = createTempFile("viewerDocument", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
return Optional.of(tempFile);
|
||||
}
|
||||
|
||||
|
||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
@ -137,7 +143,6 @@ public class LayoutParsingStorageService {
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||
|
||||
try (var in = new FileInputStream(out)) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,8 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
|
||||
public record ColoredLine(Line2D line, Color color) {
|
||||
|
||||
}
|
||||
@ -1,7 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
public record PlacedText(String text, Point2D lineStart) {
|
||||
|
||||
}
|
||||
@ -1,12 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService.LINE_WIDTH;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
@ -24,25 +23,59 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class LayoutGridService {
|
||||
|
||||
private static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
private static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
public static final Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
public static final Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
public static final Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||
public static final Color HEADER_COLOR = new Color(171, 131, 6);
|
||||
public static final Color IMAGE_COLOR = new Color(253, 63, 146);
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
|
||||
static float FONT_SIZE = 10f;
|
||||
static float LINE_WIDTH = 1f;
|
||||
static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
static Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
static Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
static Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||
static Color HEADER_COLOR = new Color(171, 131, 6);
|
||||
static Color IMAGE_COLOR = new Color(253, 63, 146);
|
||||
|
||||
|
||||
public LayoutGrid createLayoutGrid(Document document) {
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
LayoutGrid layoutGrid = createLayoutGrid(document);
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(originFile,
|
||||
destinationFile,
|
||||
Visualizations.builder()
|
||||
.layer(ContentStreams.KNECON_LAYOUT)
|
||||
.visualizationsOnPages(layoutGrid.getVisualizationsPerPages())
|
||||
.layerVisibilityDefaultValue(layerVisibilityDefaultValue)
|
||||
.build());
|
||||
}
|
||||
|
||||
|
||||
private LayoutGrid createLayoutGrid(Document document) {
|
||||
|
||||
LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages());
|
||||
document.streamAllSubNodes().forEach(semanticNode -> {
|
||||
@ -103,11 +136,11 @@ public class LayoutGridService {
|
||||
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
|
||||
xs.forEach(x -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
ys.forEach(y -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -135,9 +168,9 @@ public class LayoutGridService {
|
||||
List<Line2D> lines = createLinesFromRectangle(r, firstPage.getRotation());
|
||||
// add string to top line
|
||||
var firstLine = lines.remove(0);
|
||||
coloredLines.add(new ColoredLine(firstLine, color));
|
||||
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||
for (Line2D line : lines) {
|
||||
coloredLines.add(new ColoredLine(line, color));
|
||||
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -152,6 +185,7 @@ public class LayoutGridService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
|
||||
|
||||
Point2D.Float upperLeftCorner = switch (page.getRotation()) {
|
||||
@ -161,7 +195,8 @@ public class LayoutGridService {
|
||||
default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY());
|
||||
};
|
||||
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
|
||||
placedTexts.add(new PlacedText(s, upperLeftCorner));
|
||||
upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE);
|
||||
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
|
||||
}
|
||||
|
||||
|
||||
@ -176,9 +211,9 @@ public class LayoutGridService {
|
||||
midPageLines.remove(1);
|
||||
// add string to left line
|
||||
var leftLine = midPageLines.remove(1);
|
||||
coloredLines.add(new ColoredLine(leftLine, color));
|
||||
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||
for (Line2D line : midPageLines) {
|
||||
coloredLines.add(new ColoredLine(line, color));
|
||||
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
@ -192,9 +227,9 @@ public class LayoutGridService {
|
||||
lastPageLines.remove(0);
|
||||
// add string to left line
|
||||
var leftLine = lastPageLines.remove(2);
|
||||
coloredLines.add(new ColoredLine(leftLine, color));
|
||||
coloredLines.add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||
for (Line2D line : lastPageLines) {
|
||||
coloredLines.add(new ColoredLine(line, color));
|
||||
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
@ -208,9 +243,9 @@ public class LayoutGridService {
|
||||
firstPageLines.remove(2);
|
||||
// add string to top line
|
||||
var firstLine = firstPageLines.remove(0);
|
||||
coloredLines.add(new ColoredLine(firstLine, color));
|
||||
coloredLines.add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||
for (Line2D line : firstPageLines) {
|
||||
coloredLines.add(new ColoredLine(line, color));
|
||||
coloredLines.add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
@ -276,7 +311,10 @@ public class LayoutGridService {
|
||||
private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||
|
||||
semanticNode.getBBox()
|
||||
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color)));
|
||||
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages()
|
||||
.get(page.getNumber() - 1)
|
||||
.getColoredRectangles()
|
||||
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,217 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService {
|
||||
|
||||
private static final String LAYER_NAME = "Layout grid";
|
||||
private static final int FONT_SIZE = 10;
|
||||
public static final float LINE_WIDTH = 1f;
|
||||
|
||||
private final LayoutGridService layoutGridService;
|
||||
private final ObservationRegistry observationRegistry;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
|
||||
PDDocument pdDocument = openPDDocument(originFile);
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
//
|
||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
||||
addLayerToPageResources(pdPage);
|
||||
|
||||
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
||||
// e.g. not escaped matrix transformations.
|
||||
escapePreviousContents(pdDocument, pdPage);
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber);
|
||||
assert pageNumber == visualizationsOnPage.getPageNumber();
|
||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
contentStream.beginMarkedContent(COSName.OC, layer);
|
||||
contentStream.saveGraphicsState();
|
||||
|
||||
contentStream.setLineWidth(LINE_WIDTH);
|
||||
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
|
||||
contentStream.setStrokingColor(coloredLine.color());
|
||||
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
|
||||
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
|
||||
contentStream.stroke();
|
||||
}
|
||||
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
|
||||
contentStream.setStrokingColor(coloredRectangle.color());
|
||||
Rectangle2D r = coloredRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.stroke();
|
||||
}
|
||||
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
|
||||
contentStream.setNonStrokingColor(filledRectangle.color());
|
||||
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
|
||||
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
|
||||
contentStream.setGraphicsStateParameters(graphicsState);
|
||||
Rectangle2D r = filledRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.fill();
|
||||
}
|
||||
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
|
||||
contentStream.setFont(font, FONT_SIZE);
|
||||
contentStream.beginText();
|
||||
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||
contentStream.setTextMatrix(textMatrix);
|
||||
contentStream.showText(placedText.text());
|
||||
contentStream.endText();
|
||||
}
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
|
||||
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
|
||||
log.info("Incremental save after {} pages", pageNumber);
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
pdDocument.close();
|
||||
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
pdDocument = openPDDocument(tmpFile.toFile());
|
||||
layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||
}
|
||||
|
||||
}
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
|
||||
tmpFile.toFile().delete();
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
|
||||
private static PDDocument openPDDocument(File tmpFile) throws IOException {
|
||||
|
||||
PDDocument pdDocument;
|
||||
pdDocument = Loader.loadPDF(tmpFile);
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
return pdDocument;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||
|
||||
Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> {
|
||||
try {
|
||||
pdDocument.save(outputFile);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static void addLayerToPageResources(PDPage pdPage) {
|
||||
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
resources = new PDResources();
|
||||
pdPage.setResources(resources);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void escapePreviousContents(PDDocument pdDocument, PDPage pdPage) throws IOException {
|
||||
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||
contentStream.saveGraphicsState();
|
||||
}
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
if (ocprops == null) {
|
||||
ocprops = new PDOptionalContentProperties();
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(LAYER_NAME)) {
|
||||
layer = ocprops.getGroup(LAYER_NAME);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(LAYER_NAME);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||
// dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
private static AffineTransform getTextDeRotationTransform(PDPage page) {
|
||||
|
||||
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
|
||||
case 90 -> 3;
|
||||
case 180 -> 2;
|
||||
case 270 -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@ -39,6 +39,7 @@ dependencies {
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
|
||||
// for integration testing only
|
||||
testImplementation(project(":viewer-doc-processor"))
|
||||
testImplementation(project(":layoutparser-service-internal-api"))
|
||||
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||
|
||||
@ -6,11 +6,15 @@ import java.nio.file.Path;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -22,11 +26,35 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true);
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocumentWithTables() {
|
||||
|
||||
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
|
||||
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var mapper = ObjectMapperFactory.create();
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
19
layoutparser-service/viewer-doc-processor/build.gradle
Normal file
19
layoutparser-service/viewer-doc-processor/build.gradle
Normal file
@ -0,0 +1,19 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
}
|
||||
|
||||
description = "Library for adding/removing layers in the viewer document"
|
||||
|
||||
var pdfBoxVersion = "3.0.0"
|
||||
|
||||
dependencies {
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("org.slf4j:slf4j-api:1.7.25")
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
|
||||
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter")
|
||||
testImplementation platform('org.junit:junit-bom:5.10.0')
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package com.knecon.fforesight.service.viewerdoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC)
|
||||
public class ContentStreams {
|
||||
|
||||
public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true);
|
||||
|
||||
public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false);
|
||||
|
||||
public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true);
|
||||
|
||||
public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true);
|
||||
|
||||
public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false);
|
||||
|
||||
public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false);
|
||||
|
||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||
|
||||
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT, KNECON_OCR, KNECON_OCR_BBOX_DEBUG, KNECON_OCR_TEXT_DEBUG, OTHER, ESCAPE_START, ESCAPE_END);
|
||||
|
||||
public record Identifier(String name, COSName cosName, boolean optionalContent) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
|
||||
public record ColoredLine(Line2D line, Color color, float lineWidth) {
|
||||
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
public record ColoredRectangle(Rectangle2D rectangle2D, Color color) {
|
||||
public record ColoredRectangle(Rectangle2D rectangle2D, Color color, float lineWidth) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
public interface EmbeddableFont {
|
||||
|
||||
PDFont embed(PDDocument document);
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,7 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -12,15 +12,15 @@ import lombok.experimental.FieldDefaults;
|
||||
public class LayoutGrid {
|
||||
|
||||
int numberOfPages;
|
||||
List<VisualizationsOnPage> visualizationsPerPages;
|
||||
Map<Integer, VisualizationsOnPage> visualizationsPerPages;
|
||||
|
||||
|
||||
public LayoutGrid(int numberOfPages) {
|
||||
|
||||
this.numberOfPages = numberOfPages;
|
||||
this.visualizationsPerPages = new ArrayList<>(numberOfPages);
|
||||
this.visualizationsPerPages = new HashMap<>();
|
||||
for (int i = 0; i < numberOfPages; i++) {
|
||||
this.visualizationsPerPages.add(VisualizationsOnPage.builder().pageNumber(i).build());
|
||||
this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build());
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,10 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
|
||||
public record OperatorWithArguments(Operator operator, List<COSBase> arguments) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<Matrix> textMatrix, Optional<RenderingMode> renderingMode) {
|
||||
|
||||
public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) {
|
||||
|
||||
return new PlacedText(text, lineStart, color, fontSize, font, Optional.empty(), Optional.empty());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,37 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class Standard14EmbeddableFont implements EmbeddableFont {
|
||||
|
||||
private final PDType1Font font;
|
||||
|
||||
|
||||
public static Standard14EmbeddableFont helvetica() {
|
||||
|
||||
return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public float getStringWidth(String text) {
|
||||
|
||||
return font.getStringWidth(text);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDFont embed(PDDocument document) {
|
||||
|
||||
// no need to embed anything
|
||||
return font;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Visualizations {
|
||||
|
||||
ContentStreams.Identifier layer;
|
||||
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
|
||||
boolean layerVisibilityDefaultValue;
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.visualization;
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -13,7 +13,8 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class VisualizationsOnPage {
|
||||
|
||||
int pageNumber;
|
||||
@Builder.Default
|
||||
boolean makePathsInvisible = false;
|
||||
@Builder.Default
|
||||
List<PlacedText> placedTexts = new LinkedList<>();
|
||||
@Builder.Default
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.pdf;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
|
||||
public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.pdf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.io.RandomAccessInputStream;
|
||||
import org.apache.pdfbox.io.RandomAccessRead;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SinglePDContentStream implements PDContentStream {
|
||||
|
||||
PDStream pdStream;
|
||||
|
||||
|
||||
@Override
|
||||
public InputStream getContents() throws IOException {
|
||||
|
||||
return new RandomAccessInputStream(getContentsForRandomAccess());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RandomAccessRead getContentsForRandomAccess() throws IOException {
|
||||
|
||||
return pdStream.getCOSObject().createView();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDResources getResources() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDRectangle getBBox() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Matrix getMatrix() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,121 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ContentStreamClassifier {
|
||||
|
||||
public List<ClassifiedContentStream> getClassifiedContentStreams(PDPage page) {
|
||||
|
||||
List<SinglePDContentStream> streams = new LinkedList<>();
|
||||
page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream)));
|
||||
return ContentStreamClassifier.classifySingleContentStreams(page, streams);
|
||||
}
|
||||
|
||||
|
||||
public List<ClassifiedContentStream> classifySingleContentStreams(PDPage page, List<SinglePDContentStream> streams) {
|
||||
|
||||
return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList();
|
||||
}
|
||||
|
||||
|
||||
private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) {
|
||||
|
||||
ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page);
|
||||
return new ClassifiedContentStream(singlePDContentStream, classification);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* We assume all of our layers are written escaped, so only unknown content streams need to be escaped.
|
||||
*
|
||||
* @param classifiers List of all content streams of a page with their classification
|
||||
* @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END
|
||||
*/
|
||||
public boolean areAllContentStreamsEscaped(List<ClassifiedContentStream> classifiers) {
|
||||
|
||||
int escapeDepth = 0;
|
||||
for (ClassifiedContentStream classifier : classifiers) {
|
||||
if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) {
|
||||
return false;
|
||||
}
|
||||
if (classifier.classification().equals(ContentStreams.ESCAPE_START)) {
|
||||
escapeDepth++;
|
||||
}
|
||||
if (classifier.classification().equals(ContentStreams.ESCAPE_END)) {
|
||||
escapeDepth--;
|
||||
}
|
||||
}
|
||||
return escapeDepth == 0;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) {
|
||||
|
||||
List<OperatorWithArguments> operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2);
|
||||
if (operatorsWithArguments.isEmpty()) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
OperatorWithArguments firstOperator = operatorsWithArguments.get(0);
|
||||
|
||||
// If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q".
|
||||
if (operatorsWithArguments.size() == 1) {
|
||||
if (firstOperator.operator().getName().equals(OperatorName.SAVE)) {
|
||||
return ContentStreams.ESCAPE_START;
|
||||
}
|
||||
if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) {
|
||||
return ContentStreams.ESCAPE_END;
|
||||
}
|
||||
}
|
||||
|
||||
// In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid".
|
||||
// This OCG is then assigned a COSName by PDFBox. Usually its "oc1".
|
||||
// Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG.
|
||||
// This COSName can then be found as an argument for the first operator in the content stream.
|
||||
if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) {
|
||||
Optional<COSName> layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page);
|
||||
if (layoutGridOCGName.isPresent()) {
|
||||
if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) {
|
||||
return ContentStreams.KNECON_LAYOUT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
|
||||
Optional<COSName> firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst();
|
||||
|
||||
if (firstCOSNameFromArguments.isEmpty()) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
|
||||
var cosName = firstCOSNameFromArguments.get();
|
||||
|
||||
return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER);
|
||||
}
|
||||
|
||||
|
||||
private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) {
|
||||
|
||||
return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,78 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdfparser.PDFStreamParser;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ContentStreamUtility {
|
||||
|
||||
public static List<OperatorWithArguments> parseLeadingOperators(PDContentStream contentStream,
|
||||
int numberOfOperatorsToRead) throws IOException {
|
||||
|
||||
List<COSBase> arguments = new ArrayList<>();
|
||||
PDFStreamParser parser = new PDFStreamParser(contentStream);
|
||||
List<OperatorWithArguments> operatorsWithArguments = new LinkedList<>();
|
||||
for (int i = 0; i < numberOfOperatorsToRead; ) {
|
||||
Object token = parser.parseNextToken();
|
||||
if (token == null) {
|
||||
break;
|
||||
}
|
||||
if (token instanceof Operator operator) {
|
||||
operatorsWithArguments.add(new OperatorWithArguments(operator, arguments));
|
||||
arguments = new ArrayList<>();
|
||||
i++;
|
||||
} else {
|
||||
arguments.add((COSBase) token);
|
||||
}
|
||||
|
||||
}
|
||||
return operatorsWithArguments;
|
||||
}
|
||||
|
||||
|
||||
public static Optional<COSName> findLayoutGridOCGName(PDPage page) {
|
||||
|
||||
Optional<COSName> layoutGridOCGName = Optional.empty();
|
||||
var resourceIterator = page.getResources().getPropertiesNames();
|
||||
for (COSName cosName : resourceIterator) {
|
||||
COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME);
|
||||
if (cosBase instanceof COSString string) {
|
||||
if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) {
|
||||
layoutGridOCGName = Optional.of(cosName);
|
||||
}
|
||||
}
|
||||
}
|
||||
return layoutGridOCGName;
|
||||
}
|
||||
|
||||
|
||||
public static List<PDStream> removeLayerFromContentStreams(Set<ContentStreams.Identifier> layers, List<ClassifiedContentStream> classifiers) {
|
||||
|
||||
return classifiers.stream()
|
||||
.filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification()))
|
||||
.map(ClassifiedContentStream::contentStream)
|
||||
.map(SinglePDContentStream::getPdStream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,316 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService {
|
||||
|
||||
private final ObservationRegistry registry;
|
||||
|
||||
|
||||
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
|
||||
|
||||
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
|
||||
@SneakyThrows
|
||||
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
||||
|
||||
// originFile and destinationFile might be the same, so we use a temp file.
|
||||
// Otherwise, saving the document might corrupt the file
|
||||
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
|
||||
Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
|
||||
|
||||
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
|
||||
|
||||
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
|
||||
|
||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups
|
||||
|
||||
List<ClassifiedContentStream> classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage);
|
||||
|
||||
pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers));
|
||||
|
||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
||||
|
||||
if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) {
|
||||
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
||||
// e.g. not escaped matrix transformations.
|
||||
wrapContentStreams(pdDocument, pdPage);
|
||||
}
|
||||
|
||||
for (Visualizations visualization : visualizations) {
|
||||
if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) {
|
||||
continue;
|
||||
}
|
||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
contentStream.beginMarkedContent(visualization.getLayer().cosName());
|
||||
|
||||
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
|
||||
contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer()));
|
||||
}
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
|
||||
drawVisualizationsToContentStream(pdDocument, visualization.getVisualizationsOnPages().get(pageNumber), contentStream, textDeRotationMatrix);
|
||||
|
||||
contentStream.restoreGraphicsState();
|
||||
|
||||
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
|
||||
}
|
||||
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
|
||||
log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages());
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
pdDocument.close();
|
||||
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
pdDocument = openPDDocument(tmpFile.toFile());
|
||||
}
|
||||
}
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
|
||||
pdDocument.close();
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) {
|
||||
|
||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>();
|
||||
for (Visualizations visualization : visualizations) {
|
||||
addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())//
|
||||
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
|
||||
}
|
||||
return optionalContentGroupMap;
|
||||
}
|
||||
|
||||
|
||||
private static void drawVisualizationsToContentStream(PDDocument pdDocument,
|
||||
VisualizationsOnPage visualizationsOnPage,
|
||||
PDPageContentStream contentStream,
|
||||
AffineTransform textDeRotationMatrix) throws IOException {
|
||||
|
||||
if (visualizationsOnPage.isMakePathsInvisible()) {
|
||||
contentStream.addRect(0, 0, 1, 1);
|
||||
contentStream.clip();
|
||||
}
|
||||
|
||||
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
|
||||
contentStream.setLineWidth(coloredLine.lineWidth());
|
||||
contentStream.setStrokingColor(coloredLine.color());
|
||||
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
|
||||
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
|
||||
contentStream.stroke();
|
||||
}
|
||||
|
||||
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
|
||||
contentStream.setLineWidth(coloredRectangle.lineWidth());
|
||||
contentStream.setStrokingColor(coloredRectangle.color());
|
||||
Rectangle2D r = coloredRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.stroke();
|
||||
}
|
||||
|
||||
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
|
||||
contentStream.setNonStrokingColor(filledRectangle.color());
|
||||
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
|
||||
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
|
||||
contentStream.setGraphicsStateParameters(graphicsState);
|
||||
Rectangle2D r = filledRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.fill();
|
||||
}
|
||||
|
||||
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
|
||||
PDFont font = placedText.font().embed(pdDocument);
|
||||
contentStream.setFont(font, placedText.fontSize());
|
||||
contentStream.beginText();
|
||||
contentStream.setNonStrokingColor(placedText.color());
|
||||
if (placedText.renderingMode().isPresent()) {
|
||||
contentStream.setRenderingMode(placedText.renderingMode().get());
|
||||
} else {
|
||||
contentStream.setRenderingMode(RenderingMode.FILL);
|
||||
}
|
||||
Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix);
|
||||
contentStream.setTextMatrix(textMatrix);
|
||||
contentStream.showText(placedText.text());
|
||||
contentStream.endText();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void enrichObservation(PDDocument pdDocument, List<ContentStreams.Identifier> layers) {
|
||||
|
||||
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
|
||||
return;
|
||||
}
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(pdDocument.getNumberOfPages()));
|
||||
for (int i = 0; i < layers.size(); i++) {
|
||||
ContentStreams.Identifier layer = layers.get(i);
|
||||
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException {
|
||||
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||
contentStream.saveGraphicsState();
|
||||
}
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) {
|
||||
|
||||
Matrix textMatrix;
|
||||
if (placedText.textMatrix().isEmpty()) {
|
||||
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
} else {
|
||||
textMatrix = placedText.textMatrix().get();
|
||||
}
|
||||
return textMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static Optional<PDOptionalContentGroup> addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
if (layer.optionalContent()) {
|
||||
return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue));
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
if (ocprops == null) {
|
||||
ocprops = new PDOptionalContentProperties();
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(layerName)) {
|
||||
layer = ocprops.getGroup(layerName);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(layerName);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
private static PDDocument openPDDocument(File tmpFile) throws IOException {
|
||||
|
||||
PDDocument pdDocument;
|
||||
pdDocument = Loader.loadPDF(tmpFile);
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
return pdDocument;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||
|
||||
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
|
||||
try (var out = new FileOutputStream(outputFile)) {
|
||||
pdDocument.save(out);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private static void createPageResourcesIfNotPresent(PDPage pdPage) {
|
||||
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
resources = new PDResources();
|
||||
pdPage.setResources(resources);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static AffineTransform getTextDeRotationTransform(PDPage page) {
|
||||
|
||||
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
|
||||
case 90 -> 3;
|
||||
case 180 -> 2;
|
||||
case 270 -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,124 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class ContentStreamClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testClassification() {
|
||||
|
||||
File pdfFile = new File(this.getClass().getClassLoader().getResource("viewerDocLayers.pdf").getFile());
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
|
||||
logContentStreamClassifications(classifieds);
|
||||
|
||||
assertEquals(11, classifieds.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
|
||||
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testRemoveLayoutLayer() {
|
||||
|
||||
File pdfFile = new File(this.getClass().getClassLoader().getResource("viewerDocLayers.pdf").getFile());
|
||||
File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile();
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds));
|
||||
|
||||
document.save(tmpFile);
|
||||
}
|
||||
try (PDDocument document2 = Loader.loadPDF(tmpFile)) {
|
||||
|
||||
PDPage page2 = document2.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2);
|
||||
|
||||
logContentStreamClassifications(classifieds2);
|
||||
|
||||
assertEquals(10, classifieds2.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2));
|
||||
}
|
||||
assert tmpFile.delete();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testClassificationForOldLayers() {
|
||||
|
||||
File pdfFile = new File(this.getClass().getClassLoader().getResource("oldViewerDocLayers.pdf").getFile());
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
|
||||
logContentStreamClassifications(classifieds);
|
||||
|
||||
assertEquals(11, classifieds.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
|
||||
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void logContentStreamClassifications(List<ClassifiedContentStream> classifieds) {
|
||||
|
||||
log.info("number of content streams: {}", classifieds.size());
|
||||
log.info("Classifications: {}", classifieds.stream()//
|
||||
.map(ClassifiedContentStream::classification)//
|
||||
.map(ContentStreams.Identifier::cosName)//
|
||||
.map(COSName::getName)//
|
||||
.collect(Collectors.joining(", ")));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.knecon" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
Binary file not shown.
Binary file not shown.
@ -1,13 +1,9 @@
|
||||
/*
|
||||
* This file was generated by the Gradle 'init' task.
|
||||
*
|
||||
* This project uses @Incubating APIs which are subject to change.
|
||||
*/
|
||||
|
||||
rootProject.name = "layoutparser"
|
||||
include(":layoutparser-service-server")
|
||||
include(":layoutparser-service-processor")
|
||||
include(":layoutparser-service-internal-api")
|
||||
include("viewer-doc-processor")
|
||||
project(":layoutparser-service-server").projectDir = file("layoutparser-service/layoutparser-service-server")
|
||||
project(":layoutparser-service-processor").projectDir = file("layoutparser-service/layoutparser-service-processor")
|
||||
project(":layoutparser-service-internal-api").projectDir = file("layoutparser-service/layoutparser-service-internal-api")
|
||||
project(":viewer-doc-processor").projectDir = file("layoutparser-service/viewer-doc-processor")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user