Merge branch 'RED-9353' into 'main'

RED-9353: refactor PDFTronViewerDocumentService

See merge request fforesight/layout-parser!178
This commit is contained in:
Kilian Schüttler 2024-07-15 12:54:17 +02:00
commit 033279e261
78 changed files with 2130 additions and 2078 deletions

View File

@ -116,29 +116,14 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = originFile;
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -151,16 +136,12 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile,
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
@ -249,10 +230,11 @@ public class LayoutParsingPipeline {
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getVisualizations().setActive(true);
classificationDocument.getLayoutDebugLayer().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>();
@ -290,7 +272,7 @@ public class LayoutParsingPipeline {
}
stripper.getText(originDocument);
List<TextPositionSequence> words = stripper.getTextPositionSequences();
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
@ -298,32 +280,34 @@ public class LayoutParsingPipeline {
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),
false);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber(), ""))
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
@ -347,7 +331,7 @@ public class LayoutParsingPipeline {
}
}
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
@ -378,7 +362,7 @@ public class LayoutParsingPipeline {
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {

View File

@ -5,10 +5,7 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.google.common.base.Strings;
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.pdftron.PDFTronViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@ -18,14 +15,9 @@ public class LayoutParsingServiceProcessorConfiguration {
@Bean
@Autowired
public IViewerDocumentService viewerDocumentService(ObservationRegistry registry, LayoutparserSettings settings) {
if (!Strings.isNullOrEmpty(settings.getPdftronLicense())) {
return new PDFTronViewerDocumentService(registry);
} else {
return new ViewerDocumentService(registry);
}
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new PDFTronViewerDocumentService(registry);
}
}

View File

@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.observation.annotation.Observed;
@ -36,6 +37,7 @@ public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException {
@ -53,11 +55,18 @@ public class LayoutParsingStorageService {
}
File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
assert tempFile.delete();
return Optional.empty();
}
return Optional.of(tempFile);
}
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
@SneakyThrows
public ImageServiceResponse getImagesFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
@ -68,7 +77,8 @@ public class LayoutParsingStorageService {
}
public TableServiceResponse getTablesFile(String storageId) throws IOException {
@SneakyThrows
public TableServiceResponse getTablesFile(String storageId) {
try (var tableClassificationStream = getObject(storageId)) {
@ -78,11 +88,12 @@ public class LayoutParsingStorageService {
}
}
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) throws IOException {
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
VisualLayoutParsingResponse visualLayoutParsingResponse = objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
return visualLayoutParsingResponse;
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
}
}

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor;
@ -33,7 +33,7 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
@ -45,7 +45,7 @@ public class DocstrumSegmentationService {
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<RedTextPosition> positions = textPositions.stream()
.filter(t -> t.getDir() == direction)

View File

@ -7,7 +7,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -25,7 +25,7 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer();
private boolean headlines;
private long rulesVersion;

View File

@ -13,13 +13,13 @@ import lombok.Setter;
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class Boundary implements Comparable<Boundary> {
public class TextRange implements Comparable<TextRange> {
private int start;
private int end;
public Boundary(int start, int end) {
public TextRange(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
@ -47,15 +47,15 @@ public class Boundary implements Comparable<Boundary> {
}
public boolean contains(Boundary boundary) {
public boolean contains(TextRange textRange) {
return start <= boundary.start() && boundary.end() <= end;
return start <= textRange.start() && textRange.end() <= end;
}
public boolean containedBy(Boundary boundary) {
public boolean containedBy(TextRange textRange) {
return boundary.contains(this);
return textRange.contains(this);
}
@ -83,18 +83,18 @@ public class Boundary implements Comparable<Boundary> {
}
public boolean intersects(Boundary boundary) {
public boolean intersects(TextRange textRange) {
return boundary.start() < this.end && this.start < boundary.end();
return textRange.start() < this.end && this.start < textRange.end();
}
public List<Boundary> split(List<Integer> splitIndices) {
public List<TextRange> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
List<TextRange> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
@ -102,10 +102,10 @@ public class Boundary implements Comparable<Boundary> {
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
splitBoundaries.add(new TextRange(previousIndex, end));
return splitBoundaries;
}
@ -114,11 +114,11 @@ public class Boundary implements Comparable<Boundary> {
return IntStream.range(start, end);
}
public static Boundary merge(Collection<Boundary> boundaries) {
public static TextRange merge(Collection<TextRange> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
return new TextRange(minStart, maxEnd);
}
@ -130,12 +130,12 @@ public class Boundary implements Comparable<Boundary> {
@Override
public int compareTo(Boundary boundary) {
public int compareTo(TextRange textRange) {
if (end < boundary.end() && start < boundary.start()) {
if (end < textRange.end() && start < textRange.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
if (start > textRange.start() && end > textRange.end()) {
return 1;
}

View File

@ -11,7 +11,7 @@ import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
@ -28,11 +28,11 @@ import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class RedactionEntity {
public class TextEntity {
// initial values
@EqualsAndHashCode.Include
final Boundary boundary;
final TextRange textRange;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
@ -47,7 +47,7 @@ public class RedactionEntity {
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<RedactionEntity> references;
Set<TextEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
@ -66,9 +66,9 @@ public class RedactionEntity {
SemanticNode deepestFullyContainingNode;
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
return TextEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
}
@ -132,7 +132,7 @@ public class RedactionEntity {
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
@ -155,21 +155,21 @@ public class RedactionEntity {
}
public boolean containedBy(RedactionEntity redactionEntity) {
public boolean containedBy(TextEntity textEntity) {
return this.boundary.containedBy(redactionEntity.getBoundary());
return this.textRange.containedBy(textEntity.getTextRange());
}
public boolean contains(RedactionEntity redactionEntity) {
public boolean contains(TextEntity textEntity) {
return this.boundary.contains(redactionEntity.getBoundary());
return this.textRange.contains(textEntity.getTextRange());
}
public boolean intersects(RedactionEntity redactionEntity) {
public boolean intersects(TextEntity textEntity) {
return this.boundary.intersects(redactionEntity.getBoundary());
return this.textRange.intersects(textEntity.getTextRange());
}
@ -185,13 +185,13 @@ public class RedactionEntity {
}
public void addReference(RedactionEntity reference) {
public void addReference(TextEntity reference) {
references.add(reference);
}
public void addReferences(List<RedactionEntity> references) {
public void addReferences(List<TextEntity> references) {
this.references.addAll(references);
}
@ -210,7 +210,7 @@ public class RedactionEntity {
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(boundary);
sb.append(textRange);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());

View File

@ -8,7 +8,7 @@ import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
@ -39,7 +39,7 @@ public abstract class AbstractSemanticNode implements GenericSemanticNode {
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
Set<TextEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;

View File

@ -12,7 +12,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -33,7 +33,7 @@ public class Document extends AbstractSemanticNode {
Set<Page> pages;
Integer numberOfPages;
LayoutparsingVisualizations visualizations;
LayoutDebugLayer layoutDebugLayer;
@Override

View File

@ -92,4 +92,16 @@ public class Image extends AbstractSemanticNode {
return true;
}
public double getArea() {
return position.getWidth() * position.getHeight();
}
public boolean isFullPageImage() {
return imageType.equals(ImageType.OCR) || getArea() >= 0.5 * page.getArea();
}
}

View File

@ -6,7 +6,6 @@ public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
SIGNATURE_VISUAL,
OTHER,
OCR,

View File

@ -6,7 +6,7 @@ import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -39,7 +39,7 @@ public class Page {
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
Set<TextEntity> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
@ -60,7 +60,10 @@ public class Page {
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
return mainBody.stream()
.filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@ -84,4 +87,10 @@ public class Page {
return o instanceof Page && o.hashCode() == this.hashCode();
}
public double getArea() {
return height * width;
}
}

View File

@ -14,13 +14,14 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode {
@ -42,7 +43,9 @@ public interface SemanticNode {
*/
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getTextBlock)
.collect(new TextBlockCollector());
}
@ -52,7 +55,7 @@ public interface SemanticNode {
*
* @return Set of all Entities associated with this Node
*/
Set<RedactionEntity> getEntities();
Set<TextEntity> getEntities();
/**
@ -68,7 +71,10 @@ public interface SemanticNode {
default Page getFirstPage() {
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
return getTextBlock().getPages()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
@ -77,18 +83,19 @@ public interface SemanticNode {
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(Boundary boundary) {
default Set<Page> getPages(TextRange textRange) {
if (!getBoundary().contains(boundary)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
if (!getBoundary().contains(textRange)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
}
return getTextBlock().getPages(boundary);
return getTextBlock().getPages(textRange);
}
default boolean isOnPage(int pageNumber) {
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
return getPages().stream()
.anyMatch(page -> page.getNumber() == pageNumber);
}
@ -203,7 +210,9 @@ public interface SemanticNode {
*/
default boolean hasEntitiesOfType(String type) {
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
return getEntities().stream()
.filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
.anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
}
@ -213,9 +222,11 @@ public interface SemanticNode {
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
default List<RedactionEntity> getEntitiesOfType(String type) {
default List<TextEntity> getEntitiesOfType(String type) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
return getEntities().stream()
.filter(redactionEntity -> redactionEntity.getType().equals(type))
.toList();
}
@ -225,9 +236,11 @@ public interface SemanticNode {
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
default List<TextEntity> getEntitiesOfType(List<String> types) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
return getEntities().stream()
.filter(redactionEntity -> redactionEntity.isAnyType(types))
.toList();
}
@ -241,7 +254,8 @@ public interface SemanticNode {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
return getTextBlock().getAtomicTextBlocks()
.get(0).getNumberOnPage();
} else {
return -1;
}
@ -279,7 +293,8 @@ public interface SemanticNode {
*/
default boolean containsStrings(List<String> strings) {
return strings.stream().allMatch(this::containsString);
return strings.stream()
.allMatch(this::containsString);
}
@ -303,7 +318,8 @@ public interface SemanticNode {
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
return strings.stream()
.anyMatch(this::containsString);
}
@ -315,7 +331,8 @@ public interface SemanticNode {
*/
default boolean containsAnyStringIgnoreCase(List<String> strings) {
return strings.stream().anyMatch(this::containsStringIgnoreCase);
return strings.stream()
.anyMatch(this::containsStringIgnoreCase);
}
@ -323,19 +340,19 @@ public interface SemanticNode {
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param redactionEntity RedactionEntity, which is being inserted into the graph
* @param textEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
default void addThisToEntityIfIntersects(TextEntity textEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
redactionEntity.setDeepestFullyContainingNode(this);
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
if (textBlock.containsBoundary(textEntity.getTextRange())) {
textEntity.setDeepestFullyContainingNode(this);
}
redactionEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
textEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(textEntity.getTextRange()))
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
}
}
@ -386,7 +403,8 @@ public interface SemanticNode {
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.map(DocumentTree.Entry::getNode);
}
@ -397,7 +415,9 @@ public interface SemanticNode {
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
return getDocumentTree().allSubEntriesInOrder(getTreeId())
.filter(entry -> entry.getType().equals(nodeType))
.map(DocumentTree.Entry::getNode);
}
@ -406,9 +426,9 @@ public interface SemanticNode {
*
* @return Boundary of this Node's TextBlock
*/
default Boundary getBoundary() {
default TextRange getBoundary() {
return getTextBlock().getBoundary();
return getTextBlock().getTextRange();
}
@ -453,17 +473,19 @@ public interface SemanticNode {
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().filter(child -> !isFullPageImage(child))
.map(SemanticNode::getBBox)
.toList();
return BBoxMergingUtility.mergeBBoxes(childrenBBoxes);
}
private static boolean isFullPageImage(SemanticNode child) {
if (!child.getType().equals(NodeType.IMAGE)) {
return false;
}
return bBoxPerPage;
return ((Image) child).isFullPageImage();
}
@ -473,7 +495,9 @@ public interface SemanticNode {
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
.stream()
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}

View File

@ -15,7 +15,7 @@ import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
@ -43,7 +43,7 @@ public class Table implements SemanticNode {
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
Set<TextEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@ -54,7 +54,7 @@ public class Table implements SemanticNode {
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
@ -88,7 +88,7 @@ public class Table implements SemanticNode {
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
@ -107,7 +107,7 @@ public class Table implements SemanticNode {
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
@ -125,7 +125,7 @@ public class Table implements SemanticNode {
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
public Stream<TextEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream()
@ -145,7 +145,7 @@ public class Table implements SemanticNode {
* @param types type strings to check whether a row doesn't contain an entity like it
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
public Stream<TextEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream()
@ -163,7 +163,7 @@ public class Table implements SemanticNode {
return streamRow(rowNumber).map(TableCell::getEntities)
.flatMap(Collection::stream)
.map(RedactionEntity::getType)
.map(TextEntity::getType)
.distinct();
}
@ -304,12 +304,12 @@ public class Table implements SemanticNode {
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
*
* @param type the type of entities to search for
* @param redactionEntity the entity, which appears in the row to search
* @param textEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
public List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) {
public List<TextEntity> getEntitiesOfTypeInSameRow(String type, TextEntity textEntity) {
return redactionEntity.getIntersectingNodes()
return textEntity.getIntersectingNodes()
.stream()
.filter(node -> node instanceof TableCell)
.map(node -> (TableCell) node)

View File

@ -13,7 +13,7 @@ import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -36,14 +36,14 @@ public class AtomicTextBlock implements TextBlock {
Page page;
//string coordinates
Boundary boundary;
TextRange textRange;
String searchText;
@Builder.Default
List<Integer> lineBreaks = new ArrayList<>();
@Builder.Default
List<Boundary> boldTextBoundaries = new ArrayList<>();
List<TextRange> boldTextBoundaries = new ArrayList<>();
@Builder.Default
List<Boundary> italicTextBoundaries = new ArrayList<>();
List<TextRange> italicTextBoundaries = new ArrayList<>();
String orientation;
int textDirection;
@ -66,8 +66,8 @@ public class AtomicTextBlock implements TextBlock {
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
List<Integer> lineBreaks,
List<Boundary> boldTextBoundaries,
List<Boundary> italicTextBoundaries,
List<TextRange> boldTextBoundaries,
List<TextRange> italicTextBoundaries,
List<Rectangle2D> positions,
List<Integer> stringIdxToPositionIdx,
long idx,
@ -89,7 +89,7 @@ public class AtomicTextBlock implements TextBlock {
.italicTextBoundaries(italicTextBoundaries)
.positions(positions)
.stringIdxToPositionIdx(stringIdxToPositionIdx)
.boundary(new Boundary(offset, offset + searchText.length()))
.textRange(new TextRange(offset, offset + searchText.length()))
.textDirection(textDirection)
.orientation(orientation)
.build();
@ -100,7 +100,7 @@ public class AtomicTextBlock implements TextBlock {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.boundary(new Boundary(stringOffset, stringOffset))
.textRange(new TextRange(stringOffset, stringOffset))
.searchText("")
.page(page)
.numberOnPage(numberOnPage)
@ -118,7 +118,7 @@ public class AtomicTextBlock implements TextBlock {
.id(documentTextData.getId())
.numberOnPage(documentTextData.getNumberOnPage())
.page(page)
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
@ -140,11 +140,11 @@ public class AtomicTextBlock implements TextBlock {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
}
@ -159,9 +159,9 @@ public class AtomicTextBlock implements TextBlock {
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
.orElse(searchText.length()) + textRange.start();
}
@ -169,43 +169,43 @@ public class AtomicTextBlock implements TextBlock {
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
.orElse(0) + textRange.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
if (!containsBoundary(stringTextRange)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
}
if (stringBoundary.length() == 0) {
if (stringTextRange.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
if (stringBoundary.end() == this.boundary.end()) {
if (stringTextRange.end() == this.textRange.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
@ -217,9 +217,9 @@ public class AtomicTextBlock implements TextBlock {
}
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList();
}

View File

@ -11,7 +11,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
Boundary boundary;
TextRange textRange;
public static ConcatenatedTextBlock empty() {
@ -37,12 +37,12 @@ public class ConcatenatedTextBlock implements TextBlock {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
boundary = new Boundary(-1, -1);
textRange = new TextRange(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
}
@ -50,16 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
public ConcatenatedTextBlock concat(TextBlock textBlock) {
int start = textBlock.getBoundary().start();
int end = textBlock.getBoundary().end();
int start = textBlock.getTextRange().start();
int end = textBlock.getTextRange().end();
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(start);
boundary.setEnd(end);
} else if (boundary.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
textRange.setStart(start);
textRange.setEnd(end);
} else if (textRange.end() != start) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(end);
textRange.setEnd(end);
this.searchText = null;
return this;
}
@ -67,13 +67,13 @@ public class ConcatenatedTextBlock implements TextBlock {
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList();
}
@ -125,47 +125,47 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
return textBlocks.get(0).getPositions(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
return rectanglesPerLinePerPage;
}
@ -187,14 +187,14 @@ public class ConcatenatedTextBlock implements TextBlock {
@Override
public List<Boundary> getBoldTextBoundaries() {
public List<TextRange> getBoldTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
}
@Override
public List<Boundary> getItalicTextBoundaries() {
public List<TextRange> getItalicTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
}

View File

@ -10,7 +10,7 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
public interface TextBlock extends CharSequence {
@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence {
List<AtomicTextBlock> getAtomicTextBlocks();
List<Boundary> getBoldTextBoundaries();
List<TextRange> getBoldTextBoundaries();
List<Boundary> getItalicTextBoundaries();
List<TextRange> getItalicTextBoundaries();
String getOrientation();
@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence {
int getTextDirection();
Boundary getBoundary();
TextRange getTextRange();
int getNextLinebreak(int fromIndex);
@ -48,10 +48,10 @@ public interface TextBlock extends CharSequence {
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(Boundary stringBoundary);
List<Rectangle2D> getPositions(TextRange stringTextRange);
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
int numberOfLines();
@ -59,7 +59,7 @@ public interface TextBlock extends CharSequence {
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getBoundary().start());
return indexOf(searchTerm, getTextRange().start());
}
@ -69,10 +69,10 @@ public interface TextBlock extends CharSequence {
}
default Set<Page> getPages(Boundary boundary) {
default Set<Page> getPages(TextRange textRange) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
@ -80,38 +80,38 @@ public interface TextBlock extends CharSequence {
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
if (start == -1) {
return -1;
}
return start + getBoundary().start();
return start + getTextRange().start();
}
default CharSequence getFirstLine() {
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
}
default boolean containsBoundary(Boundary boundary) {
default boolean containsBoundary(TextRange textRange) {
if (boundary.end() < boundary.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
if (textRange.end() < textRange.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
}
return getBoundary().contains(boundary);
return getTextRange().contains(textRange);
}
default boolean containsIndex(int stringIndex) {
return getBoundary().contains(stringIndex);
return getTextRange().contains(stringIndex);
}
default CharSequence subSequence(Boundary boundary) {
default CharSequence subSequence(TextRange textRange) {
return subSequence(boundary.start(), boundary.end());
return subSequence(textRange.start(), textRange.end());
}
@ -128,21 +128,21 @@ public interface TextBlock extends CharSequence {
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
}
@Override
default int length() {
return getBoundary().length();
return getTextRange().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getBoundary().start());
return getSearchText().charAt(index - getTextRange().start());
}
}

View File

@ -12,12 +12,14 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();

View File

@ -33,7 +33,7 @@ public class BodyTextFrameService {
for (ClassificationPage page : classificationDocument.getPages()) {
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
classificationDocument.getLayoutDebugLayer().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
}
}

View File

@ -17,7 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor;
@ -35,7 +35,7 @@ public class DocstrumBlockificationService {
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
CleanRulings rulings,
boolean xyOrder,
LayoutparsingVisualizations visualizations,
LayoutDebugLayer visualizations,
LayoutParsingType layoutParsingType) {
CleanRulings usedRulings = rulings.withoutTextRulings();

View File

@ -1,9 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
@ -11,13 +8,11 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
@SuppressWarnings("all")
@Service
@ -35,7 +30,7 @@ public class RedactManagerBlockificationService {
* @param visualizations
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
CleanRulings usedRulings = cleanRulings.withoutTextRulings();

View File

@ -22,6 +22,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
@ -31,7 +32,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
@ -55,7 +55,7 @@ public class DocumentGraphFactory {
Document documentGraph = new Document();
documentGraph.setVisualizations(document.getVisualizations());
documentGraph.setLayoutDebugLayer(document.getLayoutDebugLayer());
Context context = new Context(documentGraph);
@ -280,7 +280,8 @@ public class DocumentGraphFactory {
return pages.keySet()
.stream()
.filter(page -> page.getNumber() == pageIndex)
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
.findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
}
}

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import lombok.AccessLevel;
import lombok.Builder;
@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto {
String searchText;
List<Integer> lineBreaks;
List<Integer> stringIdxToPositionIdx;
List<Boundary> boldTextBoundaries;
List<Boundary> italicTextBoundaries;
List<TextRange> boldTextBoundaries;
List<TextRange> italicTextBoundaries;
List<Rectangle2D> positions;

View File

@ -9,7 +9,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory {
}
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
private static List<TextRange> mergeToBoundaries(List<Integer> integers) {
if (integers.isEmpty()) {
return Collections.emptyList();
}
List<Boundary> boundaries = new LinkedList<>();
List<TextRange> boundaries = new LinkedList<>();
int start = integers.get(0);
int end = integers.get(0) + 1;
for (int current : integers) {
if (current > end + 1) {
boundaries.add(new Boundary(start, end));
boundaries.add(new TextRange(start, end));
start = current;
}
end = current + 1;
}
if (boundaries.isEmpty()) {
boundaries.add(new Boundary(start, end));
boundaries.add(new TextRange(start, end));
}
return boundaries;
}

View File

@ -116,8 +116,8 @@ public class DocumentDataMapper {
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
.numberOnPage(atomicTextBlock.getNumberOnPage())
.start(atomicTextBlock.getBoundary().start())
.end(atomicTextBlock.getBoundary().end())
.start(atomicTextBlock.getTextRange().start())
.end(atomicTextBlock.getTextRange().end())
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
.build();
}

View File

@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
@ -82,15 +82,15 @@ public class TaasDocumentDataMapper {
}
private static Range toRange(Boundary boundary) {
private static Range toRange(TextRange textRange) {
return new Range(boundary.start(), boundary.end());
return new Range(textRange.start(), textRange.end());
}
private static List<Range> toRange(List<Boundary> boundary) {
private static List<Range> toRange(List<TextRange> textRange) {
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
return textRange.stream().map(TaasDocumentDataMapper::toRange).toList();
}

View File

@ -1,41 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
@ -48,451 +24,41 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class LayoutGridService {
IViewerDocumentService viewerDocumentService;
static float FONT_SIZE = 10f;
static float LINE_WIDTH = 1f;
static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
static Color TABLE_COLOR = new Color(102, 205, 170);
static Color SECTION_COLOR = new Color(50, 50, 50);
static Color HEADLINE_COLOR = new Color(162, 56, 56);
static Color HEADER_COLOR = new Color(171, 131, 6);
static Color IMAGE_COLOR = new Color(253, 63, 146);
private record RectangleIdentifier(List<Integer> treeId, Integer pageNumber) {
}
HashMap<RectangleIdentifier, Rectangle2D> rectangleMap = new HashMap<>();
PDFTronViewerDocumentService viewerDocumentService;
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
List<Visualizations> allVisualizations;
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
if (writeVisualLayoutParsingGrid) {
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
.toList();
LayoutGrid layoutGrid = createLayoutGrid(document);
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
// Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
if (document.getLayoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()));
} else {
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
.toList();
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid));
}
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
}
@SneakyThrows
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
public Visualizations addLayoutGrid(Document document, boolean layerVisibilityDefaultValue, boolean visualParsingGrid) {
private LayoutGrid createLayoutGrid(Document document) {
LayoutGrid layoutGrid = createLayoutGrid(document, visualParsingGrid);
return Visualizations.builder()
.layer(visualParsingGrid ? ContentStreams.KNECON_VISUAL_PARSING : ContentStreams.KNECON_LAYOUT)
.visualizationsOnPages(layoutGrid.getVisualizationsPerPages())
.layerVisibilityDefaultValue(layerVisibilityDefaultValue)
.build();
}
private LayoutGrid createLayoutGrid(Document document, boolean visualParsingGrid) {
LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages());
LayoutGrid layoutGrid = new LayoutGrid();
document.streamAllSubNodes()
.filter(node -> (node.getEngines().contains(LayoutEngine.AI) && visualParsingGrid) || (node.getEngines().contains(LayoutEngine.ALGORITHM) && !visualParsingGrid))
.peek(layoutGrid::addTreeId)
.forEach(semanticNode -> {
Color color = switch (semanticNode.getType()) {
case PARAGRAPH -> PARAGRAPH_COLOR;
case TABLE -> TABLE_COLOR;
case SECTION, SUPER_SECTION -> SECTION_COLOR;
case HEADLINE -> HEADLINE_COLOR;
case HEADER, FOOTER -> HEADER_COLOR;
case IMAGE -> IMAGE_COLOR;
default -> null;
};
if (semanticNode instanceof DuplicatedParagraph) {
color = DUPLICATE_PARAGRAPH_COLOR;
}
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
addAsRectangle(semanticNode, layoutGrid, color);
}
if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) {
addSection(semanticNode, layoutGrid, color);
}
if (semanticNode.getType().equals(NodeType.TABLE)) {
Table table = (Table) semanticNode;
addInnerTableLines(table, layoutGrid);
switch (semanticNode.getType()) {
case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode);
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
case TABLE -> layoutGrid.addTable((Table) semanticNode);
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
}
});
return layoutGrid;
}
private void addInnerTableLines(Table table, LayoutGrid layoutGrid) {
if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) {
return;
}
for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getRow)
.findFirst();
if (optionalFirstRowOnPage.isEmpty()) {
continue;
}
int firstRowOnPage = optionalFirstRowOnPage.get();
Stream<Double> xStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX);
default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX);
};
List<Double> xs = xStream.collect(Collectors.toList());
xs.remove(0);
Stream<Double> yStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY);
default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY);
};
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox()
.get(page);
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getColoredLines();
xs.forEach(x -> {
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
ys.forEach(y -> {
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
}
}
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getBBox)
.map(bBoxMap -> bBoxMap.get(page));
}
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo)
.orElse(semanticNode.getTreeId().size());
int ownDepth = semanticNode.getTreeId().size();
Page firstPage = semanticNode.getFirstPage();
String treeIdString = buildTreeIdString(semanticNode);
if (bBoxMap.values().size() == 1) {
handleSinglePage(semanticNode, layoutGrid, color, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
return;
}
List<Page> pagesInOrder = bBoxMap.keySet()
.stream()
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
handleFirstPageOfSection(semanticNode, color, firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
if (semanticNode instanceof SuperSection) {
return;
}
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
handleForMiddlePageOfSection(semanticNode, color, middlePage, bBoxMap.get(middlePage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
}
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
handleLastPageOfSection(semanticNode, color, lastPage, bBoxMap.get(lastPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
}
@SneakyThrows
private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, LayoutGrid layoutGrid, Integer maxChildDepth) {
// translates text, such that its right edge is a bit to the left of the drawn box
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth);
Point2D upperLeftCorner;
Point2D translationVector;
switch (page.getRotation()) {
case 90 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
}
case 180 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
}
case 270 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
}
default -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
}
}
upperLeftCorner = add(upperLeftCorner, translationVector);
List<PlacedText> placedTexts = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getPlacedTexts();
PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT);
Optional<PlacedText> conflictingText = placedTexts.stream()
.filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE)
.findFirst();
if (conflictingText.isPresent()) {
PlacedText existingText = conflictingText.get();
if (newText.text().length() > existingText.text().length()) {
placedTexts.remove(existingText);
placedTexts.add(newText);
}
} else {
placedTexts.add(newText);
}
}
private void handleSinglePage(SemanticNode semanticNode,
LayoutGrid layoutGrid,
Color color,
Page page,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleFirstPageOfSection(SemanticNode semanticNode,
Color color,
Page firstPage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove bottom line
result.pageLines().remove(2);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleForMiddlePageOfSection(SemanticNode semanticNode,
Color color,
Page middlePage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// remove bottom line
result.pageLines().remove(1);
// add string to left line
var leftLine = result.pageLines().remove(1);
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private void handleLastPageOfSection(SemanticNode semanticNode,
Color color,
Page lastPage,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// add string to left line
var leftLine = result.pageLines().remove(2);
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
}
}
private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode,
Page page,
Rectangle2D rectangle2D,
String treeIdString,
LayoutGrid layoutGrid,
Integer maxChildDepth,
Integer ownDepth) {
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, layoutGrid, maxChildDepth);
if (semanticNode instanceof SuperSection) {
rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r);
}
return new RectangleAndLinesResult(coloredLines, r, lastPageLines);
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
}
private String buildTreeIdString(SemanticNode semanticNode) {
return semanticNode.getTreeId()
.stream()
.map(Object::toString)
.collect(Collectors.joining("."));
}
/*
A __________________ B
| |
| |
| |
| |
D|__________________| C
The returned List are the lines [AB, BC, DC, AD]
The List is reordered, such that the order of the returned lines are always as viewed on the page.
*/
private List<Line2D> createLinesFromRectangle(Rectangle2D r, int pageRotation) {
// +0.5 to join the lines
List<Line2D> lines = new ArrayList<>(4);
float lineWidthCorrection = LINE_WIDTH * 0.5f;
Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY());
Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY());
Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY());
Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY());
Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY());
Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY());
Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY());
Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY());
lines.add(new Line2D.Float(a1, b1));
lines.add(new Line2D.Float(b, c));
lines.add(new Line2D.Float(d1, c1));
lines.add(new Line2D.Float(a, d));
return switch (pageRotation) {
case 90 -> {
Collections.rotate(lines, 1);
yield lines;
}
case 180 -> {
Collections.rotate(lines, 2);
yield lines;
}
case 270 -> {
Collections.rotate(lines, 3);
yield lines;
}
default -> lines;
};
}
private static boolean isNotSectionOrTableCellOrDocument(SemanticNode semanticNode) {
return !(semanticNode.getType().equals(NodeType.DOCUMENT)
|| semanticNode.getType().equals(NodeType.SECTION)
|| semanticNode.getType().equals(NodeType.SUPER_SECTION)
|| semanticNode.getType().equals(NodeType.TABLE_CELL));
}
private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
semanticNode.getBBox()
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages()
.get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
}
private Point2D add(Point2D a, Point2D b) {
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
}
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.experimental.UtilityClass;
@UtilityClass
public class BBoxMergingUtility {
public Map<Page, Rectangle2D> mergeBBoxes(List<Map<Page, Rectangle2D>> bboxesToMerge) {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Set<Page> pages = bboxesToMerge.stream()
.flatMap(map -> map.keySet()
.stream())
.collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = bboxesToMerge.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
}

View File

@ -0,0 +1,111 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ConnectionLineUtil {
public static Line2D[] splitRectangleIntoLines(Rectangle2D rect) {
double x = rect.getX();
double y = rect.getY();
double width = rect.getWidth();
double height = rect.getHeight();
Line2D[] lines = new Line2D[4];
lines[0] = new Line2D.Double(x, y, x + width, y); // Top
lines[1] = new Line2D.Double(x + width, y, x + width, y + height); // Right
lines[2] = new Line2D.Double(x + width, y + height, x, y + height); // Bottom
lines[3] = new Line2D.Double(x, y + height, x, y); // Left
return lines;
}
public static Line2D transform(Line2D line2D, AffineTransform affineTransform) {
var p1 = affineTransform.transform(line2D.getP1(), null);
var p2 = affineTransform.transform(line2D.getP2(), null);
return new Line2D.Double(p1, p2);
}
public static double length(Line2D line2D) {
return line2D.getP1().distance(line2D.getP2());
}
public static Line2D findClosestMidpointLine(Rectangle2D rect1, Rectangle2D rect2) {
Line2D[] lines1 = splitRectangleIntoLines(rect1);
Line2D[] lines2 = splitRectangleIntoLines(rect2);
Line2D closestLine1 = null;
Line2D closestLine2 = null;
double minDistance = Double.MAX_VALUE;
for (Line2D line1 : lines1) {
for (Line2D line2 : lines2) {
double distance = lineDistance(line1, line2);
if (distance < minDistance) {
minDistance = distance;
closestLine1 = line1;
closestLine2 = line2;
}
}
}
if (closestLine1 == null || closestLine2 == null) {
throw new IllegalStateException("Could not find closest lines");
}
Point2D midpoint1 = getMidpoint(closestLine1);
Point2D midpoint2 = getMidpoint(closestLine2);
return new Line2D.Double(midpoint1, midpoint2);
}
private static double lineDistance(Line2D line1, Line2D line2) {
return Math.abs(getMidpoint(line1).distance(getMidpoint(line2)));
}
private static Point2D getMidpoint(Line2D line) {
double x = (line.getX1() + line.getX2()) / 2;
double y = (line.getY1() + line.getY2()) / 2;
return new Point2D.Double(x, y);
}
public static Line2D[] createArrowHead(Line2D line, double arrowLength) {
Point2D start = line.getP1();
Point2D end = line.getP2();
// Calculate the angle of the line
double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX());
// Calculate the points for the two arrow lines
double arrowHeadAngle = Math.PI / 6;
double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle);
double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle);
double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle);
double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle);
// Create and return the two arrow lines
Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1));
Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2));
return new Line2D[]{arrow1, arrow2};
}
}

View File

@ -7,7 +7,6 @@ import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@ -21,12 +20,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import lombok.AccessLevel;
@ -36,72 +33,15 @@ import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Setter
@Getter
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparsingVisualizations {
public class LayoutDebugLayer extends LayoutDebugLayerConfig {
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
static final Color WORDS_COLOR = new Color(68, 84, 147);
static final Color LINES_COLOR = new Color(152, 45, 179);
static final Color ZONES_COLOR = new Color(131, 38, 38);
static final Color RULINGS_COLOR = new Color(21, 221, 174);
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
static final Color CELLS_COLOR = new Color(31, 214, 27);
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
new Color(255, 195, 0),
new Color(76, 175, 80),
new Color(33, 150, 243),
new Color(155, 89, 182),
new Color(233, 30, 99),
new Color(0, 188, 212),
new Color(121, 85, 72));
@Setter
boolean active;
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
public Stream<Visualizations> streamAll() {
if (!active) {
return Stream.empty();
}
return Stream.of(characters, //
neighbours,//
words, //
lines, //
zones, //
rulings, //
clean_rulings, //
cells, //
mainBody, //
markedContent //
);
}
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
@ -130,6 +70,7 @@ public class LayoutparsingVisualizations {
.toList());
}
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
if (!active) {
@ -137,8 +78,7 @@ public class LayoutparsingVisualizations {
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(rulings
.stream()
.addAll(rulings.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.toList());
}
@ -295,16 +235,4 @@ public class LayoutparsingVisualizations {
}
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
return visualizations.getVisualizationsOnPages()
.get(page - 1);
}
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
return visualizationsOnPage;
}
}

View File

@ -0,0 +1,430 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutGrid extends LayoutGridLayerConfig {
@Getter
@Setter
boolean visibleByDefault;
final HashMap<RectangleIdentifier, Rectangle2D> rectangleMap = new HashMap<>();
public void addParagraph(Paragraph paragraph) {
if (paragraph instanceof DuplicatedParagraph) {
addAsRectangle(paragraph, paragraphs, DUPLICATE_PARAGRAPH_COLOR);
} else {
addAsRectangle(paragraph, paragraphs, PARAGRAPH_COLOR);
}
}
public void addImage(Image image) {
if (image.isFullPageImage()) {
addAsRectangle(image, images, IMAGE_COLOR);
} else {
addAsRectangle(image, figures, IMAGE_COLOR);
}
}
public void addHeadline(Headline headline) {
addAsRectangle(headline, headlines, HEADLINE_COLOR);
}
public void addHeaderOrFooter(SemanticNode header) {
addAsRectangle(header, headerFooter, HEADER_COLOR);
}
public void addTreeId(SemanticNode semanticNode) {
Page page = semanticNode.getFirstPage();
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
}
public void addTable(Table table) {
addAsRectangle(table, tables, TABLE_COLOR);
addInnerTableLines(table);
addHeaderCells(table);
}
private void addHeaderCells(Table table) {
table.streamHeaders()
.map(TableCell::getBBox)
.forEach(map -> map.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), tables).getFilledRectangles()
.add(new FilledRectangle(textBBox, HEADER_CELL_COLOR, 0.1f))));
}
public void addSection(SemanticNode section) {
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeType.SECTION)
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo).orElse(section.getTreeId().size());
int ownDepth = section.getTreeId().size();
Page firstPage = section.getFirstPage();
String treeIdString = buildTreeIdString(section);
if (bBoxMap.values().size() == 1) {
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
return;
}
List<Page> pagesInOrder = bBoxMap.keySet()
.stream()
.sorted(Comparator.comparingInt(Page::getNumber))
.collect(Collectors.toList());
pagesInOrder.remove(0);
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
if (section instanceof SuperSection) {
return;
}
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth);
}
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth);
}
private String buildTreeIdString(SemanticNode semanticNode) {
return semanticNode.getTreeId()
.stream()
.map(Object::toString)
.collect(Collectors.joining("."));
}
@SneakyThrows
private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, Integer maxChildDepth, Visualizations visualizations, Color color) {
// translates text, such that its right edge is a bit to the left of the drawn box
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth);
Point2D upperLeftCorner;
Point2D translationVector;
switch (page.getRotation()) {
case 90 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
}
case 180 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
}
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
}
case 270 -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
}
default -> {
if (highestParentRect != null) {
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY());
} else {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
}
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
}
}
upperLeftCorner = add(upperLeftCorner, translationVector);
List<PlacedText> placedTexts = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getPlacedTexts();
PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, color, FONT);
Optional<PlacedText> conflictingText = placedTexts.stream()
.filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE)
.findFirst();
if (conflictingText.isPresent()) {
PlacedText existingText = conflictingText.get();
if (newText.text().length() > existingText.text().length()) {
placedTexts.remove(existingText);
placedTexts.add(newText);
}
} else {
placedTexts.add(newText);
}
}
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
}
}
private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove bottom line
result.pageLines().remove(2);
// add string to top line
var firstLine = result.pageLines().remove(0);
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
}
}
private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// remove bottom line
result.pageLines().remove(1);
// add string to left line
var leftLine = result.pageLines().remove(1);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
}
}
private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
// remove top line
result.pageLines().remove(0);
// add string to left line
var leftLine = result.pageLines().remove(2);
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
for (Line2D line : result.pageLines()) {
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
}
}
private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode,
Page page,
Rectangle2D rectangle2D,
String treeIdString,
Integer maxChildDepth,
Integer ownDepth) {
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR);
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
if (semanticNode instanceof SuperSection) {
rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r);
}
return new RectangleAndLinesResult(coloredLines, r, lastPageLines);
}
private void addInnerTableLines(Table table) {
if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) {
return;
}
for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getRow)
.findFirst();
if (optionalFirstRowOnPage.isEmpty()) {
continue;
}
int firstRowOnPage = optionalFirstRowOnPage.get();
Stream<Double> xStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX);
default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX);
};
List<Double> xs = xStream.collect(Collectors.toList());
xs.remove(0);
Stream<Double> yStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY);
default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY);
};
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox().get(page);
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
xs.forEach(x -> {
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
ys.forEach(y -> {
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
});
}
}
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
.map(TableCell::getBBox)
.map(bBoxMap -> bBoxMap.get(page));
}
private void addAsRectangle(SemanticNode semanticNode, Visualizations visualizations, Color color) {
addAsRectangle(semanticNode.getBBox(), visualizations, color);
}
private void addAsRectangle(Map<Page, Rectangle2D> bbox, Visualizations visualizations, Color color) {
bbox.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredRectangles()
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
}
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
}
private record RectangleIdentifier(List<Integer> treeId, Integer pageNumber) {
}
/*
A __________________ B
| |
| |
| |
| |
D|__________________| C
The returned List are the lines [AB, BC, DC, AD]
The List is reordered, such that the order of the returned lines are always as viewed on the page.
*/
private List<Line2D> createLinesFromRectangle(Rectangle2D r, int pageRotation) {
// +0.5 to join the lines
List<Line2D> lines = new ArrayList<>(4);
float lineWidthCorrection = LINE_WIDTH * 0.5f;
Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY());
Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY());
Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY());
Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY());
Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY());
Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY());
Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY());
Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY());
lines.add(new Line2D.Float(a1, b1));
lines.add(new Line2D.Float(b, c));
lines.add(new Line2D.Float(d1, c1));
lines.add(new Line2D.Float(a, d));
return switch (pageRotation) {
case 90 -> {
Collections.rotate(lines, 1);
yield lines;
}
case 180 -> {
Collections.rotate(lines, 2);
yield lines;
}
case 270 -> {
Collections.rotate(lines, 3);
yield lines;
}
default -> lines;
};
}
private Point2D add(Point2D a, Point2D b) {
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
}
}

View File

@ -38,7 +38,7 @@ dependencies {
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("com.pdftron:PDFNet:10.5.0")
implementation("com.pdftron:PDFNet:10.7.0")
// for integration testing only
testImplementation(project(":viewer-doc-processor"))

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.server;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.google.common.base.Strings;
@ -17,7 +18,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class PDFNetInitializer {
private final LayoutparserSettings settings;
@Value("${pdftron.license:}")
private String pdftronLicense;
@SneakyThrows
@ -25,13 +27,13 @@ public class PDFNetInitializer {
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
if (Strings.isNullOrEmpty(settings.getPdftronLicense())) {
if (Strings.isNullOrEmpty(pdftronLicense)) {
return;
}
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", settings.getPdftronLicense());
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(settings.getPdftronLicense());
PDFNet.initialize(pdftronLicense);
}

View File

@ -50,7 +50,7 @@ public class BdrJsonBuildTest extends AbstractTest {
protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
file,
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -97,7 +97,7 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -12,6 +12,7 @@ import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
@ -32,18 +33,29 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import jakarta.annotation.PostConstruct;
import lombok.SneakyThrows;
public class OutlineDetectionTest extends AbstractTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Autowired
protected LayoutParsingPipeline layoutParsingPipeline;
@Autowired
PDFNetInitializer pdfNetInitializer;
@BeforeEach
public void init() {
pdfNetInitializer.init();
}
@Test
@SneakyThrows
@ -60,28 +72,17 @@ public class OutlineDetectionTest extends AbstractTest {
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(13).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2);
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
.stream()
.flatMap(Collection::stream)
@ -98,29 +99,15 @@ public class OutlineDetectionTest extends AbstractTest {
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
assertEquals(tableOfContents.getMainSections()
.get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections()
.get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections().get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren()
.get(2).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
Document document = buildGraph(fileName, classificationDocument);
@ -159,17 +146,14 @@ public class OutlineDetectionTest extends AbstractTest {
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.toList().get(3).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 1 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.toList().get(3).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(1).streamChildren()
.toList().get(1).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);

View File

@ -3,11 +3,8 @@ package com.knecon.fforesight.service.layoutparser.server;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
@ -16,12 +13,8 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
@ -30,7 +23,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedS
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import io.micrometer.observation.Observation;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
class BoundaryTest {
Boundary startBoundary;
@BeforeEach
void setUp() {
startBoundary = new Boundary(10, 100);
}
@Test
void testContains() {
assertTrue(startBoundary.contains(11));
assertTrue(startBoundary.contains(50));
assertFalse(startBoundary.contains(9));
assertFalse(startBoundary.contains(100));
assertFalse(startBoundary.contains(150));
assertFalse(startBoundary.contains(-123));
assertTrue(startBoundary.contains(new Boundary(11, 99)));
assertTrue(startBoundary.contains(new Boundary(10, 100)));
assertTrue(startBoundary.contains(new Boundary(11, 11)));
assertFalse(startBoundary.contains(9, 100));
assertTrue(startBoundary.contains(100, 100));
assertFalse(startBoundary.contains(100, 101));
assertFalse(startBoundary.contains(150, 151));
}
@Test
void testIntersects() {
assertTrue(startBoundary.intersects(new Boundary(1, 11)));
assertTrue(startBoundary.intersects(new Boundary(11, 12)));
assertTrue(startBoundary.intersects(new Boundary(11, 100)));
assertFalse(startBoundary.intersects(new Boundary(100, 101)));
assertTrue(startBoundary.intersects(new Boundary(99, 101)));
}
@Test
void testSplit() {
assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size());
assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90)));
assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40)));
assertEquals(1, startBoundary.split(Collections.emptyList()).size());
assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size());
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0)));
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100)));
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100)));
}
}

View File

@ -57,7 +57,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -0,0 +1,71 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
class TextRangeTest {
TextRange startTextRange;
@BeforeEach
void setUp() {
startTextRange = new TextRange(10, 100);
}
@Test
void testContains() {
assertTrue(startTextRange.contains(11));
assertTrue(startTextRange.contains(50));
assertFalse(startTextRange.contains(9));
assertFalse(startTextRange.contains(100));
assertFalse(startTextRange.contains(150));
assertFalse(startTextRange.contains(-123));
assertTrue(startTextRange.contains(new TextRange(11, 99)));
assertTrue(startTextRange.contains(new TextRange(10, 100)));
assertTrue(startTextRange.contains(new TextRange(11, 11)));
assertFalse(startTextRange.contains(9, 100));
assertTrue(startTextRange.contains(100, 100));
assertFalse(startTextRange.contains(100, 101));
assertFalse(startTextRange.contains(150, 151));
}
@Test
void testIntersects() {
assertTrue(startTextRange.intersects(new TextRange(1, 11)));
assertTrue(startTextRange.intersects(new TextRange(11, 12)));
assertTrue(startTextRange.intersects(new TextRange(11, 100)));
assertFalse(startTextRange.intersects(new TextRange(100, 101)));
assertTrue(startTextRange.intersects(new TextRange(99, 101)));
}
@Test
void testSplit() {
assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size());
assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90)));
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
}
}

View File

@ -4,11 +4,18 @@ import java.io.File;
import java.nio.file.Path;
import java.util.Map;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.mockito.MockitoAnnotations;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
@ -16,17 +23,30 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.pdftron.pdf.PDFNet;
import jakarta.annotation.PostConstruct;
import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
@Autowired
PDFNetInitializer pdfNetInitializer;
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@BeforeEach
public void init() {
pdfNetInitializer.init();
}
@Test
@SneakyThrows
public void testViewerDocument() {
@ -63,7 +83,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);

View File

@ -28,6 +28,11 @@ spring:
max-interval: 15000
prefetch: 1
layoutparser:
debug: true
pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a
management:
endpoint:
metrics.enabled: ${monitoring.enabled:false}

View File

@ -12,7 +12,7 @@ dependencies {
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("org.slf4j:slf4j-api:1.7.25")
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.pdftron:PDFNet:10.5.0")
implementation("com.pdftron:PDFNet:10.7.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
testImplementation("org.junit.jupiter:junit-jupiter")

View File

@ -1,72 +0,0 @@
package com.knecon.fforesight.service.viewerdoc;
import java.util.List;
import org.apache.pdfbox.cos.COSName;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC)
public class ContentStreams {
public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true);
public static Identifier KNECON_VISUAL_PARSING = new Identifier("Layout grid - visual", COSName.getPDFName("KNECON_VISUAL_PARSING"), true);
public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false);
public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true);
public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true);
public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false);
public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false);
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
KNECON_VISUAL_PARSING,
KNECON_OCR,
KNECON_OCR_BBOX_DEBUG,
KNECON_OCR_TEXT_DEBUG,
OTHER,
ESCAPE_START,
ESCAPE_END,
RULINGS,
CLEAN_RULINGS,
WORDS,
ZONES,
LINES,
MAIN_BODY,
MARKED_CONTENT,
NEIGHBOURS,
CHARACTERS,
CELLS);
public record Identifier(String name, COSName cosName, boolean optionalContent) {
}
}

View File

@ -0,0 +1,77 @@
package com.knecon.fforesight.service.viewerdoc;
import org.apache.pdfbox.cos.COSName;
/*
These identifiers are used to mark content in the pdf, such that it may be found later. The markedContentName must therefore be unique.
The String "name" is only used to display optional content in the optional content view in the pdf.
Therefore, it may be null, if optionalContent is false.
If optionalContent is false, the layer will not be created as a OCG, and will not be listed in the OCG view.
*/
public record LayerIdentifier(String name, String markedContentName) {
public String markedContentName() {
// The prefix KNECON_ is used to identify marked contents as knecon contents later on
return KNECON_IDENTIFIER_PREFIX + markedContentName;
}
public COSName cosName() {
return COSName.getPDFName(markedContentName);
}
public static final String KNECON_IDENTIFIER_PREFIX = "KNECON_";
public static final LayerIdentifier KNECON_OCR = new LayerIdentifier(null, "OCR");
public static final LayerIdentifier KNECON_OCR_TEXT = new LayerIdentifier(null, "OCR_TEXT");
public static final LayerIdentifier KNECON_OCR_LINES = new LayerIdentifier(null, "OCR_LINES");
// layers
// layout grid
public static final LayerIdentifier KNECON_LAYOUT = new LayerIdentifier("Layout grid", "LAYOUT");
public static final LayerIdentifier KNECON_LAYOUT_SECTION = new LayerIdentifier("Section", "LAYOUT_SECTION");
public static final LayerIdentifier KNECON_LAYOUT_PARAGRAPH = new LayerIdentifier("Paragraph ", "LAYOUT_PARAGRAPH");
public static final LayerIdentifier KNECON_LAYOUT_KEY_VALUE = new LayerIdentifier("Key-Value Pairs ", "LAYOUT_KEY_VALUE");
public static final LayerIdentifier KNECON_LAYOUT_HEADLINE = new LayerIdentifier("Headline", "LAYOUT_HEADLINE");
public static final LayerIdentifier KNECON_LAYOUT_HEADER_FOOTER = new LayerIdentifier("Header/Footer", "LAYOUT_HEADER_FOOTER");
public static final LayerIdentifier KNECON_LAYOUT_TABLE = new LayerIdentifier("Tables", "LAYOUT_TABLE");
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
//layout grid debug
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
public static final LayerIdentifier CLEAN_RULINGS = new LayerIdentifier("Classified Rulings", "CLEAN_RULINGS");
public static final LayerIdentifier RULINGS = new LayerIdentifier("Rulings", "RULINGS");
public static final LayerIdentifier WORDS = new LayerIdentifier("Words", "WORDS");
public static final LayerIdentifier ZONES = new LayerIdentifier("Text Zones", "ZONES");
public static final LayerIdentifier LINES = new LayerIdentifier("Text Lines", "LINES");
public static final LayerIdentifier CELLS = new LayerIdentifier("Cells", "CELLS");
public static final LayerIdentifier MAIN_BODY = new LayerIdentifier("Main Text Body", "MAIN_BODY");
public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT");
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
//ocr
public static final LayerIdentifier KNECON_OCR_DEBUG = new LayerIdentifier("OCR", "OCR_DEBUG");
public static final LayerIdentifier KNECON_OCR_TEXT_DEBUG = new LayerIdentifier("OCR Text", "OCR_TEXT_DEBUG");
public static final LayerIdentifier KNECON_OCR_BBOX_DEBUG = new LayerIdentifier("OCR Words", "OCR_BBOX_DEBUG");
public static final LayerIdentifier KNECON_OCR_LINE_DEBUG = new LayerIdentifier("OCR Lines", "OCR_LINE_DEBUG");
public static final LayerIdentifier KNECON_OCR_OVERLAPPED_TEXT = new LayerIdentifier("OCR overlapped Text", "OCR_OVERLAPPED_TEXT_DEBUG");
//azure idp
public static final LayerIdentifier KNECON_AZURE_IDP = new LayerIdentifier("IDP", "IDP");
public static final LayerIdentifier IDP_FIGURES = new LayerIdentifier("IDP Figures", "IDP_FIGURES");
public static final LayerIdentifier IDP_TABLES = new LayerIdentifier("IDP Tables", "IDP_TABLES");
public static final LayerIdentifier IDP_KV_PAIRS = new LayerIdentifier("IDP Key Value Pair", "IDP_KV_PAIRS");
public static final LayerIdentifier IDP_SECTIONS = new LayerIdentifier("IDP Sections", "IDP_SECTIONS");
public static final LayerIdentifier IDP_LINES = new LayerIdentifier("IDP Lines", "IDP_LINES");
public static final LayerIdentifier IDP_PARAGRAPHS = new LayerIdentifier("IDP Paragraphs", "IDP_PARAGRAPHS");
public static final LayerIdentifier IDP_LIST = new LayerIdentifier("IDP Lists", "IDP_LISTS");
public static final LayerIdentifier IDP_BARCODES = new LayerIdentifier("IDP Barcodes", "IDP_BARCODES");
}

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
public abstract class AbstractLayerGroup implements LayerGroup {
protected VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
if (visualizations.getVisualizationsOnPages().containsKey(page)) {
return visualizations.getVisualizationsOnPages()
.get(page);
}
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
visualizations.getVisualizationsOnPages().put(page, visualizationsOnPage);
return visualizationsOnPage;
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.awt.Color;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.Getter;
public class IdpLayerConfig extends AbstractLayerGroup {
@Getter
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_AZURE_IDP;
public static final LayerGroup CONFIG_INSTANCE = new IdpLayerConfig();
protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.IDP_FIGURES).visibleByDefault(true).build();
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.IDP_TABLES).visibleByDefault(true).build();
protected final Visualizations keyValuePairs = Visualizations.builder().layer(LayerIdentifier.IDP_KV_PAIRS).visibleByDefault(true).build();
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.IDP_PARAGRAPHS).build();
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.IDP_SECTIONS).build();
protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.IDP_LINES).build();
protected final Visualizations lists = Visualizations.builder().layer(LayerIdentifier.IDP_LIST).visibleByDefault(true).build();
protected final Visualizations barcodes = Visualizations.builder().layer(LayerIdentifier.IDP_BARCODES).visibleByDefault(true).build();
protected static final Color TABLE_COLOR = new Color(102, 205, 170);
protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
protected static final Color SECTION_COLOR = new Color(50, 50, 50);
protected static final Color SECTION_HEADING_COLOR = new Color(162, 56, 56);
protected static final Color TITLE_COLOR = new Color(221, 25, 25);
protected static final Color HEADER_FOOTER_COLOR = new Color(171, 131, 6);
protected static final Color FOOTNOTE_COLOR = new Color(6, 64, 171);
protected static final Color FORMULA_COLOR = new Color(80, 171, 6);
protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
protected static final Color IMAGE_COLOR = new Color(253, 63, 146);
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
protected static final Color KEY_COLOR = new Color(30, 92, 172);
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
protected static final Color LINES_COLOR = new Color(152, 45, 179);
@Override
public List<Visualizations> getVisualizations() {
return List.of(paragraphs, sections, figures, tables, keyValuePairs, lines, lists, barcodes);
}
}

View File

@ -0,0 +1,62 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
public interface LayerGroup {
LayerIdentifier getGroupIdentifier();
List<Visualizations> getVisualizations();
default List<LayerIdentifier> getSubLayers() {
return getVisualizations().stream()
.map(Visualizations::getLayer)
.toList();
}
default boolean isVisibleByDefault() {
return false;
}
/*
indicates the sub layers are all optional content
*/
default boolean subLayersAreOptionalContent() {
return true;
}
/*
indicates the LayerGroup is also a optional content group, and should be displayed as such:
layer
- sublayer0
- sublayer1
see note in specification 8.11.4.3
*/
default boolean isOptionalContent() {
return true;
}
default boolean isEmpty() {
return getVisualizations().isEmpty();
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.awt.Color;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.Getter;
public class LayoutDebugLayerConfig extends AbstractLayerGroup {
@Getter
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT_DEBUG;
public static final LayerGroup CONFIG_INSTANCE = new LayoutDebugLayerConfig();
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
protected static final Color LINES_COLOR = new Color(152, 45, 179);
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
protected static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
protected static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
protected static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
protected static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
new Color(255, 195, 0),
new Color(76, 175, 80),
new Color(33, 150, 243),
new Color(155, 89, 182),
new Color(233, 30, 99),
new Color(0, 188, 212),
new Color(121, 85, 72));
protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).visibleByDefault(true).build();
protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build();
protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).build();
protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build();
protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build();
protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build();
protected final Visualizations cells = Visualizations.builder().layer(LayerIdentifier.CELLS).build();
protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build();
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
public List<Visualizations> getVisualizations() {
return List.of(characters, //
neighbours,//
words, //
lines, //
zones, //
rulings, //
clean_rulings, //
cells, //
mainBody, //
markedContent //
);
}
}

View File

@ -0,0 +1,55 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.awt.Color;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.Getter;
public class LayoutGridLayerConfig extends AbstractLayerGroup {
@Getter
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT;
public static final LayerGroup CONFIG_INSTANCE = new LayoutGridLayerConfig();
protected static final float FONT_SIZE = 10f;
protected static final float LINE_WIDTH = 1f;
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
protected static final Color HEADER_CELL_COLOR = new Color(156, 21, 48);
protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
protected static final Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
protected static final Color TABLE_COLOR = new Color(102, 205, 170);
protected static final Color SECTION_COLOR = new Color(50, 50, 50);
protected static final Color HEADLINE_COLOR = new Color(162, 56, 56);
protected static final Color HEADER_COLOR = new Color(171, 131, 6);
protected static final Color IMAGE_COLOR = new Color(253, 63, 146);
protected static final Color TREEID_COLOR = new Color(53, 53, 53);
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
protected static final Color KEY_COLOR = new Color(30, 92, 172);
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build();
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build();
protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build();
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build();
protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_FIGURES).visibleByDefault(true).build();
protected final Visualizations headerFooter = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADER_FOOTER).visibleByDefault(true).build();
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
@Override
public List<Visualizations> getVisualizations() {
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds);
}
}

View File

@ -0,0 +1,38 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.awt.Color;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.Getter;
public class OcrDebugLayerConfig extends AbstractLayerGroup {
@Getter
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_OCR_DEBUG;
public static final LayerGroup CONFIG_INSTANCE = new OcrDebugLayerConfig();
protected static final Color REGULAR_COLOR = new Color(6, 39, 171);
protected static final Color BOLD_COLOR = new Color(50, 246, 246);
protected static final Color ITALIC_COLOR = new Color(171, 105, 6);
protected static final Color BOLD_ITALIC_COLOR = new Color(6, 171, 102);
protected static final Color HANDWRITTEN_COLOR = new Color(171, 64, 6);
protected static final Color OVERLAPPED_COLOR = new Color(142, 8, 8);
protected static final Color TABLE_LINES_COLOR = new Color(21, 221, 174);
protected final Visualizations debugText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT_DEBUG).visibleByDefault(true).build();
protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINE_DEBUG).visibleByDefault(true).build();
protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(false).build();
protected final Visualizations debugBBox = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_BBOX_DEBUG).visibleByDefault(false).build();
@Override
public List<Visualizations> getVisualizations() {
return List.of(debugText, tableLines, debugBBox, overlappedText);
}
}

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.viewerdoc.layers;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
public class OcrTextLayerConfig extends AbstractLayerGroup {
protected final Visualizations ocrText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT).build();
protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINES).build();
@Override
public LayerIdentifier getGroupIdentifier() {
return LayerIdentifier.KNECON_OCR;
}
@Override
public List<Visualizations> getVisualizations() {
return List.of(ocrText, tableLines);
}
@Override
public boolean subLayersAreOptionalContent() {
return false;
}
@Override
public boolean isOptionalContent() {
return false;
}
}

View File

@ -1,27 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.HashMap;
import java.util.Map;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LayoutGrid {
int numberOfPages;
Map<Integer, VisualizationsOnPage> visualizationsPerPages;
public LayoutGrid(int numberOfPages) {
this.numberOfPages = numberOfPages;
this.visualizationsPerPages = new HashMap<>();
for (int i = 0; i < numberOfPages; i++) {
this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build());
}
}
}

View File

@ -1,10 +1,12 @@
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
public class MarkedContentStack {
private final Deque<MarkedContent> stack = new LinkedList<>();
@ -44,7 +46,23 @@ public class MarkedContentStack {
}
public boolean currentMarkedContentContainsAny(Set<String> names) {
public boolean currentMarkedContentContainsNone(Set<String> names) {
if (stack.isEmpty()) {
return true;
}
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (names.contains(markedContent.name())) {
return false;
}
}
return true;
}
public boolean currentMarkedContentIsKneconContent() {
if (stack.isEmpty()) {
return false;
@ -52,11 +70,12 @@ public class MarkedContentStack {
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (names.contains(markedContent.name())) {
if (markedContent.name().startsWith(LayerIdentifier.KNECON_IDENTIFIER_PREFIX)) {
return true;
}
}
return false;
}

View File

@ -1,10 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.util.List;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
public record OperatorWithArguments(Operator operator, List<COSBase> arguments) {
}

View File

@ -1,13 +1,14 @@
package com.knecon.fforesight.service.viewerdoc.model;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.Optional;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.util.Matrix;
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<Matrix> textMatrix, Optional<RenderingMode> renderingMode) {
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<AffineTransform> textMatrix, Optional<RenderingMode> renderingMode) {
public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) {

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.viewerdoc.model;
import java.util.LinkedHashMap;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -17,9 +17,10 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Visualizations {
ContentStreams.Identifier layer;
LayerIdentifier layer;
@Builder.Default
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
boolean layerVisibilityDefaultValue;
boolean visibleByDefault;
}

View File

@ -14,6 +14,7 @@ import lombok.experimental.FieldDefaults;
public class VisualizationsOnPage {
boolean makePathsInvisible;
boolean inDeviceCoordinates;
@Builder.Default
List<PlacedText> placedTexts = new LinkedList<>();
@Builder.Default

View File

@ -1,7 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.pdf;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) {
}

View File

@ -1,61 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.pdf;
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.io.RandomAccessInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.Matrix;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SinglePDContentStream implements PDContentStream {
PDStream pdStream;
@Override
public InputStream getContents() throws IOException {
return new RandomAccessInputStream(getContentsForRandomAccess());
}
@Override
public RandomAccessRead getContentsForRandomAccess() throws IOException {
return pdStream.getCOSObject().createView();
}
@Override
public PDResources getResources() {
return null;
}
@Override
public PDRectangle getBBox() {
return null;
}
@Override
public Matrix getMatrix() {
return null;
}
}

View File

@ -1,121 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ContentStreamClassifier {
public List<ClassifiedContentStream> getClassifiedContentStreams(PDPage page) {
List<SinglePDContentStream> streams = new LinkedList<>();
page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream)));
return ContentStreamClassifier.classifySingleContentStreams(page, streams);
}
public List<ClassifiedContentStream> classifySingleContentStreams(PDPage page, List<SinglePDContentStream> streams) {
return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList();
}
private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) {
ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page);
return new ClassifiedContentStream(singlePDContentStream, classification);
}
/**
* We assume all of our layers are written escaped, so only unknown content streams need to be escaped.
*
* @param classifiers List of all content streams of a page with their classification
* @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END
*/
public boolean areAllContentStreamsEscaped(List<ClassifiedContentStream> classifiers) {
int escapeDepth = 0;
for (ClassifiedContentStream classifier : classifiers) {
if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) {
return false;
}
if (classifier.classification().equals(ContentStreams.ESCAPE_START)) {
escapeDepth++;
}
if (classifier.classification().equals(ContentStreams.ESCAPE_END)) {
escapeDepth--;
}
}
return escapeDepth == 0;
}
@SneakyThrows
public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) {
List<OperatorWithArguments> operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2);
if (operatorsWithArguments.isEmpty()) {
return ContentStreams.OTHER;
}
OperatorWithArguments firstOperator = operatorsWithArguments.get(0);
// If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q".
if (operatorsWithArguments.size() == 1) {
if (firstOperator.operator().getName().equals(OperatorName.SAVE)) {
return ContentStreams.ESCAPE_START;
}
if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) {
return ContentStreams.ESCAPE_END;
}
}
// In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid".
// This OCG is then assigned a COSName by PDFBox. Usually its "oc1".
// Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG.
// This COSName can then be found as an argument for the first operator in the content stream.
if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) {
Optional<COSName> layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page);
if (layoutGridOCGName.isPresent()) {
if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) {
return ContentStreams.KNECON_LAYOUT;
}
}
}
if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) {
return ContentStreams.OTHER;
}
Optional<COSName> firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst();
if (firstCOSNameFromArguments.isEmpty()) {
return ContentStreams.OTHER;
}
var cosName = firstCOSNameFromArguments.get();
return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER);
}
private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) {
return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName));
}
}

View File

@ -1,77 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ContentStreamUtility {
public static List<OperatorWithArguments> parseLeadingOperators(PDContentStream contentStream,
int numberOfOperatorsToRead) throws IOException {
List<COSBase> arguments = new ArrayList<>();
PDFStreamParser parser = new PDFStreamParser(contentStream);
List<OperatorWithArguments> operatorsWithArguments = new LinkedList<>();
for (int i = 0; i < numberOfOperatorsToRead; ) {
Object token = parser.parseNextToken();
if (token == null) {
break;
}
if (token instanceof Operator operator) {
operatorsWithArguments.add(new OperatorWithArguments(operator, arguments));
arguments = new ArrayList<>();
i++;
} else {
arguments.add((COSBase) token);
}
}
return operatorsWithArguments;
}
public static Optional<COSName> findLayoutGridOCGName(PDPage page) {
var resourceIterator = page.getResources().getPropertiesNames();
for (COSName cosName : resourceIterator) {
COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME);
if (cosBase instanceof COSString string) {
if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) {
return Optional.of(cosName);
}
}
}
return Optional.empty();
}
public static List<PDStream> removeLayerFromContentStreams(Set<ContentStreams.Identifier> layers, List<ClassifiedContentStream> classifiers) {
return classifiers.stream()
.filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification()))
.map(ClassifiedContentStream::contentStream)
.map(SinglePDContentStream::getPdStream)
.toList();
}
}

View File

@ -1,27 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.File;
import java.util.List;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import io.micrometer.observation.ObservationRegistry;
public interface IViewerDocumentService {
void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations);
default void enrichObservation(ObservationRegistry registry, int numberOfPages, List<ContentStreams.Identifier> layers) {
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
return;
}
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
for (int i = 0; i < layers.size(); i++) {
ContentStreams.Identifier layer = layers.get(i);
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.File;
import java.io.FileInputStream;
@ -11,16 +11,17 @@ import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.cos.COSName;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.layers.OcrDebugLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
@ -39,16 +40,19 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
public class PDFTronViewerDocumentService implements IViewerDocumentService {
public class PDFTronViewerDocumentService {
private final ObservationRegistry registry;
public static final List<LayerGroup> ALL_LAYERS_WITH_OPTIONAL_CONTENT = List.of(LayoutGridLayerConfig.CONFIG_INSTANCE,
OcrDebugLayerConfig.CONFIG_INSTANCE,
LayoutDebugLayerConfig.CONFIG_INSTANCE,
IdpLayerConfig.CONFIG_INSTANCE);
@Override
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows
public synchronized void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
public synchronized void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups) {
// originFile and destinationFile might be the same, so we use a temp file.
// Otherwise, saving the document might corrupt the file
@ -62,63 +66,73 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService {
) {
enrichObservation(registry,
pdfDoc.getPageCount(),
visualizations.stream()
layerGroups.stream()
.map(LayerGroup::getVisualizations)
.flatMap(Collection::stream)
.map(Visualizations::getLayer)
.toList());
Map<ContentStreams.Identifier, Group> groupMap = PdftronLayerUtility.addLayersToDocument(visualizations, pdfDoc);
Map<LayerIdentifier, Group> groupMap = PdftronLayerUtility.addLayersToDocument(layerGroups, pdfDoc);
Map<EmbeddableFont, Font> fontMap = buildFontMap(visualizations, pdfDoc);
Map<EmbeddableFont, Font> fontMap = buildFontMap(layerGroups, pdfDoc);
Set<String> markedContentToDraw = extractMarkedContentNames(visualizations.stream()
.map(Visualizations::getLayer));
Set<String> kneconMarkedContents = extractMarkedContentNames(ContentStreams.allContentStreams.stream());
Set<String> markedContentToDraw = mapMarkedContentNames(layerGroups);
PageContentCleaner pageContentCleaner = PageContentCleaner.builder()
.writer(pageWriter)
.reader(reader)
.elementBuilder(builder)
.markedContentToDraw(markedContentToDraw)
.kneconMarkedContents(kneconMarkedContents)
.markedContentToRemove(markedContentToDraw)
.build();
VisualizationWriter visualizationWriter = VisualizationWriter.builder()
.writer(pageWriter)
.builder(builder)
.groupMap(groupMap)
.visualizations(visualizations)
.layerGroups(layerGroups)
.fontMap(fontMap)
.build();
int pageNumber = 0;
boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc);
int pageNumber = 1;
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
Page page = iterator.next();
pageContentCleaner.cleanPage(page);
if (isCurrentVersion) {
pageContentCleaner.removeMarkedContent(page);
}
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
}
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);
saveDocument(pdfDoc, destinationFile);
} finally {
assert !tmpFile.toFile().exists() || tmpFile.toFile().delete();
}
}
private static Set<String> extractMarkedContentNames(Stream<ContentStreams.Identifier> visualizations) {
private static Set<String> mapMarkedContentNames(List<LayerGroup> layerGroups) {
return visualizations.map(ContentStreams.Identifier::cosName)
.map(COSName::getName)
return layerGroups.stream()
.map(LayerGroup::getVisualizations)
.flatMap(Collection::stream)
.map(Visualizations::getLayer)
.map(LayerIdentifier::name)
.collect(Collectors.toSet());
}
private static Map<EmbeddableFont, Font> buildFontMap(List<Visualizations> visualizations, PDFDoc pdfDoc) {
private static Map<EmbeddableFont, Font> buildFontMap(List<LayerGroup> layerGroups, PDFDoc pdfDoc) {
return visualizations.stream()
return layerGroups.stream()
.map(LayerGroup::getVisualizations)
.flatMap(Collection::stream)
.map(Visualizations::getVisualizationsOnPages)
.map(Map::values)
.flatMap(Collection::stream)
@ -146,4 +160,18 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService {
}
}
private void enrichObservation(ObservationRegistry registry, int numberOfPages, List<LayerIdentifier> layers) {
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
return;
}
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
for (int i = 0; i < layers.size(); i++) {
LayerIdentifier layer = layers.get(i);
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
}
}
}

View File

@ -0,0 +1,83 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.util.Set;
import com.knecon.fforesight.service.viewerdoc.model.MarkedContentStack;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Page;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Builder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageContentCleaner {
ElementWriter writer;
ElementReader reader;
ElementBuilder elementBuilder;
Set<String> markedContentToRemove;
@Builder.Default
MarkedContentStack markedContentStack = new MarkedContentStack();
@SneakyThrows
public void removeMarkedContent(Page page) {
begin(page);
copyElementsExceptMarkedContentToRemove();
end();
}
@SneakyThrows
private void begin(Page page) {
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
reader.begin(page);
}
@SneakyThrows
private void end() {
writer.end();
reader.end();
}
@SneakyThrows
private void copyElementsExceptMarkedContentToRemove() {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_marked_content_begin -> {
markedContentStack.enterMarkedContent(element.getMCTag().getName());
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
writer.writeElement(element);
}
}
case Element.e_marked_content_end -> {
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
writer.writeElement(element);
}
markedContentStack.leaveMarkedContent();
}
default -> {
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
writer.writeElement(element);
}
}
}
}
}
}

View File

@ -0,0 +1,187 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.ocg.Config;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdftronLayerUtility {
@SneakyThrows
public Map<LayerIdentifier, Group> addLayersToDocument(List<LayerGroup> layerGroups, PDFDoc pdfDoc) {
Map<LayerIdentifier, Group> optionalContentGroupMap = new HashMap<>();
for (var layerGroup : layerGroups) {
if (!layerGroup.subLayersAreOptionalContent() || layerGroup.isEmpty()) {
continue;
}
if (layerGroup.isOptionalContent()) {
Group group = addLayerToDocument(pdfDoc, layerGroup.getGroupIdentifier().name(), layerGroup.isVisibleByDefault());
optionalContentGroupMap.put(layerGroup.getGroupIdentifier(), group);
}
if (layerGroup.subLayersAreOptionalContent()) {
for (Visualizations subLayer : layerGroup.getVisualizations()) {
Group subGroup = addLayerToDocument(pdfDoc, subLayer.getLayer().name(), layerGroup.isVisibleByDefault());
optionalContentGroupMap.put(subLayer.getLayer(), subGroup);
}
}
}
setOrderArrayForPresentGroups(pdfDoc, PDFTronViewerDocumentService.ALL_LAYERS_WITH_OPTIONAL_CONTENT);
return optionalContentGroupMap;
}
@SneakyThrows
public void setOrderArrayForPresentGroups(PDFDoc pdfDoc, List<LayerGroup> layerGroups) {
Config cfg = getConfig(pdfDoc);
Obj orderArray = pdfDoc.createIndirectArray();
Map<String, Group> groupMap = findAllGroupsInDocAsMap(pdfDoc);
for (var layerGroup : layerGroups) {
Obj childOrderArray;
if (!layerGroup.subLayersAreOptionalContent()) {
continue;
}
if (layerGroup.isOptionalContent() && groupMap.containsKey(layerGroup.getGroupIdentifier().name())) {
Group group = groupMap.remove(layerGroup.getGroupIdentifier().name());
group.setInitialState(cfg, layerGroup.isVisibleByDefault());
orderArray.pushBack(group.getSDFObj());
childOrderArray = pdfDoc.createIndirectArray();
orderArray.pushBack(childOrderArray);
} else {
childOrderArray = orderArray;
}
for (Visualizations subLayer : layerGroup.getVisualizations()) {
if (groupMap.containsKey(subLayer.getLayer().name())) {
Group group = groupMap.remove(subLayer.getLayer().name());
group.setInitialState(cfg, subLayer.isVisibleByDefault());
childOrderArray.pushBack(group.getSDFObj());
}
}
}
if (!groupMap.isEmpty()) {
for (Group group : groupMap.values()) {
orderArray.pushBack(group.getSDFObj());
}
}
cfg.setOrder(orderArray);
cfg.getSDFObj().putText("ListMode", "VisiblePages");
}
private static Map<String, Group> findAllGroupsInDocAsMap(PDFDoc pdfDoc) throws PDFNetException {
Map<String, Group> groupMap = new LinkedHashMap<>();
List<Group> presentGroups = findAllGroupsInDoc(pdfDoc);
for (Group group : presentGroups) {
groupMap.put(group.getName(), group);
}
return groupMap;
}
private static Config getConfig(PDFDoc pdfDoc) throws PDFNetException {
Config cfg = pdfDoc.getOCGConfig();
if (cfg == null) {
cfg = Config.create(pdfDoc, true);
}
cfg.setName("knecon debug layer order");
cfg.setCreator("Knecon Technology GmbH");
return cfg;
}
@SneakyThrows
private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) {
Optional<Group> existingGroup = findGroupInDoc(doc, layerName);
if (existingGroup.isPresent()) {
return existingGroup.get();
}
return addNewLayer(doc, layerName, layerVisibilityDefaultValue, false);
}
private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue, boolean containsAll) throws PDFNetException {
Config cfg = getConfig(doc);
Group grp = Group.create(doc, layerName);
grp.setInitialState(cfg, layerVisibilityDefaultValue);
return grp;
}
@SneakyThrows
private Optional<Group> findGroupInDoc(PDFDoc doc, String layerName) {
Obj ocgs = doc.getOCGs();
if (ocgs != null) {
int i;
int sz = (int) ocgs.size();
for (i = 0; i < sz; ++i) {
Group ocg = new Group(ocgs.getAt(i));
if (ocg.getName().equals(layerName)) {
return Optional.of(ocg);
}
}
}
return Optional.empty();
}
@SneakyThrows
private List<Group> findAllGroupsInDoc(PDFDoc doc) {
Obj ocgs = doc.getOCGs();
if (ocgs == null) {
return Collections.emptyList();
}
List<Group> groups = new ArrayList<>(Math.toIntExact(ocgs.size()));
int i;
int sz = (int) ocgs.size();
for (i = 0; i < sz; ++i) {
Group ocg = new Group(ocgs.getAt(i));
groups.add(ocg);
}
return groups;
}
}

View File

@ -0,0 +1,70 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.File;
import java.util.Objects;
import java.util.Optional;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.pdftron.pdf.PDFDoc;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ViewerDocVersioningUtility {
public static final int currentVersion = 0;
public static final String AUTHOR = "knecon technology GmbH";
public static final String CUSTOM_DICT = "KNECON_VERSION";
@SneakyThrows
public void setVersionInDocument(PDFDoc pdfDoc) {
pdfDoc.getDocInfo().setAuthor(AUTHOR);
pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion);
}
private static Optional<Integer> readVersionFromKeywords(String keywords) {
String[] strings = keywords.split(":");
if (strings.length != 2) {
return Optional.empty();
}
if (!strings[0].equals(CUSTOM_DICT)) {
return Optional.empty();
}
try {
return Optional.of(Integer.parseInt(strings[1]));
} catch (NumberFormatException e) {
return Optional.empty();
}
}
@SneakyThrows
public boolean isCurrentVersion(File file) {
try (PDDocument doc = Loader.loadPDF(file)) {
return isCurrentVersion(doc.getDocumentInformation().getAuthor(), doc.getDocumentInformation().getKeywords());
}
}
@SneakyThrows
public boolean docIsCurrentVersion(PDFDoc pdfDoc) {
return isCurrentVersion(pdfDoc.getDocInfo().getAuthor(), pdfDoc.getDocInfo().getKeywords());
}
private static boolean isCurrentVersion(String author, String keywords) {
return Objects.equals(author, AUTHOR) //
&& readVersionFromKeywords(keywords).map(version -> version == currentVersion).orElse(false);
}
}

View File

@ -1,324 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
public class ViewerDocumentService implements IViewerDocumentService {
private final ObservationRegistry registry;
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
@SneakyThrows
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
// originFile and destinationFile might be the same, so we use a temp file.
// Otherwise, saving the document might corrupt the file
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
enrichObservation(registry,
pdDocument.getNumberOfPages(),
visualizations.stream()
.map(Visualizations::getLayer)
.toList());
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
.map(Visualizations::getLayer)
.collect(Collectors.toUnmodifiableSet());
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
PDPage pdPage = pdDocument.getPage(pageNumber);
createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups
List<ClassifiedContentStream> classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage);
pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers));
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) {
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
// e.g. not escaped matrix transformations.
wrapContentStreams(pdDocument, pdPage);
}
for (Visualizations visualization : visualizations) {
if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) {
continue;
}
// We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
contentStream.beginMarkedContent(visualization.getLayer().cosName());
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer()));
}
contentStream.saveGraphicsState();
drawVisualizationsToContentStream(pdDocument,
visualization.getVisualizationsOnPages()
.get(pageNumber),
contentStream,
textDeRotationMatrix);
contentStream.restoreGraphicsState();
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
contentStream.endMarkedContent();
}
contentStream.endMarkedContent();
}
}
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages());
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
pdDocument = openPDDocument(tmpFile.toFile());
}
}
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
assert tmpFile.toFile().delete();
}
private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) {
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>();
for (Visualizations visualization : visualizations) {
addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())//
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
}
return optionalContentGroupMap;
}
private static void drawVisualizationsToContentStream(PDDocument pdDocument,
VisualizationsOnPage visualizationsOnPage,
PDPageContentStream contentStream,
AffineTransform textDeRotationMatrix) throws IOException {
if (visualizationsOnPage.isMakePathsInvisible()) {
contentStream.addRect(0, 0, 1, 1);
contentStream.clip();
}
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
contentStream.setLineWidth(coloredLine.lineWidth());
contentStream.setStrokingColor(coloredLine.color());
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
contentStream.stroke();
}
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
contentStream.setLineWidth(coloredRectangle.lineWidth());
contentStream.setStrokingColor(coloredRectangle.color());
Rectangle2D r = coloredRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.stroke();
}
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
contentStream.setNonStrokingColor(filledRectangle.color());
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
contentStream.setGraphicsStateParameters(graphicsState);
Rectangle2D r = filledRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.fill();
}
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
PDFont font = placedText.font().embed(pdDocument);
contentStream.setFont(font, placedText.fontSize());
contentStream.beginText();
contentStream.setNonStrokingColor(placedText.color());
if (placedText.renderingMode()
.isPresent()) {
contentStream.setRenderingMode(placedText.renderingMode()
.get());
} else {
contentStream.setRenderingMode(RenderingMode.FILL);
}
Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix);
contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text());
contentStream.endText();
}
}
private void enrichObservation(int numberOfPages, List<ContentStreams.Identifier> layers) {
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
return;
}
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
for (int i = 0; i < layers.size(); i++) {
ContentStreams.Identifier layer = layers.get(i);
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
}
}
private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException {
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
contentStream.saveGraphicsState();
}
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
contentStream.restoreGraphicsState();
}
}
private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) {
Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(),
(float) placedText.lineStart().getY());
} else {
textMatrix = placedText.textMatrix()
.get();
}
return textMatrix;
}
private static Optional<PDOptionalContentGroup> addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
if (layer.optionalContent()) {
return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue));
}
return Optional.empty();
}
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) {
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
if (ocprops == null) {
ocprops = new PDOptionalContentProperties();
catalog.setOCProperties(ocprops);
}
PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(layerName)) {
layer = ocprops.getGroup(layerName);
} else {
layer = new PDOptionalContentGroup(layerName);
ocprops.addGroup(layer);
}
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
return layer;
}
private static PDDocument openPDDocument(File tmpFile) throws IOException {
PDDocument pdDocument;
pdDocument = Loader.loadPDF(tmpFile);
pdDocument.setAllSecurityToBeRemoved(true);
return pdDocument;
}
@SneakyThrows
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
try (var out = new FileOutputStream(outputFile)) {
pdDocument.save(out, CompressParameters.NO_COMPRESSION);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
private static void createPageResourcesIfNotPresent(PDPage pdPage) {
PDResources resources = pdPage.getResources();
if (resources == null) {
resources = new PDResources();
pdPage.setResources(resources);
}
}
private static AffineTransform getTextDeRotationTransform(PDPage page) {
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
case 90 -> 3;
case 180 -> 2;
case 270 -> 1;
default -> 0;
});
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
@ -6,7 +6,10 @@ import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
@ -20,6 +23,7 @@ import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
@ -37,8 +41,9 @@ public class VisualizationWriter {
ElementWriter writer;
ElementBuilder builder;
List<Visualizations> visualizations;
Map<ContentStreams.Identifier, Group> groupMap;
ElementReader reader;
List<LayerGroup> layerGroups;
Map<LayerIdentifier, Group> groupMap;
Map<EmbeddableFont, Font> fontMap;
@ -48,27 +53,68 @@ public class VisualizationWriter {
begin(page);
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(page);
AffineTransform pageTransformation = getTextDeRotationTransform(page);
for (Visualizations visualization : visualizations) {
for (LayerGroup layerGroup : layerGroups) {
VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages()
.get(pageNumber);
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(layerGroup.getGroupIdentifier().markedContentName());
writer.writeElement(markedContentStart);
if (layerGroup.isOptionalContent()) {
Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(layerGroup.getGroupIdentifier()).getSDFObj());
writer.writeElement(ocgStart);
}
Element escape = builder.createGroupBegin();
writer.writeElement(escape);
writeVisualizations(pageNumber, layerGroup, textDeRotationMatrix);
Element escapeEnd = builder.createGroupEnd();
writer.writeElement(escapeEnd);
if (layerGroup.isOptionalContent()) {
Element ocgEnd2 = builder.createMarkedContentEnd();
writer.writeElement(ocgEnd2);
}
Element markedContentEnd = builder.createMarkedContentEnd();
writer.writeElement(markedContentEnd);
}
end();
}
private void writeVisualizations(int pageNumber, LayerGroup layerGroup, AffineTransform textDeRotationMatrix) throws PDFNetException {
for (Visualizations visualization : layerGroup.getVisualizations()) {
VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages().get(pageNumber);
if (visualizationsOnPage == null || visualizationsOnPage.isEmpty()) {
continue;
}
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().cosName().getName());
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().markedContentName());
writer.writeElement(markedContentStart);
if (visualization.getLayer().optionalContent()) {
if (layerGroup.subLayersAreOptionalContent()) {
Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(visualization.getLayer()).getSDFObj());
writer.writeElement(ocgStart);
}
Element escape = builder.createGroupBegin();
writer.writeElement(escape);
writeVisualization(visualizationsOnPage, textDeRotationMatrix);
if (visualization.getLayer().optionalContent()) {
Element escapeEnd = builder.createGroupEnd();
writer.writeElement(escapeEnd);
if (layerGroup.subLayersAreOptionalContent()) {
Element ocgEnd = builder.createMarkedContentEnd();
writer.writeElement(ocgEnd);
}
@ -77,9 +123,6 @@ public class VisualizationWriter {
writer.writeElement(markedContentEnd);
}
end();
}
@ -123,6 +166,7 @@ public class VisualizationWriter {
writePlacedText(textDeRotationMatrix, placedText);
}
}
@ -130,26 +174,46 @@ public class VisualizationWriter {
float[] rgbComponents = placedText.color().getRGBColorComponents(null);
Font font = fontMap.get(placedText.font());
Element text = builder.createTextRun(placedText.text(), font, placedText.fontSize());
if (placedText.renderingMode()
.isPresent()) {
text.getGState()
.setRenderingIntent(placedText.renderingMode()
.get().intValue());
} else {
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
text.getGState().setFillColor(color);
}
text.getGState().setRenderingIntent(GState.e_fill_text);
Element text = builder.createTextBegin(font, placedText.fontSize());
text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
text.getGState().setFillColor(color);
}
try (Matrix2D textMatrix = getTextMatrix(placedText, textDeRotationMatrix)) {
text.setTextMatrix(textMatrix);
}
text.getGState()
.setTextRenderMode(placedText.renderingMode()
.map(VisualizationWriter::resolveTextRenderMode).orElse(GState.e_fill_text));
writer.writeElement(text);
text = switch (font.getType()) {
case Font.e_Type0, Font.e_CIDType0, Font.e_TrueType, Font.e_CIDType2 -> builder.createUnicodeTextRun(placedText.text());
case Font.e_Type1 -> builder.createTextRun(placedText.text());
default -> throw new IllegalStateException("Unexpected value: " + font.getType());
};
writer.writeElement(text);
text = builder.createTextEnd();
writer.writeElement(text);
}
private static int resolveTextRenderMode(RenderingMode renderingMode) {
return switch (renderingMode) {
case FILL -> GState.e_fill_text;
case STROKE -> GState.e_stroke_text;
case FILL_STROKE -> GState.e_fill_stroke_text;
case NEITHER -> GState.e_invisible_text;
case FILL_CLIP -> GState.e_fill_clip_text;
case STROKE_CLIP -> GState.e_stroke_clip_text;
case FILL_STROKE_CLIP -> GState.e_fill_stroke_clip_text;
case NEITHER_CLIP -> GState.e_clip_text;
};
}
@ -220,21 +284,32 @@ public class VisualizationWriter {
Matrix2D textMatrix;
if (placedText.textMatrix().isEmpty()) {
textMatrix = new Matrix2D(textDeRotationMatrix.getScaleX(),
textDeRotationMatrix.getShearX(),
textDeRotationMatrix.getShearY(),
textDeRotationMatrix.getScaleY(),
placedText.lineStart().getX(),
placedText.lineStart().getY());
textMatrix = toMatrix2D(textDeRotationMatrix, placedText.lineStart().getX(), placedText.lineStart().getY());
} else {
var matrix = placedText.textMatrix()
.get();
textMatrix = new Matrix2D(matrix.getScaleX(), matrix.getShearX(), matrix.getShearY(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY());
var matrix = placedText.textMatrix().get();
textMatrix = toMatrix2D(matrix);
}
return textMatrix;
}
private static Matrix2D toMatrix2D(AffineTransform matrix) throws PDFNetException {
return new Matrix2D(matrix.getScaleX(), matrix.getShearY(), matrix.getShearX(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY());
}
private static Matrix2D toMatrix2D(AffineTransform textDeRotationMatrix, double translateX, double translateY) throws PDFNetException {
return new Matrix2D(textDeRotationMatrix.getScaleX(),
textDeRotationMatrix.getShearY(),
textDeRotationMatrix.getShearX(),
textDeRotationMatrix.getScaleY(),
translateX,
translateY);
}
@SneakyThrows
private static AffineTransform getTextDeRotationTransform(Page page) {

View File

@ -1,120 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
import java.util.Set;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Builder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageContentCleaner {
ElementWriter writer;
ElementReader reader;
ElementBuilder elementBuilder;
Set<String> markedContentToDraw;
Set<String> kneconMarkedContents;
@Builder.Default
MarkedContentStack markedContentStack = new MarkedContentStack();
@SneakyThrows
public void cleanPage(Page page) {
begin(page);
boolean escaped = reader.next().getType() == Element.e_group_begin;
if (!escaped) {
writer.writeElement(elementBuilder.createGroupBegin());
}
copyElementsUntilFirstKneconMarkedContent();
if (!escaped) {
writer.writeElement(elementBuilder.createGroupEnd());
}
copyElementsExceptMarkedContentToDraw();
end();
}
@SneakyThrows
private void begin(Page page) {
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
reader.begin(page);
}
@SneakyThrows
private void end() {
writer.end();
reader.end();
}
@SneakyThrows
private void copyElementsUntilFirstKneconMarkedContent() {
for (Element element = reader.current(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_marked_content_begin -> {
markedContentStack.enterMarkedContent(element.getMCTag().getName());
if (markedContentStack.currentMarkedContentContainsAny(kneconMarkedContents)) {
break;
}
writer.writeElement(element);
}
case Element.e_marked_content_end -> {
markedContentStack.leaveMarkedContent();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
}
@SneakyThrows
private void copyElementsExceptMarkedContentToDraw() {
for (Element element = reader.current(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_marked_content_begin -> {
markedContentStack.enterMarkedContent(element.getMCTag().getName());
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
writer.writeElement(element);
}
}
case Element.e_marked_content_end -> {
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
writer.writeElement(element);
}
markedContentStack.leaveMarkedContent();
}
default -> {
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
writer.writeElement(element);
}
}
}
}
}
}

View File

@ -1,96 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.ocg.Config;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdftronLayerUtility {
public Map<ContentStreams.Identifier, Group> addLayersToDocument(List<Visualizations> visualizations, PDFDoc pdfDoc) {
Map<ContentStreams.Identifier, Group> optionalContentGroupMap = new HashMap<>();
for (Visualizations visualization : visualizations) {
addLayerToDocument(visualization.getLayer(), pdfDoc, visualization.isLayerVisibilityDefaultValue())//
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
}
return optionalContentGroupMap;
}
private Optional<Group> addLayerToDocument(ContentStreams.Identifier layer, PDFDoc pdfDoc, boolean layerVisibilityDefaultValue) {
if (layer.optionalContent()) {
return Optional.of(addLayerToDocument(pdfDoc, layer.name(), layerVisibilityDefaultValue));
}
return Optional.empty();
}
@SneakyThrows
private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) {
Optional<Group> existingGroup = findGroupInDoc(doc, layerName);
if (existingGroup.isPresent()) {
return existingGroup.get();
}
return addNewLayer(doc, layerName, layerVisibilityDefaultValue);
}
private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) throws PDFNetException {
Config cfg = doc.getOCGConfig();
if (cfg == null) {
cfg = Config.create(doc, true);
cfg.setName("Default");
}
Group grp = Group.create(doc, layerName);
grp.setInitialState(cfg, layerVisibilityDefaultValue);
// Add the new OCG to the list of layers that should appear in PDF viewer GUI.
Obj layerOrderArray = cfg.getOrder();
if (layerOrderArray == null) {
layerOrderArray = doc.createIndirectArray();
cfg.setOrder(layerOrderArray);
}
layerOrderArray.pushBack(grp.getSDFObj());
return grp;
}
@SneakyThrows
private Optional<Group> findGroupInDoc(PDFDoc doc, String layerName) {
Obj ocgs = doc.getOCGs();
if (ocgs != null) {
int i;
int sz = (int) ocgs.size();
for (i = 0; i < sz; ++i) {
Group ocg = new Group(ocgs.getAt(i));
if (ocg.getName().equals(layerName)) {
return Optional.of(ocg);
}
}
}
return Optional.empty();
}
}

View File

@ -1,124 +0,0 @@
package com.knecon.fforesight.service.viewerdoc.service;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.nio.file.Files;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class ContentStreamClassifierTest {
@Test
@SneakyThrows
public void testClassification() {
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile());
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
logContentStreamClassifications(classifieds);
assertEquals(11, classifieds.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
}
}
@Test
@SneakyThrows
public void testRemoveLayoutLayer() {
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile());
File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile();
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds));
document.save(tmpFile);
}
try (PDDocument document2 = Loader.loadPDF(tmpFile)) {
PDPage page2 = document2.getPage(0);
List<ClassifiedContentStream> classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2);
logContentStreamClassifications(classifieds2);
assertEquals(10, classifieds2.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2));
}
assert tmpFile.delete();
}
@Test
@SneakyThrows
public void testClassificationForOldLayers() {
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("oldViewerDocLayers.pdf").getFile());
try (PDDocument document = Loader.loadPDF(pdfFile)) {
PDPage page = document.getPage(0);
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
logContentStreamClassifications(classifieds);
assertEquals(11, classifieds.size());
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
for (int i = 1; i < 9; i++) {
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
}
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
}
}
private static void logContentStreamClassifications(List<ClassifiedContentStream> classifieds) {
log.info("number of content streams: {}", classifieds.size());
log.info("Classifications: {}", classifieds.stream()//
.map(ClassifiedContentStream::classification)//
.map(ContentStreams.Identifier::cosName)//
.map(COSName::getName)//
.collect(Collectors.joining(", ")));
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Set;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled
class PageContentCleanerTest {
@BeforeEach
public void init() {
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
}
@AfterAll
public static void cleanup() {
PDFNet.terminate();
}
@Test
@SneakyThrows
public void testContentCleaning() {
Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf");
File tmpFile = new File("/tmp/cleaned.pdf");
try (var in = new FileInputStream(file.toFile());//
var doc = new PDFDoc(in);//
var out = new FileOutputStream(tmpFile);//
ElementWriter pageWriter = new ElementWriter();//
ElementReader reader = new ElementReader();//
ElementBuilder builder = new ElementBuilder()//
) {
PageContentCleaner pageContentCleaner = PageContentCleaner.builder()
.writer(pageWriter)
.reader(reader)
.elementBuilder(builder)
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
.build();
for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
pageContentCleaner.removeMarkedContent(page);
}
doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
}
}

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.swing.table.AbstractTableModel;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
class ViewerDocVersioningUtilityTest {
@BeforeEach
public void init() {
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
}
@AfterAll
public static void cleanup() {
PDFNet.terminate();
}
@Test
@SneakyThrows
public void testMarking() {
File file = new ClassPathResource("files/empty.pdf").getFile();
Path tmpFile = Files.createTempFile("markedDocument", ".pdf");
try (var in = new FileInputStream(file); var doc = new PDFDoc(in); var out = new FileOutputStream(tmpFile.toFile())) {
ViewerDocVersioningUtility.setVersionInDocument(doc);
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());
assert tmpFile.toFile().delete();
}
}

Binary file not shown.