diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index d981b5c..4aa9dd6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -116,29 +116,14 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); -// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); - File viewerDocumentFile = originFile; + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); - VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); - if (layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()) { - visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId() - .get()); - } - - ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId() - .isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() - .get()); - } - - TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId() - .isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() - .get()); - } + VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() + .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); + ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() + .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); + TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() + .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), @@ -151,16 +136,12 @@ public class LayoutParsingPipeline { log.info("Building document graph for {}", layoutParsingRequest.identifier()); Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null // - ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument); + ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), + classificationDocument); log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, - documentGraph, - viewerDocumentFile, - false, - layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()); + layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); @@ -249,10 +230,11 @@ public class LayoutParsingPipeline { Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); + ClassificationDocument classificationDocument = new ClassificationDocument(); if (settings.isDebug() || identifier.containsKey("debug")) { - classificationDocument.getVisualizations().setActive(true); + classificationDocument.getLayoutDebugLayer().setActive(true); } List classificationPages = new ArrayList<>(); @@ -290,7 +272,7 @@ public class LayoutParsingPipeline { } stripper.getText(originDocument); List words = stripper.getTextPositionSequences(); - classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber); + classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); PDRectangle pdr = pdPage.getMediaBox(); @@ -298,32 +280,34 @@ public class LayoutParsingPipeline { boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); PDRectangle cropbox = pdPage.getCropBox(); - classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber); + classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); - classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); + classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), - - false); + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber(), "")) + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), + ImageType.GRAPHIC, + false, + stripper.getPageNumber(), + "")) .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> - docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -347,7 +331,7 @@ public class LayoutParsingPipeline { } } - classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); + classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); @@ -378,7 +362,7 @@ public class LayoutParsingPipeline { log.info("Calculating BodyTextFrame for {}", identifier); bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); for (ClassificationPage page : classificationDocument.getPages()) { - classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber()); + classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber()); } log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java index 7ba862f..7044d89 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingServiceProcessorConfiguration.java @@ -5,10 +5,7 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; -import com.google.common.base.Strings; -import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; -import com.knecon.fforesight.service.viewerdoc.service.pdftron.PDFTronViewerDocumentService; -import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; +import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import io.micrometer.observation.ObservationRegistry; @@ -18,14 +15,9 @@ public class LayoutParsingServiceProcessorConfiguration { @Bean @Autowired - public IViewerDocumentService viewerDocumentService(ObservationRegistry registry, LayoutparserSettings settings) { - - if (!Strings.isNullOrEmpty(settings.getPdftronLicense())) { - return new PDFTronViewerDocumentService(registry); - } else { - return new ViewerDocumentService(registry); - } + public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) { + return new PDFTronViewerDocumentService(registry); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 471db6a..18fb95d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility; import com.knecon.fforesight.tenantcommons.TenantContext; import io.micrometer.observation.annotation.Observed; @@ -36,6 +37,7 @@ public class LayoutParsingStorageService { private final StorageService storageService; private final ObjectMapper objectMapper; + @Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file") public File getOriginFile(String storageId) throws IOException { @@ -53,11 +55,18 @@ public class LayoutParsingStorageService { } File tempFile = createTempFile("viewerDocument", ".pdf"); storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); + + if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) { + assert tempFile.delete(); + return Optional.empty(); + } + return Optional.of(tempFile); } - public ImageServiceResponse getImagesFile(String storageId) throws IOException { + @SneakyThrows + public ImageServiceResponse getImagesFile(String storageId) { try (InputStream inputStream = getObject(storageId)) { @@ -68,7 +77,8 @@ public class LayoutParsingStorageService { } - public TableServiceResponse getTablesFile(String storageId) throws IOException { + @SneakyThrows + public TableServiceResponse getTablesFile(String storageId) { try (var tableClassificationStream = getObject(storageId)) { @@ -78,11 +88,12 @@ public class LayoutParsingStorageService { } } - public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) throws IOException { + + @SneakyThrows + public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) { try (InputStream inputStream = getObject(storageId)) { - VisualLayoutParsingResponse visualLayoutParsingResponse = objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class); - return visualLayoutParsingResponse; + return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 2b095a4..55d3f40 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.RequiredArgsConstructor; @@ -33,7 +33,7 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) { + public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { List zones = new ArrayList<>(); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO)); @@ -45,7 +45,7 @@ public class DocstrumSegmentationService { } - private List computeZones(List textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) { + private List computeZones(List textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { List positions = textPositions.stream() .filter(t -> t.getDir() == direction) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 68e2d95..c87343a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -7,7 +7,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.Data; import lombok.NoArgsConstructor; @@ -25,7 +25,7 @@ public class ClassificationDocument { private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); - private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations(); + private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer(); private boolean headlines; private long rulesVersion; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java similarity index 64% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java index 82b5275..62d41a9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/TextRange.java @@ -13,13 +13,13 @@ import lombok.Setter; @Setter @EqualsAndHashCode @SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName") -public class Boundary implements Comparable { +public class TextRange implements Comparable { private int start; private int end; - public Boundary(int start, int end) { + public TextRange(int start, int end) { if (start > end) { throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); @@ -47,15 +47,15 @@ public class Boundary implements Comparable { } - public boolean contains(Boundary boundary) { + public boolean contains(TextRange textRange) { - return start <= boundary.start() && boundary.end() <= end; + return start <= textRange.start() && textRange.end() <= end; } - public boolean containedBy(Boundary boundary) { + public boolean containedBy(TextRange textRange) { - return boundary.contains(this); + return textRange.contains(this); } @@ -83,18 +83,18 @@ public class Boundary implements Comparable { } - public boolean intersects(Boundary boundary) { + public boolean intersects(TextRange textRange) { - return boundary.start() < this.end && this.start < boundary.end(); + return textRange.start() < this.end && this.start < textRange.end(); } - public List split(List splitIndices) { + public List split(List splitIndices) { if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); } - List splitBoundaries = new LinkedList<>(); + List splitBoundaries = new LinkedList<>(); int previousIndex = start; for (int splitIndex : splitIndices) { @@ -102,10 +102,10 @@ public class Boundary implements Comparable { if (splitIndex == previousIndex) { continue; } - splitBoundaries.add(new Boundary(previousIndex, splitIndex)); + splitBoundaries.add(new TextRange(previousIndex, splitIndex)); previousIndex = splitIndex; } - splitBoundaries.add(new Boundary(previousIndex, end)); + splitBoundaries.add(new TextRange(previousIndex, end)); return splitBoundaries; } @@ -114,11 +114,11 @@ public class Boundary implements Comparable { return IntStream.range(start, end); } - public static Boundary merge(Collection boundaries) { + public static TextRange merge(Collection boundaries) { - int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); - int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new); - return new Boundary(minStart, maxEnd); + int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new); + int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new); + return new TextRange(minStart, maxEnd); } @@ -130,12 +130,12 @@ public class Boundary implements Comparable { @Override - public int compareTo(Boundary boundary) { + public int compareTo(TextRange textRange) { - if (end < boundary.end() && start < boundary.start()) { + if (end < textRange.end() && start < textRange.start()) { return -1; } - if (start > boundary.start() && end > boundary.end()) { + if (start > textRange.start() && end > textRange.end()) { return 1; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/TextEntity.java similarity index 85% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/TextEntity.java index bfa9f9b..7d4d6f3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/RedactionEntity.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/entity/TextEntity.java @@ -11,7 +11,7 @@ import java.util.Map; import java.util.Set; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; @@ -28,11 +28,11 @@ import lombok.experimental.FieldDefaults; @AllArgsConstructor @FieldDefaults(level = AccessLevel.PRIVATE) @EqualsAndHashCode(onlyExplicitlyIncluded = true) -public class RedactionEntity { +public class TextEntity { // initial values @EqualsAndHashCode.Include - final Boundary boundary; + final TextRange textRange; @EqualsAndHashCode.Include final String type; @EqualsAndHashCode.Include @@ -47,7 +47,7 @@ public class RedactionEntity { boolean dictionaryEntry; boolean dossierDictionaryEntry; Set engines; - Set references; + Set references; @Builder.Default Deque matchedRules = new LinkedList<>(); String redactionReason; @@ -66,9 +66,9 @@ public class RedactionEntity { SemanticNode deepestFullyContainingNode; - public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) { + public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) { - return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build(); + return TextEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build(); } @@ -132,7 +132,7 @@ public class RedactionEntity { public List getRedactionPositionsPerPage() { if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) { - Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary); + Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange); Page firstPage = rectanglesPerLinePerPage.keySet() .stream() @@ -155,21 +155,21 @@ public class RedactionEntity { } - public boolean containedBy(RedactionEntity redactionEntity) { + public boolean containedBy(TextEntity textEntity) { - return this.boundary.containedBy(redactionEntity.getBoundary()); + return this.textRange.containedBy(textEntity.getTextRange()); } - public boolean contains(RedactionEntity redactionEntity) { + public boolean contains(TextEntity textEntity) { - return this.boundary.contains(redactionEntity.getBoundary()); + return this.textRange.contains(textEntity.getTextRange()); } - public boolean intersects(RedactionEntity redactionEntity) { + public boolean intersects(TextEntity textEntity) { - return this.boundary.intersects(redactionEntity.getBoundary()); + return this.textRange.intersects(textEntity.getTextRange()); } @@ -185,13 +185,13 @@ public class RedactionEntity { } - public void addReference(RedactionEntity reference) { + public void addReference(TextEntity reference) { references.add(reference); } - public void addReferences(List references) { + public void addReferences(List references) { this.references.addAll(references); } @@ -210,7 +210,7 @@ public class RedactionEntity { sb.append("Entity[\""); sb.append(value); sb.append("\", "); - sb.append(boundary); + sb.append(textRange); sb.append(", pages["); pages.forEach(page -> { sb.append(page.getNumber()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java index 4afdb9a..3f9e051 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java @@ -8,7 +8,7 @@ import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -39,7 +39,7 @@ public abstract class AbstractSemanticNode implements GenericSemanticNode { @Builder.Default @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); + Set entities = new HashSet<>(); @EqualsAndHashCode.Exclude Map bBoxCache; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index f82d3fa..db3976e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -12,7 +12,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -33,7 +33,7 @@ public class Document extends AbstractSemanticNode { Set pages; Integer numberOfPages; - LayoutparsingVisualizations visualizations; + LayoutDebugLayer layoutDebugLayer; @Override diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java index 27b06a7..1588324 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java @@ -92,4 +92,16 @@ public class Image extends AbstractSemanticNode { return true; } + + public double getArea() { + + return position.getWidth() * position.getHeight(); + } + + + public boolean isFullPageImage() { + + return imageType.equals(ImageType.OCR) || getArea() >= 0.5 * page.getArea(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java index cd79bf7..743dd0f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/ImageType.java @@ -6,7 +6,6 @@ public enum ImageType { LOGO, FORMULA, SIGNATURE, - SIGNATURE_VISUAL, OTHER, OCR, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java index 3e98ab3..b82bee7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java @@ -6,7 +6,7 @@ import java.util.List; import java.util.Set; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; @@ -39,7 +39,7 @@ public class Page { @Builder.Default @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); + Set entities = new HashSet<>(); @Builder.Default @EqualsAndHashCode.Exclude @@ -60,7 +60,10 @@ public class Page { public TextBlock getMainBodyTextBlock() { - return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + return mainBody.stream() + .filter(SemanticNode::isLeaf) + .map(SemanticNode::getLeafTextBlock) + .collect(new TextBlockCollector()); } @@ -84,4 +87,10 @@ public class Page { return o instanceof Page && o.hashCode() == this.hashCode(); } + + public double getArea() { + + return height * width; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index 32369e6..b5c7410 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -14,13 +14,14 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; +import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public interface SemanticNode { @@ -42,7 +43,9 @@ public interface SemanticNode { */ default TextBlock getTextBlock() { - return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector()); + return streamAllSubNodes().filter(SemanticNode::isLeaf) + .map(SemanticNode::getTextBlock) + .collect(new TextBlockCollector()); } @@ -52,7 +55,7 @@ public interface SemanticNode { * * @return Set of all Entities associated with this Node */ - Set getEntities(); + Set getEntities(); /** @@ -68,7 +71,10 @@ public interface SemanticNode { default Page getFirstPage() { - return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + return getTextBlock().getPages() + .stream() + .min(Comparator.comparingInt(Page::getNumber)) + .orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); } @@ -77,18 +83,19 @@ public interface SemanticNode { * * @return Set of PageNodes this node appears on. */ - default Set getPages(Boundary boundary) { + default Set getPages(TextRange textRange) { - if (!getBoundary().contains(boundary)) { - throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary())); + if (!getBoundary().contains(textRange)) { + throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary())); } - return getTextBlock().getPages(boundary); + return getTextBlock().getPages(textRange); } default boolean isOnPage(int pageNumber) { - return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber); + return getPages().stream() + .anyMatch(page -> page.getNumber() == pageNumber); } @@ -203,7 +210,9 @@ public interface SemanticNode { */ default boolean hasEntitiesOfType(String type) { - return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type)); + return getEntities().stream() + .filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)) + .anyMatch(redactionEntity -> redactionEntity.getType().equals(type)); } @@ -213,9 +222,11 @@ public interface SemanticNode { * @param type string representing the type of entities to return * @return List of RedactionEntities of any the type */ - default List getEntitiesOfType(String type) { + default List getEntitiesOfType(String type) { - return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList(); + return getEntities().stream() + .filter(redactionEntity -> redactionEntity.getType().equals(type)) + .toList(); } @@ -225,9 +236,11 @@ public interface SemanticNode { * @param types A list of strings representing the types of entities to return * @return List of RedactionEntities of any provided type */ - default List getEntitiesOfType(List types) { + default List getEntitiesOfType(List types) { - return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList(); + return getEntities().stream() + .filter(redactionEntity -> redactionEntity.isAnyType(types)) + .toList(); } @@ -241,7 +254,8 @@ public interface SemanticNode { TextBlock textBlock = getTextBlock(); if (!textBlock.getAtomicTextBlocks().isEmpty()) { - return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); + return getTextBlock().getAtomicTextBlocks() + .get(0).getNumberOnPage(); } else { return -1; } @@ -279,7 +293,8 @@ public interface SemanticNode { */ default boolean containsStrings(List strings) { - return strings.stream().allMatch(this::containsString); + return strings.stream() + .allMatch(this::containsString); } @@ -303,7 +318,8 @@ public interface SemanticNode { */ default boolean containsAnyString(List strings) { - return strings.stream().anyMatch(this::containsString); + return strings.stream() + .anyMatch(this::containsString); } @@ -315,7 +331,8 @@ public interface SemanticNode { */ default boolean containsAnyStringIgnoreCase(List strings) { - return strings.stream().anyMatch(this::containsStringIgnoreCase); + return strings.stream() + .anyMatch(this::containsStringIgnoreCase); } @@ -323,19 +340,19 @@ public interface SemanticNode { * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity. * It sets the fields accordingly and recursively calls this function on all its children. * - * @param redactionEntity RedactionEntity, which is being inserted into the graph + * @param textEntity RedactionEntity, which is being inserted into the graph */ - default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) { + default void addThisToEntityIfIntersects(TextEntity textEntity) { TextBlock textBlock = getTextBlock(); - if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) { - if (textBlock.containsBoundary(redactionEntity.getBoundary())) { - redactionEntity.setDeepestFullyContainingNode(this); + if (textBlock.getTextRange().intersects(textEntity.getTextRange())) { + if (textBlock.containsBoundary(textEntity.getTextRange())) { + textEntity.setDeepestFullyContainingNode(this); } - redactionEntity.addIntersectingNode(this); - streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary())) - .forEach(node -> node.addThisToEntityIfIntersects(redactionEntity)); + textEntity.addIntersectingNode(this); + streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(textEntity.getTextRange())) + .forEach(node -> node.addThisToEntityIfIntersects(textEntity)); } } @@ -386,7 +403,8 @@ public interface SemanticNode { */ default Stream streamAllSubNodes() { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .map(DocumentTree.Entry::getNode); } @@ -397,7 +415,9 @@ public interface SemanticNode { */ default Stream streamAllSubNodesOfType(NodeType nodeType) { - return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode); + return getDocumentTree().allSubEntriesInOrder(getTreeId()) + .filter(entry -> entry.getType().equals(nodeType)) + .map(DocumentTree.Entry::getNode); } @@ -406,9 +426,9 @@ public interface SemanticNode { * * @return Boundary of this Node's TextBlock */ - default Boundary getBoundary() { + default TextRange getBoundary() { - return getTextBlock().getBoundary(); + return getTextBlock().getTextRange(); } @@ -453,17 +473,19 @@ public interface SemanticNode { */ private Map getBBoxFromChildren() { - Map bBoxPerPage = new HashMap<>(); - List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList(); - Set pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet()); - for (Page page : pages) { - Rectangle2D bBoxOnPage = childrenBBoxes.stream() - .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) - .map(childBboxPerPage -> childBboxPerPage.get(page)) - .collect(RectangleTransformations.collectBBox()); - bBoxPerPage.put(page, bBoxOnPage); + List> childrenBBoxes = streamChildren().filter(child -> !isFullPageImage(child)) + .map(SemanticNode::getBBox) + .toList(); + return BBoxMergingUtility.mergeBBoxes(childrenBBoxes); + } + + + private static boolean isFullPageImage(SemanticNode child) { + + if (!child.getType().equals(NodeType.IMAGE)) { + return false; } - return bBoxPerPage; + return ((Image) child).isFullPageImage(); } @@ -473,7 +495,9 @@ public interface SemanticNode { private Map getBBoxFromLeafTextBlock() { Map bBoxPerPage = new HashMap<>(); - Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); + Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks() + .stream() + .collect(Collectors.groupingBy(AtomicTextBlock::getPage)); atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); return bBoxPerPage; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index 18f3ef5..9d1e656 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -15,7 +15,7 @@ import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; @@ -43,7 +43,7 @@ public class Table implements SemanticNode { @Builder.Default @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); + Set entities = new HashSet<>(); @EqualsAndHashCode.Exclude Map bBoxCache; @@ -54,7 +54,7 @@ public class Table implements SemanticNode { * @param strings Strings to check whether a row contains them * @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings */ - public Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) { + public Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) { return IntStream.range(0, numberOfRows).boxed() .filter(row -> rowContainsStringsIgnoreCase(row, strings)) @@ -88,7 +88,7 @@ public class Table implements SemanticNode { * @param value the string which the table cell should contain * @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value. */ - public Stream streamEntitiesWhereRowHasHeaderAndValue(String header, String value) { + public Stream streamEntitiesWhereRowHasHeaderAndValue(String header, String value) { List vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header)) .map(TableCell::getCol) @@ -107,7 +107,7 @@ public class Table implements SemanticNode { * @param values the strings which the table cell should contain * @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value. */ - public Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List values) { + public Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List values) { List colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header)) .map(TableCell::getCol) @@ -125,7 +125,7 @@ public class Table implements SemanticNode { * @param types type strings to check whether a row contains an entity like them * @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types. */ - public Stream streamEntitiesWhereRowContainsEntitiesOfType(List types) { + public Stream streamEntitiesWhereRowContainsEntitiesOfType(List types) { List rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed() .filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream() @@ -145,7 +145,7 @@ public class Table implements SemanticNode { * @param types type strings to check whether a row doesn't contain an entity like it * @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types. */ - public Stream streamEntitiesWhereRowContainsNoEntitiesOfType(List types) { + public Stream streamEntitiesWhereRowContainsNoEntitiesOfType(List types) { List rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed() .filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream() @@ -163,7 +163,7 @@ public class Table implements SemanticNode { return streamRow(rowNumber).map(TableCell::getEntities) .flatMap(Collection::stream) - .map(RedactionEntity::getType) + .map(TextEntity::getType) .distinct(); } @@ -304,12 +304,12 @@ public class Table implements SemanticNode { * Finds all entities of the provided type, which appear in the same row that the provided entity appears in. * * @param type the type of entities to search for - * @param redactionEntity the entity, which appears in the row to search + * @param textEntity the entity, which appears in the row to search * @return List of all entities of the provided type, which appear in the same row that the provided entity appears in. */ - public List getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) { + public List getEntitiesOfTypeInSameRow(String type, TextEntity textEntity) { - return redactionEntity.getIntersectingNodes() + return textEntity.getIntersectingNodes() .stream() .filter(node -> node instanceof TableCell) .map(node -> (TableCell) node) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java index 33d9427..4749c82 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/AtomicTextBlock.java @@ -13,7 +13,7 @@ import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -36,14 +36,14 @@ public class AtomicTextBlock implements TextBlock { Page page; //string coordinates - Boundary boundary; + TextRange textRange; String searchText; @Builder.Default List lineBreaks = new ArrayList<>(); @Builder.Default - List boldTextBoundaries = new ArrayList<>(); + List boldTextBoundaries = new ArrayList<>(); @Builder.Default - List italicTextBoundaries = new ArrayList<>(); + List italicTextBoundaries = new ArrayList<>(); String orientation; int textDirection; @@ -66,8 +66,8 @@ public class AtomicTextBlock implements TextBlock { public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText, List lineBreaks, - List boldTextBoundaries, - List italicTextBoundaries, + List boldTextBoundaries, + List italicTextBoundaries, List positions, List stringIdxToPositionIdx, long idx, @@ -89,7 +89,7 @@ public class AtomicTextBlock implements TextBlock { .italicTextBoundaries(italicTextBoundaries) .positions(positions) .stringIdxToPositionIdx(stringIdxToPositionIdx) - .boundary(new Boundary(offset, offset + searchText.length())) + .textRange(new TextRange(offset, offset + searchText.length())) .textDirection(textDirection) .orientation(orientation) .build(); @@ -100,7 +100,7 @@ public class AtomicTextBlock implements TextBlock { return AtomicTextBlock.builder() .id(textBlockIdx) - .boundary(new Boundary(stringOffset, stringOffset)) + .textRange(new TextRange(stringOffset, stringOffset)) .searchText("") .page(page) .numberOnPage(numberOnPage) @@ -118,7 +118,7 @@ public class AtomicTextBlock implements TextBlock { .id(documentTextData.getId()) .numberOnPage(documentTextData.getNumberOnPage()) .page(page) - .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd())) + .textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd())) .searchText(documentTextData.getSearchText()) .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList()) .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList()) @@ -140,11 +140,11 @@ public class AtomicTextBlock implements TextBlock { throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); } if (lineNumber == 0) { - return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start()); + return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start()); } else if (lineNumber == numberOfLines() - 1) { - return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end()); + return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end()); } - return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start()); + return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start()); } @@ -159,9 +159,9 @@ public class AtomicTextBlock implements TextBlock { public int getNextLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - boundary.start()) // + .filter(linebreak -> linebreak > fromIndex - textRange.start()) // .findFirst() // - .orElse(searchText.length()) + boundary.start(); + .orElse(searchText.length()) + textRange.start(); } @@ -169,43 +169,43 @@ public class AtomicTextBlock implements TextBlock { public int getPreviousLinebreak(int fromIndex) { return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - boundary.start())// + .filter(linebreak -> linebreak <= fromIndex - textRange.start())// .reduce((a, b) -> b)// - .orElse(0) + boundary.start(); + .orElse(0) + textRange.start(); } @Override public Rectangle2D getPosition(int stringIdx) { - return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start())); + return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start())); } @Override - public List getPositions(Boundary stringBoundary) { + public List getPositions(TextRange stringTextRange) { - if (!containsBoundary(stringBoundary)) { - throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary)); + if (!containsBoundary(stringTextRange)) { + throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange)); } - if (stringBoundary.length() == 0) { + if (stringTextRange.length() == 0) { return Collections.emptyList(); } - int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()); + int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start()); - if (stringBoundary.end() == this.boundary.end()) { + if (stringTextRange.end() == this.textRange.end()) { return positions.subList(startPositionIdx, positions.size()); } - return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start())); + return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start())); } - public Map> getPositionsPerPage(Boundary stringBoundary) { + public Map> getPositionsPerPage(TextRange stringTextRange) { - List rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary)) + List rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange)) .stream() .map(this::getPositions) .map(RectangleTransformations::rectangleBBoxWithGaps) @@ -217,9 +217,9 @@ public class AtomicTextBlock implements TextBlock { } - private List getAllLineBreaksInBoundary(Boundary boundary) { + protected List getAllLineBreaksInBoundary(TextRange textRange) { - return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList(); + return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java index d48170b..c1ad087 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java @@ -11,7 +11,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import lombok.AccessLevel; @@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock { List atomicTextBlocks; String searchText; - Boundary boundary; + TextRange textRange; public static ConcatenatedTextBlock empty() { @@ -37,12 +37,12 @@ public class ConcatenatedTextBlock implements TextBlock { this.atomicTextBlocks = new LinkedList<>(); if (atomicTextBlocks.isEmpty()) { - boundary = new Boundary(-1, -1); + textRange = new TextRange(-1, -1); return; } var firstTextBlock = atomicTextBlocks.get(0); this.atomicTextBlocks.add(firstTextBlock); - boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end()); + textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end()); atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat); } @@ -50,16 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock { public ConcatenatedTextBlock concat(TextBlock textBlock) { - int start = textBlock.getBoundary().start(); - int end = textBlock.getBoundary().end(); + int start = textBlock.getTextRange().start(); + int end = textBlock.getTextRange().end(); if (this.atomicTextBlocks.isEmpty()) { - boundary.setStart(start); - boundary.setEnd(end); - } else if (boundary.end() != start) { - throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + textRange.setStart(start); + textRange.setEnd(end); + } else if (textRange.end() != start) { + throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange())); } this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); - boundary.setEnd(end); + textRange.setEnd(end); this.searchText = null; return this; } @@ -67,13 +67,13 @@ public class ConcatenatedTextBlock implements TextBlock { private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) { - return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new); + return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new); } - private List getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) { + private List getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) { - return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList(); + return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList(); } @@ -125,47 +125,47 @@ public class ConcatenatedTextBlock implements TextBlock { @Override - public List getPositions(Boundary stringBoundary) { + public List getPositions(TextRange stringTextRange) { - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositions(stringBoundary); + return textBlocks.get(0).getPositions(stringTextRange); } AtomicTextBlock firstTextBlock = textBlocks.get(0); - List positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()))); + List positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()))); for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { positions.addAll(textBlock.getPositions()); } var lastTextBlock = textBlocks.get(textBlocks.size() - 1); - positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); return positions; } @Override - public Map> getPositionsPerPage(Boundary stringBoundary) { + public Map> getPositionsPerPage(TextRange stringTextRange) { - List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); + List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange); if (textBlocks.size() == 1) { - return textBlocks.get(0).getPositionsPerPage(stringBoundary); + return textBlocks.get(0).getPositionsPerPage(stringTextRange); } AtomicTextBlock firstTextBlock = textBlocks.get(0); - Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())); + Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())); for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary())); + rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange())); } AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, - lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end()))); return rectanglesPerLinePerPage; } @@ -187,14 +187,14 @@ public class ConcatenatedTextBlock implements TextBlock { @Override - public List getBoldTextBoundaries() { + public List getBoldTextBoundaries() { return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList(); } @Override - public List getItalicTextBoundaries() { + public List getItalicTextBoundaries() { return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java index df9c427..01727ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/TextBlock.java @@ -10,7 +10,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; public interface TextBlock extends CharSequence { @@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence { List getAtomicTextBlocks(); - List getBoldTextBoundaries(); + List getBoldTextBoundaries(); - List getItalicTextBoundaries(); + List getItalicTextBoundaries(); String getOrientation(); @@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence { int getTextDirection(); - Boundary getBoundary(); + TextRange getTextRange(); int getNextLinebreak(int fromIndex); @@ -48,10 +48,10 @@ public interface TextBlock extends CharSequence { Rectangle2D getPosition(int stringIdx); - List getPositions(Boundary stringBoundary); + List getPositions(TextRange stringTextRange); - Map> getPositionsPerPage(Boundary stringBoundary); + Map> getPositionsPerPage(TextRange stringTextRange); int numberOfLines(); @@ -59,7 +59,7 @@ public interface TextBlock extends CharSequence { default int indexOf(String searchTerm) { - return indexOf(searchTerm, getBoundary().start()); + return indexOf(searchTerm, getTextRange().start()); } @@ -69,10 +69,10 @@ public interface TextBlock extends CharSequence { } - default Set getPages(Boundary boundary) { + default Set getPages(TextRange textRange) { return getAtomicTextBlocks().stream() - .filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary)) + .filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange)) .map(AtomicTextBlock::getPage) .collect(Collectors.toUnmodifiableSet()); } @@ -80,38 +80,38 @@ public interface TextBlock extends CharSequence { default int indexOf(String searchTerm, int startOffset) { - int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start()); + int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start()); if (start == -1) { return -1; } - return start + getBoundary().start(); + return start + getTextRange().start(); } default CharSequence getFirstLine() { - return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start())); + return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start())); } - default boolean containsBoundary(Boundary boundary) { + default boolean containsBoundary(TextRange textRange) { - if (boundary.end() < boundary.start()) { - throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary)); + if (textRange.end() < textRange.start()) { + throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange)); } - return getBoundary().contains(boundary); + return getTextRange().contains(textRange); } default boolean containsIndex(int stringIndex) { - return getBoundary().contains(stringIndex); + return getTextRange().contains(stringIndex); } - default CharSequence subSequence(Boundary boundary) { + default CharSequence subSequence(TextRange textRange) { - return subSequence(boundary.start(), boundary.end()); + return subSequence(textRange.start(), textRange.end()); } @@ -128,21 +128,21 @@ public interface TextBlock extends CharSequence { @Override default CharSequence subSequence(int start, int end) { - return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start()); + return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start()); } @Override default int length() { - return getBoundary().length(); + return getTextRange().length(); } @Override default char charAt(int index) { - return getSearchText().charAt(index - getBoundary().start()); + return getSearchText().charAt(index - getTextRange().start()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index d1c8e74..1c8c2bf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -12,12 +12,14 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import io.micrometer.observation.annotation.Observed; import lombok.extern.slf4j.Slf4j; @Service @Slf4j public class OutlineValidationService { + @Observed(name = "OutlineValidationService", contextualName = "create-toc") public TableOfContents createToC(List headlines) { List mainSections = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index c2c33dd..0820a49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -33,7 +33,7 @@ public class BodyTextFrameService { for (ClassificationPage page : classificationDocument.getPages()) { var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame); setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); - classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber()); + classificationDocument.getLayoutDebugLayer().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index e0a046b..6248251 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -17,7 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.RequiredArgsConstructor; @@ -35,7 +35,7 @@ public class DocstrumBlockificationService { public ClassificationPage blockify(List textPositions, CleanRulings rulings, boolean xyOrder, - LayoutparsingVisualizations visualizations, + LayoutDebugLayer visualizations, LayoutParsingType layoutParsingType) { CleanRulings usedRulings = rulings.withoutTextRulings(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index a2c7085..35d4edc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -1,9 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; -import static java.util.stream.Collectors.toSet; - import java.util.ArrayList; -import java.util.Comparator; import java.util.Iterator; import java.util.List; @@ -11,13 +8,11 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; @SuppressWarnings("all") @Service @@ -35,7 +30,7 @@ public class RedactManagerBlockificationService { * @param visualizations * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { CleanRulings usedRulings = cleanRulings.withoutTextRulings(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index d74d080..a6dd95e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -22,6 +22,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; @@ -31,7 +32,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; @@ -55,7 +55,7 @@ public class DocumentGraphFactory { Document documentGraph = new Document(); - documentGraph.setVisualizations(document.getVisualizations()); + documentGraph.setLayoutDebugLayer(document.getLayoutDebugLayer()); Context context = new Context(documentGraph); @@ -280,7 +280,8 @@ public class DocumentGraphFactory { return pages.keySet() .stream() .filter(page -> page.getNumber() == pageIndex) - .findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + .findFirst() + .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java index 96118cd..48bf5de 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionDto.java @@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import lombok.AccessLevel; import lombok.Builder; @@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto { String searchText; List lineBreaks; List stringIdxToPositionIdx; - List boldTextBoundaries; - List italicTextBoundaries; + List boldTextBoundaries; + List italicTextBoundaries; List positions; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 0d9fd8f..8acdf00 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -9,7 +9,7 @@ import java.util.List; import java.util.Locale; import java.util.Objects; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory { } - private static List mergeToBoundaries(List integers) { + private static List mergeToBoundaries(List integers) { if (integers.isEmpty()) { return Collections.emptyList(); } - List boundaries = new LinkedList<>(); + List boundaries = new LinkedList<>(); int start = integers.get(0); int end = integers.get(0) + 1; for (int current : integers) { if (current > end + 1) { - boundaries.add(new Boundary(start, end)); + boundaries.add(new TextRange(start, end)); start = current; } end = current + 1; } if (boundaries.isEmpty()) { - boundaries.add(new Boundary(start, end)); + boundaries.add(new TextRange(start, end)); } return boundaries; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java index b4b20d8..3a0a076 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java @@ -116,8 +116,8 @@ public class DocumentDataMapper { .page(atomicTextBlock.getPage().getNumber().longValue()) .searchText(atomicTextBlock.getSearchText()) .numberOnPage(atomicTextBlock.getNumberOnPage()) - .start(atomicTextBlock.getBoundary().start()) - .end(atomicTextBlock.getBoundary().end()) + .start(atomicTextBlock.getTextRange().start()) + .end(atomicTextBlock.getTextRange().end()) .lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks())) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java index cee038c..952a7a6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/TaasDocumentDataMapper.java @@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; @@ -82,15 +82,15 @@ public class TaasDocumentDataMapper { } - private static Range toRange(Boundary boundary) { + private static Range toRange(TextRange textRange) { - return new Range(boundary.start(), boundary.end()); + return new Range(textRange.start(), textRange.end()); } - private static List toRange(List boundary) { + private static List toRange(List textRange) { - return boundary.stream().map(TaasDocumentDataMapper::toRange).toList(); + return textRange.stream().map(TaasDocumentDataMapper::toRange).toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 857bd0a..ac4f680 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -1,41 +1,17 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualization; -import java.awt.Color; -import java.awt.geom.Line2D; -import java.awt.geom.Point2D; -import java.awt.geom.Rectangle2D; -import java.awt.geom.RectangularShape; import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; -import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; -import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid; -import com.knecon.fforesight.service.viewerdoc.model.PlacedText; -import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; -import com.knecon.fforesight.service.viewerdoc.model.Visualizations; -import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid; +import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import io.micrometer.observation.annotation.Observed; import lombok.AccessLevel; @@ -48,451 +24,41 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) public class LayoutGridService { - IViewerDocumentService viewerDocumentService; - - static float FONT_SIZE = 10f; - static float LINE_WIDTH = 1f; - static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); - - static Color INNER_LINES_COLOR = new Color(255, 175, 175); - static Color PARAGRAPH_COLOR = new Color(70, 130, 180); - - static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101); - static Color TABLE_COLOR = new Color(102, 205, 170); - static Color SECTION_COLOR = new Color(50, 50, 50); - static Color HEADLINE_COLOR = new Color(162, 56, 56); - static Color HEADER_COLOR = new Color(171, 131, 6); - static Color IMAGE_COLOR = new Color(253, 63, 146); - - private record RectangleIdentifier(List treeId, Integer pageNumber) { - - } - - HashMap rectangleMap = new HashMap<>(); + PDFTronViewerDocumentService viewerDocumentService; @SneakyThrows @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) { - List allVisualizations; - Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false); - if (writeVisualLayoutParsingGrid) { - Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); - allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll()) - .toList(); + LayoutGrid layoutGrid = createLayoutGrid(document); + layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue); +// Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true); + if (document.getLayoutDebugLayer().isActive()) { + viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer())); } else { - allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll()) - .toList(); + viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid)); + } - - viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations); } - @SneakyThrows - @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") - public Visualizations addLayoutGrid(Document document, boolean layerVisibilityDefaultValue, boolean visualParsingGrid) { + private LayoutGrid createLayoutGrid(Document document) { - LayoutGrid layoutGrid = createLayoutGrid(document, visualParsingGrid); - - return Visualizations.builder() - .layer(visualParsingGrid ? ContentStreams.KNECON_VISUAL_PARSING : ContentStreams.KNECON_LAYOUT) - .visualizationsOnPages(layoutGrid.getVisualizationsPerPages()) - .layerVisibilityDefaultValue(layerVisibilityDefaultValue) - .build(); - } - - - private LayoutGrid createLayoutGrid(Document document, boolean visualParsingGrid) { - - LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages()); + LayoutGrid layoutGrid = new LayoutGrid(); document.streamAllSubNodes() - .filter(node -> (node.getEngines().contains(LayoutEngine.AI) && visualParsingGrid) || (node.getEngines().contains(LayoutEngine.ALGORITHM) && !visualParsingGrid)) + .peek(layoutGrid::addTreeId) .forEach(semanticNode -> { - Color color = switch (semanticNode.getType()) { - case PARAGRAPH -> PARAGRAPH_COLOR; - case TABLE -> TABLE_COLOR; - case SECTION, SUPER_SECTION -> SECTION_COLOR; - case HEADLINE -> HEADLINE_COLOR; - case HEADER, FOOTER -> HEADER_COLOR; - case IMAGE -> IMAGE_COLOR; - default -> null; - }; - - if (semanticNode instanceof DuplicatedParagraph) { - color = DUPLICATE_PARAGRAPH_COLOR; - } - - if (isNotSectionOrTableCellOrDocument(semanticNode)) { - addAsRectangle(semanticNode, layoutGrid, color); - } - if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) { - addSection(semanticNode, layoutGrid, color); - } - if (semanticNode.getType().equals(NodeType.TABLE)) { - Table table = (Table) semanticNode; - addInnerTableLines(table, layoutGrid); + switch (semanticNode.getType()) { + case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode); + case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode); + case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode); + case TABLE -> layoutGrid.addTable((Table) semanticNode); + case IMAGE -> layoutGrid.addImage((Image) semanticNode); + case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode); } }); return layoutGrid; } - - private void addInnerTableLines(Table table, LayoutGrid layoutGrid) { - - if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) { - return; - } - for (Page page : table.getPages()) { - - Optional optionalFirstRowOnPage = table.streamCol(0) - .filter(tableCell -> tableCell.isOnPage(page.getNumber())) - .map(TableCell::getRow) - .findFirst(); - if (optionalFirstRowOnPage.isEmpty()) { - continue; - } - int firstRowOnPage = optionalFirstRowOnPage.get(); - - Stream xStream = switch (page.getRotation()) { - case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX); - case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX); - case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX); - default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX); - }; - List xs = xStream.collect(Collectors.toList()); - xs.remove(0); - - Stream yStream = switch (page.getRotation()) { - case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY); - case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY); - case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY); - default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY); - }; - List ys = yStream.collect(Collectors.toList()); - ys.remove(0); - - Rectangle2D tableBBox = table.getBBox() - .get(page); - List coloredLines = layoutGrid.getVisualizationsPerPages() - .get(page.getNumber() - 1).getColoredLines(); - xs.forEach(x -> { - Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY())); - coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); - }); - ys.forEach(y -> { - Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y)); - coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); - }); - } - } - - - private static Stream streamBBoxOfCellsOnPage(Stream table, Page page) { - - return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())) - .map(TableCell::getBBox) - .map(bBoxMap -> bBoxMap.get(page)); - } - - - private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) { - - Map bBoxMap = semanticNode.getBBox(); - - List subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION) - .toList(); - Integer maxChildDepth = subSections.stream() - .map(node -> node.getTreeId().size()) - .max(Integer::compareTo) - .orElse(semanticNode.getTreeId().size()); - int ownDepth = semanticNode.getTreeId().size(); - - Page firstPage = semanticNode.getFirstPage(); - String treeIdString = buildTreeIdString(semanticNode); - - if (bBoxMap.values().size() == 1) { - handleSinglePage(semanticNode, layoutGrid, color, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth); - return; - } - List pagesInOrder = bBoxMap.keySet() - .stream() - .sorted(Comparator.comparingInt(Page::getNumber)) - .collect(Collectors.toList()); - pagesInOrder.remove(0); - handleFirstPageOfSection(semanticNode, color, firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid, maxChildDepth, ownDepth); - if (semanticNode instanceof SuperSection) { - return; - } - for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) { - handleForMiddlePageOfSection(semanticNode, color, middlePage, bBoxMap.get(middlePage), treeIdString, layoutGrid, maxChildDepth, ownDepth); - } - var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1); - handleLastPageOfSection(semanticNode, color, lastPage, bBoxMap.get(lastPage), treeIdString, layoutGrid, maxChildDepth, ownDepth); - } - - - @SneakyThrows - private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, LayoutGrid layoutGrid, Integer maxChildDepth) { - - // translates text, such that its right edge is a bit to the left of the drawn box - float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth); - - Point2D upperLeftCorner; - Point2D translationVector; - switch (page.getRotation()) { - case 90 -> { - if (highestParentRect != null) { - upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY()); - } else { - upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY()); - } - translationVector = new Point2D.Double(FONT_SIZE, -translationAmount); - } - case 180 -> { - if (highestParentRect != null) { - upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY()); - } else { - upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY()); - } - translationVector = new Point2D.Double(translationAmount, FONT_SIZE); - } - case 270 -> { - - if (highestParentRect != null) { - upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY()); - } else { - upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY()); - } - translationVector = new Point2D.Double(-FONT_SIZE, translationAmount); - } - default -> { - - if (highestParentRect != null) { - upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY()); - } else { - upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY()); - } - translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE); - } - } - - upperLeftCorner = add(upperLeftCorner, translationVector); - - List placedTexts = layoutGrid.getVisualizationsPerPages() - .get(page.getNumber() - 1).getPlacedTexts(); - - PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT); - - Optional conflictingText = placedTexts.stream() - .filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE) - .findFirst(); - - if (conflictingText.isPresent()) { - PlacedText existingText = conflictingText.get(); - if (newText.text().length() > existingText.text().length()) { - placedTexts.remove(existingText); - placedTexts.add(newText); - } - } else { - placedTexts.add(newText); - } - - } - - - private void handleSinglePage(SemanticNode semanticNode, - LayoutGrid layoutGrid, - Color color, - Page page, - Rectangle2D rectangle2D, - String treeIdString, - Integer maxChildDepth, - Integer ownDepth) { - - RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth); - // add string to top line - var firstLine = result.pageLines().remove(0); - result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH)); - for (Line2D line : result.pageLines()) { - result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH)); - } - } - - - private void handleFirstPageOfSection(SemanticNode semanticNode, - Color color, - Page firstPage, - Rectangle2D rectangle2D, - String treeIdString, - LayoutGrid layoutGrid, - Integer maxChildDepth, - Integer ownDepth) { - - RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth); - // remove bottom line - result.pageLines().remove(2); - // add string to top line - var firstLine = result.pageLines().remove(0); - result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH)); - for (Line2D line : result.pageLines()) { - result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH)); - } - } - - - private void handleForMiddlePageOfSection(SemanticNode semanticNode, - Color color, - Page middlePage, - Rectangle2D rectangle2D, - String treeIdString, - LayoutGrid layoutGrid, - Integer maxChildDepth, - Integer ownDepth) { - - RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth); - // remove top line - result.pageLines().remove(0); - // remove bottom line - result.pageLines().remove(1); - // add string to left line - var leftLine = result.pageLines().remove(1); - result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH)); - for (Line2D line : result.pageLines()) { - result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH)); - } - } - - - private void handleLastPageOfSection(SemanticNode semanticNode, - Color color, - Page lastPage, - Rectangle2D rectangle2D, - String treeIdString, - LayoutGrid layoutGrid, - Integer maxChildDepth, - Integer ownDepth) { - - RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth); - // remove top line - result.pageLines().remove(0); - // add string to left line - var leftLine = result.pageLines().remove(2); - result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH)); - for (Line2D line : result.pageLines()) { - result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH)); - } - } - - - private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode, - Page page, - Rectangle2D rectangle2D, - String treeIdString, - LayoutGrid layoutGrid, - Integer maxChildDepth, - Integer ownDepth) { - - List coloredLines = layoutGrid.getVisualizationsPerPages() - .get(page.getNumber() - 1).getColoredLines(); - int lineWidthModifier = maxChildDepth - ownDepth; - Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox() - .get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier)); - var lastPageLines = createLinesFromRectangle(r, page.getRotation()); - - SemanticNode highestParent = semanticNode.getHighestParent(); - Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber())); - addPlacedText(page, rectangle2D, highestParentRect, treeIdString, layoutGrid, maxChildDepth); - - if (semanticNode instanceof SuperSection) { - rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r); - } - - return new RectangleAndLinesResult(coloredLines, r, lastPageLines); - } - - - private record RectangleAndLinesResult(List coloredLines, Rectangle2D rectangle, List pageLines) { - - } - - - private String buildTreeIdString(SemanticNode semanticNode) { - - return semanticNode.getTreeId() - .stream() - .map(Object::toString) - .collect(Collectors.joining(".")); - } - - - /* - A __________________ B - | | - | | - | | - | | - D|__________________| C - The returned List are the lines [AB, BC, DC, AD] - The List is reordered, such that the order of the returned lines are always as viewed on the page. - */ - private List createLinesFromRectangle(Rectangle2D r, int pageRotation) { - // +0.5 to join the lines - List lines = new ArrayList<>(4); - float lineWidthCorrection = LINE_WIDTH * 0.5f; - Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY()); - Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY()); - Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY()); - Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY()); - Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY()); - Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY()); - Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY()); - Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY()); - lines.add(new Line2D.Float(a1, b1)); - lines.add(new Line2D.Float(b, c)); - lines.add(new Line2D.Float(d1, c1)); - lines.add(new Line2D.Float(a, d)); - - return switch (pageRotation) { - case 90 -> { - Collections.rotate(lines, 1); - yield lines; - } - case 180 -> { - Collections.rotate(lines, 2); - yield lines; - } - case 270 -> { - Collections.rotate(lines, 3); - yield lines; - } - - default -> lines; - }; - } - - - private static boolean isNotSectionOrTableCellOrDocument(SemanticNode semanticNode) { - - return !(semanticNode.getType().equals(NodeType.DOCUMENT) - || semanticNode.getType().equals(NodeType.SECTION) - || semanticNode.getType().equals(NodeType.SUPER_SECTION) - || semanticNode.getType().equals(NodeType.TABLE_CELL)); - } - - - private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) { - - semanticNode.getBBox() - .forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages() - .get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color, LINE_WIDTH))); - } - - - private Point2D add(Point2D a, Point2D b) { - - return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY()); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java new file mode 100644 index 0000000..e629d1b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class BBoxMergingUtility { + + public Map mergeBBoxes(List> bboxesToMerge) { + + Map bBoxPerPage = new HashMap<>(); + Set pages = bboxesToMerge.stream() + .flatMap(map -> map.keySet() + .stream()) + .collect(Collectors.toSet()); + for (Page page : pages) { + Rectangle2D bBoxOnPage = bboxesToMerge.stream() + .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) + .map(childBboxPerPage -> childBboxPerPage.get(page)) + .collect(RectangleTransformations.collectBBox()); + bBoxPerPage.put(page, bBoxOnPage); + } + return bBoxPerPage; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/ConnectionLineUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/ConnectionLineUtil.java new file mode 100644 index 0000000..1ca0a0a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/ConnectionLineUtil.java @@ -0,0 +1,111 @@ +package com.knecon.fforesight.service.layoutparser.processor.visualization; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ConnectionLineUtil { + + public static Line2D[] splitRectangleIntoLines(Rectangle2D rect) { + + double x = rect.getX(); + double y = rect.getY(); + double width = rect.getWidth(); + double height = rect.getHeight(); + + Line2D[] lines = new Line2D[4]; + lines[0] = new Line2D.Double(x, y, x + width, y); // Top + lines[1] = new Line2D.Double(x + width, y, x + width, y + height); // Right + lines[2] = new Line2D.Double(x + width, y + height, x, y + height); // Bottom + lines[3] = new Line2D.Double(x, y + height, x, y); // Left + + return lines; + } + + + public static Line2D transform(Line2D line2D, AffineTransform affineTransform) { + + var p1 = affineTransform.transform(line2D.getP1(), null); + var p2 = affineTransform.transform(line2D.getP2(), null); + return new Line2D.Double(p1, p2); + } + + + public static double length(Line2D line2D) { + + return line2D.getP1().distance(line2D.getP2()); + } + + + public static Line2D findClosestMidpointLine(Rectangle2D rect1, Rectangle2D rect2) { + + Line2D[] lines1 = splitRectangleIntoLines(rect1); + Line2D[] lines2 = splitRectangleIntoLines(rect2); + + Line2D closestLine1 = null; + Line2D closestLine2 = null; + double minDistance = Double.MAX_VALUE; + + for (Line2D line1 : lines1) { + for (Line2D line2 : lines2) { + double distance = lineDistance(line1, line2); + if (distance < minDistance) { + minDistance = distance; + closestLine1 = line1; + closestLine2 = line2; + } + } + } + + if (closestLine1 == null || closestLine2 == null) { + throw new IllegalStateException("Could not find closest lines"); + } + + Point2D midpoint1 = getMidpoint(closestLine1); + Point2D midpoint2 = getMidpoint(closestLine2); + + return new Line2D.Double(midpoint1, midpoint2); + } + + + private static double lineDistance(Line2D line1, Line2D line2) { + + return Math.abs(getMidpoint(line1).distance(getMidpoint(line2))); + } + + + private static Point2D getMidpoint(Line2D line) { + + double x = (line.getX1() + line.getX2()) / 2; + double y = (line.getY1() + line.getY2()) / 2; + return new Point2D.Double(x, y); + } + + + public static Line2D[] createArrowHead(Line2D line, double arrowLength) { + + Point2D start = line.getP1(); + Point2D end = line.getP2(); + + // Calculate the angle of the line + double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX()); + + // Calculate the points for the two arrow lines + double arrowHeadAngle = Math.PI / 6; + double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle); + double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle); + double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle); + double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle); + + // Create and return the two arrow lines + Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1)); + Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2)); + + return new Line2D[]{arrow1, arrow2}; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java similarity index 71% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index e89ef31..74b724a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -7,7 +7,6 @@ import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Stream; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @@ -21,12 +20,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; -import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; -import com.knecon.fforesight.service.viewerdoc.model.Visualizations; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; import lombok.AccessLevel; @@ -36,72 +33,15 @@ import lombok.NoArgsConstructor; import lombok.Setter; import lombok.experimental.FieldDefaults; +@Setter @Getter @NoArgsConstructor @AllArgsConstructor @FieldDefaults(level = AccessLevel.PRIVATE) -public class LayoutparsingVisualizations { +public class LayoutDebugLayer extends LayoutDebugLayerConfig { - static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); - - static final Color WORDS_COLOR = new Color(68, 84, 147); - static final Color LINES_COLOR = new Color(152, 45, 179); - static final Color ZONES_COLOR = new Color(131, 38, 38); - - static final Color RULINGS_COLOR = new Color(21, 221, 174); - static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175); - static final Color HEADER_RULING_COLOR = new Color(171, 131, 6); - static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2); - static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171); - static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6); - - static final Color CELLS_COLOR = new Color(31, 214, 27); - - static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); - static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); - - static final List ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51), - new Color(255, 195, 0), - new Color(76, 175, 80), - new Color(33, 150, 243), - new Color(155, 89, 182), - new Color(233, 30, 99), - new Color(0, 188, 212), - new Color(121, 85, 72)); - - @Setter boolean active; - final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); - final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); - final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build(); - final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); - final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build(); - final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); - final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build(); - final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build(); - final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build(); - final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build(); - - - public Stream streamAll() { - - if (!active) { - return Stream.empty(); - } - return Stream.of(characters, // - neighbours,// - words, // - lines, // - zones, // - rulings, // - clean_rulings, // - cells, // - mainBody, // - markedContent // - ); - } - public void addTextVisualizations(List textPositionSequences, int pageNumber) { @@ -130,6 +70,7 @@ public class LayoutparsingVisualizations { .toList()); } + public void addRulingVisualization(List rulings, int pageNumber) { if (!active) { @@ -137,8 +78,7 @@ public class LayoutparsingVisualizations { } VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings); visualizationsOnPage.getColoredLines() - .addAll(rulings - .stream() + .addAll(rulings.stream() .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) .toList()); } @@ -295,16 +235,4 @@ public class LayoutparsingVisualizations { } - - private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) { - - if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) { - return visualizations.getVisualizationsOnPages() - .get(page - 1); - } - VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build(); - visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage); - return visualizationsOnPage; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java new file mode 100644 index 0000000..439f33d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java @@ -0,0 +1,430 @@ +package com.knecon.fforesight.service.layoutparser.processor.visualization; + +import java.awt.Color; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@FieldDefaults(level = AccessLevel.PRIVATE) +public class LayoutGrid extends LayoutGridLayerConfig { + + @Getter + @Setter + boolean visibleByDefault; + + final HashMap rectangleMap = new HashMap<>(); + + + public void addParagraph(Paragraph paragraph) { + + if (paragraph instanceof DuplicatedParagraph) { + addAsRectangle(paragraph, paragraphs, DUPLICATE_PARAGRAPH_COLOR); + } else { + addAsRectangle(paragraph, paragraphs, PARAGRAPH_COLOR); + } + } + + + public void addImage(Image image) { + + if (image.isFullPageImage()) { + addAsRectangle(image, images, IMAGE_COLOR); + } else { + addAsRectangle(image, figures, IMAGE_COLOR); + } + } + + + public void addHeadline(Headline headline) { + + addAsRectangle(headline, headlines, HEADLINE_COLOR); + } + + + public void addHeaderOrFooter(SemanticNode header) { + + addAsRectangle(header, headerFooter, HEADER_COLOR); + } + + + public void addTreeId(SemanticNode semanticNode) { + + Page page = semanticNode.getFirstPage(); + addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR); + } + + + public void addTable(Table table) { + + addAsRectangle(table, tables, TABLE_COLOR); + addInnerTableLines(table); + addHeaderCells(table); + } + + + private void addHeaderCells(Table table) { + + table.streamHeaders() + .map(TableCell::getBBox) + .forEach(map -> map.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), tables).getFilledRectangles() + .add(new FilledRectangle(textBBox, HEADER_CELL_COLOR, 0.1f)))); + } + + + public void addSection(SemanticNode section) { + + Map bBoxMap = section.getBBox(); + + List subSections = section.streamAllSubNodesOfType(NodeType.SECTION) + .toList(); + Integer maxChildDepth = subSections.stream() + .map(node -> node.getTreeId().size()) + .max(Integer::compareTo).orElse(section.getTreeId().size()); + int ownDepth = section.getTreeId().size(); + + Page firstPage = section.getFirstPage(); + String treeIdString = buildTreeIdString(section); + + if (bBoxMap.values().size() == 1) { + handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth); + return; + } + List pagesInOrder = bBoxMap.keySet() + .stream() + .sorted(Comparator.comparingInt(Page::getNumber)) + .collect(Collectors.toList()); + pagesInOrder.remove(0); + handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth); + if (section instanceof SuperSection) { + return; + } + for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) { + handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth); + } + var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1); + handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth); + } + + + private String buildTreeIdString(SemanticNode semanticNode) { + + return semanticNode.getTreeId() + .stream() + .map(Object::toString) + .collect(Collectors.joining(".")); + } + + + @SneakyThrows + private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, Integer maxChildDepth, Visualizations visualizations, Color color) { + + // translates text, such that its right edge is a bit to the left of the drawn box + float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth); + + Point2D upperLeftCorner; + Point2D translationVector; + switch (page.getRotation()) { + case 90 -> { + if (highestParentRect != null) { + upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY()); + } else { + upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY()); + } + translationVector = new Point2D.Double(FONT_SIZE, -translationAmount); + } + case 180 -> { + if (highestParentRect != null) { + upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY()); + } else { + upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY()); + } + translationVector = new Point2D.Double(translationAmount, FONT_SIZE); + } + case 270 -> { + + if (highestParentRect != null) { + upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY()); + } else { + upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY()); + } + translationVector = new Point2D.Double(-FONT_SIZE, translationAmount); + } + default -> { + + if (highestParentRect != null) { + upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY()); + } else { + upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY()); + } + translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE); + } + } + + upperLeftCorner = add(upperLeftCorner, translationVector); + + List placedTexts = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getPlacedTexts(); + + PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, color, FONT); + + Optional conflictingText = placedTexts.stream() + .filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE) + .findFirst(); + + if (conflictingText.isPresent()) { + PlacedText existingText = conflictingText.get(); + if (newText.text().length() > existingText.text().length()) { + placedTexts.remove(existingText); + placedTexts.add(newText); + } + } else { + placedTexts.add(newText); + } + + } + + + private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { + + RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth); + // add string to top line + var firstLine = result.pageLines().remove(0); + result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH)); + for (Line2D line : result.pageLines()) { + result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); + } + } + + + private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { + + RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth); + // remove bottom line + result.pageLines().remove(2); + // add string to top line + var firstLine = result.pageLines().remove(0); + result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH)); + for (Line2D line : result.pageLines()) { + result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); + } + } + + + private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { + + RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth); + // remove top line + result.pageLines().remove(0); + // remove bottom line + result.pageLines().remove(1); + // add string to left line + var leftLine = result.pageLines().remove(1); + result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH)); + for (Line2D line : result.pageLines()) { + result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); + } + } + + + private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) { + + RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth); + // remove top line + result.pageLines().remove(0); + // add string to left line + var leftLine = result.pageLines().remove(2); + result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH)); + for (Line2D line : result.pageLines()) { + result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH)); + } + } + + + private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode, + Page page, + Rectangle2D rectangle2D, + String treeIdString, + Integer maxChildDepth, + Integer ownDepth) { + + List coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines(); + int lineWidthModifier = maxChildDepth - ownDepth; + Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier)); + + SemanticNode highestParent = semanticNode.getHighestParent(); + Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber())); + addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR); + var lastPageLines = createLinesFromRectangle(r, page.getRotation()); + + if (semanticNode instanceof SuperSection) { + rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r); + } + + return new RectangleAndLinesResult(coloredLines, r, lastPageLines); + } + + + private void addInnerTableLines(Table table) { + + if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) { + return; + } + for (Page page : table.getPages()) { + + Optional optionalFirstRowOnPage = table.streamCol(0) + .filter(tableCell -> tableCell.isOnPage(page.getNumber())) + .map(TableCell::getRow) + .findFirst(); + if (optionalFirstRowOnPage.isEmpty()) { + continue; + } + int firstRowOnPage = optionalFirstRowOnPage.get(); + + Stream xStream = switch (page.getRotation()) { + case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX); + case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX); + case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX); + default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX); + }; + List xs = xStream.collect(Collectors.toList()); + xs.remove(0); + + Stream yStream = switch (page.getRotation()) { + case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY); + case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY); + case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY); + default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY); + }; + List ys = yStream.collect(Collectors.toList()); + ys.remove(0); + + Rectangle2D tableBBox = table.getBBox().get(page); + List coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines(); + + xs.forEach(x -> { + Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY())); + coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); + }); + ys.forEach(y -> { + Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y)); + coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH)); + }); + } + } + + + private static Stream streamBBoxOfCellsOnPage(Stream table, Page page) { + + return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())) + .map(TableCell::getBBox) + .map(bBoxMap -> bBoxMap.get(page)); + } + + + private void addAsRectangle(SemanticNode semanticNode, Visualizations visualizations, Color color) { + + addAsRectangle(semanticNode.getBBox(), visualizations, color); + } + + + private void addAsRectangle(Map bbox, Visualizations visualizations, Color color) { + + bbox.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredRectangles() + .add(new ColoredRectangle(textBBox, color, LINE_WIDTH))); + } + + + private record RectangleAndLinesResult(List coloredLines, Rectangle2D rectangle, List pageLines) { + + } + + private record RectangleIdentifier(List treeId, Integer pageNumber) { + + } + + + /* + A __________________ B + | | + | | + | | + | | + D|__________________| C + The returned List are the lines [AB, BC, DC, AD] + The List is reordered, such that the order of the returned lines are always as viewed on the page. + */ + private List createLinesFromRectangle(Rectangle2D r, int pageRotation) { + // +0.5 to join the lines + List lines = new ArrayList<>(4); + float lineWidthCorrection = LINE_WIDTH * 0.5f; + Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY()); + Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY()); + Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY()); + Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY()); + Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY()); + Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY()); + Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY()); + Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY()); + lines.add(new Line2D.Float(a1, b1)); + lines.add(new Line2D.Float(b, c)); + lines.add(new Line2D.Float(d1, c1)); + lines.add(new Line2D.Float(a, d)); + + return switch (pageRotation) { + case 90 -> { + Collections.rotate(lines, 1); + yield lines; + } + case 180 -> { + Collections.rotate(lines, 2); + yield lines; + } + case 270 -> { + Collections.rotate(lines, 3); + yield lines; + } + + default -> lines; + }; + } + + + private Point2D add(Point2D a, Point2D b) { + + return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY()); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/build.gradle.kts b/layoutparser-service/layoutparser-service-server/build.gradle.kts index 2c81aa6..e25d176 100644 --- a/layoutparser-service/layoutparser-service-server/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-server/build.gradle.kts @@ -38,7 +38,7 @@ dependencies { implementation("com.amazonaws:aws-java-sdk-s3:1.12.536") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4") - implementation("com.pdftron:PDFNet:10.5.0") + implementation("com.pdftron:PDFNet:10.7.0") // for integration testing only testImplementation(project(":viewer-doc-processor")) diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java index 604b094..0fbf9d7 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/PDFNetInitializer.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.server; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import com.google.common.base.Strings; @@ -17,7 +18,8 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class PDFNetInitializer { - private final LayoutparserSettings settings; + @Value("${pdftron.license:}") + private String pdftronLicense; @SneakyThrows @@ -25,13 +27,13 @@ public class PDFNetInitializer { // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. public void init() { - if (Strings.isNullOrEmpty(settings.getPdftronLicense())) { + if (Strings.isNullOrEmpty(pdftronLicense)) { return; } log.info("Initializing Native Libraries"); - log.info("Setting pdftron license: {}", settings.getPdftronLicense()); + log.info("Setting pdftron license: {}", pdftronLicense); PDFNet.setTempPath("/tmp/pdftron"); - PDFNet.initialize(settings.getPdftronLicense()); + PDFNet.initialize(pdftronLicense); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index cb8bf7e..bdb21ea 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -50,7 +50,7 @@ public class BdrJsonBuildTest extends AbstractTest { protected Document buildGraph(File file) { return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND, - layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND, + layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND, file, new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 15d0e8d..c51c7ae 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -97,7 +97,7 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, - layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, + layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, pdfFileResource.getFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index b96b411..0ed162b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -12,6 +12,7 @@ import java.util.Map; import java.util.function.Predicate; import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; @@ -32,18 +33,29 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; -import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; +import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; +import jakarta.annotation.PostConstruct; import lombok.SneakyThrows; public class OutlineDetectionTest extends AbstractTest { - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); @Autowired protected LayoutParsingPipeline layoutParsingPipeline; + @Autowired + PDFNetInitializer pdfNetInitializer; + + + @BeforeEach + public void init() { + + pdfNetInitializer.init(); + } + @Test @SneakyThrows @@ -60,28 +72,17 @@ public class OutlineDetectionTest extends AbstractTest { OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); assertEquals(outlineObjectTree.getRootNodes().size(), 8); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(1).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(3).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(4).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(5).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(6).size(), 2); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(7).size(), 3); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(8).size(), 2); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(10).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(11).size(), 4); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(12).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage() - .get(13).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2); assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values() .stream() .flatMap(Collection::stream) @@ -98,29 +99,15 @@ public class OutlineDetectionTest extends AbstractTest { .stream() .map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle())) .toList()); - assertEquals(tableOfContents.getMainSections() - .get(5).getChildren().size(), 6); - assertEquals(tableOfContents.getMainSections() - .get(7).getChildren().size(), 3); - assertEquals(tableOfContents.getMainSections() - .get(8).getChildren().size(), 3); - assertEquals(tableOfContents.getMainSections() - .get(8).getChildren() - .get(2).getChildren().size(), 1); - assertEquals(tableOfContents.getMainSections() - .get(8).getChildren() - .get(2).getChildren() - .get(0).getChildren().size(), 3); + assertEquals(tableOfContents.getMainSections().get(5).getChildren().size(), 6); + assertEquals(tableOfContents.getMainSections().get(7).getChildren().size(), 3); + assertEquals(tableOfContents.getMainSections().get(8).getChildren().size(), 3); + assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().size(), 1); + assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().size(), 3); - assertEquals(tableOfContents.getMainSections() - .get(0).getImages().size(), 1); - assertEquals(tableOfContents.getMainSections() - .get(6).getImages().size(), 1); - assertEquals(tableOfContents.getMainSections() - .get(8).getChildren() - .get(2).getChildren() - .get(0).getChildren() - .get(2).getImages().size(), 1); + assertEquals(tableOfContents.getMainSections().get(0).getImages().size(), 1); + assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1); + assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1); Document document = buildGraph(fileName, classificationDocument); @@ -159,17 +146,14 @@ public class OutlineDetectionTest extends AbstractTest { .count(), 3 + 1); assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren() .filter(isSectionOrSuperSection) - .toList() - .get(3).streamChildren() + .toList().get(3).streamChildren() .filter(isSectionOrSuperSection) .count(), 1 + 1); assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren() .filter(isSectionOrSuperSection) - .toList() - .get(3).streamChildren() + .toList().get(3).streamChildren() .filter(isSectionOrSuperSection) - .toList() - .get(1).streamChildren() + .toList().get(1).streamChildren() .filter(isSectionOrSuperSection) .count(), 3 + 1); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java index ab7e4ce..a818b31 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java @@ -3,11 +3,8 @@ package com.knecon.fforesight.service.layoutparser.server; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import org.junit.jupiter.api.Test; @@ -16,12 +13,8 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; @@ -30,7 +23,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedS import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; -import io.micrometer.observation.Observation; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java deleted file mode 100644 index 4c704f7..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Collections; -import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary; - -class BoundaryTest { - - Boundary startBoundary; - - - @BeforeEach - void setUp() { - - startBoundary = new Boundary(10, 100); - } - - - @Test - void testContains() { - - assertTrue(startBoundary.contains(11)); - assertTrue(startBoundary.contains(50)); - assertFalse(startBoundary.contains(9)); - assertFalse(startBoundary.contains(100)); - assertFalse(startBoundary.contains(150)); - assertFalse(startBoundary.contains(-123)); - assertTrue(startBoundary.contains(new Boundary(11, 99))); - assertTrue(startBoundary.contains(new Boundary(10, 100))); - assertTrue(startBoundary.contains(new Boundary(11, 11))); - assertFalse(startBoundary.contains(9, 100)); - assertTrue(startBoundary.contains(100, 100)); - assertFalse(startBoundary.contains(100, 101)); - assertFalse(startBoundary.contains(150, 151)); - } - - - @Test - void testIntersects() { - - assertTrue(startBoundary.intersects(new Boundary(1, 11))); - assertTrue(startBoundary.intersects(new Boundary(11, 12))); - assertTrue(startBoundary.intersects(new Boundary(11, 100))); - assertFalse(startBoundary.intersects(new Boundary(100, 101))); - assertTrue(startBoundary.intersects(new Boundary(99, 101))); - } - - - @Test - void testSplit() { - - assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size()); - assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90))); - assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40))); - assertEquals(1, startBoundary.split(Collections.emptyList()).size()); - assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size()); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0))); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100))); - assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100))); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 8d8597a..bac8d86 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -57,7 +57,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { private void writeJsons(Path filename) { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java new file mode 100644 index 0000000..385feb9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/TextRangeTest.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange; + +class TextRangeTest { + + TextRange startTextRange; + + + @BeforeEach + void setUp() { + + startTextRange = new TextRange(10, 100); + } + + + @Test + void testContains() { + + assertTrue(startTextRange.contains(11)); + assertTrue(startTextRange.contains(50)); + assertFalse(startTextRange.contains(9)); + assertFalse(startTextRange.contains(100)); + assertFalse(startTextRange.contains(150)); + assertFalse(startTextRange.contains(-123)); + assertTrue(startTextRange.contains(new TextRange(11, 99))); + assertTrue(startTextRange.contains(new TextRange(10, 100))); + assertTrue(startTextRange.contains(new TextRange(11, 11))); + assertFalse(startTextRange.contains(9, 100)); + assertTrue(startTextRange.contains(100, 100)); + assertFalse(startTextRange.contains(100, 101)); + assertFalse(startTextRange.contains(150, 151)); + } + + + @Test + void testIntersects() { + + assertTrue(startTextRange.intersects(new TextRange(1, 11))); + assertTrue(startTextRange.intersects(new TextRange(11, 12))); + assertTrue(startTextRange.intersects(new TextRange(11, 100))); + assertFalse(startTextRange.intersects(new TextRange(100, 101))); + assertTrue(startTextRange.intersects(new TextRange(99, 101))); + } + + + @Test + void testSplit() { + + assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size()); + assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90))); + assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40))); + assertEquals(1, startTextRange.split(Collections.emptyList()).size()); + assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size()); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100))); + assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100))); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 3351eb0..bad41e3 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -4,11 +4,18 @@ import java.io.File; import java.nio.file.Path; import java.util.Map; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.mockito.MockitoAnnotations; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.core.io.ClassPathResource; import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; @@ -16,17 +23,30 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; +import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; -import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; +import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; +import com.knecon.fforesight.tenantcommons.TenantsClient; +import com.pdftron.pdf.PDFNet; +import jakarta.annotation.PostConstruct; import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + @Autowired + PDFNetInitializer pdfNetInitializer; + PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + @BeforeEach + public void init() { + + pdfNetInitializer.init(); + } + + @Test @SneakyThrows public void testViewerDocument() { @@ -63,7 +83,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { tableResponse, new VisualLayoutParsingResponse(), Map.of("file", Path.of(fileName).getFileName().toFile().toString())); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml b/layoutparser-service/layoutparser-service-server/src/test/resources/application-dev.yml similarity index 99% rename from layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml rename to layoutparser-service/layoutparser-service-server/src/test/resources/application-dev.yml index 626c3ba..7f8dd04 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/resources/application-dev.yml +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/application-dev.yml @@ -9,3 +9,5 @@ storage: key: minioadmin secret: minioadmin + + diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml index 8cee17b..10dcef7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml @@ -28,6 +28,11 @@ spring: max-interval: 15000 prefetch: 1 +layoutparser: + debug: true + +pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a + management: endpoint: metrics.enabled: ${monitoring.enabled:false} diff --git a/layoutparser-service/viewer-doc-processor/build.gradle b/layoutparser-service/viewer-doc-processor/build.gradle index cdfe7e4..c97d1a2 100644 --- a/layoutparser-service/viewer-doc-processor/build.gradle +++ b/layoutparser-service/viewer-doc-processor/build.gradle @@ -12,7 +12,7 @@ dependencies { implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("org.slf4j:slf4j-api:1.7.25") implementation("com.knecon.fforesight:tracing-commons:0.5.0") - implementation("com.pdftron:PDFNet:10.5.0") + implementation("com.pdftron:PDFNet:10.7.0") testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1") testImplementation("org.junit.jupiter:junit-jupiter") diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java deleted file mode 100644 index 560da8a..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/ContentStreams.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc; - -import java.util.List; - -import org.apache.pdfbox.cos.COSName; - -import lombok.AccessLevel; -import lombok.experimental.FieldDefaults; - -@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC) -public class ContentStreams { - - public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true); - - public static Identifier KNECON_VISUAL_PARSING = new Identifier("Layout grid - visual", COSName.getPDFName("KNECON_VISUAL_PARSING"), true); - - public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false); - - public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true); - - public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true); - - public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false); - - public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false); - - public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false); - - public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true); - - public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true); - - public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true); - - public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true); - - public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true); - - public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true); - - public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true); - - public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true); - - public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true); - - public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true); - - public static List allContentStreams = List.of(KNECON_LAYOUT, - KNECON_VISUAL_PARSING, - KNECON_OCR, - KNECON_OCR_BBOX_DEBUG, - KNECON_OCR_TEXT_DEBUG, - OTHER, - ESCAPE_START, - ESCAPE_END, - RULINGS, - CLEAN_RULINGS, - WORDS, - ZONES, - LINES, - MAIN_BODY, - MARKED_CONTENT, - NEIGHBOURS, - CHARACTERS, - CELLS); - - public record Identifier(String name, COSName cosName, boolean optionalContent) { - - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java new file mode 100644 index 0000000..0926878 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -0,0 +1,77 @@ +package com.knecon.fforesight.service.viewerdoc; + +import org.apache.pdfbox.cos.COSName; + +/* +These identifiers are used to mark content in the pdf, such that it may be found later. The markedContentName must therefore be unique. +The String "name" is only used to display optional content in the optional content view in the pdf. +Therefore, it may be null, if optionalContent is false. +If optionalContent is false, the layer will not be created as a OCG, and will not be listed in the OCG view. + */ +public record LayerIdentifier(String name, String markedContentName) { + + public String markedContentName() { + // The prefix KNECON_ is used to identify marked contents as knecon contents later on + return KNECON_IDENTIFIER_PREFIX + markedContentName; + } + + + public COSName cosName() { + + return COSName.getPDFName(markedContentName); + } + + + public static final String KNECON_IDENTIFIER_PREFIX = "KNECON_"; + + public static final LayerIdentifier KNECON_OCR = new LayerIdentifier(null, "OCR"); + public static final LayerIdentifier KNECON_OCR_TEXT = new LayerIdentifier(null, "OCR_TEXT"); + public static final LayerIdentifier KNECON_OCR_LINES = new LayerIdentifier(null, "OCR_LINES"); + + // layers + // layout grid + public static final LayerIdentifier KNECON_LAYOUT = new LayerIdentifier("Layout grid", "LAYOUT"); + public static final LayerIdentifier KNECON_LAYOUT_SECTION = new LayerIdentifier("Section", "LAYOUT_SECTION"); + public static final LayerIdentifier KNECON_LAYOUT_PARAGRAPH = new LayerIdentifier("Paragraph ", "LAYOUT_PARAGRAPH"); + public static final LayerIdentifier KNECON_LAYOUT_KEY_VALUE = new LayerIdentifier("Key-Value Pairs ", "LAYOUT_KEY_VALUE"); + public static final LayerIdentifier KNECON_LAYOUT_HEADLINE = new LayerIdentifier("Headline", "LAYOUT_HEADLINE"); + public static final LayerIdentifier KNECON_LAYOUT_HEADER_FOOTER = new LayerIdentifier("Header/Footer", "LAYOUT_HEADER_FOOTER"); + public static final LayerIdentifier KNECON_LAYOUT_TABLE = new LayerIdentifier("Tables", "LAYOUT_TABLE"); + public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES"); + public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES"); + public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs"); + + //layout grid debug + public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT"); + public static final LayerIdentifier CLEAN_RULINGS = new LayerIdentifier("Classified Rulings", "CLEAN_RULINGS"); + public static final LayerIdentifier RULINGS = new LayerIdentifier("Rulings", "RULINGS"); + public static final LayerIdentifier WORDS = new LayerIdentifier("Words", "WORDS"); + public static final LayerIdentifier ZONES = new LayerIdentifier("Text Zones", "ZONES"); + public static final LayerIdentifier LINES = new LayerIdentifier("Text Lines", "LINES"); + public static final LayerIdentifier CELLS = new LayerIdentifier("Cells", "CELLS"); + public static final LayerIdentifier MAIN_BODY = new LayerIdentifier("Main Text Body", "MAIN_BODY"); + public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT"); + public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS"); + public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS"); + + public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING"); + + //ocr + public static final LayerIdentifier KNECON_OCR_DEBUG = new LayerIdentifier("OCR", "OCR_DEBUG"); + public static final LayerIdentifier KNECON_OCR_TEXT_DEBUG = new LayerIdentifier("OCR Text", "OCR_TEXT_DEBUG"); + public static final LayerIdentifier KNECON_OCR_BBOX_DEBUG = new LayerIdentifier("OCR Words", "OCR_BBOX_DEBUG"); + public static final LayerIdentifier KNECON_OCR_LINE_DEBUG = new LayerIdentifier("OCR Lines", "OCR_LINE_DEBUG"); + public static final LayerIdentifier KNECON_OCR_OVERLAPPED_TEXT = new LayerIdentifier("OCR overlapped Text", "OCR_OVERLAPPED_TEXT_DEBUG"); + + //azure idp + public static final LayerIdentifier KNECON_AZURE_IDP = new LayerIdentifier("IDP", "IDP"); + public static final LayerIdentifier IDP_FIGURES = new LayerIdentifier("IDP Figures", "IDP_FIGURES"); + public static final LayerIdentifier IDP_TABLES = new LayerIdentifier("IDP Tables", "IDP_TABLES"); + public static final LayerIdentifier IDP_KV_PAIRS = new LayerIdentifier("IDP Key Value Pair", "IDP_KV_PAIRS"); + public static final LayerIdentifier IDP_SECTIONS = new LayerIdentifier("IDP Sections", "IDP_SECTIONS"); + public static final LayerIdentifier IDP_LINES = new LayerIdentifier("IDP Lines", "IDP_LINES"); + public static final LayerIdentifier IDP_PARAGRAPHS = new LayerIdentifier("IDP Paragraphs", "IDP_PARAGRAPHS"); + public static final LayerIdentifier IDP_LIST = new LayerIdentifier("IDP Lists", "IDP_LISTS"); + public static final LayerIdentifier IDP_BARCODES = new LayerIdentifier("IDP Barcodes", "IDP_BARCODES"); + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/AbstractLayerGroup.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/AbstractLayerGroup.java new file mode 100644 index 0000000..b2219e8 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/AbstractLayerGroup.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; + +public abstract class AbstractLayerGroup implements LayerGroup { + + protected VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) { + + if (visualizations.getVisualizationsOnPages().containsKey(page)) { + return visualizations.getVisualizationsOnPages() + .get(page); + } + VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build(); + visualizations.getVisualizationsOnPages().put(page, visualizationsOnPage); + return visualizationsOnPage; + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java new file mode 100644 index 0000000..6441d99 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java @@ -0,0 +1,49 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.awt.Color; +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.Getter; + +public class IdpLayerConfig extends AbstractLayerGroup { + + @Getter + public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_AZURE_IDP; + + public static final LayerGroup CONFIG_INSTANCE = new IdpLayerConfig(); + + protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.IDP_FIGURES).visibleByDefault(true).build(); + protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.IDP_TABLES).visibleByDefault(true).build(); + protected final Visualizations keyValuePairs = Visualizations.builder().layer(LayerIdentifier.IDP_KV_PAIRS).visibleByDefault(true).build(); + protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.IDP_PARAGRAPHS).build(); + protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.IDP_SECTIONS).build(); + protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.IDP_LINES).build(); + protected final Visualizations lists = Visualizations.builder().layer(LayerIdentifier.IDP_LIST).visibleByDefault(true).build(); + protected final Visualizations barcodes = Visualizations.builder().layer(LayerIdentifier.IDP_BARCODES).visibleByDefault(true).build(); + + protected static final Color TABLE_COLOR = new Color(102, 205, 170); + protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175); + protected static final Color SECTION_COLOR = new Color(50, 50, 50); + protected static final Color SECTION_HEADING_COLOR = new Color(162, 56, 56); + protected static final Color TITLE_COLOR = new Color(221, 25, 25); + protected static final Color HEADER_FOOTER_COLOR = new Color(171, 131, 6); + protected static final Color FOOTNOTE_COLOR = new Color(6, 64, 171); + protected static final Color FORMULA_COLOR = new Color(80, 171, 6); + protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180); + protected static final Color IMAGE_COLOR = new Color(253, 63, 146); + protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85); + protected static final Color KEY_COLOR = new Color(30, 92, 172); + protected static final Color VALUE_COLOR = new Color(30, 172, 146); + protected static final Color LINES_COLOR = new Color(152, 45, 179); + + + @Override + public List getVisualizations() { + + return List.of(paragraphs, sections, figures, tables, keyValuePairs, lines, lists, barcodes); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayerGroup.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayerGroup.java new file mode 100644 index 0000000..310cea8 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayerGroup.java @@ -0,0 +1,62 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +public interface LayerGroup { + + LayerIdentifier getGroupIdentifier(); + + + List getVisualizations(); + + + default List getSubLayers() { + + return getVisualizations().stream() + .map(Visualizations::getLayer) + .toList(); + + } + + + default boolean isVisibleByDefault() { + + return false; + } + + + /* + indicates the sub layers are all optional content + */ + default boolean subLayersAreOptionalContent() { + + return true; + } + + + /* + indicates the LayerGroup is also a optional content group, and should be displayed as such: + layer + - sublayer0 + - sublayer1 + + see note in specification 8.11.4.3 + */ + default boolean isOptionalContent() { + + return true; + } + + + default boolean isEmpty() { + + return getVisualizations().isEmpty(); + } + + + + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java new file mode 100644 index 0000000..ffbaf61 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -0,0 +1,73 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.awt.Color; +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.Getter; + +public class LayoutDebugLayerConfig extends AbstractLayerGroup { + + @Getter + public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT_DEBUG; + + public static final LayerGroup CONFIG_INSTANCE = new LayoutDebugLayerConfig(); + + protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + + protected static final Color WORDS_COLOR = new Color(68, 84, 147); + protected static final Color LINES_COLOR = new Color(152, 45, 179); + protected static final Color ZONES_COLOR = new Color(131, 38, 38); + + protected static final Color RULINGS_COLOR = new Color(21, 221, 174); + protected static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175); + protected static final Color HEADER_RULING_COLOR = new Color(171, 131, 6); + protected static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2); + protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171); + protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6); + + protected static final Color CELLS_COLOR = new Color(31, 214, 27); + + protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); + protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); + + protected static final List ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51), + new Color(255, 195, 0), + new Color(76, 175, 80), + new Color(33, 150, 243), + new Color(155, 89, 182), + new Color(233, 30, 99), + new Color(0, 188, 212), + new Color(121, 85, 72)); + + protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).visibleByDefault(true).build(); + protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build(); + protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).build(); + protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build(); + protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build(); + protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build(); + protected final Visualizations cells = Visualizations.builder().layer(LayerIdentifier.CELLS).build(); + protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build(); + protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build(); + protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build(); + + + public List getVisualizations() { + + return List.of(characters, // + neighbours,// + words, // + lines, // + zones, // + rulings, // + clean_rulings, // + cells, // + mainBody, // + markedContent // + ); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java new file mode 100644 index 0000000..7b95a3d --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java @@ -0,0 +1,55 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.awt.Color; +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.Getter; + +public class LayoutGridLayerConfig extends AbstractLayerGroup { + + @Getter + public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT; + + public static final LayerGroup CONFIG_INSTANCE = new LayoutGridLayerConfig(); + + protected static final float FONT_SIZE = 10f; + protected static final float LINE_WIDTH = 1f; + protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + + protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175); + protected static final Color HEADER_CELL_COLOR = new Color(156, 21, 48); + protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180); + + protected static final Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101); + protected static final Color TABLE_COLOR = new Color(102, 205, 170); + protected static final Color SECTION_COLOR = new Color(50, 50, 50); + protected static final Color HEADLINE_COLOR = new Color(162, 56, 56); + protected static final Color HEADER_COLOR = new Color(171, 131, 6); + protected static final Color IMAGE_COLOR = new Color(253, 63, 146); + protected static final Color TREEID_COLOR = new Color(53, 53, 53); + protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85); + protected static final Color KEY_COLOR = new Color(30, 92, 172); + protected static final Color VALUE_COLOR = new Color(30, 172, 146); + + protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build(); + protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build(); + protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build(); + protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build(); + protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_FIGURES).visibleByDefault(true).build(); + protected final Visualizations headerFooter = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADER_FOOTER).visibleByDefault(true).build(); + protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build(); + protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build(); + protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build(); + + + @Override + public List getVisualizations() { + + return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java new file mode 100644 index 0000000..44cf52d --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrDebugLayerConfig.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.awt.Color; +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.Getter; + +public class OcrDebugLayerConfig extends AbstractLayerGroup { + + @Getter + public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_OCR_DEBUG; + + public static final LayerGroup CONFIG_INSTANCE = new OcrDebugLayerConfig(); + + protected static final Color REGULAR_COLOR = new Color(6, 39, 171); + protected static final Color BOLD_COLOR = new Color(50, 246, 246); + protected static final Color ITALIC_COLOR = new Color(171, 105, 6); + protected static final Color BOLD_ITALIC_COLOR = new Color(6, 171, 102); + protected static final Color HANDWRITTEN_COLOR = new Color(171, 64, 6); + protected static final Color OVERLAPPED_COLOR = new Color(142, 8, 8); + protected static final Color TABLE_LINES_COLOR = new Color(21, 221, 174); + + protected final Visualizations debugText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT_DEBUG).visibleByDefault(true).build(); + protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINE_DEBUG).visibleByDefault(true).build(); + protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(false).build(); + protected final Visualizations debugBBox = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_BBOX_DEBUG).visibleByDefault(false).build(); + + + @Override + public List getVisualizations() { + + return List.of(debugText, tableLines, debugBBox, overlappedText); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrTextLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrTextLayerConfig.java new file mode 100644 index 0000000..781da6d --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/OcrTextLayerConfig.java @@ -0,0 +1,40 @@ +package com.knecon.fforesight.service.viewerdoc.layers; + +import java.util.List; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +public class OcrTextLayerConfig extends AbstractLayerGroup { + + protected final Visualizations ocrText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT).build(); + protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINES).build(); + + + @Override + public LayerIdentifier getGroupIdentifier() { + + return LayerIdentifier.KNECON_OCR; + } + + + @Override + public List getVisualizations() { + + return List.of(ocrText, tableLines); + } + + + @Override + public boolean subLayersAreOptionalContent() { + + return false; + } + + @Override + public boolean isOptionalContent() { + + return false; + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java deleted file mode 100644 index d8f47ca..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/LayoutGrid.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.model; - -import java.util.HashMap; -import java.util.Map; - -import lombok.AccessLevel; -import lombok.Getter; -import lombok.experimental.FieldDefaults; - -@Getter -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class LayoutGrid { - - int numberOfPages; - Map visualizationsPerPages; - - - public LayoutGrid(int numberOfPages) { - - this.numberOfPages = numberOfPages; - this.visualizationsPerPages = new HashMap<>(); - for (int i = 0; i < numberOfPages; i++) { - this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build()); - } - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/MarkedContentStack.java similarity index 67% rename from layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/MarkedContentStack.java index eedd1fe..21d346f 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/MarkedContentStack.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/MarkedContentStack.java @@ -1,10 +1,12 @@ -package com.knecon.fforesight.service.viewerdoc.service.pdftron; +package com.knecon.fforesight.service.viewerdoc.model; import java.util.Deque; import java.util.Iterator; import java.util.LinkedList; import java.util.Set; +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; + public class MarkedContentStack { private final Deque stack = new LinkedList<>(); @@ -44,7 +46,23 @@ public class MarkedContentStack { } - public boolean currentMarkedContentContainsAny(Set names) { + public boolean currentMarkedContentContainsNone(Set names) { + + if (stack.isEmpty()) { + return true; + } + Iterator markedContentIterator = stack.descendingIterator(); + while (markedContentIterator.hasNext()) { + var markedContent = markedContentIterator.next(); + if (names.contains(markedContent.name())) { + return false; + } + } + return true; + } + + + public boolean currentMarkedContentIsKneconContent() { if (stack.isEmpty()) { return false; @@ -52,11 +70,12 @@ public class MarkedContentStack { Iterator markedContentIterator = stack.descendingIterator(); while (markedContentIterator.hasNext()) { var markedContent = markedContentIterator.next(); - if (names.contains(markedContent.name())) { + if (markedContent.name().startsWith(LayerIdentifier.KNECON_IDENTIFIER_PREFIX)) { return true; } } return false; + } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java deleted file mode 100644 index c9a1b9d..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/OperatorWithArguments.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.model; - -import java.util.List; - -import org.apache.pdfbox.contentstream.operator.Operator; -import org.apache.pdfbox.cos.COSBase; - -public record OperatorWithArguments(Operator operator, List arguments) { - -} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java index ff9e449..97d5065 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/PlacedText.java @@ -1,13 +1,14 @@ package com.knecon.fforesight.service.viewerdoc.model; import java.awt.Color; +import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.util.Optional; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.apache.pdfbox.util.Matrix; -public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional textMatrix, Optional renderingMode) { +public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional textMatrix, Optional renderingMode) { public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) { diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java index 6af80b9..ed83aab 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Visualizations.java @@ -3,7 +3,7 @@ package com.knecon.fforesight.service.viewerdoc.model; import java.util.LinkedHashMap; import java.util.Map; -import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -17,9 +17,10 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class Visualizations { - ContentStreams.Identifier layer; + LayerIdentifier layer; @Builder.Default Map visualizationsOnPages = new LinkedHashMap<>(); - boolean layerVisibilityDefaultValue; + + boolean visibleByDefault; } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java index 0001805..f2d59a9 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/VisualizationsOnPage.java @@ -14,6 +14,7 @@ import lombok.experimental.FieldDefaults; public class VisualizationsOnPage { boolean makePathsInvisible; + boolean inDeviceCoordinates; @Builder.Default List placedTexts = new LinkedList<>(); @Builder.Default diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java deleted file mode 100644 index dba3e6b..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/ClassifiedContentStream.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.pdf; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; - -public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) { - -} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java deleted file mode 100644 index f429639..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/pdf/SinglePDContentStream.java +++ /dev/null @@ -1,61 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.pdf; - -import java.io.IOException; -import java.io.InputStream; - -import org.apache.pdfbox.contentstream.PDContentStream; -import org.apache.pdfbox.io.RandomAccessInputStream; -import org.apache.pdfbox.io.RandomAccessRead; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.common.PDStream; -import org.apache.pdfbox.util.Matrix; - -import lombok.AccessLevel; -import lombok.Getter; -import lombok.RequiredArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Getter -@RequiredArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class SinglePDContentStream implements PDContentStream { - - PDStream pdStream; - - - @Override - public InputStream getContents() throws IOException { - - return new RandomAccessInputStream(getContentsForRandomAccess()); - } - - - @Override - public RandomAccessRead getContentsForRandomAccess() throws IOException { - - return pdStream.getCOSObject().createView(); - } - - - @Override - public PDResources getResources() { - - return null; - } - - - @Override - public PDRectangle getBBox() { - - return null; - } - - - @Override - public Matrix getMatrix() { - - return null; - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java deleted file mode 100644 index e34148b..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifier.java +++ /dev/null @@ -1,121 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service; - -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; - -import org.apache.pdfbox.contentstream.PDContentStream; -import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDPage; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments; -import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; -import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream; - -import lombok.SneakyThrows; -import lombok.experimental.UtilityClass; - -@UtilityClass -public class ContentStreamClassifier { - - public List getClassifiedContentStreams(PDPage page) { - - List streams = new LinkedList<>(); - page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream))); - return ContentStreamClassifier.classifySingleContentStreams(page, streams); - } - - - public List classifySingleContentStreams(PDPage page, List streams) { - - return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList(); - } - - - private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) { - - ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page); - return new ClassifiedContentStream(singlePDContentStream, classification); - } - - - /** - * We assume all of our layers are written escaped, so only unknown content streams need to be escaped. - * - * @param classifiers List of all content streams of a page with their classification - * @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END - */ - public boolean areAllContentStreamsEscaped(List classifiers) { - - int escapeDepth = 0; - for (ClassifiedContentStream classifier : classifiers) { - if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) { - return false; - } - if (classifier.classification().equals(ContentStreams.ESCAPE_START)) { - escapeDepth++; - } - if (classifier.classification().equals(ContentStreams.ESCAPE_END)) { - escapeDepth--; - } - } - return escapeDepth == 0; - } - - - @SneakyThrows - public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) { - - List operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2); - if (operatorsWithArguments.isEmpty()) { - return ContentStreams.OTHER; - } - OperatorWithArguments firstOperator = operatorsWithArguments.get(0); - - // If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q". - if (operatorsWithArguments.size() == 1) { - if (firstOperator.operator().getName().equals(OperatorName.SAVE)) { - return ContentStreams.ESCAPE_START; - } - if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) { - return ContentStreams.ESCAPE_END; - } - } - - // In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid". - // This OCG is then assigned a COSName by PDFBox. Usually its "oc1". - // Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG. - // This COSName can then be found as an argument for the first operator in the content stream. - if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) { - Optional layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page); - if (layoutGridOCGName.isPresent()) { - if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) { - return ContentStreams.KNECON_LAYOUT; - } - } - } - - if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) { - return ContentStreams.OTHER; - } - - Optional firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst(); - - if (firstCOSNameFromArguments.isEmpty()) { - return ContentStreams.OTHER; - } - - var cosName = firstCOSNameFromArguments.get(); - - return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER); - } - - - private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) { - - return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName)); - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java deleted file mode 100644 index c672cb1..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamUtility.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; -import java.util.Set; - -import org.apache.pdfbox.contentstream.PDContentStream; -import org.apache.pdfbox.contentstream.operator.Operator; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.pdfparser.PDFStreamParser; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDStream; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments; -import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; -import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class ContentStreamUtility { - - public static List parseLeadingOperators(PDContentStream contentStream, - int numberOfOperatorsToRead) throws IOException { - - List arguments = new ArrayList<>(); - PDFStreamParser parser = new PDFStreamParser(contentStream); - List operatorsWithArguments = new LinkedList<>(); - for (int i = 0; i < numberOfOperatorsToRead; ) { - Object token = parser.parseNextToken(); - if (token == null) { - break; - } - if (token instanceof Operator operator) { - operatorsWithArguments.add(new OperatorWithArguments(operator, arguments)); - arguments = new ArrayList<>(); - i++; - } else { - arguments.add((COSBase) token); - } - - } - return operatorsWithArguments; - } - - - public static Optional findLayoutGridOCGName(PDPage page) { - - var resourceIterator = page.getResources().getPropertiesNames(); - for (COSName cosName : resourceIterator) { - COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME); - if (cosBase instanceof COSString string) { - if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) { - return Optional.of(cosName); - } - } - } - return Optional.empty(); - } - - - public static List removeLayerFromContentStreams(Set layers, List classifiers) { - - return classifiers.stream() - .filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification())) - .map(ClassifiedContentStream::contentStream) - .map(SinglePDContentStream::getPdStream) - .toList(); - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java deleted file mode 100644 index 76bfd37..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/IViewerDocumentService.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service; - -import java.io.File; -import java.util.List; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.Visualizations; - -import io.micrometer.observation.ObservationRegistry; - -public interface IViewerDocumentService { - - void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations); - - default void enrichObservation(ObservationRegistry registry, int numberOfPages, List layers) { - - if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { - return; - } - registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); - for (int i = 0; i < layers.size(); i++) { - ContentStreams.Identifier layer = layers.get(i); - - registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name())); - } - } -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java similarity index 54% rename from layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java index dcbb69a..ee2ded6 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PDFTronViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.viewerdoc.service.pdftron; +package com.knecon.fforesight.service.viewerdoc.service; import java.io.File; import java.io.FileInputStream; @@ -11,16 +11,17 @@ import java.util.Map; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.pdfbox.cos.COSName; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig; +import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; +import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig; +import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig; +import com.knecon.fforesight.service.viewerdoc.layers.OcrDebugLayerConfig; import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; import com.knecon.fforesight.service.viewerdoc.model.Visualizations; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; -import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService; import com.pdftron.pdf.ElementBuilder; import com.pdftron.pdf.ElementReader; import com.pdftron.pdf.ElementWriter; @@ -39,16 +40,19 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @RequiredArgsConstructor -public class PDFTronViewerDocumentService implements IViewerDocumentService { +public class PDFTronViewerDocumentService { private final ObservationRegistry registry; + public static final List ALL_LAYERS_WITH_OPTIONAL_CONTENT = List.of(LayoutGridLayerConfig.CONFIG_INSTANCE, + OcrDebugLayerConfig.CONFIG_INSTANCE, + LayoutDebugLayerConfig.CONFIG_INSTANCE, + IdpLayerConfig.CONFIG_INSTANCE); + - @Override - @Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations") @SneakyThrows - public synchronized void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { - + @Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations") + public synchronized void addLayerGroups(File originFile, File destinationFile, List layerGroups) { // originFile and destinationFile might be the same, so we use a temp file. // Otherwise, saving the document might corrupt the file @@ -62,63 +66,73 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService { ) { enrichObservation(registry, pdfDoc.getPageCount(), - visualizations.stream() + layerGroups.stream() + .map(LayerGroup::getVisualizations) + .flatMap(Collection::stream) .map(Visualizations::getLayer) .toList()); - Map groupMap = PdftronLayerUtility.addLayersToDocument(visualizations, pdfDoc); + Map groupMap = PdftronLayerUtility.addLayersToDocument(layerGroups, pdfDoc); - Map fontMap = buildFontMap(visualizations, pdfDoc); + Map fontMap = buildFontMap(layerGroups, pdfDoc); - Set markedContentToDraw = extractMarkedContentNames(visualizations.stream() - .map(Visualizations::getLayer)); - - Set kneconMarkedContents = extractMarkedContentNames(ContentStreams.allContentStreams.stream()); + Set markedContentToDraw = mapMarkedContentNames(layerGroups); PageContentCleaner pageContentCleaner = PageContentCleaner.builder() .writer(pageWriter) .reader(reader) .elementBuilder(builder) - .markedContentToDraw(markedContentToDraw) - .kneconMarkedContents(kneconMarkedContents) + .markedContentToRemove(markedContentToDraw) .build(); VisualizationWriter visualizationWriter = VisualizationWriter.builder() .writer(pageWriter) .builder(builder) .groupMap(groupMap) - .visualizations(visualizations) + .layerGroups(layerGroups) .fontMap(fontMap) .build(); - int pageNumber = 0; + boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc); + + int pageNumber = 1; for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) { Page page = iterator.next(); - pageContentCleaner.cleanPage(page); + if (isCurrentVersion) { + pageContentCleaner.removeMarkedContent(page); + } visualizationWriter.drawVisualizationsOnPage(pageNumber, page); - } + ViewerDocVersioningUtility.setVersionInDocument(pdfDoc); + saveDocument(pdfDoc, destinationFile); + } finally { + assert !tmpFile.toFile().exists() || tmpFile.toFile().delete(); } } - private static Set extractMarkedContentNames(Stream visualizations) { + private static Set mapMarkedContentNames(List layerGroups) { - return visualizations.map(ContentStreams.Identifier::cosName) - .map(COSName::getName) + return layerGroups.stream() + .map(LayerGroup::getVisualizations) + .flatMap(Collection::stream) + .map(Visualizations::getLayer) + .map(LayerIdentifier::name) .collect(Collectors.toSet()); } - private static Map buildFontMap(List visualizations, PDFDoc pdfDoc) { + private static Map buildFontMap(List layerGroups, PDFDoc pdfDoc) { - return visualizations.stream() + return layerGroups.stream() + .map(LayerGroup::getVisualizations) + .flatMap(Collection::stream) .map(Visualizations::getVisualizationsOnPages) .map(Map::values) .flatMap(Collection::stream) @@ -146,4 +160,18 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService { } } + + private void enrichObservation(ObservationRegistry registry, int numberOfPages, List layers) { + + if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { + return; + } + registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); + for (int i = 0; i < layers.size(); i++) { + LayerIdentifier layer = layers.get(i); + + registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name())); + } + } + } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleaner.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleaner.java new file mode 100644 index 0000000..991cf7d --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleaner.java @@ -0,0 +1,83 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.util.Set; + +import com.knecon.fforesight.service.viewerdoc.model.MarkedContentStack; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Page; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; + +@Builder +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class PageContentCleaner { + + ElementWriter writer; + ElementReader reader; + ElementBuilder elementBuilder; + Set markedContentToRemove; + + @Builder.Default + MarkedContentStack markedContentStack = new MarkedContentStack(); + + + @SneakyThrows + public void removeMarkedContent(Page page) { + + begin(page); + copyElementsExceptMarkedContentToRemove(); + end(); + } + + + @SneakyThrows + private void begin(Page page) { + + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + reader.begin(page); + } + + + @SneakyThrows + private void end() { + + writer.end(); + reader.end(); + } + + + @SneakyThrows + private void copyElementsExceptMarkedContentToRemove() { + + for (Element element = reader.next(); element != null; element = reader.next()) { + switch (element.getType()) { + case Element.e_marked_content_begin -> { + markedContentStack.enterMarkedContent(element.getMCTag().getName()); + if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) { + writer.writeElement(element); + } + } + case Element.e_marked_content_end -> { + if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) { + writer.writeElement(element); + } + markedContentStack.leaveMarkedContent(); + } + default -> { + if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) { + writer.writeElement(element); + } + } + } + + } + + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PdftronLayerUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PdftronLayerUtility.java new file mode 100644 index 0000000..fdc6f7b --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PdftronLayerUtility.java @@ -0,0 +1,187 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.ocg.Config; +import com.pdftron.pdf.ocg.Group; +import com.pdftron.sdf.Obj; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class PdftronLayerUtility { + + @SneakyThrows + public Map addLayersToDocument(List layerGroups, PDFDoc pdfDoc) { + + Map optionalContentGroupMap = new HashMap<>(); + + for (var layerGroup : layerGroups) { + + if (!layerGroup.subLayersAreOptionalContent() || layerGroup.isEmpty()) { + continue; + } + + if (layerGroup.isOptionalContent()) { + Group group = addLayerToDocument(pdfDoc, layerGroup.getGroupIdentifier().name(), layerGroup.isVisibleByDefault()); + optionalContentGroupMap.put(layerGroup.getGroupIdentifier(), group); + } + + if (layerGroup.subLayersAreOptionalContent()) { + for (Visualizations subLayer : layerGroup.getVisualizations()) { + Group subGroup = addLayerToDocument(pdfDoc, subLayer.getLayer().name(), layerGroup.isVisibleByDefault()); + optionalContentGroupMap.put(subLayer.getLayer(), subGroup); + } + } + } + + setOrderArrayForPresentGroups(pdfDoc, PDFTronViewerDocumentService.ALL_LAYERS_WITH_OPTIONAL_CONTENT); + + return optionalContentGroupMap; + } + + + @SneakyThrows + public void setOrderArrayForPresentGroups(PDFDoc pdfDoc, List layerGroups) { + + Config cfg = getConfig(pdfDoc); + Obj orderArray = pdfDoc.createIndirectArray(); + + Map groupMap = findAllGroupsInDocAsMap(pdfDoc); + + for (var layerGroup : layerGroups) { + + Obj childOrderArray; + + if (!layerGroup.subLayersAreOptionalContent()) { + continue; + } + + if (layerGroup.isOptionalContent() && groupMap.containsKey(layerGroup.getGroupIdentifier().name())) { + Group group = groupMap.remove(layerGroup.getGroupIdentifier().name()); + group.setInitialState(cfg, layerGroup.isVisibleByDefault()); + orderArray.pushBack(group.getSDFObj()); + childOrderArray = pdfDoc.createIndirectArray(); + orderArray.pushBack(childOrderArray); + } else { + childOrderArray = orderArray; + } + + for (Visualizations subLayer : layerGroup.getVisualizations()) { + if (groupMap.containsKey(subLayer.getLayer().name())) { + Group group = groupMap.remove(subLayer.getLayer().name()); + group.setInitialState(cfg, subLayer.isVisibleByDefault()); + childOrderArray.pushBack(group.getSDFObj()); + } + } + } + + if (!groupMap.isEmpty()) { + for (Group group : groupMap.values()) { + orderArray.pushBack(group.getSDFObj()); + } + } + + cfg.setOrder(orderArray); + cfg.getSDFObj().putText("ListMode", "VisiblePages"); + } + + + private static Map findAllGroupsInDocAsMap(PDFDoc pdfDoc) throws PDFNetException { + + Map groupMap = new LinkedHashMap<>(); + + List presentGroups = findAllGroupsInDoc(pdfDoc); + + for (Group group : presentGroups) { + groupMap.put(group.getName(), group); + } + return groupMap; + } + + + private static Config getConfig(PDFDoc pdfDoc) throws PDFNetException { + + Config cfg = pdfDoc.getOCGConfig(); + if (cfg == null) { + cfg = Config.create(pdfDoc, true); + } + cfg.setName("knecon debug layer order"); + cfg.setCreator("Knecon Technology GmbH"); + return cfg; + } + + + @SneakyThrows + private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) { + + Optional existingGroup = findGroupInDoc(doc, layerName); + + if (existingGroup.isPresent()) { + return existingGroup.get(); + } + + return addNewLayer(doc, layerName, layerVisibilityDefaultValue, false); + } + + + private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue, boolean containsAll) throws PDFNetException { + + Config cfg = getConfig(doc); + Group grp = Group.create(doc, layerName); + grp.setInitialState(cfg, layerVisibilityDefaultValue); + + return grp; + } + + + @SneakyThrows + private Optional findGroupInDoc(PDFDoc doc, String layerName) { + + Obj ocgs = doc.getOCGs(); + if (ocgs != null) { + int i; + int sz = (int) ocgs.size(); + for (i = 0; i < sz; ++i) { + Group ocg = new Group(ocgs.getAt(i)); + if (ocg.getName().equals(layerName)) { + return Optional.of(ocg); + } + } + } + return Optional.empty(); + } + + + @SneakyThrows + private List findAllGroupsInDoc(PDFDoc doc) { + + Obj ocgs = doc.getOCGs(); + + if (ocgs == null) { + return Collections.emptyList(); + } + + List groups = new ArrayList<>(Math.toIntExact(ocgs.size())); + int i; + int sz = (int) ocgs.size(); + for (i = 0; i < sz; ++i) { + Group ocg = new Group(ocgs.getAt(i)); + groups.add(ocg); + } + return groups; + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java new file mode 100644 index 0000000..9fa3e83 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java @@ -0,0 +1,70 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.io.File; +import java.util.Objects; +import java.util.Optional; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; + +import com.pdftron.pdf.PDFDoc; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ViewerDocVersioningUtility { + + public static final int currentVersion = 0; + public static final String AUTHOR = "knecon technology GmbH"; + public static final String CUSTOM_DICT = "KNECON_VERSION"; + + + @SneakyThrows + public void setVersionInDocument(PDFDoc pdfDoc) { + + pdfDoc.getDocInfo().setAuthor(AUTHOR); + pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion); + } + + + private static Optional readVersionFromKeywords(String keywords) { + + String[] strings = keywords.split(":"); + if (strings.length != 2) { + return Optional.empty(); + } + if (!strings[0].equals(CUSTOM_DICT)) { + return Optional.empty(); + } + try { + return Optional.of(Integer.parseInt(strings[1])); + } catch (NumberFormatException e) { + return Optional.empty(); + } + } + + + @SneakyThrows + public boolean isCurrentVersion(File file) { + + try (PDDocument doc = Loader.loadPDF(file)) { + return isCurrentVersion(doc.getDocumentInformation().getAuthor(), doc.getDocumentInformation().getKeywords()); + } + } + + + @SneakyThrows + public boolean docIsCurrentVersion(PDFDoc pdfDoc) { + + return isCurrentVersion(pdfDoc.getDocInfo().getAuthor(), pdfDoc.getDocInfo().getKeywords()); + } + + + private static boolean isCurrentVersion(String author, String keywords) { + + return Objects.equals(author, AUTHOR) // + && readVersionFromKeywords(keywords).map(version -> version == currentVersion).orElse(false); + } + +} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java deleted file mode 100644 index 9fd713c..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java +++ /dev/null @@ -1,324 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service; - -import java.awt.geom.AffineTransform; -import java.awt.geom.Rectangle2D; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdfwriter.compress.CompressParameters; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; -import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; -import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; -import org.apache.pdfbox.util.Matrix; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; -import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; -import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; -import com.knecon.fforesight.service.viewerdoc.model.PlacedText; -import com.knecon.fforesight.service.viewerdoc.model.Visualizations; -import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; -import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; - -import io.micrometer.observation.Observation; -import io.micrometer.observation.ObservationRegistry; -import io.micrometer.observation.annotation.Observed; -import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@RequiredArgsConstructor -public class ViewerDocumentService implements IViewerDocumentService { - - private final ObservationRegistry registry; - - - @Observed(name = "ViewerDocumentService", contextualName = "add-visualizations") - @SneakyThrows - public void addVisualizationsOnPage(File originFile, File destinationFile, List visualizations) { - - // originFile and destinationFile might be the same, so we use a temp file. - // Otherwise, saving the document might corrupt the file - Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); - Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); - - PDDocument pdDocument = openPDDocument(tmpFile.toFile()); - - enrichObservation(registry, - pdDocument.getNumberOfPages(), - visualizations.stream() - .map(Visualizations::getLayer) - .toList()); - - Set allLayers = visualizations.stream() - .map(Visualizations::getLayer) - .collect(Collectors.toUnmodifiableSet()); - - Map optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument); - - for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { - PDPage pdPage = pdDocument.getPage(pageNumber); - createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups - - List classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage); - - pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers)); - - AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage); - - if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) { - // We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects, - // e.g. not escaped matrix transformations. - wrapContentStreams(pdDocument, pdPage); - } - - for (Visualizations visualization : visualizations) { - if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) { - continue; - } - // We need to append to the content stream, otherwise the content could be overlapped by following content. - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { - - contentStream.beginMarkedContent(visualization.getLayer().cosName()); - - if (optionalContentGroupMap.containsKey(visualization.getLayer())) { - contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer())); - } - - contentStream.saveGraphicsState(); - - drawVisualizationsToContentStream(pdDocument, - visualization.getVisualizationsOnPages() - .get(pageNumber), - contentStream, - textDeRotationMatrix); - - contentStream.restoreGraphicsState(); - - if (optionalContentGroupMap.containsKey(visualization.getLayer())) { - contentStream.endMarkedContent(); - } - - contentStream.endMarkedContent(); - } - - } - if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM - log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages()); - observedIncrementalSave(pdDocument, destinationFile); - pdDocument.close(); - Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); - pdDocument = openPDDocument(tmpFile.toFile()); - } - } - observedIncrementalSave(pdDocument, destinationFile); - - pdDocument.close(); - assert tmpFile.toFile().delete(); - } - - - private static Map addLayersToDocument(List visualizations, PDDocument pdDocument) { - - Map optionalContentGroupMap = new HashMap<>(); - for (Visualizations visualization : visualizations) { - addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())// - .ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg)); - } - return optionalContentGroupMap; - } - - - private static void drawVisualizationsToContentStream(PDDocument pdDocument, - VisualizationsOnPage visualizationsOnPage, - PDPageContentStream contentStream, - AffineTransform textDeRotationMatrix) throws IOException { - - if (visualizationsOnPage.isMakePathsInvisible()) { - contentStream.addRect(0, 0, 1, 1); - contentStream.clip(); - } - - for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { - contentStream.setLineWidth(coloredLine.lineWidth()); - contentStream.setStrokingColor(coloredLine.color()); - contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1()); - contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2()); - contentStream.stroke(); - } - - for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) { - contentStream.setLineWidth(coloredRectangle.lineWidth()); - contentStream.setStrokingColor(coloredRectangle.color()); - Rectangle2D r = coloredRectangle.rectangle2D(); - contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); - contentStream.stroke(); - } - - for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) { - contentStream.setNonStrokingColor(filledRectangle.color()); - PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState(); - graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha()); - contentStream.setGraphicsStateParameters(graphicsState); - Rectangle2D r = filledRectangle.rectangle2D(); - contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); - contentStream.fill(); - } - - for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) { - PDFont font = placedText.font().embed(pdDocument); - contentStream.setFont(font, placedText.fontSize()); - contentStream.beginText(); - contentStream.setNonStrokingColor(placedText.color()); - if (placedText.renderingMode() - .isPresent()) { - contentStream.setRenderingMode(placedText.renderingMode() - .get()); - } else { - contentStream.setRenderingMode(RenderingMode.FILL); - } - Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix); - contentStream.setTextMatrix(textMatrix); - contentStream.showText(placedText.text()); - contentStream.endText(); - } - } - - - private void enrichObservation(int numberOfPages, List layers) { - - if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) { - return; - } - registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); - for (int i = 0; i < layers.size(); i++) { - ContentStreams.Identifier layer = layers.get(i); - - registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name())); - } - } - - - private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException { - - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) { - contentStream.saveGraphicsState(); - } - try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) { - contentStream.restoreGraphicsState(); - } - } - - - private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) { - - Matrix textMatrix; - if (placedText.textMatrix().isEmpty()) { - textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), - (float) textDeRotationMatrix.getShearX(), - (float) textDeRotationMatrix.getShearY(), - (float) textDeRotationMatrix.getScaleY(), - (float) placedText.lineStart().getX(), - (float) placedText.lineStart().getY()); - } else { - textMatrix = placedText.textMatrix() - .get(); - } - return textMatrix; - } - - - private static Optional addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) { - - if (layer.optionalContent()) { - return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue)); - - } - return Optional.empty(); - - } - - - private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) { - - PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); - PDOptionalContentProperties ocprops = catalog.getOCProperties(); - if (ocprops == null) { - ocprops = new PDOptionalContentProperties(); - catalog.setOCProperties(ocprops); - } - PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(layerName)) { - layer = ocprops.getGroup(layerName); - } else { - layer = new PDOptionalContentGroup(layerName); - ocprops.addGroup(layer); - } - ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); - return layer; - } - - - private static PDDocument openPDDocument(File tmpFile) throws IOException { - - PDDocument pdDocument; - pdDocument = Loader.loadPDF(tmpFile); - pdDocument.setAllSecurityToBeRemoved(true); - return pdDocument; - } - - - @SneakyThrows - private void observedIncrementalSave(PDDocument pdDocument, File outputFile) { - - Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> { - try (var out = new FileOutputStream(outputFile)) { - pdDocument.save(out, CompressParameters.NO_COMPRESSION); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - - - private static void createPageResourcesIfNotPresent(PDPage pdPage) { - - PDResources resources = pdPage.getResources(); - if (resources == null) { - resources = new PDResources(); - pdPage.setResources(resources); - } - } - - - private static AffineTransform getTextDeRotationTransform(PDPage page) { - - return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) { - case 90 -> 3; - case 180 -> 2; - case 270 -> 1; - default -> 0; - }); - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java similarity index 59% rename from layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java rename to layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java index cf68d61..d949b09 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/VisualizationWriter.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/VisualizationWriter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.viewerdoc.service.pdftron; +package com.knecon.fforesight.service.viewerdoc.service; import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; @@ -6,7 +6,10 @@ import java.awt.geom.Rectangle2D; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; @@ -20,6 +23,7 @@ import com.pdftron.pdf.ColorPt; import com.pdftron.pdf.ColorSpace; import com.pdftron.pdf.Element; import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; import com.pdftron.pdf.ElementWriter; import com.pdftron.pdf.Font; import com.pdftron.pdf.GState; @@ -37,8 +41,9 @@ public class VisualizationWriter { ElementWriter writer; ElementBuilder builder; - List visualizations; - Map groupMap; + ElementReader reader; + List layerGroups; + Map groupMap; Map fontMap; @@ -48,27 +53,68 @@ public class VisualizationWriter { begin(page); AffineTransform textDeRotationMatrix = getTextDeRotationTransform(page); + AffineTransform pageTransformation = getTextDeRotationTransform(page); - for (Visualizations visualization : visualizations) { + for (LayerGroup layerGroup : layerGroups) { - VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages() - .get(pageNumber); + Element markedContentStart = builder.createMarkedContentBeginInlineProperties(layerGroup.getGroupIdentifier().markedContentName()); + writer.writeElement(markedContentStart); + + if (layerGroup.isOptionalContent()) { + Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(layerGroup.getGroupIdentifier()).getSDFObj()); + writer.writeElement(ocgStart); + } + + Element escape = builder.createGroupBegin(); + writer.writeElement(escape); + + writeVisualizations(pageNumber, layerGroup, textDeRotationMatrix); + + Element escapeEnd = builder.createGroupEnd(); + writer.writeElement(escapeEnd); + + if (layerGroup.isOptionalContent()) { + Element ocgEnd2 = builder.createMarkedContentEnd(); + writer.writeElement(ocgEnd2); + } + + Element markedContentEnd = builder.createMarkedContentEnd(); + writer.writeElement(markedContentEnd); + } + + end(); + + } + + + private void writeVisualizations(int pageNumber, LayerGroup layerGroup, AffineTransform textDeRotationMatrix) throws PDFNetException { + + for (Visualizations visualization : layerGroup.getVisualizations()) { + + VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages().get(pageNumber); if (visualizationsOnPage == null || visualizationsOnPage.isEmpty()) { continue; } - Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().cosName().getName()); + Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().markedContentName()); writer.writeElement(markedContentStart); - if (visualization.getLayer().optionalContent()) { + if (layerGroup.subLayersAreOptionalContent()) { Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(visualization.getLayer()).getSDFObj()); writer.writeElement(ocgStart); } + Element escape = builder.createGroupBegin(); + writer.writeElement(escape); + + writeVisualization(visualizationsOnPage, textDeRotationMatrix); - if (visualization.getLayer().optionalContent()) { + Element escapeEnd = builder.createGroupEnd(); + writer.writeElement(escapeEnd); + + if (layerGroup.subLayersAreOptionalContent()) { Element ocgEnd = builder.createMarkedContentEnd(); writer.writeElement(ocgEnd); } @@ -77,9 +123,6 @@ public class VisualizationWriter { writer.writeElement(markedContentEnd); } - - end(); - } @@ -123,6 +166,7 @@ public class VisualizationWriter { writePlacedText(textDeRotationMatrix, placedText); } + } @@ -130,26 +174,46 @@ public class VisualizationWriter { float[] rgbComponents = placedText.color().getRGBColorComponents(null); Font font = fontMap.get(placedText.font()); - - Element text = builder.createTextRun(placedText.text(), font, placedText.fontSize()); - - if (placedText.renderingMode() - .isPresent()) { - text.getGState() - .setRenderingIntent(placedText.renderingMode() - .get().intValue()); - } else { - try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { - text.getGState().setFillColor(color); - } - text.getGState().setRenderingIntent(GState.e_fill_text); + Element text = builder.createTextBegin(font, placedText.fontSize()); + text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) { + text.getGState().setFillColor(color); } try (Matrix2D textMatrix = getTextMatrix(placedText, textDeRotationMatrix)) { text.setTextMatrix(textMatrix); } + text.getGState() + .setTextRenderMode(placedText.renderingMode() + .map(VisualizationWriter::resolveTextRenderMode).orElse(GState.e_fill_text)); writer.writeElement(text); + + text = switch (font.getType()) { + case Font.e_Type0, Font.e_CIDType0, Font.e_TrueType, Font.e_CIDType2 -> builder.createUnicodeTextRun(placedText.text()); + case Font.e_Type1 -> builder.createTextRun(placedText.text()); + default -> throw new IllegalStateException("Unexpected value: " + font.getType()); + }; + + writer.writeElement(text); + text = builder.createTextEnd(); + writer.writeElement(text); + + } + + + private static int resolveTextRenderMode(RenderingMode renderingMode) { + + return switch (renderingMode) { + case FILL -> GState.e_fill_text; + case STROKE -> GState.e_stroke_text; + case FILL_STROKE -> GState.e_fill_stroke_text; + case NEITHER -> GState.e_invisible_text; + case FILL_CLIP -> GState.e_fill_clip_text; + case STROKE_CLIP -> GState.e_stroke_clip_text; + case FILL_STROKE_CLIP -> GState.e_fill_stroke_clip_text; + case NEITHER_CLIP -> GState.e_clip_text; + }; } @@ -220,21 +284,32 @@ public class VisualizationWriter { Matrix2D textMatrix; if (placedText.textMatrix().isEmpty()) { - textMatrix = new Matrix2D(textDeRotationMatrix.getScaleX(), - textDeRotationMatrix.getShearX(), - textDeRotationMatrix.getShearY(), - textDeRotationMatrix.getScaleY(), - placedText.lineStart().getX(), - placedText.lineStart().getY()); + textMatrix = toMatrix2D(textDeRotationMatrix, placedText.lineStart().getX(), placedText.lineStart().getY()); } else { - var matrix = placedText.textMatrix() - .get(); - textMatrix = new Matrix2D(matrix.getScaleX(), matrix.getShearX(), matrix.getShearY(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY()); + var matrix = placedText.textMatrix().get(); + textMatrix = toMatrix2D(matrix); } return textMatrix; } + private static Matrix2D toMatrix2D(AffineTransform matrix) throws PDFNetException { + + return new Matrix2D(matrix.getScaleX(), matrix.getShearY(), matrix.getShearX(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY()); + } + + + private static Matrix2D toMatrix2D(AffineTransform textDeRotationMatrix, double translateX, double translateY) throws PDFNetException { + + return new Matrix2D(textDeRotationMatrix.getScaleX(), + textDeRotationMatrix.getShearY(), + textDeRotationMatrix.getShearX(), + textDeRotationMatrix.getScaleY(), + translateX, + translateY); + } + + @SneakyThrows private static AffineTransform getTextDeRotationTransform(Page page) { diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java deleted file mode 100644 index d6615f4..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PageContentCleaner.java +++ /dev/null @@ -1,120 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service.pdftron; - -import java.util.Set; - -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementBuilder; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.Page; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; - -@Builder -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class PageContentCleaner { - - ElementWriter writer; - ElementReader reader; - ElementBuilder elementBuilder; - Set markedContentToDraw; - Set kneconMarkedContents; - - @Builder.Default - MarkedContentStack markedContentStack = new MarkedContentStack(); - - - @SneakyThrows - public void cleanPage(Page page) { - - begin(page); - boolean escaped = reader.next().getType() == Element.e_group_begin; - - if (!escaped) { - writer.writeElement(elementBuilder.createGroupBegin()); - } - - copyElementsUntilFirstKneconMarkedContent(); - - if (!escaped) { - writer.writeElement(elementBuilder.createGroupEnd()); - } - - copyElementsExceptMarkedContentToDraw(); - end(); - } - - - @SneakyThrows - private void begin(Page page) { - - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - reader.begin(page); - } - - - @SneakyThrows - private void end() { - - writer.end(); - reader.end(); - } - - - @SneakyThrows - private void copyElementsUntilFirstKneconMarkedContent() { - - for (Element element = reader.current(); element != null; element = reader.next()) { - - switch (element.getType()) { - case Element.e_marked_content_begin -> { - markedContentStack.enterMarkedContent(element.getMCTag().getName()); - if (markedContentStack.currentMarkedContentContainsAny(kneconMarkedContents)) { - break; - } - writer.writeElement(element); - } - case Element.e_marked_content_end -> { - markedContentStack.leaveMarkedContent(); - writer.writeElement(element); - } - default -> writer.writeElement(element); - } - } - } - - - @SneakyThrows - private void copyElementsExceptMarkedContentToDraw() { - - for (Element element = reader.current(); element != null; element = reader.next()) { - - switch (element.getType()) { - case Element.e_marked_content_begin -> { - markedContentStack.enterMarkedContent(element.getMCTag().getName()); - if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { - writer.writeElement(element); - } - } - case Element.e_marked_content_end -> { - if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { - writer.writeElement(element); - } - markedContentStack.leaveMarkedContent(); - } - default -> { - if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) { - writer.writeElement(element); - } - } - } - - } - - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java deleted file mode 100644 index 0359196..0000000 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/pdftron/PdftronLayerUtility.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service.pdftron; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.model.Visualizations; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.ocg.Config; -import com.pdftron.pdf.ocg.Group; -import com.pdftron.sdf.Obj; - -import lombok.SneakyThrows; -import lombok.experimental.UtilityClass; - -@UtilityClass -public class PdftronLayerUtility { - - public Map addLayersToDocument(List visualizations, PDFDoc pdfDoc) { - - Map optionalContentGroupMap = new HashMap<>(); - for (Visualizations visualization : visualizations) { - addLayerToDocument(visualization.getLayer(), pdfDoc, visualization.isLayerVisibilityDefaultValue())// - .ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg)); - } - return optionalContentGroupMap; - } - - - private Optional addLayerToDocument(ContentStreams.Identifier layer, PDFDoc pdfDoc, boolean layerVisibilityDefaultValue) { - - if (layer.optionalContent()) { - return Optional.of(addLayerToDocument(pdfDoc, layer.name(), layerVisibilityDefaultValue)); - - } - return Optional.empty(); - - } - - - @SneakyThrows - private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) { - - Optional existingGroup = findGroupInDoc(doc, layerName); - - if (existingGroup.isPresent()) { - return existingGroup.get(); - } - - return addNewLayer(doc, layerName, layerVisibilityDefaultValue); - } - - - private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) throws PDFNetException { - - Config cfg = doc.getOCGConfig(); - if (cfg == null) { - cfg = Config.create(doc, true); - cfg.setName("Default"); - } - Group grp = Group.create(doc, layerName); - grp.setInitialState(cfg, layerVisibilityDefaultValue); - - // Add the new OCG to the list of layers that should appear in PDF viewer GUI. - Obj layerOrderArray = cfg.getOrder(); - if (layerOrderArray == null) { - layerOrderArray = doc.createIndirectArray(); - cfg.setOrder(layerOrderArray); - } - layerOrderArray.pushBack(grp.getSDFObj()); - - return grp; - } - - - @SneakyThrows - private Optional findGroupInDoc(PDFDoc doc, String layerName) { - - Obj ocgs = doc.getOCGs(); - if (ocgs != null) { - int i; - int sz = (int) ocgs.size(); - for (i = 0; i < sz; ++i) { - Group ocg = new Group(ocgs.getAt(i)); - if (ocg.getName().equals(layerName)) { - return Optional.of(ocg); - } - } - } - return Optional.empty(); - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java deleted file mode 100644 index fd33307..0000000 --- a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ContentStreamClassifierTest.java +++ /dev/null @@ -1,124 +0,0 @@ -package com.knecon.fforesight.service.viewerdoc.service; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.io.File; -import java.nio.file.Files; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.junit.jupiter.api.Test; - -import com.knecon.fforesight.service.viewerdoc.ContentStreams; -import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -public class ContentStreamClassifierTest { - - @Test - @SneakyThrows - public void testClassification() { - - File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile()); - - try (PDDocument document = Loader.loadPDF(pdfFile)) { - - PDPage page = document.getPage(0); - - List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); - - logContentStreamClassifications(classifieds); - - assertEquals(11, classifieds.size()); - assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification()); - for (int i = 1; i < 9; i++) { - assertEquals(ContentStreams.OTHER, classifieds.get(i).classification()); - } - assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification()); - assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification()); - assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds)); - } - } - - - @Test - @SneakyThrows - public void testRemoveLayoutLayer() { - - File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile()); - File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile(); - - try (PDDocument document = Loader.loadPDF(pdfFile)) { - - PDPage page = document.getPage(0); - - List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); - page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds)); - - document.save(tmpFile); - } - try (PDDocument document2 = Loader.loadPDF(tmpFile)) { - - PDPage page2 = document2.getPage(0); - - List classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2); - - logContentStreamClassifications(classifieds2); - - assertEquals(10, classifieds2.size()); - assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification()); - for (int i = 1; i < 9; i++) { - assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification()); - } - assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification()); - assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2)); - } - assert tmpFile.delete(); - } - - - @Test - @SneakyThrows - public void testClassificationForOldLayers() { - - File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("oldViewerDocLayers.pdf").getFile()); - try (PDDocument document = Loader.loadPDF(pdfFile)) { - - PDPage page = document.getPage(0); - - List classifieds = ContentStreamClassifier.getClassifiedContentStreams(page); - - logContentStreamClassifications(classifieds); - - assertEquals(11, classifieds.size()); - assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification()); - for (int i = 1; i < 9; i++) { - assertEquals(ContentStreams.OTHER, classifieds.get(i).classification()); - } - assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification()); - assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification()); - assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds)); - } - } - - - private static void logContentStreamClassifications(List classifieds) { - - log.info("number of content streams: {}", classifieds.size()); - log.info("Classifications: {}", classifieds.stream()// - .map(ClassifiedContentStream::classification)// - .map(ContentStreams.Identifier::cosName)// - .map(COSName::getName)// - .collect(Collectors.joining(", "))); - } - -} diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java new file mode 100644 index 0000000..c1a92aa --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java @@ -0,0 +1,76 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Path; +import java.util.Set; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.viewerdoc.LayerIdentifier; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.PDFNet; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; + +@Disabled +class PageContentCleanerTest { + + @BeforeEach + public void init() { + + PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a"); + } + + + @AfterAll + public static void cleanup() { + + PDFNet.terminate(); + } + + + @Test + @SneakyThrows + public void testContentCleaning() { + + Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf"); + File tmpFile = new File("/tmp/cleaned.pdf"); + try (var in = new FileInputStream(file.toFile());// + var doc = new PDFDoc(in);// + var out = new FileOutputStream(tmpFile);// + ElementWriter pageWriter = new ElementWriter();// + ElementReader reader = new ElementReader();// + ElementBuilder builder = new ElementBuilder()// + ) { + + PageContentCleaner pageContentCleaner = PageContentCleaner.builder() + .writer(pageWriter) + .reader(reader) + .elementBuilder(builder) + .markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName())) + .build(); + + for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) { + + Page page = iterator.next(); + + pageContentCleaner.removeMarkedContent(page); + } + + doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); + } + + } + +} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtilityTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtilityTest.java new file mode 100644 index 0000000..678a873 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtilityTest.java @@ -0,0 +1,58 @@ +package com.knecon.fforesight.service.viewerdoc.service; + +import java.awt.geom.AffineTransform; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import javax.swing.table.AbstractTableModel; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.pdftron.common.Matrix2D; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.PDFNet; +import com.pdftron.pdf.Page; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; + +class ViewerDocVersioningUtilityTest { + + @BeforeEach + public void init() { + + PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a"); + } + + + @AfterAll + public static void cleanup() { + + PDFNet.terminate(); + } + + + @Test + @SneakyThrows + public void testMarking() { + + File file = new ClassPathResource("files/empty.pdf").getFile(); + Path tmpFile = Files.createTempFile("markedDocument", ".pdf"); + try (var in = new FileInputStream(file); var doc = new PDFDoc(in); var out = new FileOutputStream(tmpFile.toFile())) { + ViewerDocVersioningUtility.setVersionInDocument(doc); + doc.save(out, SDFDoc.SaveMode.LINEARIZED, null); + } + assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile()); + assert tmpFile.toFile().delete(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/files/empty.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/files/empty.pdf new file mode 100644 index 0000000..babc484 --- /dev/null +++ b/layoutparser-service/viewer-doc-processor/src/test/resources/files/empty.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d2b7421795d7b5eab33cef0e7cd63265f01bdd92740766fc9715fbc5c27ef5 +size 3087 diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf deleted file mode 100644 index 9b3f010..0000000 Binary files a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf and /dev/null differ diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf deleted file mode 100644 index 8848184..0000000 Binary files a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf and /dev/null differ